//! This example is a very simple approach to testing the validity of a chain //! file. To run the example, you'll need a reference FASTA, a query FASTA, as //! well as a chain file lifting over from the reference to the query. You can //! call the program like so: //! //! ``` //! cargo run --release --example chain_check //! ``` //! //! If all goes well, the program will print out the total number of positions //! compared, the total number of positions with mismatches, and the overall //! match rate between the two genomes (via the chain file). In some cases, you //! will experience errors: this indicates a malformed chain file or incorrect //! pairings of chain file/reference FASTA/query FASTA. //! //! Note that this program somewhat simplifies what a "match" between the two //! genomes means. For example, all nucleotides within both FASTA files are //! mapped to 'A', 'C', 'G', 'T', or 'Other'. Lowercase and uppercase letters in //! the FASTA file have semantic meaning, but for the purposes of this //! comparison, they are considered the same. use std::collections::HashMap; use std::env; use std::fs::File; use std::io::BufReader; use std::iter::zip; use chain::core::Coordinate; use chain::core::Interval; use chain::core::Position; use chain::core::Strand; use chain::liftover; use chainfile as chain; use flate2::read::GzDecoder; use noodles::fasta; fn main() -> Result<(), Box> { let mut total_positions = 0usize; let mut mismatches = 0usize; let chain_path = env::args().nth(1).expect("missing chain path"); let reference_path = env::args().nth(2).expect("missing reference path"); let query_path = env::args().nth(3).expect("missing query path"); let chain = File::open(chain_path) .map(GzDecoder::new) .map(BufReader::new) .map(chain::Reader::new)?; let machine = liftover::machine::builder::Builder::default().try_build_from(chain)?; let mut reference = HashMap::new(); for result in fasta::reader::Builder::default() .build_from_path(reference_path)? .records() { let record = result?; reference.insert(record.name().to_string(), record.sequence().clone()); } let mut query = HashMap::new(); for result in fasta::reader::Builder::default() .build_from_path(query_path)? .records() { let record = result?; query.insert(record.name().to_string(), record.sequence().clone()); } for (name, reference_sequence) in reference { let interval = Interval::try_new( Coordinate::try_new(name.clone(), 0, Strand::Positive)?, Coordinate::try_new(name, reference_sequence.len(), Strand::Positive)?, )?; let results = match machine.liftover(&interval) { Some(results) => results, None => { println!( "INFO: region {} is not included in the query genome.", interval ); continue; } }; for region in results { let query_sequence = query.get(region.query().contig()).unwrap_or_else(|| { panic!( "query fasta did not contain necessary contig: {}. \ Are the FASTA files and chain file correct?", region.query().contig() ) }); let reference = get_sequence_for_interval(&reference_sequence, region.reference()); let query = get_sequence_for_interval(query_sequence, region.query()); assert_eq!(reference.len(), query.len()); total_positions += reference.len(); mismatches += count_mismatches(&reference, &query); } } let result = (1.0 - (mismatches as f64 / total_positions as f64)) * 100.0; println!("Total mismatches found: {}", mismatches); println!("Total positions checked: {}", total_positions); println!("Percentage match for lifted regions: {:.1}%", result); Ok(()) } fn get_sequence_for_interval( sequence: &fasta::record::Sequence, interval: &chain::core::Interval, ) -> Vec { let result = sequence .slice(parse_interval(interval)) .expect("sequence lookup to succeed. Are the FASTA files and chain file correct?") .as_ref() .iter() .map(Nucleotide::from) .collect::>(); match interval.strand() { Strand::Positive => result, Strand::Negative => result .into_iter() .rev() .map(|n| n.complement()) .collect::>(), } } fn parse_interval(interval: &chain::core::Interval) -> noodles::core::region::Interval { let (start, end) = match interval.strand() { Strand::Positive => { let start = match interval.start().position() { Position::ZeroBased(position) => { noodles::core::Position::try_from(*position + 1).unwrap() } Position::NegativeBound => unreachable!(), }; let end = match interval.end().position() { Position::ZeroBased(position) => { noodles::core::Position::try_from(*position).unwrap() } Position::NegativeBound => unreachable!(), }; (start, end) } Strand::Negative => { let start = match interval.end().position() { Position::ZeroBased(position) => { noodles::core::Position::try_from(*position + 2).unwrap() } Position::NegativeBound => noodles::core::Position::try_from(1).unwrap(), }; let end = match interval.start().position() { Position::ZeroBased(position) => { noodles::core::Position::try_from(*position + 1).unwrap() } Position::NegativeBound => unreachable!(), }; (start, end) } }; noodles::core::region::Interval::from(start..=end) } #[derive(Debug, Eq, PartialEq)] pub enum Nucleotide { A, C, G, T, Other, } impl Nucleotide { pub fn complement(self) -> Nucleotide { match self { Nucleotide::A => Nucleotide::T, Nucleotide::C => Nucleotide::G, Nucleotide::G => Nucleotide::C, Nucleotide::T => Nucleotide::A, Nucleotide::Other => Nucleotide::Other, } } } impl From<&u8> for Nucleotide { fn from(value: &u8) -> Self { match value { b'A' => Nucleotide::A, b'a' => Nucleotide::A, b'C' => Nucleotide::C, b'c' => Nucleotide::C, b'G' => Nucleotide::G, b'g' => Nucleotide::G, b'T' => Nucleotide::T, b't' => Nucleotide::T, _ => Nucleotide::Other, } } } pub fn count_mismatches(reference: &[Nucleotide], query: &[Nucleotide]) -> usize { let mut result = 0; for (r, q) in zip(reference.iter(), query.iter()) { if r != q { result += 1; } } result }