# microBioRust ##Rust bioinformatics crate aimed at Microbial genomics
The aim of this crate is to provide Microbiology friendly Rust functions for bioinformatics.
Very much under construction!
Some concepts with many thanks to Rust-bio
To install Rust - please see here [Rust install](https://www.rust-lang.org/tools/install) or with Conda
Currently there is functionality for:
# A Genbank to GFF parser You are able to parse genbank and save as a GFF (gff3) format as well as extracting DNA sequences, gene DNA sequences (ffn) and protein fasta sequences (faa) ``` pub fn genbank_to_faa() -> Result<(), anyhow::Error> { let args: Vec = env::args().collect(); let config = Config::new(&args).unwrap_or_else(|err| { println!("Problem with parsing file arguments: {}", err); process::exit(1); }); let file_gbk = fs::File::open(config.filename)?; let mut reader = Reader::new(file_gbk); let mut records = reader.records(); let mut cds_counter: u32 = 0; loop { //collect from each record advancing on a next record basis, count cds records match records.next() { Some(Ok(mut record)) => { for (k, v) in &record.cds.attributes { match record.seq_features.get_sequence_faa(&k) { Some(value) => { let seq_faa = value.to_string(); println!(">{}|{}\n{}", &record.id, &k, seq_faa); }, _ => (), }; } cds_counter+=1; }, Some(Err(e)) => { println!("Error encountered - an err {:?}", e); }, None => { println!("finished iteration"); break; }, } } println!("Total records processed: {}", read_counter); return Ok(()); } ``` Example to save a provided multi- or single genbank file as a GFF file (by joining any multi-genbank) ``` pub fn genbank_to_gff() -> io::Result<()> { let args: Vec = env::args().collect(); let config = Config::new(&args).unwrap_or_else(|err| { println!("Problem with parsing file arguments: {}", err); process::exit(1); }); let file_gbk = fs::File::open(&config.filename)?; let prev_start: u32 = 0; let mut prev_end: u32 = 0; let mut reader = Reader::new(file_gbk); let mut records = reader.records(); let mut read_counter: u32 = 0; let mut seq_region: BTreeMap = BTreeMap::new(); let mut record_vec: Vec = Vec::new(); loop { match records.next() { Some(Ok(mut record)) => { //println!("next record"); //println!("Record id: {:?}", record.id); let source = record.source_map.source_name.clone().expect("issue collecting source name"); let beginning = match record.source_map.get_start(&source) { Some(value) => value.get_value(), _ => 0, }; let ending = match record.source_map.get_stop(&source) { Some(value) => value.get_value(), _ => 0, }; if ending + prev_end < beginning + prev_end { } seq_region.insert(source, (beginning + prev_end, ending + prev_end)); record_vec.push(record); // Add additional fields to print if needed read_counter+=1; prev_end+=ending; // create the joined record if there are multiple }, Some(Err(e)) => { println!("theres an err {:?}", e); }, None => { println!("finished iteration"); break; }, } } let output_file = format!("{}.gff", &config.filename); gff_write(seq_region.clone(), record_vec, &output_file, true); println!("Total records processed: {}", read_counter); return Ok(()); ``` Example to create a completely new record, use of setters or set_ functionality To write into GFF format requires gff_write(seq_region, record_vec, filename, true or false) The seq_region is the region of interest to save with name and DNA coordinates such as ``` seqregion.entry("source_1".to_string(), (1,897))``` This makes it possible to save the whole file or to subset it record_vec is a list of the records. If there is only one record, include this as a vec using ``` vec![record] ``` The boolean true/false describes whether the DNA sequence should be included in the GFF3 file To write into genbank format requires gbk_write(seq_region, record_vec, filename), no true or false since genbank format will include the DNA sequence ``` pub fn create_new_record() -> Result<(), anyhow::Error> { let filename = format!("new_record.gff"); let mut record = Record::new(); let mut seq_region: BTreeMap = BTreeMap::new(); //example from E.coli K12 seq_region.insert("source_1".to_string(), (1,897)); //Add the source into SourceAttributes record.source_map .set_counter("source_1".to_string()) .set_start(RangeValue::Exact(1)) .set_stop(RangeValue::Exact(897)) .set_organism("Escherichia coli".to_string()) .set_mol_type("DNA".to_string()) .set_strain("K-12 substr. MG1655".to_string()) .set_type_material("".to_string()) .set_db_xref("PRJNA57779".to_string()); //Add the features into FeatureAttributes, here we are setting two features, i.e. coding sequences or genes record.cds .set_counter("b3304".to_string()) .set_start(RangeValue::Exact(1)) .set_stop(RangeValue::Exact(354)) .set_gene("rplR".to_string()) .set_product("50S ribosomal subunit protein L18".to_string()) .set_codon_start(1) .set_strand(-1); record.cds .set_counter("b3305".to_string()) .set_start(RangeValue::Exact(364)) .set_stop(RangeValue::Exact(897)) .set_gene("rplF".to_string()) .set_product("50S ribosomal subunit protein L6".to_string()) .set_codon_start(1) .set_strand(-1); //Add the sequences for the coding sequence (CDS) into SequenceAttributes record.seq_features .set_counter("b3304".to_string()) .set_start(RangeValue::Exact(1)) .set_stop(RangeValue::Exact(354)) .set_sequence_ffn("ATGGATAAGAAATCTGCTCGTATCCGTCGTGCGACCCGCGCACGCCGCAAGCTCCAGGAG CTGGGCGCAACTCGCCTGGTGGTACATCGTACCCCGCGTCACATTTACGCACAGGTAATT GCACCGAACGGTTCTGAAGTTCTGGTAGCTGCTTCTACTGTAGAAAAAGCTATCGCTGAA CAACTGAAGTACACCGGTAACAAAGACGCGGCTGCAGCTGTGGGTAAAGCTGTCGCTGAA CGCGCTCTGGAAAAAGGCATCAAAGATGTATCCTTTGACCGTTCCGGGTTCCAATATCAT GGTCGTGTCCAGGCACTGGCAGATGCTGCCCGTGAAGCTGGCCTTCAGTTCTAA".to_string()) .set_sequence_faa("MDKKSARIRRATRARRKLQELGATRLVVHRTPRHIYAQVIAPNGSEVLVAASTVEKAIAE QLKYTGNKDAAAAVGKAVAERALEKGIKDVSFDRSGFQYHGRVQALADAAREAGLQF".to_string()) .set_codon_start(1) .set_strand(-1); record.seq_features .set_counter("bb3305".to_string()) .set_start(RangeValue::Exact(364)) .set_stop(RangeValue::Exact(897)) .set_sequence_ffn("ATGTCTCGTGTTGCTAAAGCACCGGTCGTTGTTCCTGCCGGCGTTGACGTAAAAATCAAC GGTCAGGTTATTACGATCAAAGGTAAAAACGGCGAGCTGACTCGTACTCTCAACGATGCT GTTGAAGTTAAACATGCAGATAATACCCTGACCTTCGGTCCGCGTGATGGTTACGCAGAC GGTTGGGCACAGGCTGGTACCGCGCGTGCCCTGCTGAACTCAATGGTTATCGGTGTTACC GAAGGCTTCACTAAGAAGCTGCAGCTGGTTGGTGTAGGTTACCGTGCAGCGGTTAAAGGC AATGTGATTAACCTGTCTCTGGGTTTCTCTCATCCTGTTGACCATCAGCTGCCTGCGGGT ATCACTGCTGAATGTCCGACTCAGACTGAAATCGTGCTGAAAGGCGCTGATAAGCAGGTG ATCGGCCAGGTTGCAGCGGATCTGCGCGCCTACCGTCGTCCTGAGCCTTATAAAGGCAAG GGTGTTCGTTACGCCGACGAAGTCGTGCGTACCAAAGAGGCTAAGAAGAAGTAA".to_string()) .set_sequence_faa("MSRVAKAPVVVPAGVDVKINGQVITIKGKNGELTRTLNDAVEVKHADNTLTFGPRDGYAD GWAQAGTARALLNSMVIGVTEGFTKKLQLVGVGYRAAVKGNVINLSLGFSHPVDHQLPAG ITAECPTQTEIVLKGADKQVIGQVAADLRAYRRPEPYKGKGVRYADEVVRTKEAKKK".to_string()) .set_codon_start(1) .set_strand(-1); //Add the full sequence of the entire record into the record.sequence record.sequence = "TTAGAACTGAAGGCCAGCTTCACGGGCAGCATCTGCCAGTGCCTGGACACGACCATGATA TTGGAACCCGGAACGGTCAAAGGATACATCTTTGATGCCTTTTTCCAGAGCGCGTTCAGC GACAGCTTTACCCACAGCTGCAGCCGCGTCTTTGTTACCGGTGTACTTCAGTTGTTCAGC GATAGCTTTTTCTACAGTAGAAGCAGCTACCAGAACTTCAGAACCGTTCGGTGCAATTAC CTGTGCGTAAATGTGACGCGGGGTACGATGTACCACCAGGCGAGTTGCGCCCAGCTCCTG GAGCTTGCGGCGTGCGCGGGTCGCACGACGGATACGAGCAGATTTCTTATCCATAGTGTT ACCTTACTTCTTCTTAGCCTCTTTGGTACGCACGACTTCGTCGGCGTAACGAACACCCTT GCCTTTATAAGGCTCAGGACGACGGTAGGCGCGCAGATCCGCTGCAACCTGGCCGATCAC CTGCTTATCAGCGCCTTTCAGCACGATTTCAGTCTGAGTCGGACATTCAGCAGTGATACC CGCAGGCAGCTGATGGTCAACAGGATGAGAGAAACCCAGAGACAGGTTAATCACATTGCC TTTAACCGCTGCACGGTAACCTACACCAACCAGCTGCAGCTTCTTAGTGAAGCCTTCGGT AACACCGATAACCATTGAGTTCAGCAGGGCACGCGCGGTACCAGCCTGTGCCCAACCGTC TGCGTAACCATCACGCGGACCGAAGGTCAGGGTATTATCTGCATGTTTAACTTCAACAGC ATCGTTGAGAGTACGAGTCAGCTCGCCGTTTTTACCTTTGATCGTAATAACCTGACCGTT GATTTTTACGTCAACGCCGGCAGGAACAACGACCGGTGCTTTAGCAACACGAGACAT".to_string(); gff_write(seq_region, vec![record], &filename, true); return Ok(()); } ```