@article{Gweon2019, year = {2019}, keywords = {amr,read-depth,metagenomics,classification,speciation}, title = {{The impact of sequencing depth on the inferred taxonomic composition and AMR gene content of metagenomic samples}}, author = {Gweon, H. Soon and Shaw, Liam P. and Swann, Jeremy and Maio, Nicola De and AbuOun, Manal and Niehus, Rene and Hubbard, Alasdair T. M. and Bowes, Mike J. and Bailey, Mark J. and Peto, Tim E. A. and Hoosdally, Sarah J. and Walker, A. Sarah and Sebra, Robert P. and Crook, Derrick W. and Anjum, Muna F. and Read, Daniel S. and Stoesser, Nicole and Abuoun, M. and Anjum, M. and Bailey, M. J. and Barker, L. and Brett, H. and Bowes, M. J. and Chau, K. and Crook, D. W. and Maio, N. De and Gilson, D. and Gweon, H. S. and Hubbard, A. T. M. and Hoosdally, S. and Kavanagh, J. and Jones, H. and Peto, T. E. A. and Read, D. S. and Sebra, R. and Shaw, L. P. and Sheppard, A. E. and Smith, R. and Stubberfield, E. and Swann, J. and Walker, A. S. and Woodford, N.}, journal = {Environmental Microbiome}, doi = {10.1186/s40793-019-0347-1}, pmid = {33902704}, abstract = {{Shotgun metagenomics is increasingly used to characterise microbial communities, particularly for the investigation of antimicrobial resistance (AMR) in different animal and environmental contexts. There are many different approaches for inferring the taxonomic composition and AMR gene content of complex community samples from shotgun metagenomic data, but there has been little work establishing the optimum sequencing depth, data processing and analysis methods for these samples. In this study we used shotgun metagenomics and sequencing of cultured isolates from the same samples to address these issues. We sampled three potential environmental AMR gene reservoirs (pig caeca, river sediment, effluent) and sequenced samples with shotgun metagenomics at high depth (\textbackslashtextasciitilde 200 million reads per sample). Alongside this, we cultured single-colony isolates of Enterobacteriaceae from the same samples and used hybrid sequencing (short- and long-reads) to create high-quality assemblies for comparison to the metagenomic data. To automate data processing, we developed an open-source software pipeline, ‘ResPipe’. Taxonomic profiling was much more stable to sequencing depth than AMR gene content. 1 million reads per sample was sufficient to achieve < 1\% dissimilarity to the full taxonomic composition. However, at least 80 million reads per sample were required to recover the full richness of different AMR gene families present in the sample, and additional allelic diversity of AMR genes was still being discovered in effluent at 200 million reads per sample. Normalising the number of reads mapping to AMR genes using gene length and an exogenous spike of Thermus thermophilus DNA substantially changed the estimated gene abundance distributions. While the majority of genomic content from cultured isolates from effluent was recoverable using shotgun metagenomics, this was not the case for pig caeca or river sediment. Sequencing depth and profiling method can critically affect the profiling of polymicrobial animal and environmental samples with shotgun metagenomics. Both sequencing of cultured isolates and shotgun metagenomics can recover substantial diversity that is not identified using the other methods. Particular consideration is required when inferring AMR gene content or presence by mapping metagenomic reads to a database. ResPipe, the open-source software pipeline we have developed, is freely available (https://gitlab.com/hsgweon/ResPipe).}}, pages = {7}, number = {1}, volume = {14} } @article{Baccarella2018, year = {2018}, keywords = {rna,rna-sq,read-depth}, title = {{Empirical assessment of the impact of sample number and read depth on RNA-Seq analysis workflow performance}}, author = {Baccarella, Alyssa and Williams, Claire R. and Parrish, Jay Z. and Kim, Charles C.}, journal = {BMC Bioinformatics}, doi = {10.1186/s12859-018-2445-2}, pmid = {30428853}, abstract = {{RNA-Sequencing analysis methods are rapidly evolving, and the tool choice for each step of one common workflow, differential expression analysis, which includes read alignment, expression modeling, and differentially expressed gene identification, has a dramatic impact on performance characteristics. Although a number of workflows are emerging as high performers that are robust to diverse input types, the relative performance characteristics of these workflows when either read depth or sample number is limited–a common occurrence in real-world practice–remain unexplored. Here, we evaluate the impact of varying read depth and sample number on the performance of differential gene expression identification workflows, as measured by precision, or the fraction of genes correctly identified as differentially expressed, and by recall, or the fraction of differentially expressed genes identified. We focus our analysis on 30 high-performing workflows, systematically varying the read depth and number of biological replicates of patient monocyte samples provided as input. We find that, in general for most workflows, read depth has little effect on workflow performance when held above two million reads per sample, with reduced workflow performance below this threshold. The greatest impact of decreased sample number is seen below seven samples per group, when more heterogeneity in workflow performance is observed. The choice of differential expression identification tool, in particular, has a large impact on the response to limited inputs. Among the tested workflows, the recall/precision balance remains relatively stable at a range of read depths and sample numbers, although some workflows are more sensitive to input restriction. At ranges typically recommended for biological studies, performance is more greatly impacted by the number of biological replicates than by read depth. Caution should be used when selecting analysis workflows and interpreting results from low sample number experiments, as all workflows exhibit poorer performance at lower sample numbers near typically reported values, with variable impact on recall versus precision. These analyses highlight the performance characteristics of common differential gene expression workflows at varying read depths and sample numbers, and provide empirical guidance in experimental and analytical design.}}, pages = {423}, number = {1}, volume = {19} } @article{Maio2019, year = {2019}, keywords = {nanopore,assembly,bacterial-genomics,hybrid-assembly,benchmark}, title = {{Comparison of long-read sequencing technologies in the hybrid assembly of complex bacterial genomes}}, author = {Maio, Nicola De and Shaw, Liam P and Hubbard, Alasdair and George, Sophie and Sanderson, Nicholas D and Swann, Jeremy and Wick, Ryan and AbuOun, Manal and Stubberfield, Emma and Hoosdally, Sarah J and Crook, Derrick W and Peto, Timothy E A and Sheppard, Anna E and Bailey, Mark J and Read, Daniel S and Anjum, Muna F and Walker, A Sarah and Stoesser, Nicole and Consortium, On Behalf Of The Rehab}, journal = {Microbial Genomics}, doi = {10.1099/mgen.0.000294}, pmid = {31483244}, pmcid = {PMC6807382}, abstract = {{Illumina sequencing allows rapid, cheap and accurate whole genome bacterial analyses, but short reads (<300 bp) do not usually enable complete genome assembly. Long-read sequencing greatly assists with resolving complex bacterial genomes, particularly when combined with short-read Illumina data (hybrid assembly). However, it is not clear how different long-read sequencing methods affect hybrid assembly accuracy. Relative automation of the assembly process is also crucial to facilitating high-throughput complete bacterial genome reconstruction, avoiding multiple bespoke filtering and data manipulation steps. In this study, we compared hybrid assemblies for 20 bacterial isolates, including two reference strains, using Illumina sequencing and long reads from either Oxford Nanopore Technologies (ONT) or SMRT Pacific Biosciences (PacBio) sequencing platforms. We chose isolates from the family Enterobacteriaceae, as these frequently have highly plastic, repetitive genetic structures, and complete genome reconstruction for these species is relevant for a precise understanding of the epidemiology of antimicrobial resistance. We de novo assembled genomes using the hybrid assembler Unicycler and compared different read processing strategies, as well as comparing to long-read-only assembly with Flye followed by short-read polishing with Pilon. Hybrid assembly with either PacBio or ONT reads facilitated high-quality genome reconstruction, and was superior to the long-read assembly and polishing approach evaluated with respect to accuracy and completeness. Combining ONT and Illumina reads fully resolved most genomes without additional manual steps, and at a lower consumables cost per isolate in our setting. Automated hybrid assembly is a powerful tool for complete and accurate bacterial genome assembly.}}, number = {9}, volume = {5} } @misc{filtlong, author = {Wick, Ryan}, title = {Filtlong: Quality filtering tool for long reads}, year = {2021}, publisher = {GitHub}, journal = {GitHub repository}, url = {https://github.com/rrwick/Filtlong} } @misc{seqtk, author = {Li, Heng}, title = {Seqtk: Toolkit for processing sequences in FASTA/Q formats }, year = {2018}, publisher = {GitHub}, journal = {GitHub repository}, url = {https://github.com/lh3/seqtk} }