// Copyright (c) 2020 Robert Vaser #include #include "bioparser/fasta_parser.hpp" #include "bioparser/fastq_parser.hpp" #include "biosoup/sequence.hpp" #include "spoa/spoa.hpp" std::atomic biosoup::Sequence::num_objects{0}; namespace { static struct option options[] = { {"algorithm", required_argument, nullptr, 'l'}, {"result", required_argument, nullptr, 'r'}, {"dot", required_argument, nullptr, 'd'}, {"strand-ambiguous", no_argument, nullptr, 's'}, {"version", no_argument, nullptr, 'v'}, {"help", no_argument, nullptr, 'h'}, {nullptr, 0, nullptr, 0} }; std::unique_ptr> CreateParser( const std::string& path) { auto is_suffix = [] (const std::string& str, const std::string& suff) { return str.size() < suff.size() ? false : str.compare(str.size() - suff.size(), suff.size(), suff) == 0; }; if (is_suffix(path, ".fasta") || is_suffix(path, ".fasta.gz") || is_suffix(path, ".fna") || is_suffix(path, ".fna.gz") || is_suffix(path, ".faa") || is_suffix(path, ".faa.gz") || is_suffix(path, ".fa") || is_suffix(path, ".fa.gz")) { try { return bioparser::Parser::Create(path); // NOLINT } catch (const std::invalid_argument& exception) { std::cerr << exception.what() << std::endl; return nullptr; } } if (is_suffix(path, ".fastq") || is_suffix(path, ".fastq.gz") || is_suffix(path, ".fq") || is_suffix(path, ".fq.gz")) { try { return bioparser::Parser::Create(path); // NOLINT } catch (const std::invalid_argument& exception) { std::cerr << exception.what() << std::endl; return nullptr; } } std::cerr << "[spoa::CreateParser] error: file " << path << " has unsupported format extension (valid extensions: .fasta, " << ".fasta.gz, .fna, .fna.gz, .faa, .faa.gz, .fa, .fa.gz, .fastq, " << ".fastq.gz, .fq, .fq.gz)" << std::endl; return nullptr; } void Help() { std::cout << "usage: spoa [options ...] \n" "\n" " # default output is stdout\n" " \n" " input file in FASTA/FASTQ format (can be compressed with gzip)\n" "\n" " options:\n" " -m \n" " default: 5\n" " score for matching bases\n" " -n \n" " default: -4\n" " score for mismatching bases\n" " -g \n" " default: -8\n" " gap opening penalty (must be non-positive)\n" " -e \n" " default: -6\n" " gap extension penalty (must be non-positive)\n" " -q \n" " default: -10\n" " gap opening penalty of the second affine function\n" " (must be non-positive)\n" " -c \n" " default: -4\n" " gap extension penalty of the second affine function\n" " (must be non-positive)\n" " -l, --algorithm \n" " default: 0\n" " alignment mode:\n" " 0 - local (Smith-Waterman)\n" " 1 - global (Needleman-Wunsch)\n" " 2 - semi-global\n" " -r, --result (option can be used multiple times)\n" " default: 0\n" " result mode:\n" " 0 - consensus (FASTA)\n" " 1 - multiple sequence alignment (FASTA)\n" " 2 - 0 & 1 (FASTA)\n" " 3 - partial order graph (GFA)\n" " 4 - 0 & 3 (GFA)\n" " -d, --dot \n" " output file for the partial order graph in DOT format\n" " -s, --strand-ambiguous\n" " for each sequence pick the strand with the better alignment\n" " --version\n" " prints the version number\n" " -h, --help\n" " prints the usage\n" "\n" " gap mode:\n" " linear if g >= e\n" " affine if g <= q or e >= c\n" " convex otherwise (default)\n"; } void PrintGfa( const spoa::Graph& graph, const std::vector& headers, const std::vector& is_reversed, bool include_consensus = false) { if (headers.size() < graph.sequences().size()) { std::cerr << "[spoa::PrintGfa] error: missing header(s)" << std::endl; return; } if (!is_reversed.empty() && is_reversed.size() < graph.sequences().size()) { std::cerr << "[spoa::PringGfa] error: missing reversion flag(s)" << std::endl; // NOLINT return; } std::vector is_consensus_node(graph.nodes().size(), false); for (const auto& it : graph.consensus()) { is_consensus_node[it->id] = true; } std::cout << "H\tVN:Z:1.0" << std::endl; for (const auto& it : graph.nodes()) { std::cout << "S\t" << it->id + 1 << "\t" << static_cast(graph.decoder(it->code)); if (is_consensus_node[it->id]) { std::cout << "\tic:Z:true"; } std::cout << std::endl; for (const auto& jt : it->outedges) { std::cout << "L\t" << it->id + 1 << "\t" << "+\t" << jt->head->id + 1 << "\t" << "+\t" << "OM\t" << "ew:f:" << jt->weight; if (is_consensus_node[it->id] && is_consensus_node[jt->head->id]) { std::cout << "\tic:Z:true"; } std::cout << std::endl; } } for (std::uint32_t i = 0; i < graph.sequences().size(); ++i) { std::cout << "P\t" << headers[i] << "\t"; std::vector path; auto curr = graph.sequences()[i]; while (true) { path.emplace_back(curr->id + 1); if (!(curr = curr->Successor(i))) { break; } } bool ir = !is_reversed.empty() && is_reversed[i]; if (ir) { std::reverse(path.begin(), path.end()); } for (std::uint32_t j = 0; j < path.size(); ++j) { if (j != 0) { std::cout << ","; } std::cout << path[j] << (ir ? "-" : "+"); } std::cout << "\t*" << std::endl; } if (include_consensus) { std::cout << "P\tConsensus\t"; for (std::uint32_t i = 0; i < graph.consensus().size(); ++i) { if (i != 0) { std::cout << ","; } std::cout << graph.consensus()[i]->id + 1 << "+"; } std::cout << "\t*" << std::endl; } } } // namespace int main(int argc, char** argv) { std::int8_t m = 5; std::int8_t n = -4; std::int8_t g = -8; std::int8_t e = -6; std::int8_t q = -10; std::int8_t c = -4; std::uint8_t algorithm = 0; std::vector results = { 0 }; std::string dot_path{}; bool is_strand_ambiguous = false; std::string optstr = "m:n:g:e:q:c:l:r:d:sh"; int opt; while ((opt = getopt_long(argc, argv, optstr.c_str(), options, nullptr)) != -1) { // NOLINT switch (opt) { case 'm': m = atoi(optarg); break; case 'n': n = atoi(optarg); break; case 'g': g = atoi(optarg); break; case 'e': e = atoi(optarg); break; case 'q': q = atoi(optarg); break; case 'c': c = atoi(optarg); break; case 'l': algorithm = atoi(optarg); break; case 'r': results.emplace_back(atoi(optarg)); break; case 'd': dot_path = optarg; break; case 's': is_strand_ambiguous = true; break; case 'v': std::cout << VERSION << std::endl; return 0; case 'h': Help(); return 0; default: return 1; } } if (results.size() > 1) { results.erase(results.begin()); } if (optind >= argc) { std::cerr << "[spoa::] error: missing input file!" << std::endl; Help(); return 1; } auto sparser = CreateParser(argv[optind]); if (sparser == nullptr) { return 1; } std::unique_ptr alignment_engine; try { alignment_engine = spoa::AlignmentEngine::Create( static_cast(algorithm), m, n, g, e, q, c); } catch(std::invalid_argument& exception) { std::cerr << exception.what() << std::endl; return 1; } std::vector> sequences; sequences = sparser->Parse(-1); std::size_t max_sequence_len = 0; for (const auto& it : sequences) { max_sequence_len = std::max(max_sequence_len, it->data.size()); } try { alignment_engine->Prealloc(max_sequence_len, 4); } catch (std::invalid_argument& exception) { std::cerr << exception.what() << std::endl; return 1; } spoa::Graph graph{}; std::vector is_reversed; for (const auto& it : sequences) { std::int32_t score = 0; spoa::Alignment alignment; try { alignment = alignment_engine->Align(it->data, graph, &score); } catch (std::invalid_argument& exception) { std::cerr << exception.what() << std::endl; return 1; } if (is_strand_ambiguous) { it->ReverseAndComplement(); std::int32_t score_rev = 0; spoa::Alignment alignment_rev; try { alignment_rev = alignment_engine->Align(it->data, graph, &score_rev); } catch (std::invalid_argument& exception) { std::cerr << exception.what() << std::endl; return 1; } if (score >= score_rev) { it->ReverseAndComplement(); is_reversed.push_back(false); } else { alignment = alignment_rev; is_reversed.push_back(true); } } try { if (it->quality.empty()) { graph.AddAlignment(alignment, it->data); } else { graph.AddAlignment(alignment, it->data, it->quality); } } catch(std::invalid_argument& exception) { std::cerr << exception.what() << std::endl; return 1; } } for (const auto& it : results) { switch (it) { case 0: { auto consensus = graph.GenerateConsensus(); std::cout << ">Consensus LN:i:" << consensus.size() << std::endl << consensus << std::endl; break; } case 1: case 2: { auto msa = graph.GenerateMultipleSequenceAlignment(it == 2); for (std::uint32_t i = 0; i < msa.size(); ++i) { std::string name = i < sequences.size() ? sequences[i]->name : "Consensus"; // NOLINT std::cout << ">" << name << std::endl << msa[i] << std::endl; } break; } case 3: case 4: { std::vector headers; for (const auto& it : sequences) { headers.emplace_back(it->name); } graph.GenerateConsensus(); PrintGfa(graph, headers, is_reversed, it == 4); break; } default: break; } } graph.PrintDot(dot_path); return 0; }