/* * db_construction.cpp * * Created on: 2016/8/31 * Author: Tsukasa Fukunaga */ #include "db_construction.h" #include "fastafile_reader.h" #include "encoder.h" #include "sais.h" #include "raccess.h" #include #include void DbConstruction::Run(const DbConstructionParameters parameters){ vector sequences; sequences.reserve(10); vector encoded_sequences; vector suffix_array; vector > start_hash; start_hash.reserve(parameters.GetHashSize()); vector > end_hash; start_hash.reserve(parameters.GetHashSize()); ReadFastaFile(parameters, sequences); CalculateAccessibility(parameters,sequences); ConstructSuffixArray(parameters, sequences,encoded_sequences,suffix_array); ConstructHashForShortSubstring(parameters, encoded_sequences, suffix_array, start_hash, end_hash); SaveIndexData(parameters.GetDbFilename(), suffix_array, start_hash, end_hash); SaveSequenceData(parameters.GetDbFilename(), sequences, encoded_sequences); } void DbConstruction::ReadFastaFile(const DbConstructionParameters parameters, vector &sequences){ vector names; names.reserve(10); FastafileReader fastafile_reader; fastafile_reader.ReadFastafile(parameters.GetInputFilename(), sequences, names); SaveBasicInformation(parameters, names); } void DbConstruction::CalculateAccessibility(const DbConstructionParameters parameters, vector &sequences){ Raccess raccess(parameters.GetDbFilename(), parameters.GetMaximalSpan(), parameters.GetMinAccessibleLength()); for (int i = 0; i < sequences.size() ; i++){ raccess.Run(sequences[i]); } } void DbConstruction::ConstructSuffixArray(const DbConstructionParameters parameters, vector &sequences, vector &encoded_sequences, vector &suffix_array){ Encoder encoder(parameters.GetRepeatFlag()); encoder.Encode(sequences, encoded_sequences, 1); suffix_array.resize(encoded_sequences.size()); sais(&encoded_sequences[0], &suffix_array[0], encoded_sequences.size()); }; void DbConstruction::ConstructHashForShortSubstring(const DbConstructionParameters parameters, vector &encoded_sequences, vector &suffix_array, vector > &start_hash, vector > &end_hash){ int nucleotide_size = 4; for(int i = 0; i < parameters.GetHashSize(); i++){ vector temp_vector; temp_vector.reserve((int)pow(nucleotide_size,i+1)); start_hash.push_back(temp_vector); vector temp_vector2; temp_vector2.reserve((int)pow(nucleotide_size,i+1)); end_hash.push_back(temp_vector2); } int start = 0; int end = 0; for(int i = 0; i < parameters.GetHashSize(); i++){ for(int j =0; j < (int)pow(nucleotide_size,i+1); j++){ unsigned char c = (j%4)+2; if(i == 0){ start = 0; end = suffix_array.size()-1; }else{ start = start_hash[i-1][j/4]; end = end_hash[i-1][j/4]; } Search(encoded_sequences, suffix_array, &start, &end, c, i); start_hash[i].push_back(start); end_hash[i].push_back(end); } } } void DbConstruction::SaveSequenceData(string file_name, vector &sequences, vector &encoded_sequences){ ofstream of((file_name+".seq").c_str(), ios::out | ios::binary); int number_of_seq = sequences.size(); of.write(reinterpret_cast(&number_of_seq), sizeof(int)); for(int i = 0; i < number_of_seq; i++){ int seq_size = sequences[i].size(); of.write(reinterpret_cast(&seq_size), sizeof(int)); } int count = encoded_sequences.size(); vector::iterator it; of.write(reinterpret_cast(&count), sizeof(int)); for(it = encoded_sequences.begin(); it != encoded_sequences.end(); it++){ of.write(reinterpret_cast(&*it), sizeof(unsigned char)); } of.close(); } void DbConstruction::SaveIndexData(string file_name, vector &suffix_array, vector > &start_hash, vector > &end_hash){ ofstream of((file_name+".ind").c_str(), ios::out | ios::binary); int count = suffix_array.size(); vector::iterator it; of.write(reinterpret_cast(&count), sizeof(int)); for(it = suffix_array.begin(); it != suffix_array.end(); it++){ of.write(reinterpret_cast(&*it), sizeof(int)); } for(int i = 0; i< start_hash.size();i++){ for(it = start_hash[i].begin(); it != start_hash[i].end(); it++){ of.write(reinterpret_cast(&*it), sizeof(int)); } } for(int i = 0; i< end_hash.size();i++){ for(it = end_hash[i].begin(); it != end_hash[i].end(); it++){ of.write(reinterpret_cast(&*it), sizeof(int)); } } of.close(); } void DbConstruction::SaveBasicInformation(DbConstructionParameters parameters, vector &names){ ofstream of((parameters.GetDbFilename()+".bas").c_str(), ios::out | ios::binary); int hash_size = parameters.GetHashSize(); int repeat_flag = parameters.GetRepeatFlag(); int maximal_span = parameters.GetMaximalSpan(); int min_accessible_length = parameters.GetMinAccessibleLength(); of.write(reinterpret_cast(&hash_size), sizeof(int)); of.write(reinterpret_cast(&repeat_flag), sizeof(int)); of.write(reinterpret_cast(&maximal_span), sizeof(int)); of.write(reinterpret_cast(&min_accessible_length), sizeof(int)); of.close(); of.open((parameters.GetDbFilename()+".nam").c_str(), ios::out); for(int i = 0; i < names.size(); i++){ of << names[i] << endl; } of.close(); } void DbConstruction::Search(vector &encoded_sequences, vector &suffix_array, int* start, int* end, unsigned char c, int offset){ int s = *start; int e = *end; int m; if (suffix_array[s] + offset >= encoded_sequences.size()) { ++(*start); } if(s>e){ *start = 1; *end = 0; return; }else if (s == e) { if (encoded_sequences[suffix_array[s]+offset] == c) { return; } else { *start = 1; *end = 0; return; } } if (encoded_sequences[suffix_array[s]+offset] != c) { while (s < e - 1) { m = (s + e) / 2; if (encoded_sequences[suffix_array[m]+offset] < c) { s = m; } else { e = m; } } if (encoded_sequences[suffix_array[e]+offset] != c) { *start = 1; *end = 0; return; } *start = e; s = e; e = *end; } if (encoded_sequences[suffix_array[e]+offset] != c) { while (s < e - 1) { m = (s + e) / 2; if (encoded_sequences[suffix_array[m]+offset] > c) { e = m; } else { s = m; } } if (encoded_sequences[suffix_array[s]+offset] != c) { *start = 1; *end = 0; return; } *end = s; } return; }