/* * rna_interaction_search.cpp * * Created on: 2016/8/31 * Author: Tsukasa Fukunaga */ #include "rna_interaction_search.h" #include "fastafile_reader.h" #include "encoder.h" #include "sais.h" #include "raccess.h" #include "seed_search.h" #include "ungapped_extension.h" #include "gapped_extension.h" #include #include #include bool compare( const Hit& left, const Hit& right ) { if(left.GetDbSp() != right.GetDbSp()){ return(left.GetDbSp() < right.GetDbSp()); }else if(left.GetQSp() != right.GetQSp()){ return(left.GetQSp() < right.GetQSp()); }else if(left.GetDbLength() != right.GetDbLength()){ return(left.GetDbLength() > right.GetDbLength()); }else{ return(left.GetQLength() > right.GetQLength()); } } bool base_pair_compare( const BasePair& left, const BasePair& right ) { return(left.qpos < right.qpos); } void RnaInteractionSearch::Run(const RnaInteractionSearchParameters parameters){ string query_sequence; string query_name; vector query_encoded_sequence; query_encoded_sequence.reserve(10); vector query_suffix_array; vector query_accessibility; query_accessibility.reserve(10); vector query_conditional_accessibility; query_conditional_accessibility.reserve(10); LoadDatabase(parameters.GetDbFilename(), parameters.GetHashSize()); ReadFastaFile(parameters, query_sequence, query_name); CalculateAccessibility(parameters, query_sequence, query_accessibility, query_conditional_accessibility); ConstructSuffixArray(parameters, query_sequence, query_encoded_sequence, query_suffix_array); vector hit_result; hit_result.reserve(10000000); SearchSeed(parameters,hit_result, query_encoded_sequence, query_suffix_array, query_accessibility, query_conditional_accessibility); ExtendWithoutGap(parameters,hit_result, query_encoded_sequence, query_accessibility, query_conditional_accessibility); ExtendWithGap(parameters,hit_result, query_encoded_sequence, query_accessibility, query_conditional_accessibility); Output(parameters,hit_result,query_name); } void RnaInteractionSearch::ReadFastaFile(const RnaInteractionSearchParameters parameters, string &query_sequence, string &query_name){ FastafileReader fastafile_reader; fastafile_reader.ReadFastafile(parameters.GetInputFilename(), query_sequence, query_name); }; void RnaInteractionSearch::CalculateAccessibility(const RnaInteractionSearchParameters parameters, string &query_sequence, vector &query_accessibility, vector &query_conditional_accessibility){ FastafileReader fastafile_reader; Raccess raccess(parameters.GetMaximalSpan(), parameters.GetMinAccessibleLength()); raccess.Run(query_sequence, query_accessibility, query_conditional_accessibility); }; void RnaInteractionSearch::ConstructSuffixArray(const RnaInteractionSearchParameters parameters, string &query_sequence, vector &query_encoded_sequence, vector &query_suffix_array){ Encoder encoder(parameters.GetRepeatFlag()); encoder.Encode(query_sequence, query_encoded_sequence, 0); query_suffix_array.resize(query_encoded_sequence.size()); sais(&query_encoded_sequence[0], &query_suffix_array[0], query_encoded_sequence.size()); }; void RnaInteractionSearch::SearchSeed(const RnaInteractionSearchParameters parameters, vector &hit_result, vector &query_encoded_sequence, vector &query_suffix_array, vector &query_accessibility, vector &query_conditional_accessibility){ SeedSearch seed_search(parameters.GetHashSize(), parameters.GetMaxSeedLength(), parameters.GetMinAccessibleLength(), parameters.GetHybridEnergyThreshold()); seed_search.Run(query_encoded_sequence, query_suffix_array, _db_seq, _db_suffix_array, _start_hash, _end_hash); seed_search.CalcInteractionEnergy(hit_result, query_suffix_array, _db_suffix_array,query_accessibility, query_conditional_accessibility, _db_accessibility, _db_conditional_accessibility, _db_seq_length, _db_seq_start_position); } void RnaInteractionSearch::ExtendWithoutGap(const RnaInteractionSearchParameters parameters, vector &hit_result, vector &query_encoded_sequence, vector &query_accessibility, vector &query_conditional_accessibility){ UngappedExtension ungapped_extension(parameters.GetMinAccessibleLength(), parameters.GetDropOutLengthWoGap()); ungapped_extension.Run(hit_result, query_encoded_sequence, _db_seq, query_accessibility, query_conditional_accessibility, _db_accessibility, _db_conditional_accessibility); double energy_threshold = parameters.GetInteractionEnergyThreshold(); sort(hit_result.begin(), hit_result.end(), compare); for(int i = 0; i energy_threshold){ hit_result[i].SetFlag(); } if(!hit_result[i].GetFlag()){ int a_QSp = hit_result[i].GetQSp(); int a_DbSp = hit_result[i].GetDbSp(); int a_QEp = a_QSp+hit_result[i].GetQLength()-1; int a_DbEp = a_DbSp+hit_result[i].GetDbLength()-1; for(int j = i+1; j=b_QEp && a_QSp <= b_QSp && a_DbEp >= b_DbEp){ hit_result[j].SetFlag(); } } } } } hit_result.erase(remove_if(hit_result.begin(), hit_result.end(), CheckFlag()), hit_result.end()); GetBasePair(hit_result, query_encoded_sequence); } void RnaInteractionSearch::ExtendWithGap(const RnaInteractionSearchParameters parameters, vector &hit_result, vector &query_encoded_sequence, vector &query_accessibility, vector &query_conditional_accessibility){ GappedExtension gapped_extension(parameters.GetMinAccessibleLength(), parameters.GetDropOutLengthWGap()); gapped_extension.Run(hit_result, query_encoded_sequence, _db_seq, query_accessibility, query_conditional_accessibility, _db_accessibility, _db_conditional_accessibility, _db_seq_length); for(int i = 1; i &hit_result, string q_name){ // ofstream ofs; FILE *fl_pntr; // ofs.open(parameters.GetOutputFilename().c_str(),ios::out); fl_pntr = fopen(parameters.GetOutputFilename().c_str(), "wb"); /* if (!ofs){ cout << "Error: can't open output_file:"+parameters.GetOutputFilename()+"." < gpls_ps(0, basepair_length - 1); for(int j = basepair_length / 2; j > 0; j--) { if(hit_result[i].GetBasePairFirst(j) - 1 != hit_result[i].GetBasePairFirst(j - 1)) { break; } gpls_ps.first = j - 1; } for(int j = basepair_length / 2 + 1; j < basepair_length; j++) { if(hit_result[i].GetBasePairFirst(j) + 1 != hit_result[i].GetBasePairFirst(j + 1)) { break; } gpls_ps.second = j + 1; } for(int j = 0; j < gpls_ps.first; j++) { int dbpos = (length-1) - ( hit_result[i].GetBasePairSecond(j)- seq_start_position); string bs_pr = "(" + to_string(hit_result[i].GetBasePairFirst(j)) + ":" + to_string(dbpos) + ") "; fwrite(bs_pr.c_str(), sizeof(char), bs_pr.size(), fl_pntr); } pair db_ps_pr((length-1) - ( hit_result[i].GetBasePairSecond(gpls_ps.first) - seq_start_position), (length-1) - ( hit_result[i].GetBasePairSecond(gpls_ps.second) - seq_start_position)); string bs_prs = "([" + to_string(hit_result[i].GetBasePairFirst(gpls_ps.first)) + "," + to_string(hit_result[i].GetBasePairFirst(gpls_ps.second)) + "]:[" + to_string(db_ps_pr.first) + "," + to_string(db_ps_pr.second) + "]) "; fwrite(bs_prs.c_str(), sizeof(char), bs_prs.size(), fl_pntr); for(int j = gpls_ps.second + 1; j < basepair_length; j++) { int dbpos = (length-1) - ( hit_result[i].GetBasePairSecond(j)- seq_start_position); string bs_pr = "(" + to_string(hit_result[i].GetBasePairFirst(j)) + ":" + to_string(dbpos) + ") "; fwrite(bs_pr.c_str(), sizeof(char), bs_pr.size(), fl_pntr); } /* for(int j = 0; j &hit_result, vector &query_encoded_sequence){ for(int i = 0; i< hit_result.size(); i++){ int q_start = hit_result[i].GetQSp(); int db_start = hit_result[i].GetDbSp(); int length = hit_result[i].GetQLength(); for(int j = 0; j(&temp_i), sizeof(int)); _number_of_db_seq = temp_i; vector temp_iv; vector::iterator i_it; temp_iv.assign(temp_i, 0.0); for(i_it=temp_iv.begin();i_it!=temp_iv.end();i_it++){ ifs.read(reinterpret_cast(&*i_it),sizeof(int)); } int temp = 0; for(int i = 0; i ::iterator c_it; ifs.read(reinterpret_cast(&count), sizeof(int)); _db_seq.resize(count); for(c_it=_db_seq.begin();c_it!=_db_seq.end();c_it++){ ifs.read(reinterpret_cast(&*c_it),sizeof(unsigned char)); } ifs.close(); //load acc file ifs.open((db_file_name+".acc").c_str(), ios::in | ios::binary); if (!ifs){ cout << "Error: can't open " << db_file_name << ".acc." <::iterator it; vector temp; vector c_temp; ifs.read(reinterpret_cast(&a_count), sizeof(int)); temp.assign(a_count, 0.0); for(it=temp.begin();it!=temp.end();it++){ ifs.read(reinterpret_cast(&*it),sizeof(float)); } ifs.read(reinterpret_cast(&a_count), sizeof(int)); c_temp.assign(a_count, 0.0); for(it=c_temp.begin();it!=c_temp.end();it++){ ifs.read(reinterpret_cast(&*it),sizeof(float)); } _db_accessibility.push_back(temp); _db_conditional_accessibility.push_back(c_temp); } ifs.close(); //load nam file ifs.open((db_file_name+".nam").c_str(), ios::in); if (!ifs){ cout << "Error: can't open " << db_file_name << ".nam." <::iterator it; ifs.read(reinterpret_cast(&count), sizeof(int)); _db_suffix_array.resize(count); for(it=_db_suffix_array.begin();it!=_db_suffix_array.end();it++){ ifs.read(reinterpret_cast(&*it),sizeof(int)); } for(int i = 0; i < hash_size; i++){ for(it=_start_hash[i].begin();it!=_start_hash[i].end();it++){ ifs.read(reinterpret_cast(&*it),sizeof(int)); } } for(int i = 0; i < hash_size; i++){ for(it=_end_hash[i].begin();it!=_end_hash[i].end();it++){ ifs.read(reinterpret_cast(&*it),sizeof(int)); } } ifs.close(); }