/* * seed_search.cpp * * Created on: 2016/8/31 * Author: Tsukasa Fukunaga */ #include "seed_search.h" #include #include void SeedSearch::Run(vector &query_seq, vector &query_suffix_array, vector &db_seq, vector &db_suffix_array, vector > &start_hash, vector > &end_hash){ vector db_seed; db_seed.reserve(_max_seed_length); vector q_seed; q_seed.reserve(_max_seed_length); SeedSearchCore(query_seq, query_suffix_array, db_seq, db_suffix_array, start_hash, end_hash, db_seed, q_seed, 0, query_suffix_array.size()-1, 0, db_suffix_array.size()-1, 0.0, 0); return; } void SeedSearch::CalcInteractionEnergy(vector &hit_result, vector &query_suffix_array, vector &db_suffix_array, vector &query_accessibility, vector &query_conditional_accessibility, vector > &db_accessibility,vector > &db_conditional_accessibility, vector &db_seq_length, vector &db_seq_start_position){ int count = 0; for(int i = 0;i < _hit_candidate_result.size(); i++){ int sp_q_sa = _hit_candidate_result[i].GetSpQSa(); int ep_q_sa = _hit_candidate_result[i].GetEpQSa(); int sp_db_sa = _hit_candidate_result[i].GetSpDbSa(); int ep_db_sa = _hit_candidate_result[i].GetEpDbSa(); int length = _hit_candidate_result[i].GetLength(); double temp_score = _hit_candidate_result[i].GetEnergy(); vector q_sp_vector; q_sp_vector.reserve(10000); vector qa_vector; qa_vector.reserve(10000); for(int j = sp_q_sa; j <=ep_q_sa ; j++){ q_sp_vector.push_back(query_suffix_array[j]); qa_vector.push_back(CalcAccessibility(query_accessibility, query_conditional_accessibility, query_suffix_array[j], length)); } for(int k = sp_db_sa; k <=ep_db_sa ; k++){ int db_sp = db_suffix_array[k]; int dbseq_start = 0; int dbseq_id = 0; GetSeqIdAndStart(db_seq_length, db_seq_start_position, &dbseq_id, &dbseq_start, db_sp, length); double dba = CalcAccessibility(db_accessibility[dbseq_id], db_conditional_accessibility[dbseq_id], dbseq_start, length); for(int j = sp_q_sa; j <= ep_q_sa; j++){ count += 1; double interaction_energy = qa_vector[j-sp_q_sa] + dba + temp_score; if(interaction_energy < 0){ Hit temp_hit( q_sp_vector[j-sp_q_sa], db_sp, length, interaction_energy); temp_hit.SetDbSeqId(dbseq_id); temp_hit.SetDbSeqIdStart(dbseq_start); hit_result.push_back(temp_hit); } } } } return; } void SeedSearch::GetSeqIdAndStart(vector &db_seq_length, vector &db_seq_start_position, int* seq_id, int* start, int sp, int length){ int s = 0; int e = db_seq_start_position.size()-1; int m = (s+e)/2; while(true){ if(e-s == 0){ *seq_id = s; *start = db_seq_length[s] - (sp - db_seq_start_position[s]) - length; break; }else if(e-s==1){ if(sp >= db_seq_start_position[s] && sp < db_seq_start_position[s+1]){ *seq_id = s; *start = db_seq_length[s] - (sp - db_seq_start_position[s]) - length; break; }else{ *seq_id = e; *start = db_seq_length[e] - (sp - db_seq_start_position[e]) - length; break; } }else{ if(sp >= db_seq_start_position[m] && sp < db_seq_start_position[m+1]){ *seq_id = m; *start = db_seq_length[m] - (sp - db_seq_start_position[m]) - length; break; }else{ if(sp >= db_seq_start_position[m]){ s = m; m = (s+e)/2; }else{ e = m; m = (s+e)/2; } } } } return; } double SeedSearch::CalcAccessibility(vector accessibility, vector conditional_accessibility, int sp, int length){ double temp = accessibility[sp]; for(int i =_min_accessible_length; i < length; i++){ temp += conditional_accessibility[sp+i]; } return(temp); } void SeedSearch::SeedSearchCore(vector &query_seq, vector &query_suffix_array, vector &db_seq, vector &db_suffix_array, vector > &start_hash, vector > &end_hash, vector &db_seed, vector &q_seed, int sp_q, int ep_q, int sp_db, int ep_db, double score, int length){ if(length < _max_seed_length){ vector sp_q_result; sp_q_result.reserve(_pair_size); vector ep_q_result; ep_q_result.reserve(_pair_size); vector sp_db_result; sp_db_result.reserve(_pair_size); vector ep_db_result; ep_db_result.reserve(_pair_size); for(int i = 0; i< _pair_size; i++){ int start = sp_q; int end = ep_q; int nucleotide_size = 4; SeedSearchNextCharacter(query_seq, query_suffix_array, &start, &end, _stem_pair[i][0], length); sp_q_result.push_back(start); ep_q_result.push_back(end); start = sp_db; end = ep_db; if(length+1 > _hash_size){ SeedSearchNextCharacter(db_seq, db_suffix_array, &start, &end, _stem_pair[i][1], length); }else{ int temp = _stem_pair[i][1]-2; for(int j = 0; j 0){ int type = BP_pair[q_seed[length-1]-1][db_seed[length-1]-1]; int type2 = BP_pair[_stem_pair[i][0]-1][_stem_pair[i][1]-1]; type2 = rtype[type2]; temp_score = score + ((double)stack37[type][type2])/100; } if(temp_score < _hybrid_energy_threshold && length+1 >= _min_accessible_length){ Hit_candidate temp_hit_candidate(sp_q_result[i], ep_q_result[i], sp_db_result[i], ep_db_result[i], length+1, temp_score); _hit_candidate_result.push_back(temp_hit_candidate); }else{ q_seed.push_back(_stem_pair[i][0]); db_seed.push_back(_stem_pair[i][1]); SeedSearchCore(query_seq, query_suffix_array, db_seq, db_suffix_array, start_hash, end_hash, db_seed, q_seed, sp_q_result[i], ep_q_result[i], sp_db_result[i], ep_db_result[i], temp_score, length+1); } } } } q_seed.pop_back(); db_seed.pop_back(); return; } void SeedSearch::SeedSearchNextCharacter(vector &encoded_sequences, vector &suffix_array, int* start, int* end, unsigned char c, int offset){ int s = *start; int e = *end; int m; if (suffix_array[s] + offset >= encoded_sequences.size()) { ++(*start); } if(s>e){ *start = 1; *end = 0; return; }else if (s == e) { if (encoded_sequences[suffix_array[s]+offset] == c) { return; } else { *start = 1; *end = 0; return; } } if (encoded_sequences[suffix_array[s]+offset] != c) { while (s < e - 1) { m = (s + e) / 2; if (encoded_sequences[suffix_array[m]+offset] < c) { s = m; } else { e = m; } } if (encoded_sequences[suffix_array[e]+offset] != c) { *start = 1; *end = 0; return; } *start = e; s = e; e = *end; } if (encoded_sequences[suffix_array[e]+offset] != c) { while (s < e - 1) { m = (s + e) / 2; if (encoded_sequences[suffix_array[m]+offset] > c) { e = m; } else { s = m; } } if (encoded_sequences[suffix_array[s]+offset] != c) { *start = 1; *end = 0; return; } *end = s; } return; }