#! /usr/bin/env python import utils from Bio import SeqIO import seaborn from matplotlib import pyplot import os from sklearn.metrics import roc_curve import math from math import sqrt import multiprocessing import numpy import glob from Bio import AlignIO import pandas from statistics import mean seaborn.set(font_scale = 1.2) color_palette = seaborn.color_palette() color_palette_2 = seaborn.color_palette("Set2") white = "#F2F2F2" pyplot.rcParams["figure.dpi"] = 600 def main(): (current_work_dir_path, asset_dir_path, program_dir_path, conda_program_dir_path) = utils.get_dir_paths() num_of_threads = multiprocessing.cpu_count() image_dir_path = asset_dir_path + "/images" if not os.path.exists(image_dir_path): os.mkdir(image_dir_path) consalign_elapsed_time_data_file_path = asset_dir_path + "/consalign_elapsed_time_data.dat" raf_elapsed_time_data_file_path = asset_dir_path + "/raf_elapsed_time_data.dat" locarna_elapsed_time_data_file_path = asset_dir_path + "/locarna_elapsed_time_data.dat" dafs_elapsed_time_data_file_path = asset_dir_path + "/dafs_elapsed_time_data.dat" sparse_elapsed_time_data_file_path = asset_dir_path + "/sparse_elapsed_time_data.dat" turbofold_elapsed_time_data_file_path = asset_dir_path + "/turbofold_elapsed_time_data.dat" image_dir_path = asset_dir_path + "/images" if not os.path.exists(image_dir_path): os.mkdir(image_dir_path) seq_lens_avg, seq_nums, consalign_elapsed_times = utils.read_elapsed_time_data(consalign_elapsed_time_data_file_path) data = {"Average RNA sequence length": seq_lens_avg} data_frame = pandas.DataFrame(data = data) num_of_range_short = len(data_frame.query("`Average RNA sequence length` < 100.")) num_of_range_mid = len(data_frame.query("100. <= `Average RNA sequence length` < 200.")) num_of_range_long = len(data_frame.query("200. <= `Average RNA sequence length`")) print("# entries in Rfam = %d" % (num_of_range_short + num_of_range_mid + num_of_range_long)) ax = pyplot.pie([num_of_range_short, num_of_range_mid, num_of_range_long], labels = ["Short", "Medium", "Long"], counterclock = False, startangle = 90, autopct = '%1.1f%%', pctdistance = 0.7) pyplot.tight_layout() pyplot.savefig(image_dir_path + "/avg_seq_len_dist.svg", bbox_inches = "tight") pyplot.clf() data = {"Number of RNA sequences": seq_nums} data_frame = pandas.DataFrame(data = data) num_of_range_few = len(data_frame.query("`Number of RNA sequences` < 6")) num_of_range_mid = len(data_frame.query("6 <= `Number of RNA sequences` < 11")) num_of_range_many = len(data_frame.query("11 <= `Number of RNA sequences`")) ax = pyplot.pie([num_of_range_few, num_of_range_mid, num_of_range_many], labels = ["Few", "Medium", "Many"], counterclock = False, startangle = 90, autopct = '%1.1f%%', pctdistance = 0.7) pyplot.tight_layout() pyplot.savefig(image_dir_path + "/seq_num_dist.svg", bbox_inches = "tight") pyplot.clf() _, _, raf_elapsed_times = utils.read_elapsed_time_data(raf_elapsed_time_data_file_path) _, _, locarna_elapsed_times = utils.read_elapsed_time_data(locarna_elapsed_time_data_file_path) _, _, dafs_elapsed_times = utils.read_elapsed_time_data(dafs_elapsed_time_data_file_path) _, _, sparse_elapsed_times = utils.read_elapsed_time_data(sparse_elapsed_time_data_file_path) _, _, turbofold_elapsed_times = utils.read_elapsed_time_data(turbofold_elapsed_time_data_file_path) elapsed_times = consalign_elapsed_times + raf_elapsed_times + locarna_elapsed_times + dafs_elapsed_times + sparse_elapsed_times + turbofold_elapsed_times data = {"Average RNA sequence length": seq_lens_avg * 6, "# RNA sequences": seq_nums * 6, "Running time (s)": elapsed_times, "RNA structural aligner": ["ConsAlign"] * len(consalign_elapsed_times) + ["RAF"] * len(raf_elapsed_times) + ["LocARNA"] * len(locarna_elapsed_times) + ["DAFS"] * len(dafs_elapsed_times) + ["SPARSE"] * len(sparse_elapsed_times) + ["LinearTurboFold"] * len(turbofold_elapsed_times)} data_frame = pandas.DataFrame(data = data) ax = seaborn.lmplot(x = "Average RNA sequence length", y = "Running time (s)", data = data_frame, lowess = True, hue = "RNA structural aligner", scatter = False) seaborn.move_legend(ax, "upper right", bbox_to_anchor = (1.3, 1)) pyplot.tight_layout() pyplot.savefig(image_dir_path + "/rna_aligner_reg_plot_elapsed_time.svg", bbox_inches = "tight") pyplot.clf() ax = seaborn.lmplot(x = "# RNA sequences", y = "Running time (s)", data = data_frame, lowess = True, hue = "RNA structural aligner", scatter = False, legend = False) pyplot.tight_layout() pyplot.savefig(image_dir_path + "/rna_aligner_reg_plot_elapsed_time_seq_num.svg", bbox_inches = "tight") pyplot.clf() def get_bin_counts(params): (estimated_ss_file_path, ref_css, rna_seq_lens) = params num_of_rnas = len(rna_seq_lens) estimated_css = utils.get_css(estimated_ss_file_path) if estimated_ss_file_path.endswith(".sth") else utils.get_sss(estimated_ss_file_path) tp = fp = tn = fn = 0 for m in range(0, num_of_rnas): sub_estimated_css = estimated_css[m] sub_ref_css = ref_css[m] rna_seq_len_1 = rna_seq_lens[m] for i in range(0, rna_seq_len_1): for j in range(i + 1, rna_seq_len_1): estimated_bin = (i, j) in sub_estimated_css ref_bin = (i, j) in sub_ref_css if estimated_bin == ref_bin: if estimated_bin == True: tp += 1 else: tn += 1 else: if estimated_bin == True: fp += 1 else: fn += 1 return tp, tn, fp, fn def get_metrics(params): (estimated_sa_file_path, ref_sa, rna_seq_lens) = params num_of_rnas = len(rna_seq_lens) estimated_sa = AlignIO.read(estimated_sa_file_path, "stockholm" if estimated_sa_file_path.endswith(".sth") else "fasta") tp = total = 0 estimated_sa_len = len(estimated_sa[0]) num_of_rnas = len(estimated_sa) pos_map_sets_estimated = [] for i in range(num_of_rnas): pos_maps_estimated = [] pos = -1 for j in range(estimated_sa_len): c = estimated_sa[i][j] if c != "-": pos += 1 if c != "-": pos_maps_estimated.append(pos) else: pos_maps_estimated.append(-1) pos_map_sets_estimated.append(pos_maps_estimated) aligned_pair_sets_estimated = {} for m in range(0, num_of_rnas): pos_maps_estimated = pos_map_sets_estimated[m] for n in range(m + 1, num_of_rnas): aligned_pairs_estimated = set() pos_maps_estimated_2 = pos_map_sets_estimated[n] for i in range(estimated_sa_len): pos_pair = (pos_maps_estimated[i], pos_maps_estimated_2[i]) if pos_pair[0] != -1 or pos_pair[1] != -1: aligned_pairs_estimated.add(pos_pair) total += 1 aligned_pair_sets_estimated[(m, n)] = aligned_pairs_estimated ref_sa_len = len(ref_sa[0]) ref_pos_map_sets = [] for i in range(num_of_rnas): ref_pos_maps = [] pos = -1 for j in range(ref_sa_len): c = ref_sa[i][j] if c != "-": pos += 1 if c != "-": ref_pos_maps.append(pos) else: ref_pos_maps.append(-1) ref_pos_map_sets.append(ref_pos_maps) for m in range(0, num_of_rnas): ref_pos_maps = ref_pos_map_sets[m] for n in range(m + 1, num_of_rnas): ref_pos_maps_2 = ref_pos_map_sets[n] aligned_pos_pairs_estimated = aligned_pair_sets_estimated[(m, n)] for i in range(ref_sa_len): pos_pair = (ref_pos_maps[i], ref_pos_maps_2[i]) if pos_pair[0] != -1 or pos_pair[1] != -1: if pos_pair in aligned_pos_pairs_estimated: tp += 1 f = open(estimated_sa_file_path, "r") line = f.readlines()[1] splits = line.split() string = splits[-1] expected_sps = float(string.split("=")[-1]) string = splits[-2] gamma_bp = float(string.split("=")[-1]) string = splits[-3] gamma_align = float(string.split("=")[-1]) return (tp / total, expected_sps, gamma_bp, gamma_align) def get_sci(params): (estimated_sa_file_path, ref_sa, rna_seq_lens) = params rnaalifold_command = "RNAalifold %s --sci" % estimated_sa_file_path (output, _, _) = utils.run_command(rnaalifold_command) output = str(output) splits = output.split("sci = ") sci = splits[-1].strip() sci = float(sci[:-4]) return sci def get_f1_score(result): (tp, tn, fp, fn) = result denom = tp + 0.5 * (fp + fn) return tp / denom def get_mcc(result): (tp, tn, fp, fn) = result denom = sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)) return (tp * tn - fp * fn) / denom if __name__ == "__main__": main()