############################################################################ # demotic_fasta package # Parses fasta or ssearch output, stores extracted information in convenient vars. # SRE, Wed Jun 25 13:41:41 2003 # ############################################################################ package demotic_fasta; # parse(\*STDIN) would parse ssearch output # coming in via stdin. # sub parse (*) { my $fh = shift; my $parsing_header = 1; my $parsing_hitlist = 0; my $parsing_alilist = 0; my $target; my $alilinecount = 0; my $prvaliline_isquery = 0; my $ali_qline; my $ali_tline; my $ali_qasq; my $ali_tasq; my $margin; # Initialize everything... so we can call the parser # repeatedly, one call per ssearch output. # # This section also documents what's available in # variables within the package. # # Arrays are indexed [0..nhits-1] or [0..nali-1] # $queryname = ""; # Name of the query sequence $querydesc = ""; # Description line for the query (or "") $querylen = 0; # Length of the query in residues $db = ""; # Name of the target database file $db_nseq = 0; # Number of sequences in the target database $db_nletters = ""; # Number of residues in the target database # (stored as a string so we can have large #'s) # The top hit list (still in rank order) $nhits = 0; # Number of entries in the hit list @hit_target = (); # Target sequence name (by rank) %target_desc = (); # Target sequence description (by targ name) %target_len = (); # Length of target sequence @hit_score = (); # Raw score (by rank) @hit_bitscore = (); # Bit score (by rank) @hit_Eval = (); # E-value (by rank) # The alignment output (in order it came in) # all indexed by # of alignment [0..nali-1] $nali = 0; # Number of alignments @ali_target = (); # Target sequence name @ali_score = (); # Smith/Waterman raw score of alignment @ali_bitscore = (); # bit score @ali_evalue = (); # E-value @ali_nident = (); # Number of identical residues @ali_alen = (); # Length of alignment (overlap) @ali_identity = (); # Percent identity # @ali_npos = (); # Number of positives (similar positions) # @ali_positive = (); # Percent of positions matched or similar @ali_qstart = (); # Start position on query @ali_qend = (); # End position on query @ali_tstart = (); # Start position on target @ali_tend = (); # End position on target @ali_qali = (); # Aligned string from query @ali_tali = (); # Aligned string from target (subject) @ali_qmask = (); # line in between the two aligned strings @ali_tmask = (); # line in between the two aligned strings @ali_hitidx = (); # index of hit $hitidx = -1; if (defined($save_querycount) && $save_querycount > 1) { # on subsequent queries, we had to use the >>> start line to detect # the end of the prev query; we socked the necessary info away in tmp vars. $querycount = $save_querycount; $queryname = $save_queryname; $querydesc = $save_querydesc; $querylen = $save_querylen; } # Now, loop over the lines of our input, and parse 'em. # my $line; my $prvline; while (<$fh>) { $line = $_; if ($parsing_header) { if (/^The best scores are:/) { # start of hit list $parsing_header = 0; $parsing_hitlist = 1; next; } elsif (/^!! No sequences/) { # no hit list: no hits; just return return 1; # return "success" } elsif (/^\s+(\d+)>>>\s*(\S*)\s*(.*)\s*-\s*(\d+) \S\S$/) { # allows blank query. \S\S is "nt" or "aa" $querycount = $1; $queryname = $2; $querydesc = $3; $querylen = $4; if ($queryname eq "") { $queryname = "unnamed_query"; } } elsif (/^\s+(\d+)\s+residues in\s+(\d+)\s+sequences\s*$/) { $db_nletters = $1; $db_nseq = $2; } } elsif ($parsing_hitlist) { if (/^\s*$/) { # blank line marks end of hit list, start of alignments $parsing_hitlist = 0; $parsing_alilist = 1; next; } elsif (/^(\S+)\s*(.*\S?)\s*\(\s*(\d+)\)\s+(\d+)\s+(\S+)\s+(\S+)\s*$/) { $hit_target[$nhits] = $1; $target_desc{$1} = $2; $target_len{$1} = $3; $hit_score[$nhits] = $4; $hit_bitscore[$nhits] = $5; $hit_Eval[$nhits] = $6; $nhits++; } } elsif ($parsing_alilist) { if (/^>>(\S+)\s*(.*)\s+\((\d+) \S\S\)\s*$/) { # the \S\S is either nt or aa $target = $1; $hitidx ++; $target_desc{$target} = $2; if ($3 != $target_len{$target}) { die "can't happen.", "1)", $3, "2)", $target_len{$target}; } } elsif (/^ s-w opt:\s+(\d+)\s+Z-score:\s*(\S+)\s+bits:\s+(\S+)\s+E\(\d*\):\s+(\S+)\s*$/) { # SSEARCH $nali++; $ali_target[$nali-1] = $target; $ali_score[$nali-1] = $1; $ali_bitscore[$nali-1] = $3; $ali_evalue[$nali-1] = $4; $ali_hitidx[$nali-1] = $hitidx; } elsif (/^ initn:\s*\d+\s*init1:\s*\d+\s*opt:\s*(\d+)\s*Z-score:\s*(\S+)\s*bits:\s*(\S+)\s*E\(\d*\):\s*(\S+)\s*$/) { # FASTA $nali++; $ali_target[$nali-1] = $target; $ali_score[$nali-1] = $1; $ali_bitscore[$nali-1] = $3; $ali_evalue[$nali-1] = $4; $ali_hitidx[$nali-1] = $hitidx; } elsif (/^Smith-Waterman score:\s+(\d+);\s+(\S+)% identity .* in (\d+) \S\S overlap \((\d+)-(\d+):(\d+)-(\d+)\)\s*/) { $ali_identity[$nali-1] = $2; $ali_alen[$nali-1] = $3; $ali_qstart[$nali-1] = $4; $ali_qend[$nali-1] = $5; $ali_tstart[$nali-1] = $6; $ali_tend[$nali-1] = $7; # $ali_nident[$nali-1] = $1; # $ali_npos[$nali-1] = $4; # $ali_positive[$nali-1] = $5; $alilinecount = 0; $ali_qali[$nali-1] = ""; $ali_tali[$nali-1] = ""; $ali_qmask[$nali-1] = ""; $ali_tmask[$nali-1] = ""; } elsif (/^(\S+)\s+(\S+)\s*$/) { # only ali lines are right-flushed # the usual alignment display is # ali_query_line # mask # ali_target_line # # (1) ends are not flushed, and they can have "extra stuff" # function calculate_flushedmask() corrects that # # (2) alingments do not need to be complete. (particularly using option -a ) # Meaning at the end AND at the beginning as well, you can end up with one single query line # or one single target line. # # ali_query_line # (no mask) # (no ali_target_line) # # or # # (no ali_query_line) # (no mask) # ali_target_line # # or even # # aliq_query_line # (no mask) # ali_target_Line # # why? oh! why? check for that and fix it. my $name = $1; my $asq = $2; #carefull, the alignment part, truncates the names of the query and targets # this is a problem specially if the query and target names start similarly. # it appears that querynames have been truncated to 5 characters and targetnames to 6 # also check for a prvline with numbers, but if len < 10 those do not show up either if ($queryname =~ /^$name/ && (length($name) <= 5 || $prvline =~ /\s+(\d+)\s*/)) { $prvaliline_isquery = 1; $ali_qline = $_; $ali_qline =~ s/\n//; $ali_qasq = $asq; $mask = ""; } elsif ($ali_target[$nali-1] =~ /^$name/) { $talilinecount ++; $ali_tline = $_; $ali_tline =~ s/\n//; $ali_tasq = $asq; if ($prvaliline_isquery) { $ali_qali[$nali-1] .= $ali_qasq; $ali_tali[$nali-1] .= $ali_tasq; $ali_qmask[$nali-1] .= calculate_flushedmask($ali_qline, $mask); $ali_tmask[$nali-1] .= calculate_flushedmask($ali_tline, $mask); } $prvaliline_isquery = 0; } $alilinecount++; } elsif (/^(\s*[\.\:][\s\.\:]*)$/) { $mask .= $1; } elsif (/^\s+(\d+)>>>\s*(\S*)\s*(.*)\s*-\s*(\d+) \S\S$/) { # next query is starting. \S\S is "nt" or "aa" $save_querycount = $1; $save_queryname = $2; $save_querydesc = $3; $save_querylen = $4; if ($save_queryname eq "") { $save_queryname = "unnamed_query"; } return 1; # normal return. We've finished output for a query, and stored some info about the next one. } } $prvline = $line; } # this closes the loop over lines in the input stream: at EOF. if ($parsing_alilist) { for (my $ali = 0; $ali < $nali; $ali ++) { # the ali lines come along with more residues that are not part of the alignment. Why? oh! why?. REMOVE # you cannot remove using ali_{q,t}start and $ali_{q,t}end because those are # relative to the full sequence. Need to do it by parsing also the line in between (what I call the "mask") # and finding the first and last ":" or "." $ali_qali[$ali] = ali_removeflanking($ali_qali[$ali], $ali_qmask[$ali], $ali_qstart[$ali], $ali_qend[$ali]); $ali_tali[$ali] = ali_removeflanking($ali_tali[$ali], $ali_tmask[$ali], $ali_tstart[$ali], $ali_tend[$ali]); } return 1; } else { return 0; } # at EOF: normal return if we were in the alignment section. } sub exblxout { my $ofh = shift; my $i; for ($i = 0; $i <= $nali-1; $i++) { printf $ofh "%s\t%d\t%d\t%d\t%s\t%d\t%d\t%s\n", $ali_evalue[$i], $ali_identity[$i], $ali_tstart[$i], $ali_tend[$i], $ali_target[$i], $ali_qstart[$i], $ali_qend[$i], $queryname; } } sub tblout { my $ofh = shift; my $i; for ($i = 0; $i <= $nali-1; $i++) { printf $ofh "%s\t%d\t%d\t%d\t%s\t%d\t%d\t%s\t%s\n", $ali_evalue[$i], $ali_identity[$i], $ali_qstart[$i], $ali_qend[$i], $queryname, $ali_tstart[$i], $ali_tend[$i], $ali_target[$i], $target_desc{$ali_target[$i]}; } } sub gffout { my $ofh = shift; my $source = shift; my $feature = shift; my $i; my $strand; for ($i = 0; $i <= $nali-1; $i++) { if ($ali_qstart[$i] > $ali_qend[$i]) { $strand = "-"; } else { $strand = "+"; } printf $ofh "%s\t%s\t%s\t%d\t%d\t%.1f\t%s\t.\tgene \"%s\"\n", $ali_target[$i], $source, $feature, $ali_tstart[$i], $ali_tend[$i], $ali_bitscore[$i], $strand, $queryname; } } sub profmarkout { my $ofh = shift; my $i; for ($i = 0; $i < $nhits; $i++) { printf $ofh "%g\t%.1f\t%s\t%s\n", $hit_Eval[$i], $hit_bitscore[$i], $hit_target[$i], $queryname; } } sub calculate_flushedmask { my ($asq, $mask) = @_; my $lremove = -1; my $flushedasq; if ($asq =~ /^(\S+\s+)(\S+)\s*$/) { $lremove = length($1); $flushedasq = $2; } my $flushedmask = $mask; $flushedmask =~ s/^(.{$lremove})//; $flushedmask =~ s/\n//; if (length($flushedmask) > length($flushedasq)) { while (length($flushedmask) > length($flushedasq)) { if ($flushedmask =~ /(\s)$/) { $flushedmask =~ s/(\s)$//; } } } if (length($flushedmask) < length($flushedasq)) { while (length($flushedmask) < length($flushedasq)) { $flushedmask .= " "; } } #printf("\naseq|$asq|\nmask|$mask|\n"); #printf("^^aseq|$flushedasq|\n^^mask|$flushedmask|\n"); return $flushedmask; } sub ali_removeflanking { my ($aseq, $mask) = @_; my $taseq = ""; my $alen = length($aseq); my $apos_start = 0; while ($apos_start < $alen) { my $mval = substr($mask, $apos_start, 1); if ($mval =~ /[\.\:]/) { last; } $apos_start ++; } my $apos_end = $alen-1; while ($apos_end >= 0) { my $mval = substr($mask, $apos_end, 1); if ($mval =~ /[\.\:]/) { last; } $apos_end --; } for (my $apos = $apos_start; $apos <= $apos_end; $apos++) { $taseq .= substr($aseq, $apos, 1); } #print "B:$aseq\nM:$mask\nA:$taseq\n"; return $taseq; } 1; __END__