############################################################################ # demotic_blast package # Parses blast output, stores extracted information in convenient vars. # Works for both WU-BLAST and NCBI BLAST output files, # and for PSIBLAST runs from checkpoint files (as I use in profmark). # (SRE originated, 10/2000) # ############################################################################ package demotic_blast; # parse(\*STDIN) would parse BLAST output # coming in via stdin. # sub parse (*) { my $fh = shift; my $parsing_header = 1; my $parsing_hitlist = 0; my $parsing_alilist = 0; my $is_wublast = 0; my $target; my $firstchunk; # Initialize everything... so we can call the parser # repeatedly, one call per BLAST output. # # This section also documents what's available in # variables within the package. # # Arrays are indexed [0..nhits-1] or [0..nali-1] # $queryname = ""; # Name of the query sequence $querydesc = ""; # Description line for the query (or "") $querylen = 0; # Length of the query in residues $db = ""; # Name of the target database file $db_nseq = 0; # Number of sequences in the target database $db_nletters = ""; # Number of residues in the target database # (stored as a string so we can have large #'s) # The top hit list (still in rank order) $nhits = 0; # Number of entries in the hit list @hit_target = (); # Target sequence name (by rank) %target_desc = (); # Target sequence description (by targ name) @hit_bitscore = (); # Raw score (by rank) @hit_Eval = (); # E-val (by rank) # The alignment output (in order it came in) # all indexed by # of alignment [0..nali-1] $nali = 0; # Number of alignments @ali_target = (); # Target sequence name @ali_score = (); # Raw score of alignment @ali_bitscore = (); # bit score @ali_evalue = (); # E-value @ali_pvalue = (); # P-value @ali_nident = (); # Number of identical residues @ali_alen = (); # Length of alignment @ali_identity = (); # Percent identity @ali_npos = (); # Number of positives (similar positions) @ali_positive = (); # Percent of positions matched or similar @ali_qstart = (); # Start position on query @ali_qend = (); # End position on query @ali_tstart = (); # Start position on target @ali_tend = (); # End position on target @ali_qali = (); # Aligned string from query @ali_tali = (); # Aligned string from target (subject) @ali_hitidx = (); # index of hit $hitidx = -1; # Now, loop over the lines of our input, and parse 'em. # while (<$fh>) { if ($parsing_header) { if (/^\s*\*+ No hits found \*+\s*$/) { return 1; } if (/^Sequences producing /) { # wu and ncbi share this $parsing_header = 0; $parsing_hitlist = 1; <$fh>; # This discards the next (blank) line (ncbi, wu) next; } elsif (/^Query=\s*(\S*)\s*(.*)\s*$/) { # allows blank query $queryname = $1; $querydesc = $2; chomp $querydesc; if ($queryname eq "") { $queryname = "unnamed_query"; } while (1) { $_ = <$fh>; # perl quirk: unless part of a while() # loop, line must be explicitly assigned # to $_ or else it will be lost. if (/^\s+\((\S+) letters/) { $querylen = $1; last; } elsif (/^Length=(\d+)/) { $querylen = $1; last; } # NCBI elsif (/^\s*( .+)\s*$/) { $querydesc .= $1; chomp $querydesc; } } } elsif (/^Database:\s*(.+)\s*$/) { $db = $1; $_ = <$fh>; if (/^\s+(\S+) sequences; (\S+) total letters/) { $db_nseq = $1; $db_nletters = $2; $db_nseq =~ s/,//g; $db_nletters =~ s/,//g; } } elsif (/^Copyright.+Washington University/) { $is_wublast = 1; } } elsif ($parsing_hitlist) { # WUBLAST: Sequences producing High-scoring Segment Pairs: Score P(N) N # 2-Hacid_dh/1/17-338/439-757 domains: PTXD_PSEST/5-326 O28... 299 4.7e-27 1 # NCBI+: Sequences producing significant alignments: (Bits) Value # 2-Hacid_dh/4/753-1076/1224-1507 domains: Q20595_CAEEL/181-504 P... 95.1 4e-20 # NCBI: Sequences producing significant alignments: (bits) Value # 2-Hacid_dh/4/753-1076/1224-1507 domains: Q20595_CAEEL/181-504 PD... 95 4e-20 # In NCBI E-values, beware "e-xxx" without any leading number. # In NCBI+, beware bit scores can now be real numbers, not just integers. # if (/^\s*$/) { $parsing_hitlist = 0; $parsing_alilist = 1; next; } elsif (( $is_wublast && /^\s*(\S+)\s+(.+)\s+(\d+)\s+(\S+)\s+\d+\s*$/) || (! $is_wublast && /^\s*(\S+)\s+(.+)\s+(\S+)\s+(\S+)\s*$/)) { $hit_target[$nhits] = $1; $target_desc{$1} = $2; $hit_bitscore[$nhits] = $3; if ($is_wublast) { $hit_Eval[$nhits] = $4; } # actually WU reports P-value else { $hit_Eval[$nhits] = &repair_evalue($4); } $nhits++; } } elsif ($parsing_alilist) { if (/^>\s*(\S+)\s*(.*)$/) { $target = $1; $hitidx ++; $target_desc{$target} = $2; $_ = <$fh>; if (/^\s+Length = (\S+)/) { $target_len{$target} = $1; } } elsif (/^ Score =\s+(\d+) \((\S+) bits\), Expect\s+=\s+(\S+), /) { # WU $nali++; $ali_target[$nali-1] = $target; $ali_score[$nali-1] = $1; $ali_bitscore[$nali-1] = $2; $ali_evalue[$nali-1] = $3; $ali_hitidx[$nali-1] = $hitidx; } elsif (/^ Score =\s+(\S+) bits \((\S+)\),\s*Expect\s+=\s+(\S+), /) { # NCBI $nali++; $ali_target[$nali-1] = $target; $ali_bitscore[$nali-1] = $1; $ali_score[$nali-1] = $2; $ali_evalue[$nali-1] = &repair_evalue($3); $ali_hitidx[$nali-1] = $hitidx; } elsif (/^ Identities = (\d+)\/(\d+) \((\d+)%\).+Positives = (\d+).+\((\d+)%/) { # NCBI or WU $ali_nident[$nali-1] = $1; $ali_alen[$nali-1] = $2; $ali_identity[$nali-1] = $3; $ali_npos[$nali-1] = $4; $ali_positive[$nali-1] = $5; $ali_hitidx[$nali-1] = $hitidx; $firstchunk = 1; } elsif (/^ Identities = (\d+)\/(\d+) \((\d+)%\).+/) { # NCBI megablast : no Positives $ali_nident[$nali-1] = $1; $ali_alen[$nali-1] = $2; $ali_identity[$nali-1] = $3; $ali_npos[$nali-1] = $1; $ali_positive[$nali-1] = $3; $ali_hitidx[$nali-1] = $hitidx; $firstchunk = 1; } elsif (/^Query:?\s+(\d+)\s+(\S+)\s+(\d+)\s*$/) { if ($firstchunk) { $ali_qstart[$nali-1] = $1; } $ali_qali[$nali-1] .= $2; $ali_qend[$nali-1] = $3; } elsif (/^Sbjct:?\s+(\d+)\s+(\S+)\s+(\d+)\s*$/) { if ($firstchunk) { $ali_tstart[$nali-1] = $1; } $ali_tali[$nali-1] .= $2; $ali_tend[$nali-1] = $3; $firstchunk = 0; } elsif (/^BLAST/) { return 1; } # normal return; output from a new query is starting elsif (/^Effective search space used:/) { return 1; } #normal end of query for NCBI BLAST+ } } # this closes the loop over lines in the input stream. if ($parsing_alilist) { return 1; } else { return 0; } } # NCBI BLAST now has a nasty habit of abbreviating # 1e-100 as e-100, which isn't a number format. sub repair_evalue { my $value = shift; if ($value =~ /^e-\d+$/) { $value = "1$value"; } $value; } sub exblxout { my $ofh = shift; my $i; for ($i = 0; $i <= $nali-1; $i++) { printf $ofh "%s\t%d\t%d\t%d\t%s\t%d\t%d\t%s\n", $ali_evalue[$i], $ali_identity[$i], $ali_tstart[$i], $ali_tend[$i], $ali_target[$i], $ali_qstart[$i], $ali_qend[$i], $queryname; } } sub tblout { my $ofh = shift; my $i; for ($i = 0; $i <= $nali-1; $i++) { printf $ofh "%s\t%d\t%d\t%d\t%s\t%d\t%d\t%s\t%s\n", $ali_evalue[$i], $ali_identity[$i], $ali_qstart[$i], $ali_qend[$i], $queryname, $ali_tstart[$i], $ali_tend[$i], $ali_target[$i], $target_desc{$ali_target[$i]}; } } sub gffout { my $ofh = shift; my $source = shift; my $feature = shift; my $i; my $strand; for ($i = 0; $i <= $nali-1; $i++) { if ($ali_qstart[$i] > $ali_qend[$i]) { $strand = "-"; } else { $strand = "+"; } printf $ofh "%s\t%s\t%s\t%d\t%d\t%.1f\t%s\t.\tgene \"%s\"\n", $ali_target[$i], $source, $feature, $ali_tstart[$i], $ali_tend[$i], $ali_bitscore[$i], $strand, $queryname; } } sub profmarkout { my $ofh = shift; my $i; for ($i = 0; $i < $nhits; $i++) { printf $ofh "%g\t%.1f\t%s\t%s\n", $hit_Eval[$i], $hit_bitscore[$i], $hit_target[$i], $queryname; } } 1; __END__