# Demotic parser for HMMER3 output # # Works for phmmer, hmmsearch output # Does not parse env coords nor alignments yet, # but is suitable for use in profmark. # # SRE, Tue Apr 22 13:13:51 2008 package demotic_hmmer; # parse(\*STDIN) would parse HMMER output # coming in via stdin. # sub parse (*) { my $fh = shift; my $parsing_header = 1; my $parsing_seqs = 0; my $parsing_domains = 0; my $parsing_alis = 0; # Initialize everything... so we can call the parser # repeatedly, one call per hmmsearch output. # # This section also documents what's available in # variables within the package. # # Arrays are indexed [0..nhits-1] or [0..nali-1] # $queryname = ""; # Name of the query sequence $querydesc = ""; # Description line for the query (or "") $querylen = 0; # Length of the query in residues $db = ""; # Name of the target database file $db_nseq = 0; # Number of sequences in the target database $db_nletters = ""; # Number of residues in the target database # (stored as a string so we can have large #'s) # The top hit list (still in rank order) $nhits = 0; # Number of entries in the hit list @hit_target = (); # Target sequence name (by rank) %target_desc = (); # Target sequence description (by targ name) @hit_bitscore = (); # Raw score (by rank) @hit_Eval = (); # P-val or E-val (by rank) # The alignment output (in order it came in) # all indexed by # of alignment [0..nali-1] $nali = 0; # Number of alignments @ali_target = (); # Target sequence name @ali_bitscore = (); # bit score @ali_evalue = (); # E-value @ali_qstart = (); # Start position on query @ali_qend = (); # End position on query @ali_tstart = (); # Start position on target @ali_tend = (); # End position on target @ali_qali = (); # Aligned string from query @ali_tali = (); # Aligned string from target (subject) @ali_hitidx = (); # index of hit # Now, loop over the lines of our input, and parse 'em. # while (<$fh>) { if ($parsing_header) { if (/^Scores for complete sequences /) { # seq section starts with this line $parsing_header = 0; $parsing_seqs = 1; <$fh>; # This discards the next line: --- full sequence ---- ---- single best dom ---- <$fh>; # and discard another line: E-value score bias score bias <$fh>; # and discard another line: ------- ------- ------ ------- ------ ---------- next; } elsif (/Query:\s*(\S+)\s*\[[LM]=(\d+)\]\s*$/) { $queryname = $1; $querylen = $2; } elsif (/Description:\s*(.*)$/) { $querydesc = $1; } } elsif ($parsing_seqs) { if (/^\s*$/) { # seq section ends with a blank line $parsing_seqs = 0; $parsing_domains = 1; next; } elsif (/^\s*(\S+)\s+(\S+)\s+(\S+)\s+\S+\s+\S+\s+\S+\s+(\S+)\s+(\d+)\s+(\S+)\s+(.+)$/) { # evalue score bias evalue score bias exp N target desc # 1 2 3 4 5 6 7 $hit_target[$nhits] = $6; $target_desc{$6} = $7; $hit_bitscore[$nhits] = $2; $hit_Eval[$nhits] = $1; $nhits++; } } elsif ($parsing_domains) { if (/^\s*$/) { } elsif (/^\s*Domain annotation/) { } elsif (/^\>\>\s*(\S+)\s*/) { $curr_ali_target = $1; } elsif (/^\s*\d+\s+\S\s+(\S+)\s+\S+\s+(\S+)\s+\S+\s+(\d+)\s+(\d+)\s+\S\S\s+(\d+)\s+(\d+)\s+\S\S\s+(\d+)\s+(\d+)\s+\S\S\s+\S+\s*$/) # # score bias c-Evalue i-Evalue hmmfrom hmm to alifrom ali to envfrom env to acc # --- ------ ----- --------- --------- ------- ------- ------- ------- ------- ------- ---- # 1 ! 387.6 1.4 7.5e-120 2.1e-115 1 181 [] 1 181 [] 1 181 [] 1.00 { $ali_target[$nali] = $curr_ali_target; $ali_bitscore[$nali] = $1; $ali_evalue[$nali] = $2; $ali_qstart[$nali] = $3; $ali_qend[$nali] = $4; $ali_tstart[$nali] = $5; $ali_tend[$nali] = $6; $ali_hitidx[$nali] = $nhits-1; $nali ++; } elsif (/^\s+Alignments for each domain/) { $parsing_domains = 0; $parsing_alis = 1; } } elsif ($parsing_alis) { if (/^\s*$/) { } elsif (/^\s*\=\=\s+domain\s+(\d+)\s+/) { $whichali = $1-1; } elsif (/^\s+$queryname\s+\d+\s+(\S+)\s+\d+\s*$/) { $ali_qali[$whichali] .= $1; } elsif (/^\s+$ali_target[$whichali]\s+\d+\s+(\S+)\s+\d+\s*$/) { $ali_tali[$whichali] .= $1; } elsif (/\/\//) { return 1; } # normal return after each query. else { next; } } } if ($parsing_alis) { return 1; } else { return 0; } } sub exblxout { my $ofh = shift; my $i; for ($i = 0; $i <= $nali-1; $i++) { printf $ofh "%s\t%d\t%d\t%d\t%s\t%d\t%d\t%s\n", $ali_evalue[$i], $ali_identity[$i], $ali_tstart[$i], $ali_tend[$i], $ali_target[$i], $ali_qstart[$i], $ali_qend[$i], $queryname; } } sub tblout { my $ofh = shift; my $i; for ($i = 0; $i <= $nali-1; $i++) { printf $ofh "%s\t%d\t%d\t%d\t%s\t%d\t%d\t%s\t%s\n", $ali_evalue[$i], $ali_identity[$i], $ali_qstart[$i], $ali_qend[$i], $queryname, $ali_tstart[$i], $ali_tend[$i], $ali_target[$i], $target_desc{$ali_target[$i]}; } } sub gffout { my $ofh = shift; my $source = shift; my $feature = shift; my $i; my $strand; for ($i = 0; $i <= $nali-1; $i++) { if ($ali_qstart[$i] > $ali_qend[$i]) { $strand = "-"; } else { $strand = "+"; } printf $ofh "%s\t%s\t%s\t%d\t%d\t%.1f\t%s\t.\tgene \"%s\"\n", $ali_target[$i], $source, $feature, $ali_tstart[$i], $ali_tend[$i], $ali_bitscore[$i], $strand, $queryname; } } sub profmarkout { my $ofh = shift; my $i; for ($i = 0; $i < $nhits; $i++) { printf $ofh "%g\t%.1f\t%s\t%s\n", $hit_Eval[$i], $hit_bitscore[$i], $hit_target[$i], $queryname; } } 1; __END__ |more