############################################################################
# demotic_infernal_tab package   
#    Parses "tabfile" output generated by cmsearch rc1.0
#    (TAJ 6/2008) based on demotic_search3.pm; itself based on SRE demotic_wublast.pm
############################################################################

#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-
# TODO List:
#     (1) Parse "Post-search" number of hits (expected, actual) and surv fraction (expected, actual)
#     (2) Possibly offload gff output functionality from calling script to this pm. 
#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-

# Example output line (ignoring the leading "#"): 
# ////////////////////////////////////////////////////////////////////////////////////////////
## target name                   start        stop  start   stop    bit sc   E-value  GC%
## -----------------------  ----------  ----------  -----  -----  --------  --------  ---
#  Contig10/506498-506590            1          65      1     80     35.79  1.00e-08   40
# ////////////////////////////////////////////////////////////////////////////////////////////
#
# Example "post_search" content:
# ////////////////////////////////////////////////////////////////////////////////////////////
# Post-search info for CM 1: SmY
#
#                              number of hits       surv fraction  
#                            -------------------  -----------------
# rnd  mod  alg  cfg   beta    expected   actual  expected   actual
# ---  ---  ---  ---  -----  ----------  -------  --------  -------
#   1   cm  ins  glc  1e-15       1.000       15    0.0601   0.7829
#
# expected time    actual time
# -------------  -------------
#   00:00:01.04    00:00:06.00
# ////////////////////////////////////////////////////////////////////////////////////////////


package demotic_infernal_tab;
# Don't have "parse(\*STDIN)" option, since "tabfile" output always written to file

sub parse (*) {
    my $fh                = shift; # if this isn't a file handle to a tabfile, some ass needs kickin'
    my $parsing_header    = 1;
    my $parsing_presearch = 0; 
    my $parsing_hits      = 0; 
    my $post_search       = 0;
    my $nhits             = 0;   # temp counter for number of hits in current model; (tabulated)
    my $index             = 0;   # indexes CM number; ie 1st model is index 0

### NOTE:
### Some arrays (0..(N-1)) contain values from each search (1..N). Recall there is 1 search per model.
###      Ex: @cm_name      # lists names of all CMs used, say: (SmY.1, SmY.2, SmY.3)
###
### Other arrays are a "list of lists" (see LoL note below). In these cases, the Nth position 
### in the array, will be an anonymous reference to a list of features for _all_ hits to model (N+1).
###
###      Ex: @t_name       # position N in the list corresponds to one model (N+1) in the search, 
###                        # which is an anonymous ref to a list of ctgs for all hits from that model.
###      Ex: @{$t_name[1]} # lists ctgs for all hits to the 2nd query model; eg (ChrI, ChrI, ChrIII, ChrV)


# Document all the variables available in the package; initialize everything
# 
# ===========  Header details (once/report)  ================================
    $command   = "";   # command line used to run search
    $date      = "";   # date search was run
    $db_recs   = "";   # number of records in target DB
    $db_size   = "";   # DB size (in MB)

# ===========  Pre-search details (once/CM) =================================
# Note: details of the "pre-search" filtering rounds not captured currently 
    @cm_name   = ();   # Covariation Model(s) name; 1st model is 0th in array

# ===========  Details from each *hit* (0-many/CM) ==========================
########################################################################################################
# NOTE: (LoL) == List of Lists; the (N)th position in the following arrays contains a list             #
#                of that feature for all hits from the (N+1)th CM in the report. The order of          # 
#                features in the anonymous array @{$foo[$i]} corresponds to the order in the report.   #
#                EXAMPLE:                                                                              #
#                   @GC              # list of anonymous refs, one per CM; refs list all GCs in search # 
#                   @{$GC[0]}        # contains the list of GC values for all hits to the 1st CM       # 
#                   ${$GC[1]}->[3]   # GC value of 4th (3+1) hit in the 2nd (1+1) search               # 
########################################################################################################
    @t_name    = ();   # (LoL); target fasta record name
    @t_start   = ();   # (LoL); start location in fasta target
    @t_stop    = ();   # (LoL); stop  location in fasta target
    @q_start   = ();   # (LoL); start location in CM query
    @q_stop    = ();   # (LoL); stop  location in CM query 
    @bitscore  = ();   # (LoL); bit score
    @Eval      = ();   # (LoL); E-value  // can be in scientific notation
    @GC        = ();   # (LoL); %GC

# ===================== "Post-search info" (each CM report) ==================
# NOTE: I'm not capturing all available information here, because I believe it can
#       be quite variable. I'm not expert on all possible output that can be generated
    @time_expect = ();  # expected run time (quoted, not converting into hr:min)
    @time_actual = ();  # actual run time   (quoted, not converting into hr:min)
    @num_hits    = ();  # tracks number of hits for each model

# ====================== Cumulative Data (all query outputs) ================
    $model_num   = 0;   # number of CM's used in the search
    

    # Loop over tabfile input, and parse it (there can be >1 CM/report in each tabfile)
    while (<$fh>) {
	if ($parsing_header) {    # only one header section, even if multiple CM's in search
	    if (/^\#\sPre\-search\sinfo\sfor\sCM/) { # NB: model name here is picked up later
		$parsing_header = 0;
		$parsing_presearch = 1;
		next;
	    }
	    elsif (/^\#\scommand\:\s+(\S.+)$/) {
		$command = $1; 
	    }
	    elsif (/^\#\sdate:\s+(\S.+)$/) {
		$date = $1;
	    }
	    elsif (/^\#\snum\sseqs:\s+(\d+)/) { # could get fucked if commas appear in the number
		$db_recs = $1; 
	    }
	    elsif (/^\#\sdbsize\S+:\s+(\S+)/) { # $1 has a decimal; \d just don't cut it
		$db_size = $1; 
	    }
	}
	elsif ($parsing_presearch) {    # Each CM has it's own "pre-search"
	    if (/^\#\starget\sname/) {  # signals end of "Pre-search"; occurs once per CM
		$parsing_presearch = 0;
		$parsing_hits      = 1; 
		<$fh>;     # Discard next line (composed of dashes); then hitlist begins
		next; 
	    }
	    elsif (/^\#\sCM:\s(\S+)/) { # If multiple models/CM; format is "CM-NAME.n" (1..n)
		$model_num++;           # NB: correctly tallys models; use $index for array indices!
		push (@cm_name, $1); 
	    }
	}
	elsif ($parsing_hits) {         # every line not starting with "#" should be a "hit"
# Example hit [sans leading "#"]: 
#   Contig10                         1001         1065      1     80     35.79  1.00e-08   40
	    if (/^\s*(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\d+)\s*$/) {  
		push (@{$t_name[$index]}  , $1);
		push (@{$t_start[$index]} , $2);
		push (@{$t_stop[$index]}  , $3);
		push (@{$q_start[$index]} , $4);
		push (@{$q_stop[$index]}  , $5);
		push (@{$bitscore[$index]}, $6);
		push (@{$Eval[$index]}    , $7);
		push (@{$GC[$index]}      , $8);
		$nhits++;  
	    }
	    elsif (/^\#/) {       # end of "hitlist" for this CM
		$parsing_hits = 0; 
		$post_search  = 1; 
		push (@num_hits, $nhits); 
		$nhits = 0;       # clear hit counter for next model 
		next; 
	    }
	    else { die "Parsing 'hitlist' -- should never get here! Current line is:\n$_"; }
	}
	elsif ($post_search) {           # Each CM has it's own "post-search"
	    if (/^\#\sexpected\stime/) { # Effectively marking the end of a single query's report
		<$fh>;       # discard next line of dashes
		$_ = <$fh>;  # get next line listing the times
		if (/^\#\s+(\S+)\s+(\S+)\s*$/) {
		    push (@time_expect, $1); # \__ not going to try parsing hours/minutes/seconds
		    push (@time_actual, $2); # /
		    $post_search       = 0;  # \__ if looping over multiple query CMs, next up is
		    $parsing_presearch = 1;  # /   "pre-search" for next model, or soon: EOF
		    $index++;                # LoL index; end of report i, prepare for report i+1
		}
	    }
	}
	else {  die "Should never get here! Dropped through WHILE loop, parsing line:\n$_";  }
    }   # this closes the loop over lines in the input stream.
}

1;
__END__