#! /usr/bin/env python3 # Do a chunk of a profmark benchmark, for hmmsearch. # # Usage: x-hmmsearch # Example: ./x-hmmsearch ~/releases/hmmer-3.0/build-icc-mpi ~/releases/hmmer-3.0 testdir test.tbl pmark.train.msa pmark.test.fa test.out # # This script is normally called by pmark_master.py, and its command # line syntax is tied to pmark_master.py. It can also be run by itself # (for testing, for example). # # Inputs: # : top level directory for the executables to be benchmarked. # program(s) will be run as ${top_builddir}/${program_name} # (Easel miniapps are assumed to be in your $PATH.) # : top level directory for finding scripts or data files. (Unused here.) # : name of a tmp directory to put tmpfiles in # : file with list of MSA names to use as queries - fetched from # : query MSAs are fetched from this file # : each query MSA is searched against these synthetic positives and negatives # # Outputs: # : output data for this chunk; one line per reasonably-significant hit at permissive E-value # threshold (we want to be able to see performance at up to 10-100 fp/query or so) # Each line has: # # Normally, pmark-master creates , using an input arg from # the user; creates as "/tbl.", one # parallelized chunk of the MSAs in the benchmark; and creates # as "/tbl.{n}.out". The and # were created by create-profmark with a shared benchmark_prefix name, # e.g. "pmark.{train.msa,test.fa}". # # For testing, you can make any list of MSA names you want (in # "test.tbl" for example), make a tmp dir ("testdir", for example), # and direct the output to any file you want ("test.out" for example). # import sys import os import subprocess usage = "x-hmmsearch " if len(sys.argv) != 8: sys.exit('Incorrect number of cmdline args.\nUsage: {}'.format(usage)) (top_builddir, top_srcdir, resultdir, tblfile, msafile, fafile, outfile) = sys.argv[1:] hmmbuild = '{}/src/hmmbuild'.format(top_builddir); hmmsearch = '{}/src/hmmsearch'.format(top_builddir); buildopts = ''; searchopts = '-E 200 --cpu 1'; if not os.path.isdir(top_builddir): sys.exit("didn't find top_builddir at {}".format(top_builddir)) if not os.path.isdir(top_srcdir): sys.exit("didn't find top_srcdir at {}".format(top_srcdir)) if not os.path.isdir(resultdir): sys.exit("didn't find resultdir at {}".format(resultdir)) if not os.path.isfile(tblfile): sys.exit("didn't find tblfile at {}".format(tblfile)) if not os.path.isfile(msafile): sys.exit("didn't find msafile at {}".format(msafile)) if not os.path.isfile(fafile): sys.exit("didn't find fafile at {}".format(fafile)) if not os.access(hmmbuild, os.X_OK): sys.exit("didn't find executable hmmbuild at {}".format(hmmbuild)) if not os.access(hmmsearch,os.X_OK): sys.exit("didn't find executable hmmsearch at {}".format(hmmsearch)) if not os.path.isfile('{}.ssi'.format(msafile)): sys.exit("msafile {} needs to have an SSI index.\nRun esl-afetch --index on it to create one.".format(msafile)) tblfp = open(tblfile) outfp = open(outfile, "w"); if not tblfp: sys.exit("failed to open tblfile {} for reading".format(tblfile)) if not outfp: sys.exit("failed to open outfile {} for writing".format(outfile)) for line in tblfp: msaname = line.split(None, 1)[0] cmd = 'esl-afetch -o {0}/{1}.sto {2} {1}'.format(resultdir, msaname, msafile) r = subprocess.run(cmd.split(), capture_output=True, encoding='utf-8') if r.returncode: sys.exit('FAILED: {}\n{}'.format(cmd, r.stderr)) cmd = '{0} {1} {2}/{3}.hmm {2}/{3}.sto'.format(hmmbuild, buildopts, resultdir, msaname) r = subprocess.run(cmd.split(), capture_output=True, encoding='utf-8') if r.returncode: sys.exit('FAILED: {}\n{}'.format(cmd, r.stderr)) cmd = '{0} {1} --tblout {2}/{3}.tmp {2}/{3}.hmm {4}'.format(hmmsearch, searchopts, resultdir, msaname, fafile) r = subprocess.run(cmd.split(), stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, encoding='utf-8') if r.returncode: sys.exit('FAILED: {}\n{}'.format(cmd, r.stderr)) resfp = open('{0}/{1}.tmp'.format(resultdir, msaname)) if not resfp: sys.exit('failed to open hmmsearch .tblout file {0}/{1}.tmp for reading'.format(resultdir, msaname)) for line in resfp: if line[0] == '#': continue fields = line.split() target = fields[0] pval = fields[4] bitscore = fields[5] outfp.write('{0} {1} {2} {3}\n'.format(pval, bitscore, target, msaname)) resfp.close() os.remove('{0}/{1}.hmm'.format(resultdir, msaname)) os.remove('{0}/{1}.sto'.format(resultdir, msaname)) os.remove('{0}/{1}.tmp'.format(resultdir, msaname)) tblfp.close() outfp.close()