#! /usr/bin/env python3 # Usage: # techtree.py # # Where input is created in Easel directory by clang preprocessing: # for cfile in esl_*.c easel.c; do clang -MM -c -I. -DeslENABLE_AVX -DeslENABLE_AVX512 -DeslENABLE_VMX -DeslENABLE_NEON -DeslENABLE_SSE -DHAVE_MPI ${cfile} >> easel.dep; done # # (Some harmless errors will come out on stderr about NEON and VMX instructions not being enabled; ignore them.) # # This input file consists of lists of dependencies, possibly wrapped to multiple lines, # as in: # # esl_cpu.o: esl_cpu.c esl_config.h easel.h esl_cpu.h # esl_dirichlet.o: esl_dirichlet.c esl_config.h easel.h esl_fileparser.h \ # esl_minimizer.h esl_random.h esl_stats.h esl_vectorops.h \ # esl_dirichlet.h # esl_distance.o: esl_distance.c esl_config.h easel.h esl_alphabet.h \ # esl_dmatrix.h esl_random.h esl_distance.h # # For each module, we extract its name ("esl_distance.o" => "distance"). We # ignore the dependency on a module's own .c and .h file, and on esl_config.h. # # Four global data structures describe the proposed layout of the tech tree, # as shown in the tech tree figure: # specifies an order that the groups organize into: # base, a layer of fundamental stuff, sequence handling, # statistics, misc stuff off the base, and MPI over all. # specifies how modules are assigned to groups; # is an ordered list of modules that belong to group # specifies dependencies at group level: # is a set of groups that group depends on. # specifies additional dependencies within a group: # is a set of modules that depends on within m's own group. # # The script validates this proposed layout, and checks that the actual dependencies # in Easel are consistent with it. # # The output is in two sections: the actual dependencies of each Easel module # (so you can consider improved layouts), followed by a list of any actual # dependencies that aren't satisfied by the current layout. # import sys import re easelgrps = [ 'BASE', 'BIOLOGICAL_SEQUENCES', 'NUMERICAL_METHODS', 'ALGORITHMS', 'FILE_INPUT', 'ADVANCED_SEQUENCES', 'MULTIPLE_ALIGNMENTS', 'MULTIPLE_ALIGNMENT_FILES', 'SEQUENCE_FILES', 'STATISTICAL_DISTRIBUTIONS', 'MIXTURE_DISTRIBUTIONS', 'COMMANDLINE', 'BENCHMARKS', 'SIMD_VECTORS', 'MULTITHREADING', 'MPI' ] easelmods = { 'BASE' : [ 'easel', 'mem', 'random', 'regexp', 'stack', 'vectorops' ], 'ALGORITHMS' : [ 'arr2', 'arr3', 'bitfield', 'cluster', 'dmatrix', 'heap', 'keyhash', 'matrixops', 'quicksort', 'red_black', 'varint', 'huffman', 'graph', 'tree' ], 'NUMERICAL_METHODS' : [ 'rootfinder', 'minimizer', 'rand64' ], 'BIOLOGICAL_SEQUENCES' : [ 'alphabet', 'composition', 'hmm' ], 'FILE_INPUT' : [ 'buffer', 'fileparser', 'recorder', 'ssi', 'json' ], 'COMMANDLINE' : [ 'getopts', 'subcmd' ], 'BENCHMARKS' : [ 'stopwatch' ], 'STATISTICAL_DISTRIBUTIONS' : [ 'stats', 'normal', 'histogram', 'exponential', 'gamma', 'gev', 'gumbel', 'stretchexp', 'weibull' ], 'MIXTURE_DISTRIBUTIONS': [ 'dirichlet', 'mixgev', 'hyperexp', 'mixdchlet' ], 'ADVANCED_SEQUENCES' : [ 'distance', 'wuss', 'paml', 'randomseq', 'ratematrix', 'scorematrix', 'swat' ], 'MULTIPLE_ALIGNMENTS' : [ 'msa', 'msacluster', 'msashuffle', 'msaweight' ], 'MULTIPLE_ALIGNMENT_FILES' : [ 'msafile', 'msafile2', 'msafile_a2m', 'msafile_afa', 'msafile_clustal', 'msafile_phylip', 'msafile_psiblast', 'msafile_selex', 'msafile_stockholm' ], 'SEQUENCE_FILES' : [ 'sq', 'sqio_ascii', 'sqio_ncbi', 'sqio', 'dsqdata', 'gencode' ], 'SIMD_VECTORS' : [ 'alloc', 'cpu', 'sse', 'avx', 'avx512', 'neon', 'vmx' ], 'MULTITHREADING' : [ 'threads', 'workqueue' ], 'MPI' : [ 'mpi' ], } easelparents = { 'BASE' : {}, 'BIOLOGICAL_SEQUENCES' : { 'BASE' }, 'NUMERICAL_METHODS' : { 'BASE' }, 'ALGORITHMS': { 'BASE' }, 'FILE_INPUT': { 'BASE' }, 'COMMANDLINE': { 'BASE' }, 'BENCHMARKS': { 'BASE' }, 'SIMD_VECTORS': { 'BASE' }, 'MULTITHREADING': { 'BASE' }, 'ADVANCED_SEQUENCES': { 'BIOLOGICAL_SEQUENCES', 'NUMERICAL_METHODS', 'ALGORITHMS', 'FILE_INPUT' }, 'STATISTICAL_DISTRIBUTIONS' : { 'NUMERICAL_METHODS' }, 'MIXTURE_DISTRIBUTIONS' : { 'STATISTICAL_DISTRIBUTIONS', 'ALGORITHMS', 'FILE_INPUT' }, 'MULTIPLE_ALIGNMENTS' : { 'ADVANCED_SEQUENCES' }, 'MULTIPLE_ALIGNMENT_FILES' : { 'MULTIPLE_ALIGNMENTS' }, 'SEQUENCE_FILES' : { 'MULTIPLE_ALIGNMENT_FILES' }, 'MPI': { 'BENCHMARKS', 'SEQUENCE_FILES' } } easeladd = { # BASE 'easel' : set(), 'mem' : { 'easel' }, 'random' : { 'easel' }, 'regexp' : { 'easel' }, 'stack' : { 'easel', 'random' }, 'vectorops' : { 'easel', 'random' }, # ALGORITHMS 'arr2' : set(), 'arr3' : set(), 'bitfield' : set(), 'cluster' : set(), 'dmatrix' : set(), 'heap' : set(), 'keyhash' : set(), 'matrixops' : set(), 'quicksort' : set(), 'red_black' : set(), 'varint' : set(), 'huffman' : { 'quicksort' }, 'graph' : { 'matrixops' }, 'tree' : { 'arr2', 'dmatrix', 'stack'}, # NUMERICAL_METHODS 'rootfinder' : set(), 'minimizer' : set(), 'rand64' : set(), # BIOLOGICAL_SEQUENCES 'alphabet' : set(), 'composition' : set(), 'hmm' : { 'alphabet' }, # FILE_INPUT 'buffer' : set(), 'fileparser' : set(), 'recorder' : set(), 'ssi' : set(), 'json' : { 'buffer' }, # COMMANDLINE 'getopts' : set(), 'subcmd' : { 'getopts' }, # BENCHMARKS 'stopwatch' : set(), # STATISTICAL_DISTRIBUTIONS 'stats' : set(), 'normal' : { 'stats' }, 'histogram' : { 'stats' }, 'exponential' : { 'stats', 'histogram' }, 'gamma' : { 'stats', 'histogram' }, 'gev' : { 'stats' }, 'gumbel' : { 'stats' }, 'stretchexp' : { 'stats', 'histogram' }, 'weibull' : { 'stats', 'histogram' }, # MIXTURE_DISTRIBUTIONS 'dirichlet' : set(), 'hyperexp' : set(), 'mixdchlet' : { 'dirichlet' }, 'mixgev' : { 'dirichlet' }, # ADVANCED_SEQUENCES 'distance' : set(), 'wuss' : set(), 'paml' : set(), 'randomseq' : set(), 'ratematrix' : set(), 'scorematrix' : { 'ratematrix' }, 'swat' : { 'scorematrix' }, # MULTIPLE_ALIGNMENTS 'msa' : set(), 'msacluster' : { 'msa' }, 'msashuffle' : { 'msa' }, 'msaweight' : { 'msa', 'msacluster' }, # MULTIPLE_ALIGNMENT_FILES 'msafile' : set(), 'msafile2' : set(), 'msafile_a2m' : { 'msafile' }, 'msafile_afa' : { 'msafile' }, 'msafile_clustal' : { 'msafile' }, 'msafile_phylip' : { 'msafile' }, 'msafile_psiblast' : { 'msafile' }, 'msafile_selex' : { 'msafile' }, 'msafile_stockholm' : { 'msafile' }, # SEQUENCE_FILES 'sq' : set(), 'sqio' : { 'sq' }, 'sqio_ascii' : { 'sq', 'sqio' }, 'sqio_ncbi' : { 'sq', 'sqio' }, 'dsqdata' : { 'sq', 'sqio', 'sqio_ascii' }, 'gencode' : { 'sq', 'sqio', 'sqio_ascii', 'sqio_ncbi' }, # SIMD_VECTORS 'alloc' : set(), 'cpu' : set(), 'sse' : set(), 'avx' : set(), 'avx512' : set(), 'neon' : set(), 'vmx' : set(), # MULTITHREADING 'threads' : set(), 'workqueue' : set(), # MPI 'mpi' : set(), } def main(): if (len(sys.argv) != 2): sys.exit("Usage: techtree.py <.dep file>"); validate_layout() Ma = expand_layout() # Ma[m] is a set of allowable dependencies for module , according to layout # Get the actual dependencies from the input file. # is a set of module dependencies for module # excluding esl_config.h and itself, as parsed out of input # depfile. # D = {} for line in open(sys.argv[1]): mat = re.match(r'(\S+): (.+)', line) # First line in a list of dependencies? if mat: m = process_token(mat.group(1)) fields = mat.group(2).split() D[m] = set(); else: fields = line.split() for s in fields: if s != "\\" and s != 'esl_config.h': # ignore \ and esl_config.h... s = process_token(s) # ... extract module name from filename if s != m: # ... and ignore dependencies on itself D[m].add(s) # ... otherwise, add dependencies to the set. output_direct_deplines(D) # First section: actual dependencies, organized for study. compare_layout(Ma, D) # Followed by any problems in them. def process_token(tok): """ """ tok = re.sub(r'^esl_', '', tok) tok = re.sub(r'.[coh]$', '', tok) return tok #### # validate_layout() # # Checks the four global data structures that define the figure # layout. # def validate_layout(): Gl = set(easelgrps) # list is all the group names Gm = set(easelmods.keys()) # keys are also all the group names Gp = set(easelparents.keys()) # keys are also all the group names if Gl != Gm: sys.exit("easelgrps and easelmods have different set of group names") if Gl != Gp: sys.exit("easelgrps and easelparents have different set of group names") Mm = [] # Get all the module names in 's sets. for g in easelmods.keys(): for m in easelmods[g]: Mm.append(m) if len(Mm) != len(set(Mm)): sys.exit("easelmods must have same module assigned to >1 group") Mm = set(Mm) Ma = set(easeladd.keys()) if Mm != Ma: sys.exit("easelmods and easeladd have a different set of module names") print("# {0} groups".format(len(Gm))) print("# {0} modules".format(len(Mm))) #### # expand_layout() # # Use the immediate group dependencies in to traverse # upwards through the graph for each group , to get the complete # set of dependencies for it. Use that to obtain its complete set of # module dependencies. # # Return a , a dict of sets: Ma[m] is a set of modules that module # depends on. # def expand_layout(): # Use a pushdown automaton to recursively find all group dependencies # for each group ; store this set in Ga[g]. Ga = {} for g in easelparents.keys(): Ga[g] = set() pda = list(easelparents[g]) while len(pda): g2 = pda.pop() Ga[g].add(g2) pda.extend(list(easelparents[g2])) # Using Ga, plus additional within-group module dependencies in , # expand set of group dependencies to set of module dependencies for each module . Ma = {} for g in easelmods.keys(): mdep = set() for g2 in Ga[g]: # for each group that depends on... mdep = mdep | set(easelmods[g2]) # add the modules in that group for m in easelmods[g]: Ma[m] = mdep | easeladd[m] return Ma #### def compare_layout(Ma, D): for m in Ma.keys(): if not Ma[m] >= D[m]: print(m, D[m] - Ma[m]) def output_direct_deplines(D): for g in easelgrps: print('# {0}'.format(g)) for m in easelmods[g]: print('{0:20s}: '.format(m), end='') for g2 in easelgrps: for m2 in easelmods[g2]: if m2 in D[m]: print('{0} '.format(m2), end='') print('') print('') if __name__ == "__main__": main()