#!/usr/bin/env python # -* coding: utf-8 -* """ This script reads the standard connectivty table provided by the PDB and produces a C++ header file that maps residue names to their connectivity by atom name. WARNING: The entire PDB dictionary is quite large, so it is recomended to only use part of it (see usage). The online version of this table is available here: http://ftp.wwpdb.org/pub/pdb/data/monomers/het_dictionary.txt """ import sys import re class Interner: def __init__(self): self.list = [] def get(self, string): for (i, known) in enumerate(self.list): if string == known: return i self.list.append(string) return len(self.list) - 1 INTERNER = Interner() def join_and_wrap_80(indent, list): result = "" line = indent for string in list: line += string + ", " if len(line) > 72: result += line + "\n" line = indent if line != indent: result += line return result class Residue: def __init__(self, code): self.code = code.strip() self.atoms = {} def __str__(self): connectivity = [] for atom, connections in self.atoms.iteritems(): for connected in connections: connectivity.append('{{{}, {}}}'.format(atom, connected)) connectivity = join_and_wrap_80(" ", connectivity) return '{{"{}", {{\n{} }}}}'.format(self.code, connectivity) def add_atom(self, name, connected): self.atoms[INTERNER.get(name)] = [INTERNER.get(s) for s in connected] def read_residues(path, accepted_residues=None): residues = [] read_residue = False with open(path) as f: for line in f: line = line.replace(r'"', r'\"') if line[:7] == "RESIDUE": read_residue = False if accepted_residues is None or line[10:13] in accepted_residues: residues.append(Residue(line[10:13])) read_residue = True if line[:6] == "CONECT" and read_residue: splitted = re.split(r'\s+', line)[1:-1] residues[-1].add_atom(splitted[0], splitted[2:]) return residues HEADER = """// Chemfiles, a modern library for chemistry file reading and writing // Copyright (C) Guillaume Fraux and contributors -- BSD license // !!!! AUTO-GENERATED FILE !!!! Do not edit. See pdb_connectivity.py for the code. // The data comes from the PDB's connectivity table: // http://ftp.wwpdb.org/pub/pdb/data/monomers/het_dictionary.txt #include "chemfiles/pdb_connectivity.hpp" using namespace chemfiles; """ def write_elements(path, residues): with open(path, "w") as fd: fd.write(HEADER) fd.write("// generated for the following residues\n") fd.write(join_and_wrap_80("// ", [r.code for r in residues])) fd.write("\n\n") fd.write("const std::vector InternedName::INTERNER_ = {\n") fd.write(join_and_wrap_80(" ", ['"{}"'.format(name) for name in INTERNER.list])) fd.write("};\n\n\n") fd.write("const PDBConnectivity::PDBConnectMap PDBConnectivity::PDB_CONNECTIVITY_MAP_ = {\n") for residue in residues: fd.write(" " + str(residue) + ",\n") fd.write("};\n\n") def usage(): print(sys.argv[0] + " path/to/het_dictionary.txt pdb_connectivity.cpp [Residue names...]") if __name__ == "__main__": if len(sys.argv) < 3: usage() sys.exit(0) if len(sys.argv) >= 4: residue_names = set(sys.argv[3:]) residue_names.update({ "GLU", "GLY", "PHE", "LEU", "SER", "LYS", "CYS", "TRP", "TYR", "PRO", "HIS", "GLN", "ARG", "ILE", "MET", "THR", "ASN", "ASP", "VAL", "ALA", "A ", "U ", "G ", "C ", "DA ", "DT ", "DG ", "DC " }); residues = read_residues(sys.argv[1], residue_names) else: residues = read_residues(sys.argv[1]) write_elements(sys.argv[2], residues)