// // Copyright (C) 2018 Boran Adas, Google Summer of Code // // @@ All Rights Reserved @@ // This file is part of the RDKit. // The contents are covered by the terms of the BSD license // which is included in the file license.txt, found at the root // of the RDKit source tree. // #include #include #include #include #include #include #include #include #include #include namespace RDKit { template FingerprintArguments::FingerprintArguments( const bool countSimulation, const std::vector countBounds, std::uint32_t fpSize, std::uint32_t numBitsPerFeature) : d_countSimulation(countSimulation), d_countBounds(countBounds), d_fpSize(fpSize), d_numBitsPerFeature(numBitsPerFeature) { PRECONDITION(!countSimulation || !countBounds.empty(), "bad count bounds provided"); PRECONDITION(d_numBitsPerFeature > 0, "numBitsPerFeature must be >0"); } template FingerprintArguments::FingerprintArguments( bool countSimulation, const std::vector countBounds, std::uint32_t fpSize, std::uint32_t numBitsPerFeature); template FingerprintArguments::FingerprintArguments( bool countSimulation, const std::vector countBounds, std::uint32_t fpSize, std::uint32_t numBitsPerFeature); template std::string FingerprintArguments::commonArgumentsString() const { return "Common arguments : countSimulation=" + std::to_string(d_countSimulation) + " fpSize=" + std::to_string(d_fpSize) + " bitsPerFeature=" + std::to_string(d_numBitsPerFeature); } template FingerprintGenerator::FingerprintGenerator( AtomEnvironmentGenerator *atomEnvironmentGenerator, FingerprintArguments *fingerprintArguments, AtomInvariantsGenerator *atomInvariantsGenerator, BondInvariantsGenerator *bondInvariantsGenerator, bool ownsAtomInvGenerator, bool ownsBondInvGenerator) : df_ownsAtomInvGenerator(ownsAtomInvGenerator), df_ownsBondInvGenerator(ownsBondInvGenerator) { this->dp_atomEnvironmentGenerator = atomEnvironmentGenerator; this->dp_fingerprintArguments = fingerprintArguments; this->dp_atomInvariantsGenerator = atomInvariantsGenerator; this->dp_bondInvariantsGenerator = bondInvariantsGenerator; } template FingerprintGenerator::FingerprintGenerator( AtomEnvironmentGenerator *atomEnvironmentGenerator, FingerprintArguments *fingerprintArguments, AtomInvariantsGenerator *atomInvariantsGenerator, BondInvariantsGenerator *bondInvariantsGenerator, bool ownsAtomInvGenerator, bool ownsBondInvGenerator); template FingerprintGenerator::FingerprintGenerator( AtomEnvironmentGenerator *atomEnvironmentGenerator, FingerprintArguments *fingerprintArguments, AtomInvariantsGenerator *atomInvariantsGenerator, BondInvariantsGenerator *bondInvariantsGenerator, bool ownsAtomInvGenerator, bool ownsBondInvGenerator); template FingerprintGenerator::~FingerprintGenerator() { delete dp_atomEnvironmentGenerator; delete dp_fingerprintArguments; if (df_ownsAtomInvGenerator) { delete dp_atomInvariantsGenerator; } if (df_ownsBondInvGenerator) { delete dp_bondInvariantsGenerator; } } template FingerprintGenerator::~FingerprintGenerator(); template FingerprintGenerator::~FingerprintGenerator(); template std::string FingerprintGenerator::infoString() const; template std::string FingerprintGenerator::infoString() const; template std::string FingerprintGenerator::infoString() const { std::string separator = " --- "; return dp_fingerprintArguments->commonArgumentsString() + separator + dp_fingerprintArguments->infoString() + separator + dp_atomEnvironmentGenerator->infoString() + separator + (dp_atomInvariantsGenerator ? (dp_atomInvariantsGenerator->infoString() + separator) : ("No atom invariants generator" + separator)) + (dp_bondInvariantsGenerator ? (dp_bondInvariantsGenerator->infoString()) : "No bond invariants generator"); } template SparseIntVect *FingerprintGenerator::getFingerprintHelper( const ROMol &mol, const std::vector *fromAtoms, const std::vector *ignoreAtoms, const int confId, const AdditionalOutput *additionalOutput, const std::vector *customAtomInvariants, const std::vector *customBondInvariants, const std::uint64_t fpSize) const { if (additionalOutput) { if (additionalOutput->atomCounts) { additionalOutput->atomCounts->resize(mol.getNumAtoms()); std::fill(additionalOutput->atomCounts->begin(), additionalOutput->atomCounts->end(), 0); } if (additionalOutput->atomToBits) { additionalOutput->atomToBits->resize(mol.getNumAtoms()); std::fill(additionalOutput->atomToBits->begin(), additionalOutput->atomToBits->end(), std::vector()); } if (additionalOutput->bitInfoMap) { additionalOutput->bitInfoMap->clear(); } if (additionalOutput->bitPaths) { additionalOutput->bitPaths->clear(); } } bool hashResults = false; if (fpSize != 0) { hashResults = true; } std::vector *atomInvariants = nullptr; if (customAtomInvariants) { atomInvariants = new std::vector(*customAtomInvariants); } else if (dp_atomInvariantsGenerator) { atomInvariants = dp_atomInvariantsGenerator->getAtomInvariants(mol); } std::vector *bondInvariants = nullptr; if (customBondInvariants) { bondInvariants = new std::vector(*customBondInvariants); } else if (dp_bondInvariantsGenerator) { bondInvariants = dp_bondInvariantsGenerator->getBondInvariants(mol); } // create all atom environments that will generate the bit-ids that will make // up the fingerprint std::vector *> atomEnvironments = dp_atomEnvironmentGenerator->getEnvironments( mol, dp_fingerprintArguments, fromAtoms, ignoreAtoms, confId, additionalOutput, atomInvariants, bondInvariants, hashResults); // allocate the result SparseIntVect *res = nullptr; if (fpSize != 0) { res = new SparseIntVect(fpSize); } else { res = new SparseIntVect(dp_fingerprintArguments->getResultSize()); } // define a mersenne twister with customized parameters. // The standard parameters (used to create boost::mt19937) // result in an RNG that's much too computationally intensive // to seed. // These are the parameters that have been used for the RDKit fingerprint. typedef boost::random::mersenne_twister rng_type; typedef boost::uniform_int<> distrib_type; typedef boost::variate_generator source_type; std::unique_ptr generator; // // if we generate arbitrarily sized ints then mod them down to the // appropriate size, we can guarantee that a fingerprint of // size x has the same bits set as one of size 2x that's been folded // in half. This is a nice guarantee to have. // std::unique_ptr dist; std::unique_ptr randomSource; if (dp_fingerprintArguments->d_numBitsPerFeature > 1) { // we will only create the RNG if we're going to need it generator.reset(new rng_type(42u)); dist.reset(new distrib_type(0, INT_MAX)); randomSource.reset(new source_type(*generator, *dist)); } // iterate over every atom environment and generate bit-ids that will make up // the fingerprint for (auto it = atomEnvironments.begin(); it != atomEnvironments.end(); it++) { OutputType seed = (*it)->getBitId(dp_fingerprintArguments, atomInvariants, bondInvariants, additionalOutput, hashResults, fpSize); auto bitId = seed; if (fpSize != 0) { bitId %= fpSize; } res->setVal(bitId, res->getVal(bitId) + 1); // do the additional bits if required: if (dp_fingerprintArguments->d_numBitsPerFeature > 1) { generator->seed(static_cast(seed)); for (boost::uint32_t bitN = 1; bitN < dp_fingerprintArguments->d_numBitsPerFeature; ++bitN) { bitId = (*randomSource)(); if (fpSize != 0) { bitId %= fpSize; } res->setVal(bitId, res->getVal(bitId) + 1); } } delete (*it); } delete atomInvariants; delete bondInvariants; return res; } template SparseIntVect *FingerprintGenerator::getSparseCountFingerprint( const ROMol &mol, const std::vector *fromAtoms, const std::vector *ignoreAtoms, const int confId, const AdditionalOutput *additionalOutput, const std::vector *customAtomInvariants, const std::vector *customBondInvariants) const { return getFingerprintHelper(mol, fromAtoms, ignoreAtoms, confId, additionalOutput, customAtomInvariants, customBondInvariants); } // todo getSparseFingerprint does not completely produce the same output as // getSparseCountFingerprint. Count simulation and potential 64 bit outputs // makes size limiting necessary for getSparseFingerprint. This can be // changed if there is another way to avoid the size limitation of SparseBitVect template SparseBitVect *FingerprintGenerator::getSparseFingerprint( const ROMol &mol, const std::vector *fromAtoms, const std::vector *ignoreAtoms, const int confId, const AdditionalOutput *additionalOutput, const std::vector *customAtomInvariants, const std::vector *customBondInvariants) const { // make sure the result will fit into SparseBitVect std::uint32_t resultSize = std::min((std::uint64_t)std::numeric_limits::max(), (std::uint64_t)dp_fingerprintArguments->getResultSize()); std::uint32_t effectiveSize = resultSize; if (dp_fingerprintArguments->d_countSimulation) { // effective size needs to be smaller than result size to compansate for // count simulation effectiveSize /= dp_fingerprintArguments->d_countBounds.size(); } SparseIntVect *tempResult = getFingerprintHelper( mol, fromAtoms, ignoreAtoms, confId, additionalOutput, customAtomInvariants, customBondInvariants, effectiveSize); auto *result = new SparseBitVect(resultSize); for (auto val : tempResult->getNonzeroElements()) { if (dp_fingerprintArguments->d_countSimulation) { for (unsigned int i = 0; i < dp_fingerprintArguments->d_countBounds.size(); ++i) { // for every bound in the d_countBounds in dp_fingerprintArguments, set // a bit if the occurrence count is equal or higher than the bound for // that bit const auto &bounds_count = dp_fingerprintArguments->d_countBounds; if (val.second >= static_cast(bounds_count[i])) { result->setBit(val.first * bounds_count.size() + i); } } } else { result->setBit(val.first); } } delete tempResult; return result; } template SparseIntVect *FingerprintGenerator::getCountFingerprint( const ROMol &mol, const std::vector *fromAtoms, const std::vector *ignoreAtoms, const int confId, const AdditionalOutput *additionalOutput, const std::vector *customAtomInvariants, const std::vector *customBondInvariants) const { SparseIntVect *tempResult = getFingerprintHelper( mol, fromAtoms, ignoreAtoms, confId, additionalOutput, customAtomInvariants, customBondInvariants, dp_fingerprintArguments->d_fpSize); auto *result = new SparseIntVect(dp_fingerprintArguments->d_fpSize); for (auto val : tempResult->getNonzeroElements()) { result->setVal(val.first, val.second); } delete tempResult; return result; } template ExplicitBitVect *FingerprintGenerator::getFingerprint( const ROMol &mol, const std::vector *fromAtoms, const std::vector *ignoreAtoms, const int confId, const AdditionalOutput *additionalOutput, const std::vector *customAtomInvariants, const std::vector *customBondInvariants) const { std::uint32_t effectiveSize = dp_fingerprintArguments->d_fpSize; if (dp_fingerprintArguments->d_countSimulation) { // effective size needs to be smaller than result size to compansate for // count simulation effectiveSize /= dp_fingerprintArguments->d_countBounds.size(); } SparseIntVect *tempResult = getFingerprintHelper( mol, fromAtoms, ignoreAtoms, confId, additionalOutput, customAtomInvariants, customBondInvariants, effectiveSize); auto *result = new ExplicitBitVect(dp_fingerprintArguments->d_fpSize); for (auto val : tempResult->getNonzeroElements()) { if (dp_fingerprintArguments->d_countSimulation) { for (unsigned int i = 0; i < dp_fingerprintArguments->d_countBounds.size(); ++i) { // for every bound in the d_countBounds in dp_fingerprintArguments, set // a bit if the occurrence count is equal or higher than the bound for // that bit const auto &bounds_count = dp_fingerprintArguments->d_countBounds; if (val.second >= static_cast(bounds_count[i])) { result->setBit(val.first * bounds_count.size() + i); } } } else { result->setBit(val.first); } } delete tempResult; return result; } template RDKIT_FINGERPRINTS_EXPORT SparseIntVect *FingerprintGenerator::getSparseCountFingerprint( const ROMol &mol, const std::vector *fromAtoms, const std::vector *ignoreAtoms, const int confId, const AdditionalOutput *additionalOutput, const std::vector *customAtomInvariants, const std::vector *customBondInvariants) const; template RDKIT_FINGERPRINTS_EXPORT SparseIntVect *FingerprintGenerator::getSparseCountFingerprint( const ROMol &mol, const std::vector *fromAtoms, const std::vector *ignoreAtoms, const int confId, const AdditionalOutput *additionalOutput, const std::vector *customAtomInvariants, const std::vector *customBondInvariants) const; template RDKIT_FINGERPRINTS_EXPORT SparseBitVect * FingerprintGenerator::getSparseFingerprint( const ROMol &mol, const std::vector *fromAtoms, const std::vector *ignoreAtoms, const int confId, const AdditionalOutput *additionalOutput, const std::vector *customAtomInvariants, const std::vector *customBondInvariants) const; template RDKIT_FINGERPRINTS_EXPORT SparseBitVect * FingerprintGenerator::getSparseFingerprint( const ROMol &mol, const std::vector *fromAtoms, const std::vector *ignoreAtoms, const int confId, const AdditionalOutput *additionalOutput, const std::vector *customAtomInvariants, const std::vector *customBondInvariants) const; template RDKIT_FINGERPRINTS_EXPORT SparseIntVect *FingerprintGenerator::getCountFingerprint( const ROMol &mol, const std::vector *fromAtoms, const std::vector *ignoreAtoms, const int confId, const AdditionalOutput *additionalOutput, const std::vector *customAtomInvariants, const std::vector *customBondInvariants) const; template RDKIT_FINGERPRINTS_EXPORT SparseIntVect *FingerprintGenerator::getCountFingerprint( const ROMol &mol, const std::vector *fromAtoms, const std::vector *ignoreAtoms, const int confId, const AdditionalOutput *additionalOutput, const std::vector *customAtomInvariants, const std::vector *customBondInvariants) const; template RDKIT_FINGERPRINTS_EXPORT ExplicitBitVect * FingerprintGenerator::getFingerprint( const ROMol &mol, const std::vector *fromAtoms, const std::vector *ignoreAtoms, const int confId, const AdditionalOutput *additionalOutput, const std::vector *customAtomInvariants, const std::vector *customBondInvariants) const; template RDKIT_FINGERPRINTS_EXPORT ExplicitBitVect * FingerprintGenerator::getFingerprint( const ROMol &mol, const std::vector *fromAtoms, const std::vector *ignoreAtoms, const int confId, const AdditionalOutput *additionalOutput, const std::vector *customAtomInvariants, const std::vector *customBondInvariants) const; SparseIntVect *getSparseCountFP(const ROMol &mol, FPType fPType) { std::vector tempVect(1, &mol); return (*getSparseCountFPBulk(tempVect, fPType))[0]; } SparseBitVect *getSparseFP(const ROMol &mol, FPType fPType) { std::vector tempVect(1, &mol); return (*getSparseFPBulk(tempVect, fPType))[0]; } SparseIntVect *getCountFP(const ROMol &mol, FPType fPType) { std::vector tempVect(1, &mol); return (*getCountFPBulk(tempVect, fPType))[0]; } ExplicitBitVect *getFP(const ROMol &mol, FPType fPType) { std::vector tempVect(1, &mol); return (*getFPBulk(tempVect, fPType))[0]; } std::vector *> *getSparseCountFPBulk( const std::vector molVector, FPType fPType) { FingerprintGenerator *generator = nullptr; switch (fPType) { case FPType::AtomPairFP: { generator = AtomPair::getAtomPairGenerator(); break; } case FPType::MorganFP: { generator = MorganFingerprint::getMorganGenerator(2); break; } case FPType::RDKitFP: { generator = RDKitFP::getRDKitFPGenerator(); break; } case FPType::TopologicalTorsionFP: { generator = TopologicalTorsion::getTopologicalTorsionGenerator(); break; } default: { throw UnimplementedFPException( "Fingerprint type not implemented for getSparseCountFP"); } } auto *res = new std::vector *>(); for (const auto *mol : molVector) { res->push_back(generator->getSparseCountFingerprint(*mol)); } delete generator; return res; } std::vector *getSparseFPBulk( const std::vector molVector, FPType fPType) { FingerprintGenerator *generator = nullptr; switch (fPType) { case FPType::AtomPairFP: { generator = AtomPair::getAtomPairGenerator(); break; } case FPType::MorganFP: { generator = MorganFingerprint::getMorganGenerator(2); break; } case FPType::RDKitFP: { generator = RDKitFP::getRDKitFPGenerator(); break; } case FPType::TopologicalTorsionFP: { generator = TopologicalTorsion::getTopologicalTorsionGenerator(); break; } default: { throw UnimplementedFPException( "Fingerprint type not implemented for getSparseFP"); } } auto *res = new std::vector(); for (const auto *mol : molVector) { res->push_back(generator->getSparseFingerprint(*mol)); } delete generator; return res; } std::vector *> *getCountFPBulk( const std::vector molVector, FPType fPType) { FingerprintGenerator *generator = nullptr; switch (fPType) { case FPType::AtomPairFP: { generator = AtomPair::getAtomPairGenerator(); break; } case FPType::MorganFP: { generator = MorganFingerprint::getMorganGenerator(2); break; } case FPType::RDKitFP: { generator = RDKitFP::getRDKitFPGenerator(); break; } case FPType::TopologicalTorsionFP: { generator = TopologicalTorsion::getTopologicalTorsionGenerator(); break; } default: { throw UnimplementedFPException( "Fingerprint type not implemented for getCountFP"); } } auto *res = new std::vector *>(); for (const auto *mol : molVector) { res->push_back(generator->getCountFingerprint(*mol)); } delete generator; return res; } std::vector *getFPBulk( const std::vector molVector, FPType fPType) { FingerprintGenerator *generator = nullptr; switch (fPType) { case FPType::AtomPairFP: { generator = AtomPair::getAtomPairGenerator(); break; } case FPType::MorganFP: { generator = MorganFingerprint::getMorganGenerator(2); break; } case FPType::RDKitFP: { generator = RDKitFP::getRDKitFPGenerator(); break; } case FPType::TopologicalTorsionFP: { generator = TopologicalTorsion::getTopologicalTorsionGenerator(); break; } default: { throw UnimplementedFPException( "Fingerprint type not implemented for getFP"); } } auto *res = new std::vector(); for (const auto *mol : molVector) { res->push_back(generator->getFingerprint(*mol)); } delete generator; return res; } } // namespace RDKit