// // Copyright (C) 2001-2020 Greg Landrum and Rational Discovery LLC // // @@ All Rights Reserved @@ // This file is part of the RDKit. // The contents are covered by the terms of the BSD license // which is included in the file license.txt, found at the root // of the RDKit source tree. // #include #include #include #include "SmilesParse.h" #include "SmilesParseOps.h" #include #include #include #include #include namespace SmilesParseOps { using namespace RDKit; void ClearAtomChemicalProps(RDKit::Atom *atom) { TEST_ASSERT(atom); atom->setIsotope(0); atom->setFormalCharge(0); atom->setNumExplicitHs(0); } void CheckRingClosureBranchStatus(RDKit::Atom *atom, RDKit::RWMol *mp) { // github #786 and #1652: if the ring closure comes after a branch, // the stereochem is wrong. // This function is called while closing a branch during construction of // the molecule from SMILES and corrects for what happens when parsing odd // (and arguably wrong) SMILES constructs like: // 1) [C@@](F)1(C)CCO1 // 2) C1CN[C@](O)(N)1 // 3) [C@](Cl)(F)1CC[C@H](F)CC1 // In the first two cases the stereochemistry at the chiral atom // needs to be reversed. In the third case the stereochemistry should be // reversed when the Cl is added, but left alone when the F is added. // We recognize these situations using the index of the chiral atom // and the degree of that chiral atom at the time the ring closure // digit is encountered during parsing. // ---------- // github #1972 adds these examples: // 1) [C@@]1(Cl)(F)I.Br1 (ok) // 2) [C@@](Cl)1(F)I.Br1 (reverse) // 3) [C@@](Cl)(F)1I.Br1 (ok) // 4) [C@@](Cl)(F)(I)1.Br1 (reverse) PRECONDITION(atom, "bad atom"); PRECONDITION(mp, "bad mol"); if (atom->getIdx() != mp->getNumAtoms(true) - 1 && (atom->getDegree() == 1 || (atom->getDegree() == 2 && atom->getIdx() != 0) || (atom->getDegree() == 3 && atom->getIdx() == 0)) && (atom->getChiralTag() == Atom::CHI_TETRAHEDRAL_CW || atom->getChiralTag() == Atom::CHI_TETRAHEDRAL_CCW)) { // std::cerr << "crcbs: " << atom->getIdx() << std::endl; atom->invertChirality(); } } void ReportParseError(const char *message, bool throwIt) { PRECONDITION(message, "bad message"); if (!throwIt) { BOOST_LOG(rdErrorLog) << "SMILES Parse Error: " << message << std::endl; } else { throw SmilesParseException(message); } } void CleanupAfterParseError(RWMol *mol) { PRECONDITION(mol, "no molecule"); // blow out any partial bonds: for (auto markI : *mol->getBondBookmarks()) { RWMol::BOND_PTR_LIST &bonds = markI.second; for (auto &bond : bonds) { delete bond; } } } namespace { bool couldBeRingClosure(int val) { return val < 100000 && val >= 0; } } // namespace // // set bondOrder to Bond::IONIC to skip the formation of a bond // between the fragment and the molecule // void AddFragToMol(RWMol *mol, RWMol *frag, Bond::BondType bondOrder, Bond::BondDir bondDir) { PRECONDITION(mol, "no molecule"); PRECONDITION(frag, "no fragment"); PRECONDITION(mol->getActiveAtom(), "no active atom"); Atom *lastAt = mol->getActiveAtom(); int nOrigAtoms = mol->getNumAtoms(); int nOrigBonds = mol->getNumBonds(); // // Add the fragment's atoms and bonds to the molecule: // mol->insertMol(*frag); // // update ring-closure order information on the added atoms: // for (const auto atom : frag->atoms()) { INT_VECT tmpVect; if (atom->getPropIfPresent(common_properties::_RingClosures, tmpVect)) { for (auto &v : tmpVect) { // if the ring closure is not already a bond, don't touch it: if (v >= 0) { v += nOrigBonds; } } auto newAtom = mol->getAtomWithIdx(nOrigAtoms + atom->getIdx()); newAtom->setProp(common_properties::_RingClosures, tmpVect); } } // // ses up the bond between the mol and the branch // if (bondOrder != Bond::IONIC) { // FIX: this is not so much with the elegance... auto firstAt = mol->getAtomWithIdx(nOrigAtoms); int atomIdx1 = firstAt->getIdx(); int atomIdx2 = lastAt->getIdx(); if (frag->hasBondBookmark(ci_LEADING_BOND)) { // std::cout << "found it" << std::endl; const ROMol::BOND_PTR_LIST &leadingBonds = frag->getAllBondsWithBookmark(ci_LEADING_BOND); for (auto leadingBond : leadingBonds) { // we've already got a bond, so just set its local info // and then add it to the molecule intact (no sense doing // any extra work). leadingBond->setOwningMol(mol); leadingBond->setEndAtomIdx(leadingBond->getBeginAtomIdx() + nOrigAtoms); leadingBond->setBeginAtomIdx(atomIdx2); mol->addBond(leadingBond, true); } mol->clearBondBookmark(ci_LEADING_BOND); } else { // SMARTS semantics: unspecified bonds can be single or aromatic if (bondOrder == Bond::UNSPECIFIED) { auto *newB = new QueryBond(Bond::SINGLE); newB->setQuery(makeSingleOrAromaticBondQuery()); newB->setOwningMol(mol); newB->setBeginAtomIdx(atomIdx1); newB->setEndAtomIdx(atomIdx2); newB->setProp(RDKit::common_properties::_unspecifiedOrder, 1); mol->addBond(newB); delete newB; } else { Bond::BondType bo = bondOrder; if (bo == Bond::DATIVEL) { std::swap(atomIdx1, atomIdx2); bo = Bond::DATIVE; } else if (bo == Bond::DATIVER) { bo = Bond::DATIVE; } int idx = mol->addBond(atomIdx2, atomIdx1, bo) - 1; mol->getBondWithIdx(idx)->setBondDir(bondDir); } } } // // okay, the next thing we have to worry about is the possibility // that there might be ring opening/closing in the fragment we just // dealt with e.g. for things like C1C(C1) and C1C.C1 // We deal with this by copying in the bookmarks and partial bonds // that exist in the fragment // for (auto atIt : *frag->getAtomBookmarks()) { // don't bother even considering bookmarks outside // the range used for cycles if (couldBeRingClosure(atIt.first)) { for (auto at2 : atIt.second) { int newIdx = at2->getIdx() + nOrigAtoms; mol->setAtomBookmark(mol->getAtomWithIdx(newIdx), atIt.first); while (frag->hasBondBookmark(atIt.first)) { Bond *b = frag->getBondWithBookmark(atIt.first); int atomIdx1 = b->getBeginAtomIdx() + nOrigAtoms; b->setOwningMol(mol); b->setBeginAtomIdx(atomIdx1); mol->setBondBookmark(b, atIt.first); frag->clearBondBookmark(atIt.first, b); } } } } frag->clearAllAtomBookmarks(); frag->clearAllBondBookmarks(); }; typedef std::pair SIZET_PAIR; typedef std::pair INT_PAIR; template bool operator<(const std::pair &p1, const std::pair &p2) { return p1.first < p2.first; } void AdjustAtomChiralityFlags(RWMol *mol) { PRECONDITION(mol, "no molecule"); for (auto atom : mol->atoms()) { Atom::ChiralType chiralType = atom->getChiralTag(); if (chiralType == Atom::CHI_TETRAHEDRAL_CW || chiralType == Atom::CHI_TETRAHEDRAL_CCW) { // // The atom is marked as chiral, set the SMILES-order of the // atom's bonds. This is easy for non-ring-closure bonds, // because the SMILES order is determined solely by the atom // indices. Things are trickier for ring-closure bonds, which we // need to insert into the list in a particular order // INT_VECT ringClosures; atom->getPropIfPresent(common_properties::_RingClosures, ringClosures); #if 0 std::cerr << "CLOSURES: "; std::copy(ringClosures.begin(), ringClosures.end(), std::ostream_iterator(std::cerr, " ")); std::cerr << std::endl; #endif std::list neighbors; // push this atom onto the list of neighbors (we'll use this // to find our place later): neighbors.emplace_back(atom->getIdx(), -1); std::list bondOrder; for (auto nbrIdx : boost::make_iterator_range(mol->getAtomNeighbors(atom))) { Bond *nbrBond = mol->getBondBetweenAtoms(atom->getIdx(), nbrIdx); if (std::find(ringClosures.begin(), ringClosures.end(), static_cast(nbrBond->getIdx())) == ringClosures.end()) { neighbors.emplace_back(nbrIdx, nbrBond->getIdx()); } } // sort the list of non-ring-closure bonds: neighbors.sort(); // find the location of this atom. it pretty much has to be // first in the list, e.g for smiles like [C@](F)(Cl)(Br)I, or // second (everything else). auto selfPos = neighbors.begin(); if (selfPos->first != atom->getIdx()) { ++selfPos; } CHECK_INVARIANT(selfPos->first == atom->getIdx(), "weird atom ordering"); // copy over the bond ids: INT_LIST bondOrdering; for (auto neighborIt = neighbors.begin(); neighborIt != neighbors.end(); ++neighborIt) { if (neighborIt != selfPos) { bondOrdering.push_back(rdcast(neighborIt->second)); } else { // we are not going to add the atom itself, but we will push on // ring closure bonds at this point (if required): bondOrdering.insert(bondOrdering.end(), ringClosures.begin(), ringClosures.end()); } } // ok, we now have the SMILES ordering of the bonds, figure out the // permutation order. // // This whole thing is necessary because the ring-closure bonds // in the SMILES come before the bonds to the other neighbors, but // they come after the neighbors in the molecule we build. // A crude example: // in F[C@](Cl)(Br)I the C-Cl bond is index 1 in both SMILES // and as built // in F[C@]1(Br)I.Cl1 the C-Cl bond is index 1 in the SMILES // and index 3 as built. // int nSwaps = atom->getPerturbationOrder(bondOrdering); // FIX: explain this one: // At least part of what's going on here for degree 3 atoms: // - The first part: if we're at the beginning of the SMILES and have // an explicit H, we need to add a swap. // This is to reflect that [C@](Cl)(F)C is equivalent to Cl[C@@](F)C // but [C@H](Cl)(F)C is fine as-is (The H-C bond is the one you look // down). // - The second part is more complicated and deals with situations like // F[C@]1CCO1. In this case we otherwise end up looking like we need // to invert the chirality, which is bogus. The chirality here needs // to remain @ just as it does in F[C@](Cl)CCO1 // - We have to be careful with the second part to not sweep things like // C[S@]2(=O).Cl2 into the same bin (was github #760). We detect // those cases by looking for unsaturated atoms // if (Canon::chiralAtomNeedsTagInversion( *mol, atom, atom->hasProp(common_properties::_SmilesStart), ringClosures.size())) { ++nSwaps; } // std::cerr << "nswaps " << atom->getIdx() << " " << nSwaps // << std::endl; // std::copy(bondOrdering.begin(), bondOrdering.end(), // std::ostream_iterator(std::cerr, ", ")); // std::cerr << std::endl; if (nSwaps % 2) { atom->invertChirality(); } } } } // namespace SmilesParseOps Bond::BondType GetUnspecifiedBondType(const RWMol *mol, const Atom *atom1, const Atom *atom2) { PRECONDITION(mol, "no molecule"); PRECONDITION(atom1, "no atom1"); PRECONDITION(atom2, "no atom2"); Bond::BondType res; if (atom1->getIsAromatic() && atom2->getIsAromatic()) { res = Bond::AROMATIC; } else { res = Bond::SINGLE; } return res; } void SetUnspecifiedBondTypes(RWMol *mol) { PRECONDITION(mol, "no molecule"); for (auto bond : mol->bonds()) { if (bond->hasProp(RDKit::common_properties::_unspecifiedOrder)) { bond->setBondType(GetUnspecifiedBondType(mol, bond->getBeginAtom(), bond->getEndAtom())); if (bond->getBondType() == Bond::AROMATIC) { bond->setIsAromatic(true); } else { bond->setIsAromatic(false); } } } } namespace { void swapBondDirIfNeeded(Bond *bond1, const Bond *bond2) { PRECONDITION(bond1, "bad bond1"); PRECONDITION(bond2, "bad bond2"); if (bond1->getBondDir() == Bond::NONE && bond2->getBondDir() != Bond::NONE) { bond1->setBondDir(bond2->getBondDir()); if (bond1->getBeginAtom() != bond2->getBeginAtom()) { switch (bond1->getBondDir()) { case Bond::ENDDOWNRIGHT: bond1->setBondDir(Bond::ENDUPRIGHT); break; case Bond::ENDUPRIGHT: bond1->setBondDir(Bond::ENDDOWNRIGHT); break; default: break; } } } } } // namespace void CloseMolRings(RWMol *mol, bool toleratePartials) { // Here's what we want to do here: // loop through the molecule's atom bookmarks // for each bookmark: // connect pairs of atoms sharing that bookmark // left to right (in the order in which they were // inserted into the molecule). // whilst doing this, we have to be cognizant of the fact that // there may well be partial bonds in the molecule which need // to be tied in as well. WOO HOO! IT'S A BIG MESS! PRECONDITION(mol, "no molecule"); auto bookmarkIt = mol->getAtomBookmarks()->begin(); while (bookmarkIt != mol->getAtomBookmarks()->end()) { auto &bookmark = *bookmarkIt; // don't bother even considering bookmarks outside // the range used for rings if (couldBeRingClosure(bookmark.first)) { RWMol::ATOM_PTR_LIST bookmarkedAtomsToRemove; auto atomIt = bookmark.second.begin(); auto atomsEnd = bookmark.second.end(); while (atomIt != atomsEnd) { Atom *atom1 = *atomIt; ++atomIt; if (!toleratePartials && atomIt == atomsEnd) { ReportParseError("unclosed ring"); } else if (atomIt != atomsEnd && *atomIt == atom1) { // make sure we don't try to connect an atom to itself // this was github #1925 auto fmt = boost::format{ "duplicated ring closure %1% bonds atom %2% to itself"} % bookmark.first % atom1->getIdx(); std::string msg = fmt.str(); ReportParseError(msg.c_str(), true); } else if (mol->getBondBetweenAtoms(atom1->getIdx(), (*atomIt)->getIdx()) != nullptr) { auto fmt = boost::format{ "ring closure %1% duplicates bond between atom %2% and atom " "%3%"} % bookmark.first % atom1->getIdx() % (*atomIt)->getIdx(); std::string msg = fmt.str(); ReportParseError(msg.c_str(), true); } else if (atomIt != atomsEnd) { // we actually found an atom, so connect it to the first Atom *atom2 = *atomIt; ++atomIt; int bondIdx = -1; // We're guaranteed two partial bonds, one for each time // the ring index was used. We give the first specification // priority. CHECK_INVARIANT(mol->hasBondBookmark(bookmark.first), "Missing bond bookmark"); // now use the info from the partial bond: // The partial bond itself will have a proper order and // directionality (with a minor caveat documented below) and will // have its beginning atom set already: RWMol::BOND_PTR_LIST bonds = mol->getAllBondsWithBookmark(bookmark.first); auto bondIt = bonds.begin(); CHECK_INVARIANT(bonds.size() >= 2, "Missing bond"); // get pointers to the two bonds: Bond *bond1 = *bondIt; ++bondIt; Bond *bond2 = *bondIt; // remove those bonds from the bookmarks: mol->clearBondBookmark(bookmark.first, bond1); mol->clearBondBookmark(bookmark.first, bond2); // Make sure the bonds have the correct starting atoms: CHECK_INVARIANT(bond1->getBeginAtomIdx() == atom1->getIdx(), "bad begin atom"); CHECK_INVARIANT(bond2->getBeginAtomIdx() == atom2->getIdx(), "bad begin atom"); // we use the _cxsmilesBondIdx value from the second one, if it's // there if (bond2->hasProp("_cxsmilesBondIdx")) { bond1->setProp("_cxsmilesBondIdx", bond2->getProp("_cxsmilesBondIdx")); } Bond *matchedBond; // figure out which (if either) bond has a specified type, we'll // keep that one. We also need to update the end atom index to // match FIX: daylight barfs when you give it multiple specs for the // closure // bond, we'll just take the first one and ignore others // NOTE: we used to do this the other way (take the last // specification), // but that turned out to be troublesome in odd cases like // C1CC11CC1. // std::cerr << ">-------------" << std::endl; // std::cerr << atom1->getIdx() << "-" << atom2->getIdx() << ": " // << bond1->getBondType() << "(" // << bond1->hasProp(common_properties::_unspecifiedOrder) // << "):" << bond1->getBondDir() << " " // << bond2->getBondType() << "(" // << bond2->hasProp(common_properties::_unspecifiedOrder) // << "):" << bond2->getBondDir() << std::endl; if (!bond1->hasProp(common_properties::_unspecifiedOrder)) { matchedBond = bond1; if (matchedBond->getBondType() == Bond::DATIVEL) { matchedBond->setBeginAtomIdx(atom2->getIdx()); matchedBond->setEndAtomIdx(atom1->getIdx()); matchedBond->setBondType(Bond::DATIVE); } else if (matchedBond->getBondType() == Bond::DATIVER) { matchedBond->setEndAtomIdx(atom2->getIdx()); matchedBond->setBondType(Bond::DATIVE); } else { matchedBond->setEndAtomIdx(atom2->getIdx()); } swapBondDirIfNeeded(bond1, bond2); delete bond2; } else { matchedBond = bond2; if (matchedBond->getBondType() == Bond::DATIVEL) { matchedBond->setBeginAtomIdx(atom1->getIdx()); matchedBond->setEndAtomIdx(atom2->getIdx()); matchedBond->setBondType(Bond::DATIVE); } else if (matchedBond->getBondType() == Bond::DATIVER) { matchedBond->setEndAtomIdx(atom1->getIdx()); matchedBond->setBondType(Bond::DATIVE); } else { matchedBond->setEndAtomIdx(atom1->getIdx()); } swapBondDirIfNeeded(bond2, bond1); delete bond1; } if (matchedBond->getBondType() == Bond::UNSPECIFIED && !matchedBond->hasQuery()) { Bond::BondType bondT = GetUnspecifiedBondType(mol, atom1, atom2); matchedBond->setBondType(bondT); } matchedBond->setOwningMol(mol); if (matchedBond->getBondType() == Bond::AROMATIC) { matchedBond->setIsAromatic(true); } // add the bond: bondIdx = mol->addBond(matchedBond, true); // we found a bond, so update the atom's _RingClosures // property: if (bondIdx > -1) { CHECK_INVARIANT( atom1->hasProp(common_properties::_RingClosures) && atom2->hasProp(common_properties::_RingClosures), "somehow atom doesn't have _RingClosures property."); INT_VECT closures; atom1->getProp(common_properties::_RingClosures, closures); auto closurePos = std::find(closures.begin(), closures.end(), -(bookmark.first + 1)); CHECK_INVARIANT(closurePos != closures.end(), "could not find bookmark in atom _RingClosures"); *closurePos = bondIdx - 1; atom1->setProp(common_properties::_RingClosures, closures); atom2->getProp(common_properties::_RingClosures, closures); closurePos = std::find(closures.begin(), closures.end(), -(bookmark.first + 1)); CHECK_INVARIANT(closurePos != closures.end(), "could not find bookmark in atom _RingClosures"); *closurePos = bondIdx - 1; atom2->setProp(common_properties::_RingClosures, closures); } bookmarkedAtomsToRemove.push_back(atom1); bookmarkedAtomsToRemove.push_back(atom2); } } int mark = bookmark.first; ++bookmarkIt; for (const auto atom : bookmarkedAtomsToRemove) { mol->clearAtomBookmark(mark, atom); } } else { ++bookmarkIt; } } }; void CleanupAfterParsing(RWMol *mol) { PRECONDITION(mol, "no molecule"); for (auto atom : mol->atoms()) { atom->clearProp(common_properties::_RingClosures); atom->clearProp(common_properties::_SmilesStart); } for (auto bond : mol->bonds()) { bond->clearProp(common_properties::_unspecifiedOrder); bond->clearProp("_cxsmilesBondIdx"); } for (auto sg : RDKit::getSubstanceGroups(*mol)) { sg.clearProp("_cxsmilesindex"); } } } // end of namespace SmilesParseOps