RDKit
Open-source cheminformatics and machine learning.
RGroupData.h
Go to the documentation of this file.
1 //
2 // Copyright (C) 2017 Novartis Institutes for BioMedical Research
3 //
4 // @@ All Rights Reserved @@
5 // This file is part of the RDKit.
6 // The contents are covered by the terms of the BSD license
7 // which is included in the file license.txt, found at the root
8 // of the RDKit source tree.
9 //
10 #ifndef RGROUP_DATA
11 #define RGROUP_DATA
12 
13 #include "../RDKitBase.h"
14 #include "RGroupUtils.h"
19 #include <boost/scoped_ptr.hpp>
20 #include <set>
21 #include <vector>
22 #include <regex>
23 
24 namespace RDKit {
25 
26 //! A single rgroup attached to a given core.
27 struct RGroupData {
28  boost::shared_ptr<RWMol> combinedMol;
29  std::vector<boost::shared_ptr<ROMol>> mols; // All the mols in the rgroup
30  std::vector<std::string> smilesVect; // used for rgroup equivalence
31  std::string
32  smiles; // smiles for all the mols in the rgroup (with attachments)
33  std::set<int> attachments; // core attachment points
34  std::unique_ptr<ExplicitBitVect>
35  fingerprint; // fingerprint for score calculations
36  std::vector<int> fingerprintOnBits;
37  bool is_hydrogen = false;
38  bool single_fragment = true;
39  bool multiple_attachments = false;
40  bool is_linker = false;
41  bool labelled = false;
42 
43  private:
44  RGroupData(const RGroupData &rhs);
45 
46  public:
48 
49  void add(boost::shared_ptr<ROMol> newMol,
50  const std::vector<int> &rlabel_attachments) {
51  // some fragments can be add multiple times if they are cyclic
52  for (auto &mol : mols) {
53  if (newMol.get() == mol.get()) {
54  return;
55  }
56  }
57 
58  if (mols.size() > 0) {
59  // don't add extraneous hydrogens
60  if (isMolHydrogen(*newMol)) {
61  return;
62  }
63  if (is_hydrogen) {
64  // if we are adding a heavy attachment to hydrogens, discard the
65  // hydrogen and start over
66  combinedMol = nullptr;
67  smilesVect.clear();
68  attachments.clear();
69  mols.clear();
70  }
71  }
72 
73  labelled = false;
74  std::copy(rlabel_attachments.begin(), rlabel_attachments.end(),
75  std::inserter(attachments, attachments.end()));
76 
77  mols.push_back(newMol);
78  static const std::regex remove_isotopes_regex("\\[\\d*\\*\\]");
79  // remove the isotope labels from the SMILES string to avoid
80  // that identical R-group are perceived as different when
81  // MCS alignment is not used (NoAlign flag)
82  smilesVect.push_back(std::regex_replace(MolToSmiles(*newMol, true),
83  remove_isotopes_regex, "*"));
84  if (!combinedMol.get()) {
85  combinedMol = boost::shared_ptr<RWMol>(new RWMol(*mols[0].get()));
86  } else {
87  ROMol *m = combineMols(*combinedMol.get(), *newMol.get());
88  single_fragment = false;
89  m->updateProps(*combinedMol.get());
90  combinedMol.reset(new RWMol(*m));
91  delete m;
92  }
93  smiles = getSmiles();
95  computeIsHydrogen();
96  is_linker = single_fragment && attachments.size() > 1;
97  }
98 
99  std::map<int, int> getNumBondsToRlabels() const {
100  std::map<int, int> rlabelsUsedCount;
101 
102  for (ROMol::AtomIterator atIt = combinedMol->beginAtoms();
103  atIt != combinedMol->endAtoms(); ++atIt) {
104  Atom *atom = *atIt;
105  int rlabel;
106  if (atom->getPropIfPresent<int>(RLABEL, rlabel)) {
107  rlabelsUsedCount[rlabel] += 1;
108  }
109  }
110  return rlabelsUsedCount;
111  }
112 
113  std::string toString() const {
114  auto attachmentString = std::accumulate(
115  attachments.cbegin(), attachments.cend(), std::string(),
116  [](std::string s, int a) {
117  return s.empty() ? std::to_string(a)
118  : std::move(s) + ',' + std::to_string(a);
119  });
120  std::stringstream ss;
121  ss << "RG " << attachmentString << " " << getSmiles();
122  return ss.str();
123  }
124 
125  private:
126  void computeIsHydrogen() { // is the rgroup all Hs
127  for (const auto &mol : mols) {
128  if (!isMolHydrogen(*mol)) {
129  is_hydrogen = false;
130  return;
131  }
132  }
133  is_hydrogen = true;
134  }
135 
136  bool isMolHydrogen(ROMol &mol) {
137  for (ROMol::AtomIterator atIt = mol.beginAtoms(); atIt != mol.endAtoms();
138  ++atIt) {
139  if ((*atIt)->getAtomicNum() > 1) {
140  return false;
141  }
142  }
143  return true;
144  }
145 
146  //! compute the canonical smiles for the attachments (bug: removes dupes since
147  //! we are using a set...)
148  std::string getSmiles() const {
149  std::string s;
150  for (const auto &it : smilesVect) {
151  if (s.length()) {
152  s += ".";
153  }
154  s += it;
155  }
156  return s;
157  }
158 };
159 } // namespace RDKit
160 
161 #endif
The class for representing atoms.
Definition: Atom.h:68
bool getPropIfPresent(const std::string &key, T &res) const
Definition: RDProps.h:116
RWMol is a molecule class that is intended to be edited.
Definition: RWMol.h:32
RDKIT_RDGENERAL_EXPORT const std::string internalRgroupSmiles
Std stuff.
Definition: Abbreviations.h:18
RDKIT_CHEMTRANSFORMS_EXPORT ROMol * combineMols(const ROMol &mol1, const ROMol &mol2, RDGeom::Point3D offset=RDGeom::Point3D(0, 0, 0))
Combined two molecules to create a new one.
RDKIT_SMILESPARSE_EXPORT std::string MolToSmiles(const ROMol &mol, const SmilesWriteParams &params)
returns canonical SMILES for a molecule
RDKIT_RGROUPDECOMPOSITION_EXPORT const std::string RLABEL
A single rgroup attached to a given core.
Definition: RGroupData.h:27
boost::shared_ptr< RWMol > combinedMol
Definition: RGroupData.h:28
std::vector< boost::shared_ptr< ROMol > > mols
Definition: RGroupData.h:29
std::set< int > attachments
Definition: RGroupData.h:33
std::vector< int > fingerprintOnBits
Definition: RGroupData.h:36
bool multiple_attachments
Definition: RGroupData.h:39
std::string toString() const
Definition: RGroupData.h:113
std::vector< std::string > smilesVect
Definition: RGroupData.h:30
std::string smiles
Definition: RGroupData.h:32
std::map< int, int > getNumBondsToRlabels() const
Definition: RGroupData.h:99
void add(boost::shared_ptr< ROMol > newMol, const std::vector< int > &rlabel_attachments)
Definition: RGroupData.h:49
std::unique_ptr< ExplicitBitVect > fingerprint
Definition: RGroupData.h:35