RDKit
Open-source cheminformatics and machine learning.
RGroupScore.h
Go to the documentation of this file.
1 //
2 // Copyright (C) 2017 Novartis Institutes for BioMedical Research
3 //
4 // @@ All Rights Reserved @@
5 // This file is part of the RDKit.
6 // The contents are covered by the terms of the BSD license
7 // which is included in the file license.txt, found at the root
8 // of the RDKit source tree.
9 //
10 #ifndef RGROUP_SCORE_H
11 #define RGROUP_SCORE_H
12 
13 #include "RGroupMatch.h"
14 #include <vector>
15 #include <deque>
16 #include <set>
17 namespace RDKit {
18 
19 //! iterate through all possible permutations of the rgroups
21  std::vector<size_t> permutation;
22  std::vector<size_t> sizes;
23  std::deque<size_t> bases;
26  CartesianProduct(const std::vector<size_t> &inputSizes)
27  : permutation(inputSizes.size(), 0),
28  sizes(inputSizes),
29  permutationCount(0) {
30  maxPermutations = 1;
31  for (unsigned long size : sizes) {
32  bases.push_front(maxPermutations);
33  maxPermutations *= size; // may overflow....
34  }
35  }
36 
37  bool next() {
39  if (permutationCount == 1) {
40  return true;
41  }
42 
43  return increment(0);
44  }
45 
46  size_t value(const std::vector<size_t> &p) const {
47  size_t v = 0;
48  for (size_t i = 0; i < p.size(); ++i) {
49  v += bases[i] * p[i];
50  }
51  return v;
52  }
53 
54  size_t value() { return value(permutation); }
55 
56  bool increment(size_t rowToIncrement) {
58  return false;
59  }
60 
61  permutation[rowToIncrement] += 1;
62  size_t max_index_of_row = sizes[rowToIncrement] - 1;
63  if (permutation[rowToIncrement] > max_index_of_row) {
64  permutation[rowToIncrement] = 0;
65  return increment(rowToIncrement + 1);
66  }
67  return true;
68  }
69 };
70 
72  public:
74  RGroupScorer(const std::vector<std::vector<size_t>> &permutations,
75  double score);
76  //! score the passed permutation of matches
77  double matchScore(const std::vector<size_t> &permutation,
78  const std::vector<std::vector<RGroupMatch>> &matches,
79  const std::set<int> &labels);
80  //! set the passed permutation and score as the best one
81  void setBestPermutation(const std::vector<size_t> &permutation, double score);
82  //! return the best permutation found so far
83  const std::vector<size_t> &getBestPermutation() const {
84  return d_saved.permutation;
85  }
86  //! called when process() starts to initialize State
88  //! store the passed tied permutation for subsequent processing
89  void pushTieToStore(const std::vector<size_t> &permutation);
90  //! find the best permutation across the tied ones that were stored
91  void breakTies(const std::vector<std::vector<RGroupMatch>> &matches,
92  const std::set<int> &labels,
93  const std::unique_ptr<CartesianProduct> &iterator,
94  const std::chrono::steady_clock::time_point &t0,
95  double timeout);
96  //! clear all stored tied permutations
97  void clearTieStore();
98  //! number of stored tied permutations
99  size_t tieStoreSize() const { return d_store.size(); }
100  //! return the best score found so far
101  double getBestScore() const { return d_bestScore; }
102 
103  private:
104  void restoreInitialState() { d_current = d_initial; }
105  struct RLabelData {
106  int numRGroups = 0;
107  std::vector<std::map<std::string, unsigned int>> matchSetVect;
108  std::map<std::set<int>, size_t> linkerMatchSet;
109  };
110  // The State structure stores the state of the RGroupScorer
111  // This allows more efficient scoring of permutations, in that
112  // the score of pruned permutations, which are effectively frozen,
113  // are cached in the State rather than being recomputed on-the-fly
114  // while only permutations in the last chunk are actually scored
115  struct State {
116  // compute the criteria according to which the best
117  // permutation is found across the tied ones
118  void computeTieBreakingCriteria(
119  const std::vector<std::vector<RGroupMatch>> &matches,
120  const std::vector<int> &orderedLabels, std::vector<int> &heavyCounts) {
121  // heavyCounts is a vector which has the same size of labels
122  // for each label we add an increment if a molecule
123  // bears an R-group at that label
124  PRECONDITION(permutation.size() <= matches.size(), "permutation.size() should be <= matches.size()");
125  size_t offset = matches.size() - permutation.size();
126  // numMatchedUserRGroups counts the total number of user labelled r
127  // groups filled in this permutation. We want to maximize this number
128  size_t i = 0;
129  for (int label : orderedLabels) {
130  for (size_t m = 0; m < permutation.size(); ++m) { // for each molecule
131  // Negative labels are assigned to R-groups that were found along
132  // the way (when onlyMatchAtRGroups=false) rather than being
133  // user-specified. For each molecule, check if we add an R-group at
134  // this negative label; if we do, count it once. So we know how many
135  // different negative labels we have filled: we prefer permutations
136  // which fill less, as it means we have added less groups on different
137  // positions
138  const auto &match = matches[m + offset][permutation[m]];
139  auto rg = match.rgroups.find(label);
140  if (rg != match.rgroups.end() && !rg->second->is_hydrogen) {
141  if (label < 0 && heavyCounts.at(i) == 0) {
142  ++numAddedRGroups;
143  } else if (label > 0) {
144  ++numMatchedUserRGroups;
145  }
146  ++heavyCounts[i];
147  }
148  }
149  ++i;
150  }
151  }
152 
153  int N = 0;
154  int numAddedRGroups = 0;
155  int numMatchedUserRGroups = 0;
156  std::map<int, int> heavyCountPerLabel;
157  std::map<int, RLabelData> labelDataMap;
158  std::vector<size_t> permutation;
159  };
160  double d_bestScore = 0.0;
161  // the current State
162  State d_current;
163  // the initial state when process() is called
164  State d_initial;
165  // the best State found so far
166  State d_saved;
167  // the States associated to each tied permutation
168  std::deque<State> d_store;
169 };
170 
171 } // namespace RDKit
172 #endif
#define PRECONDITION(expr, mess)
Definition: Invariant.h:109
void pushTieToStore(const std::vector< size_t > &permutation)
store the passed tied permutation for subsequent processing
void startProcessing()
called when process() starts to initialize State
void setBestPermutation(const std::vector< size_t > &permutation, double score)
set the passed permutation and score as the best one
void breakTies(const std::vector< std::vector< RGroupMatch >> &matches, const std::set< int > &labels, const std::unique_ptr< CartesianProduct > &iterator, const std::chrono::steady_clock::time_point &t0, double timeout)
find the best permutation across the tied ones that were stored
void clearTieStore()
clear all stored tied permutations
RGroupScorer(const std::vector< std::vector< size_t >> &permutations, double score)
const std::vector< size_t > & getBestPermutation() const
return the best permutation found so far
Definition: RGroupScore.h:83
double matchScore(const std::vector< size_t > &permutation, const std::vector< std::vector< RGroupMatch >> &matches, const std::set< int > &labels)
score the passed permutation of matches
size_t tieStoreSize() const
number of stored tied permutations
Definition: RGroupScore.h:99
double getBestScore() const
return the best score found so far
Definition: RGroupScore.h:101
#define RDKIT_RGROUPDECOMPOSITION_EXPORT
Definition: export.h:385
Std stuff.
Definition: Abbreviations.h:18
iterate through all possible permutations of the rgroups
Definition: RGroupScore.h:20
std::vector< size_t > sizes
Definition: RGroupScore.h:22
std::deque< size_t > bases
Definition: RGroupScore.h:23
size_t value(const std::vector< size_t > &p) const
Definition: RGroupScore.h:46
CartesianProduct(const std::vector< size_t > &inputSizes)
Definition: RGroupScore.h:26
bool increment(size_t rowToIncrement)
Definition: RGroupScore.h:56
std::vector< size_t > permutation
Definition: RGroupScore.h:21