-
Notifications
You must be signed in to change notification settings - Fork 4
/
applicability_domains.py
208 lines (172 loc) · 7.29 KB
/
applicability_domains.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
import uuid
from functools import partial
from multiprocessing import Pool
from time import gmtime, strftime
import json
import csv
import numpy as np
from guacamol.scoring_function import BatchScoringFunction
from rdkit import Chem
from rdkit.Chem import AllChem
from featurizers import ecfp4
import Levenshtein
class AD():
"""Each applicability domain class inherits from the AD class.
It must be initialized with a list of SMILES and a type of featurization, and
implement a check_smiles_list method, that takes as input a list of SMILES,
and outputs a list of 0 and 1 according to whether each SMILES correspond to
a molecule within the AD"""
def __init__(self, reference_molecules, featurization):
self.ref = reference_molecules
self.featurization = featurization
class convex_hull(AD):
"""Check whether a molecule is inside the convex hull of the reference set"""
def __init__(self, reference_molecules, featurization):
super().__init__(reference_molecules, featurization)
self.table = self.featurization(self.ref)
variations = []
values = []
for i in range(len(np.array(self.table[0]))):
erase = False
for element in self.table:
if np.array(element)[i]!=np.array(self.table[0])[i]:
erase = True
if not erase:
variations.append(1)
values.append(np.array(element)[i])
else:
variations.append(0)
values.append(0)
self.variations = np.array(variations)
self.values = np.array(values)
self.name = "convex_hull"
def check_smiles_list(self, smiles_list):
is_in_convex_hull = []
for smiles in smiles_list:
X = np.array(self.featurization([smiles])[0])
res = 1 - min(1, np.dot(self.variations, X!=self.values))
is_in_convex_hull.append(res)
return is_in_convex_hull
def jaccard(im1, im2):
im1 = np.asarray(im1).astype(np.bool)
im2 = np.asarray(im2).astype(np.bool)
if im1.shape != im2.shape:
raise ValueError("Shape mismatch: im1 and im2 must have the same shape.")
intersection = np.logical_and(im1, im2)
union = np.logical_or(im1, im2)
return intersection.sum() / float(union.sum())
class similarity_max(AD):
def __init__(self, reference_molecules, featurization, threshold=0.5):
super().__init__(reference_molecules, featurization)
self.table = self.featurization(self.ref)
self.threshold = threshold
self.name = "maxsim"
def check_smiles_list(self, smiles_list):
is_in_convex_hull = []
for smiles in smiles_list:
if Chem.MolFromSmiles(smiles):
X = np.array(self.featurization([smiles])[0])
score = max([jaccard(X, y) for y in self.table])
is_in_convex_hull.append(1 * score>self.threshold)
else:
is_in_convex_hull.append(0)
return is_in_convex_hull
class SMILESvalidity(AD):
def __init__(self, reference_molecules, featurization):
super().__init__(reference_molecules, featurization)
self.name = "smiles_validity"
def check_smiles_list(self, smiles_list):
is_in_convex_hull = []
for smiles in smiles_list:
if Chem.MolFromSmiles(smiles):
is_in_convex_hull.append(1)
else:
is_in_convex_hull.append(0)
return is_in_convex_hull
class filtersvalidity(AD):
def __init__(self, reference_molecules, featurization):
super().__init__(reference_molecules, featurization)
self.name = "filters_validity"
alert_collection_path = "data/alerts.csv"
names = []
smarts = []
with open(alert_collection_path) as csv_file:
csv_reader = csv.reader(csv_file, delimiter=',')
for i, row in enumerate(csv_reader):
if i>0:
names.append(row[2])
smarts.append(row[3])
names_already_present = []
for smiles in reference_molecules:
for i, motif in enumerate(smarts):
subs = Chem.MolFromSmarts(motif)
if subs != None and Chem.MolFromSmiles(smiles).HasSubstructMatch(subs) and names[i] not in names_already_present:
names_already_present.append(names[i])
self.names = names
self.smarts = smarts
self.names_already_present = names_already_present
def check_smiles_list(self, smiles_list):
is_in_convex_hull = []
for smiles in smiles_list:
if Chem.MolFromSmiles(smiles):
to_keep = True
for i, motif in enumerate(self.smarts):
subs = Chem.MolFromSmarts(motif)
if subs != None and self.names[i] not in self.names_already_present and Chem.MolFromSmiles(smiles).HasSubstructMatch(subs):
to_keep = False
if to_keep:
is_in_convex_hull.append(1)
else:
is_in_convex_hull.append(0)
else:
is_in_convex_hull.append(0)
return is_in_convex_hull
class levenshtein(AD):
def __init__(self, reference_molecules, featurization, threshold=0.8):
super().__init__(reference_molecules, featurization)
self.table = self.ref
self.threshold = threshold
self.name = "levenshtein"
def check_smiles_list(self, smiles_list):
is_in_AD = []
for smiles in smiles_list:
if Chem.MolFromSmiles(smiles):
score = max([Levenshtein.ratio(smiles, y) for y in self.table])
is_in_AD.append(1 * score>self.threshold)
else:
is_in_AD.append(0)
return is_in_AD
class in_range(AD):
def __init__(self, reference_molecules, featurization, tolerance=0):
super().__init__(reference_molecules, featurization)
self.table = self.featurization(self.ref)
self.mins = []
self.maxs = []
for i in range(len(np.array(self.table[0]))):
minimum = 1000
maximum = -1000
for element in self.table:
if element[i]>maximum:
maximum = np.array(element)[i]
if element[i]<minimum:
minimum = np.array(element)[i]
self.mins.append(minimum)
self.maxs.append(maximum)
self.tolerance = tolerance
self.name = "range"
def check_smiles_list(self, smiles_list):
is_in_AD = []
for smiles in smiles_list:
valid = True
try:
if Chem.MolFromSmiles(smiles):
X = np.array(self.featurization([smiles])[0])
for i in range(len(X)):
if X[i]>self.maxs[i] or X[i]<self.mins[i]:
valid = False
is_in_AD.append(1*valid)
else:
is_in_AD.append(0)
except:
is_in_AD.append(0)
return is_in_AD