-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathutils_activeLearning_Oliver.py
206 lines (139 loc) · 5.33 KB
/
utils_activeLearning_Oliver.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
import os, sys, time
import numpy as np
import pandas as pd
#import selfies
from tqdm import tqdm
#import selfies_024 as sf
import selfies as sf
#### ZINC training data files used for VAE
csc_df = pd.read_csv('../Data/VAE_data/csc-library-SMILES.csv')
#### Extract SELFIES of training samples
selfies_list = csc_df['SELFIES'].tolist()
#### Extract alphabets in SELFIES
alphabet_list = pd.read_csv('../Data/VAE_data/alphabet-list.csv')['alphabets'].tolist()
#### One hot encoding SELFIES max length.
selfies_len_list = csc_df['SELFIES'].apply(lambda selfi: selfi.count('[')).tolist()
selfies_len_max = max(selfies_len_list)
hot_dt = np.dtype(bool)
def len_selfies(selfies):
return selfies.count("[") + selfies.count(".")
# filter dataframe using alphabets of the molecules
def check_molecules_alphabets_exists(df):
"""
Check if alphabets of input SMILES in SELFIES format has the alphabets used in training.
Parameters
----------
df: DataFrame
Dataframe containing SMILES of molecules being explored.
Returns
-------
alpha_exists: List[Bool]
Return in bool depending on whether the alphabets of SMILES exist.
"""
smiles_list = df['smiles'].values
alpha_exists = []
for smiles in smiles_list:
#print('--> Translating SMILES to SELFIES...')
selfies_molecule = sf.encoder(smiles)
#print(selfies_molecule)
len_seflies = sf.len_selfies(selfies_molecule)
#print(len_seflies)
symbols_selfies = list(sf.split_selfies(selfies_molecule))
#print(symbols_selfies)
#print('Finished translating SMILES to SELFIES.')
alpha_exists.append(all(elem in alphabet_list for elem in symbols_selfies))
return alpha_exists
def get_selfie_and_smiles_encodings_for_dataset(smilesDf):
"""
Returns encoding, alphabet and length of largest molecule in SMILES and
SELFIES, given a file containing SMILES molecules.
Parameters
----------
input:
smilesDf: Dataframe.
Column's name must be 'smiles'.
Returns
-------
output:
- selfies encoding
- selfies alphabet
- longest selfies string
- smiles encoding (equivalent to file content)
- smiles alphabet (character based)
- longest smiles string
"""
df = smilesDf
smiles_list = np.asanyarray(df.smiles)
smiles_alphabet = list(set(''.join(smiles_list)))
smiles_alphabet.append(' ') # for padding
largest_smiles_len = len(max(smiles_list, key=len))
print('--> Translating SMILES to SELFIES...')
selfies_list = list(map(sf.encoder, smiles_list))
largest_selfies_len = max(len_selfies(s) for s in selfies_list)
print('Finished translating SMILES to SELFIES.')
return selfies_list, alphabet_list, largest_selfies_len, \
smiles_list, smiles_alphabet, largest_smiles_len
def selfies_to_hot(molecule, largest_smile_len, alphabet):
"""
Go from a single selfies string to a one-hot encoding.
This is similar to that used in training the VAE.
Parameters
----------
molecule: String
Input SELFIE string
largest_smile_len:
Length of the largest SMILE molecule
alphabet: List
Alphabets of SELFIES used in training.
Returns
-------
One hot encoded vector.
"""
char_to_int = dict((c, i) for i, c in enumerate(alphabet))
# integer encode input smile
len_of_molecule=len(molecule)-len(molecule.replace('[',''))
for _ in range(largest_smile_len-len_of_molecule):
molecule+='[epsilon]'
selfies_char_list_pre=molecule[1:-1].split('][')
selfies_char_list=[]
for selfies_element in selfies_char_list_pre:
selfies_char_list.append('['+selfies_element+']')
integer_encoded = [char_to_int[char] for char in selfies_char_list]
# one hot-encode input smile
onehot_encoded = list()
for value in integer_encoded:
letter = [0 for _ in range(len(alphabet))]
letter[value] = 1
onehot_encoded.append(letter)
return np.array(onehot_encoded, dtype=hot_dt)
def multiple_selfies_to_hot(selfies_list, largest_molecule_len, alphabet):
"""
Convert a list of selfies strings to a one-hot encoding
Parameters
----------
selfies_list: List[Strings]
Input list containing SELFIES of probes to be converted to one hot encoded vector.
largest_molecule_len: Int
Largest molecule in the search space.
alphabet: List[Strings]
Alphabets of SELFIES used in training.
Returns
-------
Array of one hot encoded vectors for each SELFIE in the input list.
"""
hot_list = []
for selfiesI in tqdm(selfies_list):
onehot_encoded = selfies_to_hot(selfiesI, largest_molecule_len, alphabet)
hot_list.append(onehot_encoded)
return np.array(hot_list, dtype=hot_dt)
def is_correct_smiles(smiles):
"""
Using RDKit to calculate whether molecule is syntactically and
semantically valid.
"""
if smiles == "":
return False
try:
return CChem.MolFromSmiles(smiles, sanitize=False) is not None
except Exception:
return False