-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathprocess.py
68 lines (57 loc) · 1.92 KB
/
process.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
from collections import defaultdict
from rdkit import Chem
from rdkit.Chem import AllChem
import numpy as np
from math import log10
from rdkit import DataStructs
from rdkit.Chem import MolStandardize
from neutral import NeutraliseCharges
from multiprocessing import Pool
from rdkit import RDLogger
import pandas as pd
from rdkit.Chem.MolStandardize import rdMolStandardize
RDLogger.DisableLog('rdApp.*')
def canonicalize_smiles(smiles):
if len(smiles)==0:
return ''
mol = Chem.MolFromSmiles(smiles)
lfc = MolStandardize.fragment.LargestFragmentChooser()
if mol is not None:
mol2 = lfc.choose(mol)
smi2=Chem.MolToSmiles(mol2, isomericSmiles=True)
smi,_=NeutraliseCharges(smi2)
return smi
else:
return ''
def run(line):
smi=line.smiles.values.tolist()
p=Pool(30)
smi=p.map(process_tautomer,smi)
if smi=='' is None:
return None
else:
data={'smiles':smi
}
data=pd.DataFrame(data)
return data
def process_tautomer(smi):
smiles=canonicalize_smiles(smi)
mol = Chem.MolFromSmiles(smiles)
enumerator = rdMolStandardize.TautomerEnumerator()
processed=enumerator.Canonicalize(mol)
# processed = enumerator.Enumerate(mol)
smi2=Chem.MolToSmiles(processed, isomericSmiles=True)
return smi2
def enumerate_tautomers_smiles(smiles):
"""Return a set of tautomers as SMILES strings, given a SMILES string.
:param smiles: A SMILES string.
:returns: A set containing SMILES strings for every possible tautomer.
:rtype: set of strings.
"""
# Skip sanitize as standardize does this anyway
smiles=canonicalize_smiles(smiles)
mol = Chem.MolFromSmiles(smiles, sanitize=False)
params=rdMolStandardize.CleanupParamers()
params.maxTautomers = 5
tautomers = rdMolStandardize.TautomerEnumerator().Enumerate(mol)
return {Chem.MolToSmiles(m, isomericSmiles=True) for m in tautomers}