Spaces:
Running
on
Zero
Running
on
Zero
| from sklearn.metrics import mean_squared_error, roc_auc_score, r2_score | |
| from rdkit.Chem import QED, Crippen, MolFromSmiles, rdmolops, rdMolDescriptors, AllChem | |
| from rdkit.Chem.Scaffolds.MurckoScaffold import MurckoScaffoldSmiles | |
| import networkx as nx | |
| import os.path as op | |
| import math | |
| #from rdkit.six.moves import cPickle | |
| import _pickle as cPickle | |
| #from rdkit.six import iteritems | |
| from rdkit import Chem | |
| import pickle | |
| import numpy as np | |
| import sys | |
| import os | |
| from rdkit.Chem import RDConfig | |
| sys.path.append(os.path.join(RDConfig.RDContribDir, 'SA_Score')) | |
| import sascorer | |
| from rdkit.DataStructs.cDataStructs import TanimotoSimilarity | |
| from rdkit.Chem.Fingerprints import FingerprintMols | |
| def compute_rmse(gt, pred): | |
| return mean_squared_error(gt, pred, squared=False) | |
| def compute_r2score(gt, pred): | |
| return r2_score(gt, pred) | |
| def compute_roc_auc(gt, pred): | |
| return roc_auc_score(gt, pred) | |
| def check_valid(smiles_list): | |
| total_num = len(smiles_list) | |
| empty_num = smiles_list.count("") | |
| return 1 - empty_num / float(total_num) | |
| def check_unique(smiles_list): | |
| total_num = len(smiles_list) | |
| smiles_set = set(smiles_list) | |
| if "" in smiles_set: | |
| smiles_set.remove("") | |
| return len(smiles_set) / float(total_num) | |
| def check_nolvelty(gen_smiles, train_smiles): | |
| if len(gen_smiles) == 0: | |
| novel_ratio = 0. | |
| else: | |
| duplicates = [1 for mol in gen_smiles if mol in train_smiles] | |
| novel = len(gen_smiles) - sum(duplicates) | |
| novel_ratio = novel*100./len(gen_smiles) | |
| return novel_ratio | |
| _fscores = None | |
| def readFragmentScores(name='fpscores'): | |
| import gzip | |
| global _fscores | |
| # generate the full path filename: | |
| if name == "fpscores": | |
| name = op.join(op.dirname(__file__), name) | |
| _fscores = cPickle.load(gzip.open('%s.pkl.gz'%name)) | |
| outDict = {} | |
| for i in _fscores: | |
| for j in range(1,len(i)): | |
| outDict[i[j]] = float(i[0]) | |
| _fscores = outDict | |
| def numBridgeheadsAndSpiro(mol,ri=None): | |
| nSpiro = rdMolDescriptors.CalcNumSpiroAtoms(mol) | |
| nBridgehead = rdMolDescriptors.CalcNumBridgeheadAtoms(mol) | |
| return nBridgehead,nSpiro | |
| def calculateScore(m): | |
| if _fscores is None: readFragmentScores() | |
| # fragment score | |
| fp = rdMolDescriptors.GetMorganFingerprint(m,2) #<- 2 is the *radius* of the circular fingerprint | |
| fps = fp.GetNonzeroElements() | |
| score1 = 0. | |
| nf = 0 | |
| for bitId,v in iteritems(fps): | |
| nf += v | |
| sfp = bitId | |
| score1 += _fscores.get(sfp,-4)*v | |
| score1 /= nf | |
| # features score | |
| nAtoms = m.GetNumAtoms() | |
| nChiralCenters = len(Chem.FindMolChiralCenters(m,includeUnassigned=True)) | |
| ri = m.GetRingInfo() | |
| nBridgeheads,nSpiro=numBridgeheadsAndSpiro(m,ri) | |
| nMacrocycles=0 | |
| for x in ri.AtomRings(): | |
| if len(x)>8: nMacrocycles+=1 | |
| sizePenalty = nAtoms**1.005 - nAtoms | |
| stereoPenalty = math.log10(nChiralCenters+1) | |
| spiroPenalty = math.log10(nSpiro+1) | |
| bridgePenalty = math.log10(nBridgeheads+1) | |
| macrocyclePenalty = 0. | |
| # --------------------------------------- | |
| # This differs from the paper, which defines: | |
| # macrocyclePenalty = math.log10(nMacrocycles+1) | |
| # This form generates better results when 2 or more macrocycles are present | |
| if nMacrocycles > 0: macrocyclePenalty = math.log10(2) | |
| score2 = 0. -sizePenalty -stereoPenalty -spiroPenalty -bridgePenalty -macrocyclePenalty | |
| # correction for the fingerprint density | |
| # not in the original publication, added in version 1.1 | |
| # to make highly symmetrical molecules easier to synthetise | |
| score3 = 0. | |
| if nAtoms > len(fps): | |
| score3 = math.log(float(nAtoms) / len(fps)) * .5 | |
| sascore = score1 + score2 + score3 | |
| # need to transform "raw" value into scale between 1 and 10 | |
| min = -4.0 | |
| max = 2.5 | |
| sascore = 11. - (sascore - min + 1) / (max - min) * 9. | |
| # smooth the 10-end | |
| if sascore > 8.: sascore = 8. + math.log(sascore+1.-9.) | |
| if sascore > 10.: sascore = 10.0 | |
| elif sascore < 1.: sascore = 1.0 | |
| return sascore | |
| def compute_plogp(mol): | |
| #mol = MolFromSmiles(smiles_string) | |
| #logp = (Crippen.MolLogP(mol) - np.mean(logP_values)) / np.std(logP_values) | |
| logp = Crippen.MolLogP(mol) | |
| #SA_score = (-sascorer.calculateScore(mol) - np.mean(SA_scores)) / np.std(SA_scores) | |
| SA_score = -calculateScore(mol) | |
| cycle_list = nx.cycle_basis(nx.Graph(rdmolops.GetAdjacencyMatrix(mol))) | |
| if len(cycle_list) == 0: | |
| cycle_length = 0 | |
| else: | |
| cycle_length = max([ len(j) for j in cycle_list ]) | |
| if cycle_length <= 6: | |
| cycle_length = 0 | |
| else: | |
| cycle_length = cycle_length - 6 | |
| #cycle_score = (-cycle_length - np.mean(cycle_scores)) / np.std(cycle_scores) | |
| cycle_score = -cycle_length | |
| #plogp = -(logp + SA_score + cycle_score) | |
| plogp = (logp + SA_score + cycle_score) | |
| return plogp | |
| clf_model = None | |
| def load_model(): | |
| global clf_model | |
| #name = op.join(op.dirname(__file__), 'clf_py36.pkl') | |
| name = op.join(op.dirname(__file__), 'drd2_current.pkl') | |
| with open(name, "rb") as f: | |
| clf_model = pickle.load(f) | |
| def fingerprints_from_mol(mol): | |
| fp = AllChem.GetMorganFingerprint(mol, 3, useCounts=True, useFeatures=True) | |
| size = 2048 | |
| nfp = np.zeros((1, size), np.int32) | |
| for idx,v in fp.GetNonzeroElements().items(): | |
| nidx = idx%size | |
| nfp[0, nidx] += int(v) | |
| return nfp | |
| def compute_drd2(mol): | |
| if clf_model is None: | |
| load_model() | |
| #print(smile) | |
| #mol = Chem.MolFromSmiles(smile) | |
| if mol: | |
| fp = fingerprints_from_mol(mol) | |
| score = clf_model.predict_proba(fp)[:, 1] | |
| return float(score) | |
| return 0.0 | |
| def compute_qed(mol): | |
| return QED.qed(mol) | |
| def compute_logp(mol): | |
| return Crippen.MolLogP(mol) | |
| def compute_tpsa(mol): | |
| return rdMolDescriptors.CalcTPSA(mol) | |
| def compute_sas(mol): | |
| return sascorer.calculateScore(mol) | |
| def check_valid_unique(smiles_list): | |
| total_num = len(smiles_list) | |
| empty_num = smiles_list.count("") | |
| smiles_set = set(smiles_list) | |
| if "" in smiles_set: | |
| smiles_set.remove("") | |
| return 1 - empty_num / float(total_num), \ | |
| len(smiles_set) / float(total_num - empty_num) | |
| def get_similarity(smiles1, smiles2): | |
| if smiles1 == "" or smiles2 == "": | |
| return np.nan | |
| sim = TanimotoSimilarity(FingerprintMols.FingerprintMol(Chem.MolFromSmiles(smiles1)), | |
| FingerprintMols.FingerprintMol(Chem.MolFromSmiles(smiles2))) | |
| return sim | |
| def get_scaffold(smiles): | |
| scaffold = MurckoScaffoldSmiles(smiles) | |
| return scaffold |