assert type(to_smile('CCC')) == str
assert type(to_mol('CCC')) == Chem.Mol
assert type(to_smile(Chem.MolFromSmiles('CCC'))) == str
assert type(to_mol(Chem.MolFromSmiles('CCC'))) == Chem.Mol
s = 'O=C(NCc1ccc(Br)cc1F)C(=O)NCC1(Cc2ccccc2)CC1'
assert selfie_to_smile(smile_to_selfie(s)) == s
try:
output = maybe_parallel(rdMolDescriptors.CalcExactMolWt, [to_mol('CCC')])
print('Parallel execution succeeded')
except:
print('parallel execution failed')
def wrapper(mol):
return rdMolDescriptors.CalcExactMolWt(mol)
try:
output = maybe_parallel(wrapper, [to_mol('CCC')])
print('Parallel execution succeeded')
except:
print('parallel execution failed')
Sadly, having a generic wrapper constructor also fails to pickle because such a wrapper requires an RDKit function as input to construct the wrapper, which brings back the pickle problems (see code example below). This leaves us with manually defining wrapper functions for RDKit functions
def rdkit_wrapper(rdkit_func):
def wrapper(mol):
return rdkit_func(mol)
return wrapper
try:
output = maybe_parallel(rdkit_wrapper(rdMolDescriptors.CalcExactMolWt), [to_mol('CCC')])
print('Parallel execution succeeded')
except:
print('parallel execution failed')
try:
_ = maybe_parallel(hbd, [to_mol('CCC')])
output = 'success'
except:
output = 'fail'
assert output == 'success'
Substructure Matching
This class is used for substructure matching an input Mol against a list of SMARTS.
Note: Substructure matching is tricky. Be sure to verify your SMARTS before putting a large number of them into a filter.
CatalogMatch functions as a base class to match Mol objects against any generic catalog. has_match will return a single boolean value for if the Mol matches one of he filters in the catalog. get_matches will return a list of bools for all elements in the catalog. percent_matches returns a list of floats for what percentage of filters match.
SMARTSMatch will generate a catalog from a list of SMARTS
PAINSMatch, PAINSAMatch, PAINSBMatch and PAINSCMatch specify different PAINS catalogs present in RDKit (see here)
smarts = [
'[*]-[#6]1:[#6]:[#6](-[#0]):[#6]:[#6](-[*]):[#6]:1',
'[*]-[#6]1:[#6]:[#6](-[*]):[#6]:[#6]:[#6]:1',
'[*]-[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1',
'[*]-[#6]1:[#6]:[#6](-[#7]-[*]):[#6]:[#6]:[#6]:1',
'[#6]1:[#6]:[#7]:[#6]:[#6]:[#6]:1'
]
sm = SmartsCatalog(smarts)
smiles = [
'c1ccccc1',
'Cc1cc(NC)ccc1',
'Cc1cc(NC)cnc1',
'Cc1cccc(NCc2ccccc2)c1'
]
mols = [to_mol(i) for i in smiles]
assert sm(mols, criteria='any') == [False, True, True, True]
assert sm(mols, criteria=0.5) == [False, True, False, True]
assert sm(mols[1], criteria=3)==True
Fingerprints
This section deals with creating and manipulating molecular fingerprints. Below are functions for generating different forms of Morgan fingerprints (ECFP4, ECFP6, FCFP4, FCFP6). Fingerprints by default are generated as RDKit ExplicitBitVect objects, but can be converted to numpy arrays using the fp_to_array function.
Fingerprint similarity functions using Tanimoto, Dice and Cosine metrics are implemented for both ExplicitBitVect and ndarray objects.
Note following cheminformatics convention, fingerprint metrics are implemented as similarities rather than distances. The metrics used have the relationship similarity = 1 - distance. For using fingerprint difference metrics in machine learning applications, be sure you are using the correct relationship (similarity vs difference) for your task.
When computing similarities between fingerprints, several things need to be lined up. Different methods are needed for different fingerprint formats (ndarray vs ExplicitBitVect) and different distance metrics.
The FP class holds logic to make this easy.
The FP.get_fingerprint function allows for parallel processing of fingerprint generation.
The FP.fingerprint_similarity routes fingerprints to the correct similarity function based on the fingerprint's array type and the similarity metric used.
For cases where instantiating a class isn't helpful, get_fingerprint and fingerprint_similarities work as functional wrappers around FP.
fp = FP()
fps = fp.get_fingerprint(mols, fp_type='ECFP4', output_type='rdkit')
fps_np = fp_to_array(fps)
assert np.allclose(fp.fingerprint_similarity(fps, fps, 'tanimoto'),
fp.fingerprint_similarity(fps_np, fps_np, 'tanimoto'))
def my_fp(mol):
mol = to_mol(mol)
fp = AllChem.RDKFingerprint(mol)
return fp
class MyFP(FP):
def __init__(self):
super().__init__()
self.fps['my_fp'] = my_fp
fp = MyFP()
fps = fp.get_fingerprint(mols, fp_type='my_fp', output_type='rdkit')
def my_dist(fps1, fps2):
# make sure your distance function works on binary/boolean arrays!!
return 1-distance.cdist(fps1, fps2, metric='russellrao')
def my_dist_rd(fp, fps):
# make sure the RDKit method gives the same result as scipy, not always the case
return DataStructs.BulkRusselSimilarity(fp, fps)
class MyFP(FP):
def __init__(self):
super().__init__()
self.similarities['my_metric'] = {'rdkit' : my_dist_rd,
'numpy' : my_dist}
fp = MyFP()
fps = fp.get_fingerprint(mols, fp_type='ECFP6', output_type='numpy')
fp.fingerprint_similarity(fps, fps, 'my_metric')
Mol Operations
Functions for editing or manipulating Mol objects.
Fragmenting functions like fragment_smile break molecules into fragments by cutting single bonds.
fuse_on_atom_mapping fuses fragments following RDKit's atom mapping conventions.
[*:1]-R1-[*:2] + [*:1]-R2 >> [*:2]-R1-R2
fuse_on_link relies on user-defined linkages such as heavy atoms.
[Rb]-R1-[Pb] + [Rb]-R2 >> [Pb]-R1-R2
fragment_smile('CCCCCCCC', [1])
assert fuse_on_atom_mapping('[*:1]CC.[*:1]CC') == 'CCCC'
assert fuse_on_atom_mapping(to_mol('[*:1]CC.[*:1]CC')) == 'CCCC'
assert fuse_on_atom_mapping('[*:1]CC.[*:2]CC') == 'CC[*:1].CC[*:2]'
assert fuse_on_link('[Rb]CC.[Rb]CC', ['[Rb]']) == 'CCCC'
assert fuse_on_link('[Rb]CC.[Rb]CC', ['[Pb]']) == 'CC[Rb].CC[Rb]'
fragment_smile = 'C1CCC([*:1])CC1.C([*:3])CC.c1cncc([*:2])c1.c1nc([*:1])c2c([*:3])nc([*:2])cc2n1'
mol = to_mol(fragment_smile)
mol
fused_smile = fuse_on_atom_mapping(fragment_smile)
new_mol = to_mol(fused_smile)
new_mol
smile = 'Cc1cc(Oc2nccc(CCC)c2)ccc1'
scaffold = murcko_scaffold(smile)
scaffold_generic = murcko_scaffold(smile, generic=True)
draw_mols(to_mols([smile, scaffold, scaffold_generic]))
Structure Enumeration
Often it can be useful to enumerate variants of the same core structure. For example, generating every 6 member ring variant with 2 nitrogens. The StructureEnumerator class provides a way of enumerating over a core structure defined by a smarts string and a set of user inputs. The structure enumerator can also add wildcard atoms.
For examples on using the StructureEnumerator class, see the Structure Enumeration tutorial page
out = add_atom_combi('C1CN=CCC1', ['C', 'N', 'O', 'F', -1, -2])
out = add_bond_combi('C1CN=CCC1')
Proteins
Functions designed for manipulating proteins as amino acid sequences.
Current Limitations
The underlying RDKit utils for amino acids are somewhat more restricted than those for SMILES strings. Only standard amino acid characters can be used (ie no wildcards).
Proteins are represented as FASTA sequences, ie MKDCSNGCSAECTGEGG
assert type(to_protein('MKDCSNGCSAECTGEGG'))==Chem.Mol
assert to_sequence(to_protein('MKDCSNGCSAECTGEGG')) == 'MKDCSNGCSAECTGEGG'
Nucleic Acids
Functions designed for manipulating DNA/RNA as nucleic acid sequences.
Current Limitations
The underlying RDKit utils for nucleic are somewhat more restricted than those for SMILES strings. Only standard nucleic acid characters can be used. This means no wildcards (*) or hybrid nucleic acids (N)
Polynucleotides are represented as FASTA sequences, ie ATGCATGC. FASTA sequences are resolved into uncapped Polynucleotides.
assert type(to_dna('ATGC'))==Chem.Mol
assert to_sequence(to_dna('ATGC')) == 'ATGC'
assert type(to_rna('AUGC'))==Chem.Mol
assert to_sequence(to_rna('AUGC')) == 'AUGC'