chemistry.py
import functools
from typing import Callable, Optional, Iterable, Iterator
import numpy as np
from rdkit import Chem, DataStructs
import rdkit.Chem.rdMolDescriptors
# rdkit's DataStructs.ExplicitBitVect is more efficient for rdkit-internal use.
get_morgan_fp: Callable[[Chem.Mol], DataStructs.ExplicitBitVect] = functools.partial(
Chem.rdMolDescriptors.GetMorganFingerprintAsBitVect, radius=2, nBits=2048
)
def tanimoto_sim(mol1: Chem.Mol, mol2: Chem.Mol) -> float:
"""Compute Tanimoto similarity for just two molecules."""
return DataStructs.FingerprintSimilarity(
get_morgan_fp(mol1), get_morgan_fp(mol2), metric=DataStructs.TanimotoSimilarity
)
def _bulk_similarity(
mols1: Iterable[Chem.Mol], mols2: Optional[Iterable[Chem.Mol]] = None
) -> Iterator[np.ndarray]:
if mols2 is None:
mols2 = mols1
mol1_fps = map(get_morgan_fp, mols1)
mol2_fps = tuple(map(get_morgan_fp, mols2))
for fp in mol1_fps:
yield DataStructs.BulkTanimotoSimilarity(fp, mol2_fps)
def canonical_smiles(smiles: str, kekulize: bool = False) -> str:
"""Use rdkit to convert the `smiles` string to canonical form"""
mol = Chem.MolFromSmiles(smiles)
if mol: # If a mol object was successfully create (i.e. not `None`)
if kekulize:
Chem.Kekulize(mol)
smiles = Chem.MolToSmiles(mol, isomericSmiles=True)
else: # No mol object means the `smiles` string was invalid
smiles = ""
return smiles