Raw File
chemistry.py
import functools
from typing import Callable, Optional, Iterable, Iterator
import numpy as np

from rdkit import Chem, DataStructs
import rdkit.Chem.rdMolDescriptors

# rdkit's DataStructs.ExplicitBitVect is more efficient for rdkit-internal use.
get_morgan_fp: Callable[[Chem.Mol], DataStructs.ExplicitBitVect] = functools.partial(
    Chem.rdMolDescriptors.GetMorganFingerprintAsBitVect, radius=2, nBits=2048
)

def tanimoto_sim(mol1: Chem.Mol, mol2: Chem.Mol) -> float:
    """Compute Tanimoto similarity for just two molecules."""
    return DataStructs.FingerprintSimilarity(
        get_morgan_fp(mol1), get_morgan_fp(mol2), metric=DataStructs.TanimotoSimilarity
    )


def _bulk_similarity(
    mols1: Iterable[Chem.Mol], mols2: Optional[Iterable[Chem.Mol]] = None
) -> Iterator[np.ndarray]:
    if mols2 is None:
        mols2 = mols1
    mol1_fps = map(get_morgan_fp, mols1)
    mol2_fps = tuple(map(get_morgan_fp, mols2))
    for fp in mol1_fps:
        yield DataStructs.BulkTanimotoSimilarity(fp, mol2_fps)


def canonical_smiles(smiles: str, kekulize: bool = False) -> str:
    """Use rdkit to convert the `smiles` string to canonical form"""
    mol = Chem.MolFromSmiles(smiles)
    if mol:  # If a mol object was successfully create (i.e. not `None`)
        if kekulize:
            Chem.Kekulize(mol)
        smiles = Chem.MolToSmiles(mol, isomericSmiles=True)
    else:  # No mol object means the `smiles` string was invalid
        smiles = ""
    return smiles
back to top