util.py
import os
import numpy as np
from typing import Dict, List, NoReturn
from alphafold.data import pipeline
from alphafold.data import templates
from alphafold.data.tools import hhsearch
def mk_mock_template(seq: str) -> dict:
r"""Generates mock templates that will not influence prediction
Taken from ColabFold version 62d7558c91a9809712b022faf9d91d8b183c328c
Parameters
----------
seq: Query sequence
Returns
----------
Dictionary with blank/empty/meaningless features
"""
# Define constants
lentype = templates.residue_constants.atom_type_num
lseq = len(seq)
# Since alphafold's model requires a template input
# We create a blank example w/ zero input, confidence -1
aatypes = np.array(
templates.residue_constants.sequence_to_onehot(
"-" * lseq, templates.residue_constants.HHBLITS_AA_TO_ID
)
)
return {
"template_all_atom_positions": np.zeros((lseq, lentype, 3))[None],
"template_all_atom_masks": np.zeros((lseq, lentype))[None],
"template_sequence": [f"none".encode()],
"template_aatype": aatypes[None],
"template_confidence_scores": np.full(lseq, -1)[None],
"template_domain_names": [f"none".encode()],
"template_release_date": [f"none".encode()],
}
###############################
def mk_template(seq: str, a3m_lines=str, path=str) -> dict:
r"""Parses templates into features
Parameters
----------
seq : Query sequence
a3m_lines : Lines form MMSeqs2 alignment
path : Path to templates fetched using MMSeqs2
Returns
----------
Dictionary with features
"""
result = hhsearch.HHSearch(
binary_path="hhsearch", databases=[f"{ path }/pdb70"]
).query(a3m_lines)
return templates.HhsearchHitFeaturizer(
mmcif_dir=path,
max_template_date="2100-01-01",
max_hits=20,
kalign_binary_path="kalign",
release_dates_path=None,
obsolete_pdbs_path=None,
).get_templates(query_sequence=seq, hits=pipeline.parsers.parse_hhr(result))
###############################
def setup_features(seq: str, a3m_lines: list, tfeatures_in: dict) -> dict:
r"""Set up features for alphafold
Parameters
----------
seq : Sequence (string)
a3m_lines : Sequence alignment lines
tfeatures_in : Template features
Returns
----------
Alphafold features object
"""
msa = pipeline.parsers.parse_a3m(a3m_lines)
return {
**pipeline.make_sequence_features(
sequence=seq, description="none", num_res=len(seq)
),
**pipeline.make_msa_features(msas=[msa]),
**tfeatures_in,
}
def mutate_msa(
a3m_lines: str,
pos_res: Dict[int, str],
) -> str:
r"""Mutates every position in an MSA to a residue of interest
Example usage: mutate_msa( a3m_lines, { 15: "A", 155: "A" } )
This will mutate residues 15 and 155 to alanine throughout the MSA
Parameters
----------
a3m_lines : Sequence alignment
pos : Position to change
target_res : Residue to mutate to
Returns
----------
Sequence alignment (as string)
"""
for target_res in pos_res.values():
assert len(target_res) == 1
output = []
# Iterate over alignment lines
for line in a3m_lines.split("\n"):
if line.startswith(">"):
output.append(line)
elif len(line) > 1:
line = list(line)
for pos, res in pos_res.items():
if line[pos] in "ACDEFGHIKLMNPQRSTVWY":
line[pos] = res
output.append("".join(line))
else:
output.append(line)
return "\n".join(output)
def mutate(x, y):
mutate_msa(x, y) # Alias for brevity
def plddt_to_bfactor(filename: str, maxval: float = 100.0) -> NoReturn:
r"""Converts a pLDDT vals to a B factor
This equation is derived from the following publication:
"Improved protein structure refinement guided by deep learning based
accuracy estimation" by Hiranuma et al 2021
https://doi.org/10.1038/s41467-021-21511-x
Parameters
----------
filename : Name of PDB file
maxval : Set to 100 if using AF2 (or 1 if RoseTTAFold)
Returns
----------
None
"""
pdb = Bio.PDB.PDBParser().get_structure("TEMP", filename)
for atom in pdb.get_atoms():
rmsf = 1.5 * np.exp(4 * (0.7 - (atom.bfactor / maxval)))
atom.bfactor = (8.0 / 3.0) * (np.pi**2) * (rmsf**2)
pdbio = Bio.PDB.PDBIO()
pdbio.set_structure(pdb)
pdbio.save(filename)