mmseqs2.py
import hashlib
import numpy as np
import os
import re
import requests
import tarfile
import time
from absl import logging
from typing import List, NoReturn, Tuple
class MMSeqs2Runner:
r"""Runner object
Fetches sequence alignment and templates from MMSeqs2 server
Based on the function run_mmseqs2 from ColabFold (sokrypton/ColabFold)
Version 62d7558c91a9809712b022faf9d91d8b183c328c
Relevant publications
----------
* "Clustering huge protein sequence sets in linear time"
https://doi.org/10.1038/s41467-018-04964-5
* "MMseqs2 enables sensitive protein sequence searching for the analysis
of massive data sets"
https://doi.org/10.1038/nbt.3988
Private variables
----------
self.job: Job ID (five-char string)
self.seq: Sequence to search
self.host_url: URL address to ping for data
self.t_url: URL address to ping for templates from PDB
self.n_templates = Number of templates to fetch (default=20)
self.path: Path to use
self.tarfile: Compressed file archive to download
"""
def __init__(
self,
job: str,
seq: str,
host_url: str = "https://a3m.mmseqs.com",
t_url: str = "https://a3m-templates.mmseqs.com/template",
path_suffix: str = "env",
n_templates: int = 20,
):
r"""Initialize runner object
Parameters
----------
job : Job name
seq : Amino acid sequence
host_url : Website to ping for sequence data
t_url : Website to ping for template info
path_suffix : Suffix for path info
"""
# Clean up sequence
self.seq = self._cleanseq(seq.upper())
# Come up with unique job ID for MMSeqs
self.job = self._define_jobname(job)
# Save everything else
self.host_url = host_url
self.t_url = t_url
self.n_templates = n_templates
self.path = "_".join((self.job, path_suffix))
if not os.path.isdir(self.path):
os.system(f"mkdir { self.path }")
self.tarfile = f"{ self.path }/out.tar.gz"
def _cleanseq(self, seq) -> str:
r"""Cleans the sequence to remove whitespace and noncanonical letters
Parameters
----------
seq : Amino acid sequence (only all 20 here)
Returns
----------
Cleaned up amin acid sequence
"""
if any([aa in seq for aa in "BJOUXZ"]):
logging.warning("Sequence contains non-canonical amino acids!")
logging.warning("Removing B, J, O, U, X, and Z from sequence")
seq = re.sub(r"[BJOUXZ]", "", seq)
return re.sub(r"[^A-Z]", "", "".join(seq.split()))
def _define_jobname(self, job: str) -> str:
r"""Provides a unique five-digit identifier for the job name
Parameters
----------
job : Job name
Returns
----------
Defined job name
"""
return "_".join(
(
re.sub(r"\W+", "", "".join(job.split())),
hashlib.sha1(self.seq.encode()).hexdigest()[:5],
)
)
def _submit(self) -> dict:
r"""Submit job to MMSeqs2 server
Parameters
----------
None
Returns
----------
None
"""
data = {"q": f">101\n{ self.seq }", "mode": "env"}
res = requests.post(f"{ self.host_url }/ticket/msa", data=data)
try:
out = res.json()
except ValueError:
out = {"status": "UNKNOWN"}
return out
def _status(self, idx: str) -> dict:
r"""Check status of job
Parameters
----------
idx : Index assigned by MMSeqs2 server
Returns
----------
None
"""
res = requests.get(f"{ self.host_url }/ticket/{ idx }")
try:
out = res.json()
except ValueError:
out = {"status": "UNKNOWN"}
return out
def _download(self, idx: str, path: str) -> NoReturn:
r"""Download job outputs
Parameters
----------
idx : Index assigned by MMSeqs2 server
path : Path to download data
Returns
----------
None
"""
res = requests.get(f"{ self.host_url }/result/download/{ idx }")
with open(path, "wb") as out:
out.write(res.content)
def _search_mmseqs2(self) -> NoReturn:
r"""Run the search and download results
Heavily modified from ColabFold
Parameters
----------
None
Returns
----------
None
"""
if os.path.isfile(self.tarfile):
return
out = self._submit()
time.sleep(5 + np.random.randint(0, 5))
while out["status"] in ["UNKNOWN", "RATELIMIT"]:
# resubmit
time.sleep(5 + np.random.randint(0, 5))
out = self._submit()
logging.debug(f"ID: { out[ 'id' ] }")
while out["status"] in ["UNKNOWN", "RUNNING", "PENDING"]:
time.sleep(5 + np.random.randint(0, 5))
out = self._status(out["id"])
if out["status"] == "COMPLETE":
self._download(out["id"], self.tarfile)
elif out["status"] == "ERROR":
raise RuntimeError(
" ".join(
(
"MMseqs2 API is giving errors.",
"Please confirm your input is a valid protein sequence.",
"If error persists, please try again in an hour.",
)
)
)
def process_templates(self, templates: List[str] = []) -> str:
r"""Process templates and fetch from MMSeqs2 server
Parameters
----------
use_templates : True/False whether to use templates
max_templates : Maximum number of templates to use
Returns
----------
Directory containing templates (empty if not using templates)
"""
path = f"{ self.job }_env/templates_101"
if os.path.isdir(path):
os.system(f"rm { path }")
# templates = {}
logging.info("\t".join(("seq", "pdb", "cid", "evalue")))
pdbs = []
with open(f"{ self.path }/pdb70.m8", "r") as infile:
for line in infile:
sl = line.rstrip().split()
pdb = sl[1]
if pdb in templates:
pdbs.append(sl[1])
logging.info(f"{ sl[0] }\t{ sl[1] }\t{ sl[2] }\t{ sl[10] }")
if len(pdbs) == 0:
logging.warning("No templates found.")
return ""
else:
if not os.path.isdir(path):
os.mkdir(path)
pdbs = [t for t in pdbs if t in templates]
if len(templates) == 0 or len(pdbs) == 0:
pdbs = ",".join(templates[: self.n_templates])
else:
pdbs = ",".join(pdbs[: self.n_templates])
os.system(f"curl -v { self.t_url }/{ pdbs } |tar xzf - -C { path }/")
os.system(f"cp { path }/pdb70_a3m.ffindex { path }/pdb70_cs219.ffindex")
os.system(f"touch { path }/pdb70_cs219.ffdata")
return path
def _process_alignment(
self, a3m_files: list, templates: List[str] = []
) -> Tuple[str, str]:
r"""Process sequence alignment
(modified from ColabFold)
Parameters
----------
a3m_files : List of files to parse
token : Token to look for when parsing
Returns
----------
Tuple with [0] string with alignment, and [1] path to template
"""
a3m_lines = ""
for a3m_file in a3m_files:
for line in open(os.path.join(self.path, a3m_file), "r"):
if len(line) > 0:
a3m_lines += line.replace("\x00", "")
return a3m_lines, self.process_templates(templates)
def run_job(self, templates: List[str] = []) -> Tuple[str, str]:
r"""
Run sequence alignments using MMseqs2
Parameters
----------
use_templates: Whether to use templates
Returns
----------
Tuple with [0] string with alignment, and [1] path to template
"""
self._search_mmseqs2()
a3m_files = ["uniref.a3m", "bfd.mgnify30.metaeuk30.smag30.a3m"]
# extract a3m files
if not os.path.isfile(os.path.join(self.path, a3m_files[0])):
with tarfile.open(self.tarfile) as tar_gz:
tar_gz.extractall(self.path)
return self._process_alignment(a3m_files, templates)