import hashlib import numpy as np import os import re import requests import tarfile import time from absl import logging from typing import List, NoReturn, Tuple class MMSeqs2Runner: r"""Runner object Fetches sequence alignment and templates from MMSeqs2 server Based on the function run_mmseqs2 from ColabFold (sokrypton/ColabFold) Version 62d7558c91a9809712b022faf9d91d8b183c328c Relevant publications ---------- * "Clustering huge protein sequence sets in linear time" https://doi.org/10.1038/s41467-018-04964-5 * "MMseqs2 enables sensitive protein sequence searching for the analysis of massive data sets" https://doi.org/10.1038/nbt.3988 Private variables ---------- self.job: Job ID (five-char string) self.seq: Sequence to search self.host_url: URL address to ping for data self.t_url: URL address to ping for templates from PDB self.n_templates = Number of templates to fetch (default=20) self.path: Path to use self.tarfile: Compressed file archive to download """ def __init__( self, job: str, seq: str, host_url: str = "https://a3m.mmseqs.com", t_url: str = "https://a3m-templates.mmseqs.com/template", path_suffix: str = "env", n_templates: int = 20 ): r"""Initialize runner object Parameters ---------- job : Job name seq : Amino acid sequence host_url : Website to ping for sequence data t_url : Website to ping for template info path_suffix : Suffix for path info """ # Clean up sequence self.seq = self._cleanseq( seq.upper() ) # Come up with unique job ID for MMSeqs self.job = self._define_jobname( job ) # Save everything else self.host_url = host_url self.t_url = t_url self.n_templates = n_templates self.path = "_".join( ( self.job, path_suffix ) ) if not os.path.isdir( self.path ): os.system( f"mkdir { self.path }" ) self.tarfile = f'{ self.path }/out.tar.gz' def _cleanseq( self, seq ) -> str: r""" Cleans the sequence to remove whitespace and noncanonical letters Parameters ---------- seq : Amino acid sequence (only all 20 here) Returns ---------- Cleaned up amin acid sequence """ if any( [ aa in seq for aa in "BJOUXZ" ] ): logging.warning( "Sequence contains non-canonical amino acids!" ) logging.warning( "Removing B, J, O, U, X, and Z from sequence" ) seq = re.sub( r'[BJOUXZ]', '', seq ) return re.sub( r'[^A-Z]', '', "".join( seq.split() ) ) def _define_jobname( self, job: str ) -> str: r""" Provides a unique five-digit identifier for the job name Parameters ---------- job : Job name Returns ---------- Defined job name """ return "_".join( ( re.sub( r"\W+", "", "".join( job.split() ) ), hashlib.sha1( self.seq.encode() ).hexdigest()[ :5 ] ) ) def _submit( self ) -> dict: r"""Submit job to MMSeqs2 server Parameters ---------- None Returns ---------- None """ data = { 'q': f">101\n{ self.seq }", 'mode': "env" } res = requests.post( f'{ self.host_url }/ticket/msa', data=data ) try: out = res.json() except ValueError: out = { "status": "UNKNOWN" } return out def _status( self, idx: str ) -> dict: r"""Check status of job Parameters ---------- idx : Index assigned by MMSeqs2 server Returns ---------- None """ res = requests.get( f'{ self.host_url }/ticket/{ idx }' ) try: out = res.json() except ValueError: out = { "status": "UNKNOWN" } return out def _download( self, idx: str, path: str ) -> NoReturn: r"""Download job outputs Parameters ---------- idx : Index assigned by MMSeqs2 server path : Path to download data Returns ---------- None """ res = requests.get( f'{ self.host_url }/result/download/{ idx }' ) with open( path, "wb" ) as out: out.write( res.content ) def _search_mmseqs2( self ) -> NoReturn: r"""Run the search and download results Heavily modified from ColabFold Parameters ---------- None Returns ---------- None """ if os.path.isfile( self.tarfile ): return out = self._submit() time.sleep( 5 + np.random.randint( 0, 5 ) ) while out[ "status" ] in [ "UNKNOWN", "RATELIMIT" ]: # resubmit time.sleep( 5 + np.random.randint( 0, 5 ) ) out = self._submit() logging.debug( f"ID: { out[ 'id' ] }" ) while out[ "status" ] in [ "UNKNOWN", "RUNNING", "PENDING" ]: time.sleep( 5 + np.random.randint( 0, 5 ) ) out = self._status( out[ "id" ] ) if out[ "status" ] == "COMPLETE": self._download( out[ "id" ], self.tarfile ) elif out[ "status" ] == "ERROR": raise RuntimeError( " ".join( ( "MMseqs2 API is giving errors.", "Please confirm your input is a valid protein sequence.", "If error persists, please try again in an hour." ) ) ) def process_templates( self, templates: List[ str ] = [] ) -> str: r"""Process templates and fetch from MMSeqs2 server Parameters ---------- use_templates : True/False whether to use templates max_templates : Maximum number of templates to use Returns ---------- Directory containing templates (empty if not using templates) """ path = f"{ self.job }_env/templates_101" if os.path.isdir( path ): os.system( f"rm { path }" ) #templates = {} logging.info( "\t".join( ( "seq", "pdb", "cid", "evalue" ) ) ) pdbs = [] with open( f"{ self.path }/pdb70.m8", "r" ) as infile: for line in infile: sl = line.rstrip().split() pdb = sl[ 1 ] if pdb in templates: pdbs.append( sl[ 1 ] ) logging.info( f"{ sl[ 0 ] }\t{ sl[ 1 ] }\t{ sl[ 2 ] }\t{ sl[ 10 ] }" ) if len( pdbs ) == 0: logging.warning( "No templates found." ) return "" else: if not os.path.isdir( path ): os.mkdir( path ) pdbs = [ t for t in pdbs if t in templates ] if len( templates ) == 0 or len( pdbs ) == 0: pdbs = ",".join( templates[ :self.n_templates ] ) else: pdbs = ",".join( pdbs[ :self.n_templates ] ) os.system( f"curl -v { self.t_url }/{ pdbs }" f" | tar xzf - -C { path }/" ) os.system( f"cp { path }/pdb70_a3m.ffindex { path }/pdb70_cs219.ffindex" ) os.system( f"touch { path }/pdb70_cs219.ffdata" ) return path def _process_alignment( self, a3m_files: list, templates: List[ str ] = [] ) -> Tuple[ str, str ]: r""" Process sequence alignment (modified from ColabFold) Parameters ---------- a3m_files : List of files to parse token : Token to look for when parsing Returns ---------- Tuple with [0] string with alignment, and [1] path to template """ a3m_lines = "" for a3m_file in a3m_files: for line in open( os.path.join( self.path, a3m_file ), "r" ): if len( line ) > 0: a3m_lines += line.replace( "\x00", "" ) return a3m_lines, self.process_templates( templates ) def run_job( self, templates: List[ str ] = [] ) -> Tuple[ str, str ]: r""" Run sequence alignments using MMseqs2 Parameters ---------- use_templates: Whether to use templates Returns ---------- Tuple with [0] string with alignment, and [1] path to template """ self._search_mmseqs2() a3m_files = [ "uniref.a3m", "bfd.mgnify30.metaeuk30.smag30.a3m" ] # extract a3m files if not os.path.isfile( os.path.join( self.path, a3m_files[ 0 ] ) ): with tarfile.open( self.tarfile ) as tar_gz: tar_gz.extractall( self.path ) return self._process_alignment( a3m_files, templates )