Content - 434a36d4651ef2620a4480fd4cd4bc61dcc80bf2 - fde183d/metage2metabo/__main__.py

__main__.py
import logging
import argparse
import logging
import os
import pkg_resources
import re
import sys
import tarfile
import time
import traceback
import subprocess
from shutil import copyfile,which

from metage2metabo.m2m_workflow import run_workflow, recon, iscope, cscope, addedvalue, mincom, instance_community, metacom_analysis
from metage2metabo import sbml_management, utils

VERSION = pkg_resources.get_distribution("metage2metabo").version
LICENSE = """Copyright (C) Dyliss
License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
metage2metabo is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law.\n
"""
MESSAGE = """
From metabolic network reconstruction with annotated genomes to metabolic capabilities screening to identify organisms of interest in a large microbiota.
"""
REQUIRES = """
Requires: Pathway Tools installed and in $PATH, and NCBI Blast
"""

logger = logging.getLogger('metage2metabo')
logger.setLevel(logging.DEBUG)

# Check ASP binaries.
if not which('clingo'):
    logger.critical('clingo is not in the Path, m2m can not work without it.')
    logger.critical('You can install with: pip install clyngor-with-clingo') 
    sys.exit(1)

# Check if clingo is accessible.
try:
    subprocess.call(['clingo', '--version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
except:
    logger.critical('Error with clingo.')
    logger.critical(traceback.format_exc())
    sys.exit(1)


def main():
    """Run programm.
    """
    start_time = time.time()
    parser = argparse.ArgumentParser(
        "m2m",
        description=MESSAGE + " For specific help on each subcommand use: m2m {cmd} --help",
        epilog=REQUIRES
    )
    parser.add_argument(
        "-v",
        "--version",
        action="version",
        version="%(prog)s " + VERSION + "\n" + LICENSE)

    # parent parser
    parent_parser_q = argparse.ArgumentParser(add_help=False)
    parent_parser_q.add_argument(
        "-q",
        "--quiet",
        dest="quiet",
        help="quiet mode",
        required=False,
        action="store_true",
        default=None,
    )
    parent_parser_c = argparse.ArgumentParser(add_help=False)
    parent_parser_c.add_argument(
        "-c",
        "--cpu",
        help="cpu number for multi-process",
        required=False,
        type=int,
        default=1)
    parent_parser_o = argparse.ArgumentParser(add_help=False)
    parent_parser_o.add_argument(
        "-o",
        "--out",
        dest="out",
        required=True,
        help="output directory path",
        metavar="OUPUT_DIR")

    parent_parser_no = argparse.ArgumentParser(add_help=False)
    parent_parser_no.add_argument(
        "--noorphan",
        help="use this option to ignore reactions without gene or protein association",
        required=False,
        action="store_true",
        default=False,
    )
    parent_parser_s = argparse.ArgumentParser(add_help=False)
    parent_parser_s.add_argument(
        "-s",
        "--seeds",
        help="seeds (growth medium) for metabolic analysis",
        required=True)
    parent_parser_n = argparse.ArgumentParser(add_help=False)
    parent_parser_n.add_argument(
        "-n",
        "--networksdir",
        metavar="NETWORKS_DIR",
        help="metabolic networks directory",
        required=True)
    parent_parser_g = argparse.ArgumentParser(add_help=False)
    parent_parser_g.add_argument(
        "-g", "--genomes", help="annotated genomes directory", required=True)
    parent_parser_cl = argparse.ArgumentParser(add_help=False)
    parent_parser_cl.add_argument(
        "--clean",
        help="clean PGDBs if already present",
        required=False,
        action="store_true",
        default=None)
    parent_parser_m = argparse.ArgumentParser(add_help=False)
    parent_parser_m.add_argument(
        "-m",
        "--modelhost",
        help="host metabolic model for community analysis",
        required=False,
        default=None)
    parent_parser_l = argparse.ArgumentParser(add_help=False)
    parent_parser_l.add_argument(
        "-l",
        "--level",
        help="Level for SBML creation, 2 or 3",
        required=False,
        type = int,
        choices=[2,3],
        default=2)
    parent_parser_p = argparse.ArgumentParser(add_help=False)
    parent_parser_p.add_argument(
        "-p",
        "--padmet",
        help="create padmet files",
        required=False,
        action="store_true",
        default=None)

    # subparsers
    subparsers = parser.add_subparsers(
        title='subcommands',
        description='valid subcommands:',
        dest="cmd")
    ptools_parser = subparsers.add_parser(
        "recon",
        help="metabolic network reconstruction",
        parents=[
            parent_parser_g, parent_parser_o, parent_parser_c, parent_parser_q,
            parent_parser_l, parent_parser_no, parent_parser_p, parent_parser_cl
        ],
        description=
        "Run metabolic network reconstruction for each annotated genome of the input directory, using Pathway Tools"
    )
    indivscope_parser = subparsers.add_parser(
        "iscope",
        help="individual scope computation",
        parents=[
            parent_parser_n, parent_parser_s, parent_parser_o, parent_parser_q
        ],
        description=
        "Compute individual scopes (reachable metabolites from seeds) for each metabolic network of the input directory"
    )
    comscope_parser = subparsers.add_parser(
        "cscope",
        help="community scope computation",
        parents=[
            parent_parser_n, parent_parser_s, parent_parser_o, parent_parser_m,
            parent_parser_q
        ],
        description="Compute the community scope of all metabolic networks")
    added_value_parser = subparsers.add_parser(
        "addedvalue",
        help="added value of microbiota's metabolism over individual's",
        parents=[
            parent_parser_n, parent_parser_s, parent_parser_o, parent_parser_m,
            parent_parser_q
        ],
        description=
        "Compute metabolites that are reachable by the community/microbiota and not by individual organisms"
    )
    mincom_parser = subparsers.add_parser(
        "mincom",
        help="minimal communtity selection",
        parents=[
            parent_parser_n, parent_parser_s, parent_parser_o, parent_parser_m,
            parent_parser_q
        ],
        description=
        "Select minimal-size community to make reachable a set of metabolites")
    mincom_parser.add_argument(
        "-t",
        "--targets",
        help="targets for metabolic analysis",
        required=True
        )
    seeds_parser = subparsers.add_parser(
        "seeds",
        help="creation of seeds SBML file",
        parents=[parent_parser_o, parent_parser_q],
        description=
        "Create a SBML file starting for a simple text file with metabolic compounds identifiers"
    )
    seeds_parser.add_argument(
        "--metabolites",
        help=
        'metabolites file: one per line, encoded (XXX as in <species id="XXXX" .../> of SBML files)',
        required=True)
    wkf_parser = subparsers.add_parser(
        "workflow",
        help="whole workflow",
        parents=[
            parent_parser_g, parent_parser_s, parent_parser_m, parent_parser_o,
            parent_parser_c, parent_parser_q, parent_parser_no, parent_parser_p, parent_parser_cl
        ],
        description=
        "Run the whole workflow: metabolic network reconstruction, individual and community scope analysis and community selection"
    )
    metacom_parser = subparsers.add_parser(
        "metacom",
        help="whole metabolism community analysis",
        parents=[
            parent_parser_n, parent_parser_s, parent_parser_m, parent_parser_o,
            parent_parser_q
        ],
        description=
        "Run the whole metabolism community analysis: individual and community scope analysis and community selection"
    )
    test_parser = subparsers.add_parser(
        "test",
        help="test on sample data from rumen experiments",
        parents=[
            parent_parser_q, parent_parser_c, parent_parser_o
        ],
        description=
        "Test the whole workflow on a data sample")

    args = parser.parse_args()

    # If no argument print the help.
    if len(sys.argv) == 1:
        parser.print_help()
        sys.exit(1)

    # test writing in out_directory if a subcommand is given else print version and help
    if args.cmd:
        if not utils.is_valid_dir(args.out):
            logger.critical("Impossible to access/create output directory")
            sys.exit(1)
    else:
        logger.info("m2m " + VERSION + "\n" + LICENSE)
        parser.print_help()
        sys.exit()

    # logger = logging.getLogger()    #TODO: get rid of it once mpwt's logger is fixed
    # logger.setLevel(logging.DEBUG)  #TODO: get rid of it once mpwt's logger is fixed
    # add logger in file
    formatter = logging.Formatter('%(message)s')
    file_handler = logging.FileHandler(f'{args.out}/m2m_{args.cmd}.log', 'w+')
    file_handler.setLevel(logging.INFO)
    file_handler.setFormatter(formatter)
    logger.addHandler(file_handler)
    # set up the default console logger
    console_handler = logging.StreamHandler(sys.stdout)
    console_handler.setLevel(logging.INFO)
    console_handler.setFormatter(formatter)
    if args.quiet:
        console_handler.setLevel(logging.WARNING)
    logger.addHandler(console_handler)

    #if modelhost is given as an arg: check the SBML level and turn it into 2 if needed
    if args.cmd in ["workflow", "metacom", "mincom", "cscope", "addedvalue"] and args.modelhost:
        new_arg_modelhost = check_sbml(args.modelhost, args.out, folder=False)
    else:
        new_arg_modelhost = None


    # deal with given subcommand
    if args.cmd == "workflow":
        main_workflow(args.genomes, args.out, args.cpu, args.clean, args.seeds,
                      args.noorphan, args.padmet, new_arg_modelhost)
    elif args.cmd in ["iscope", "cscope", "addedvalue", "mincom", "metacom"]:
        if not os.path.isdir(args.networksdir):
            logger.critical(args.networksdir + " is not a correct directory path")
            sys.exit(1)
        network_dir = check_sbml(args.networksdir, args.out)
        if not utils.is_valid_file(args.seeds):
            logger.critical(args.seeds + " is not a correct filepath")
            sys.exit(1)
        else:
            if args.cmd == "iscope":
                main_iscope(network_dir, args.seeds, args.out)
            elif args.cmd == "cscope":
                main_cscope(network_dir, args.seeds, args.out, new_arg_modelhost)
            elif args.cmd == "addedvalue":
                main_added_value(network_dir, args.seeds, args.out,
                                new_arg_modelhost)
            elif args.cmd == "mincom":
                if not utils.is_valid_file(args.targets):
                    logger.critical(args.targets + " is not a correct filepath")
                    sys.exit(1)
                else:
                    main_mincom(network_dir, args.seeds, args.out, args.targets, new_arg_modelhost)
            elif args.cmd == "metacom":
                main_metacom(network_dir, args.out, args.seeds, new_arg_modelhost)
    elif args.cmd == "recon":
        main_recon(args.genomes, args.out, args.noorphan, args.padmet, args.level, args.cpu,
                   args.clean)
    elif args.cmd == "seeds":
        if not utils.is_valid_file(args.metabolites):
            logger.critical(args.metabolites + " is not a correct filepath")
            sys.exit(1)
        else:
            main_seeds(args.metabolites, args.out)
    elif args.cmd == 'test':
        main_test(args.out, args.cpu)

    logger.info("--- Total runtime %.2f seconds ---" % (time.time() - start_time))
    logger.warning(f'--- Logs written in {args.out}/m2m_{args.cmd}.log ---')


def main_workflow(*allargs):
    """Run main workflow.
    """
    run_workflow(*allargs)


def main_metacom(*allargs):
    """Run main workflow.
    """
    metacom_analysis(*allargs)


def main_recon(*allargs):
    """Run recon command.
    """
    pgdbdir, sbmldir, padmet_folder= recon(*allargs)
    logger.info("PGDB created in " + pgdbdir)
    if allargs[3]:
        logger.info("PADMET files created in " + padmet_folder)

    logger.info("SBML files created in " + sbmldir)


def main_iscope(*allargs):
    """Run iscope command.
    """
    return iscope(*allargs)


def main_cscope(*allargs):
    """Run cscope command.
    """
    instance_com, comscope = cscope(*allargs)
    logger.info("\n" + str(len(comscope)) + " metabolites (excluding the seeds) reachable by the whole community/microbiota: \n")
    logger.info('\n'.join(comscope))
    #delete intermediate file
    os.unlink(instance_com)
    return comscope


def main_added_value(sbmldir, seeds, outdir, host):
    """Run addedvalue command.
    
    Args:
        sbmldir (str): SBML file directory
        seeds (str): SBML file for seeds
        outdir (str): results directory
        host (str): SBML file for host
    """
    iscope_metabolites = main_iscope(sbmldir, seeds, outdir)
    #logger.info(", ".join(iscope_metabolites))
    cscope_metabolites = main_cscope(sbmldir, seeds, outdir, host)
    newtargets = addedvalue(iscope_metabolites, cscope_metabolites, outdir)
    if len(newtargets) > 0:
        sbml_management.create_species_sbml(newtargets, outdir + "/community_analysis/targets.sbml")
        logger.info("Target file created with the addedvalue targets in: " +
                    outdir + "/community_analysis/targets.sbml")


def main_mincom(sbmldir, seedsfiles, outdir, targets, host):
    """Run mincom command.
    
    Args:
        sbmldir (str): SBML files directory
        seedsfiles (str): SBML file for seeds
        outdir (str): results directory
        targets (str): targets SBML file
        host (str): SBML file for host
    """
    #create instance
    instance = instance_community(sbmldir, seedsfiles, outdir, targets, host)
    #run mincom
    mincom(instance, outdir)
    #delete intermediate file
    os.unlink(instance)


def main_seeds(metabolites_file, outdir):
    """Run seeds command.
    
    Args:
        metabolites_file (str): text file with metabolites IDs, one per line
        outdir (str): Results directory
    """
    outfile = outdir + "/seeds.sbml"
    with open(metabolites_file, "r") as f:
        rawdata = f.readlines()
    metabolites_set = set()
    for elem in rawdata:
        metabolites_set.add(elem.strip("\n"))
    for metabolite in metabolites_set:
        assert re.match(
            "^[A-Za-z0-9_-]*$", metabolite
        ) and not metabolite[0].isdigit(
        ), "Seed file is not in the correct format. Error with %s. Example of a correct ID is M_OXYGEN__45__MOLECULE_c. Rules = only numbers, letters or underscore in IDs, not starting with a number. One ID per line." %(metabolite)
    sbml_management.create_species_sbml(metabolites_set, outfile)
    logger.info("Seeds SBML file created in " + outfile)


def main_test(outdir, cpu):
    """Run test command.

    Args:
        outdir (str): directory containing the test data and the test output
        cpu (int): number of cpu to use (recommended: 2)
    """
    # Retrieve package path and path to test data.
    package_path = '/'.join(os.path.realpath(__file__).split('/')[:-1])+ '/workflow_data/'

    genome_file = package_path + 'workflow_genomes.tar.gz'
    seeds_sbml_file = package_path + 'seeds_workflow.sbml'

    logger.info("Uncompressing test data to " + outdir)
    tar = tarfile.open(genome_file, "r:gz")
    tar.extractall(outdir)
    tar.close()

    logger.info("Launching workflow on test data")
    input_genome = outdir + '/workflow_genomes'
    main_workflow(input_genome, outdir, cpu, None,
                seeds_sbml_file, None, True, None)


def check_sbml(inpt, outdir, folder = True):
    """Check whether one or several SBML level 3 files are in directory. If yes, convert them into a new directory and copy the SBML files that are correct into this same directory.
    
    Args:
        inpt (str): SBML files directory
        outdir (str): Results directory
        folder (bool): Defaults to True. Change function behavior is input is a file or a folder
    
    Returns:
        str: filepath of file or directory, same as input if all SBMLs are level2
    """
    if folder:
        all_files = [
            f for f in os.listdir(inpt)
            if os.path.isfile(os.path.join(inpt, f)) and utils.get_extension(
                os.path.join(inpt, f)).lower() in ["xml", "sbml"]
        ]
        sbml_levels = {}
        make_new_sbmls = False
        for f in all_files:
            sbml_levels[f] = sbml_management.get_sbml_level(
                os.path.join(inpt, f))
            if sbml_levels[f] != 2:
                make_new_sbmls = True
        if make_new_sbmls:
            sbml_dir = outdir + "/new_sbml/"
            logger.warning(
                "At least one SBML has not a suitable level for the tools. They will be transformed and created in "
                + sbml_dir + ". The others will be copied in this directory")
            if not utils.is_valid_dir(sbml_dir):
                logger.critical("Impossible to write in output directory")
                sys.exit(1)
            for f in all_files:
                if sbml_levels[f] != 2:
                    #create level 2 SBML in sbml_dir
                    sbml_management.transform_sbml_lvl(
                        os.path.join(inpt, f), os.path.join(sbml_dir, f), 2)
                else:
                    #copy the original SBML in sbml_dir
                    copyfile(os.path.join(inpt, f), os.path.join(sbml_dir, f))
        else:
            sbml_dir = inpt
        return sbml_dir
    else:
        if not utils.is_valid_file(inpt):
            logger.critical(inpt + " is not a correct filepath")
            sys.exit(1)
        else:
            sbml_level = sbml_management.get_sbml_level(inpt)
            if sbml_level != 2:
                newsbml = outdir + '/' + utils.get_basename(inpt) + "_lvl2.sbml"
                logger.warning(inpt + " was not in a suitable level for analysis. A converted file is created in " + newsbml)
                sbml_management.transform_sbml_lvl(inpt, newsbml, 2)
            else:
                newsbml = inpt
            return newsbml


if __name__ == "__main__":
    main()