Content - e9edfa433c06b835e687ca9fc4a056d425b8784f - fde183d/metage2metabo/m2m_analysis.py

visit type:
Tip revision: 2cab4c79acd814eb177a370602c07599a93bc947 authored by cfrioux on 22 April 2020, 08:42:56 UTC
add install or req in GA publish
Tip revision: 2cab4c7
m2m_analysis.py
#!/usr/bin/env python
# Copyright (c) 2019, Clemence Frioux <clemence.frioux@inria.fr>
#
# This file is part of metage2metabo.
#
# metage2metabo is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# metage2metabo is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with metage2metabo.  If not, see <http://www.gnu.org/licenses/>.
# -*- coding: utf-8 -*-

import csv
import json
import miscoto
import networkx as nx
import os
import sys
import time
import subprocess
import logging
import zipfile

from ete3 import NCBITaxa
from itertools import combinations
from metage2metabo import utils, sbml_management

logger = logging.getLogger(__name__)
logging.getLogger("miscoto").setLevel(logging.CRITICAL)


def file_or_folder(variable_folder_file):
    """Check if the variable is file or a folder

    Args:
        variable_folder_file (str): path to a file or a folder

    Returns:
        dict: {name of input file: path to input file}
    """
    file_folder_paths = {}

    if os.path.isfile(variable_folder_file):
        filename = os.path.splitext(os.path.basename(variable_folder_file))[0]
        file_folder_paths[filename] = variable_folder_file

    # For folder, iterate through all files inside the folder.
    elif os.path.isdir(variable_folder_file):
        for file_from_folder in os.listdir(variable_folder_file):
            filename = os.path.splitext(os.path.basename(file_from_folder))[0]
            file_folder_paths[filename] = variable_folder_file + "/" + file_from_folder

    return file_folder_paths


def run_analysis_workflow(sbml_folder, target_folder_file, seed_file, output_dir, taxon_file, oog_jar, host_file=None):
    """Run the whole m2m_analysis workflow

    Args:
        sbml_folder (str): sbml directory
        target_folder_file (str): targets file or folder containing multiple sbmls
        seed_file (str): seeds file
        output_dir (str): results directory
        taxon_file (str): mpwt taxon file for species in sbml folder
        oog_jar (str): path to OOG jar file
        host_file (str): metabolic network file for host
    """
    starttime = time.time()

    json_file_folder = enumeration_analysis(sbml_folder, target_folder_file, seed_file, output_dir, host_file)

    gml_output = graph_analysis(json_file_folder, target_folder_file, output_dir, taxon_file)

    powergraph_analysis(gml_output, output_dir, oog_jar)

    logger.info(
        "--- m2m_analysis runtime %.2f seconds ---\n" % (time.time() - starttime))

def enumeration(sbml_folder, target_file, seed_file, output_json, host_file):
    """Run miscoto enumeration on one target file

    Args:
        sbml_folder (str): sbml directory
        target_file (str): targets file
        seed_file (str): seeds file
        output_json (str): path to json output
        host_file (str): metabolic network file for host

    Returns:
        str: path to output json
    """
    results = miscoto.run_mincom(option="soup", bacteria_dir=sbml_folder,
        targets_file=target_file, seeds_file=seed_file,
        host_file=host_file, intersection=True,
        enumeration=True, union=True,
        optsol=True, output_json=output_json)

    # Give union of solutions
    union = results['union_bacteria']
    logger.info('######### Keystone species: Union of minimal communities #########')
    logger.info("# Bacteria occurring in at least one minimal community enabling the producibility of the target metabolites given as inputs")
    logger.info("Keystone species = " +
                str(len(union)))
    logger.info("\n".join(union))
    # Give intersection of solutions
    intersection = results['inter_bacteria']
    logger.info('######### Essential symbionts: Intersection of minimal communities #########')
    logger.info("# Bacteria occurring in ALL minimal community enabling the producibility of the target metabolites given as inputs")
    logger.info("Essential symbionts = " +
                str(len(intersection)))
    logger.info("\n".join(intersection))
    # Give keystones, essential and alternative symbionts
    alternative_symbionts = list(set(union) - set(intersection))
    logger.info('######### Alternative symbionts: Difference between Union and Intersection #########')
    logger.info("# Bacteria occurring in at least one minimal community but not all minimal community enabling the producibility of the target metabolites given as inputs")
    logger.info("Alternative symbionts = " +
                str(len(alternative_symbionts)))
    logger.info("\n".join(alternative_symbionts))

    return output_json


def enumeration_analysis(sbml_folder, target_folder_file, seed_file, output_dir, host_file=None):
    """Run miscoto enumeration on input data

    Args:
        sbml_folder (str): sbml directory
        target_folder_file (str): targets file or folder containing multiple sbmls
        seed_file (str): seeds file
        output_dir (str): results directory
        host_file (str): metabolic network file for host

    Returns:
        dict: {target_filename_without_extension: json_output_path}
    """
    starttime = time.time()

    target_paths = file_or_folder(target_folder_file)

    output_jsons = output_dir + "/" + "json" + "/"
    if not utils.is_valid_dir(output_jsons):
        logger.critical("Impossible to access/create output directory")
        sys.exit(1)

    miscoto_jsons = {}
    for target_path in target_paths:
        logger.info('######### Enumeration of solution for: '+ target_path + ' #########')
        target_pathname = target_paths[target_path]
        output_json = output_jsons + target_path + ".json"
        miscoto_json = enumeration(sbml_folder, target_pathname, seed_file, output_json, host_file)
        miscoto_jsons[target_path] = miscoto_json

    logger.info(
        "--- Enumeration runtime %.2f seconds ---\n" % (time.time() - starttime))

    return output_jsons


def stat_analysis(json_file_folder, output_dir, taxon_file=None):
    """Run the analysis part of the workflow on miscoto enumeration jsons

    Args:
        json_file_folder (str): json file or folder containing multiple jsons
        output_dir (str): results directory
        taxon_file (str): mpwt taxon file for species in sbml folder
    """
    starttime = time.time()

    miscoto_stat_output = output_dir + "/" + "miscoto_stats.txt"
    key_species_stats_output = output_dir + "/" + "keystone_species_stats.tsv"
    key_species_supdata_output = output_dir + "/" + "keystone_species_supdata.tsv"
    json_paths = file_or_folder(json_file_folder)

    if taxon_file:
        phylum_output_file = output_dir + "/taxon_phylum.tsv"
        tree_output_file = output_dir + "/taxon_tree.txt"
        extract_taxa(taxon_file, phylum_output_file, tree_output_file)
        phylum_species, all_phylums = get_phylum(phylum_output_file)
    else:
        phylum_species = None
        all_phylums = None

    with open(key_species_stats_output, "w") as key_stats_file, open(
        key_species_supdata_output, "w"
    ) as key_sup_file, open(miscoto_stat_output, "w") as stats_output:
        key_stats_writer = csv.writer(key_stats_file, delimiter="\t")
        if all_phylums:
            key_stats_writer.writerow(["target_categories", "keystones_group", *sorted(all_phylums), "Sum"])
        else:
            key_stats_writer.writerow(["target_categories", "keystones_group", "data", "Sum"])
        key_sup_writer = csv.writer(key_sup_file, delimiter="\t")
        statswriter = csv.writer(stats_output, delimiter="\t")
        statswriter.writerow(["categories", "nb_target", "size_min_sol", "size_union", "size_intersection", "size_enum"])
        for json_path in json_paths:
            with open(json_paths[json_path]) as json_data:
                json_elements = json.load(json_data)
            create_stat_species(json_path, json_elements, key_stats_writer, key_sup_writer, phylum_species, all_phylums)
            statswriter.writerow([json_path, str(len(json_elements["newly_prod"]) + len(json_elements["still_unprod"])),
									str(len(json_elements["bacteria"])), str(len(json_elements["union_bacteria"])),
                    				str(len(json_elements["inter_bacteria"])), str(len(json_elements["enum_bacteria"]))])

    logger.info(
        "--- Stats runtime %.2f seconds ---\n" % (time.time() - starttime))


def graph_analysis(json_file_folder, target_folder_file, output_dir, taxon_file):
    """Run the graph creation on miscoto output

    Args:
        json_file_folder (str): json file or folder containing multiple jsons
        target_folder_file (str): targets file or folder containing multiple sbmls
        output_dir (str): results directory
        taxon_file (str): mpwt taxon file for species in sbml folder

    Returns:
        str: path to folder containing gml results
    """
    starttime = time.time()

    target_paths = file_or_folder(target_folder_file)
    json_paths = file_or_folder(json_file_folder)

    gml_output = output_dir + "/" + "gml" + "/"

    if taxon_file:
        phylum_output_file = output_dir + "/taxon_phylum.tsv"
        tree_output_file = output_dir + "/taxon_tree.txt"
        if not os.path.exists(phylum_output_file):
            extract_taxa(taxon_file, phylum_output_file, tree_output_file)
    else:
        phylum_output_file = None

    create_gml(json_paths, target_paths, output_dir, phylum_output_file)

    logger.info(
        "--- Graph runtime %.2f seconds ---\n" % (time.time() - starttime))

    return gml_output


def powergraph_analysis(gml_file_folder, output_dir, oog_jar):
    """Run the graph compression and picture creation

    Args:
        gml_file_folder (str): gml directory or gml file
        output_dir (str): results directory
        oog_jar (str): path to OOG jar file
    """
    starttime = time.time()

    gml_paths = file_or_folder(gml_file_folder)

    bbl_path = output_dir + "/" + "bbl" + "/"
    svg_path = output_dir + "/" + "svg" + "/"

    if not utils.is_valid_dir(bbl_path):
        logger.critical("Impossible to access/create output directory")
        sys.exit(1)

    if not utils.is_valid_dir(svg_path):
        logger.critical("Impossible to access/create output directory")
        sys.exit(1)

    for gml_path in gml_paths:
        bbl_output = bbl_path + gml_path + ".bbl"
        svg_output = svg_path + gml_path + ".svg"
        gml_input = gml_paths[gml_path]
        logger.info('######### Graph compression: ' + gml_path + ' #########')
        compression(gml_input, bbl_output)
        logger.info('######### PowerGraph visualization: ' + gml_path + ' #########')
        bbl_to_svg(oog_jar, bbl_output, svg_path)

    logger.info(
        "--- Powergraph runtime %.2f seconds ---\n" % (time.time() - starttime))


def extract_taxa(mpwt_taxon_file, taxon_output_file, tree_output_file):
    """From NCBI taxon ID, extract taxonomy rank and create a tree file

    Args:
        mpwt_taxon_file (str): mpwt taxon file for species in sbml folder
        taxon_output_file (str): path to phylum output file
        tree_output_file (str): path to tree output file

    """
    ncbi = NCBITaxa()

    taxon_ids = []

    phylum_count = {}
    with open(taxon_output_file, "w") as phylum_file:
        csvwriter = csv.writer(phylum_file, delimiter="\t")
        csvwriter.writerow(["species", "taxid", "phylum_number", "phylum", "class", "order", "family", "genus", "species"])
        with open(mpwt_taxon_file, "r") as taxon_file:
            csvfile = csv.reader(taxon_file, delimiter="\t")
            for line in csvfile:
                if "taxon" not in line[1]:
                    taxon_ids.append(line[1])
                    lineage = ncbi.get_lineage(line[1])
                    lineage2ranks = ncbi.get_rank(lineage)
                    names = ncbi.get_taxid_translator(lineage)
                    ranks2lineage = dict((rank, names[taxid]) for (taxid, rank) in lineage2ranks.items())
                    ranks = [ranks2lineage.get(rank, "no_information") for rank in ["phylum", "class", "order", "family", "genus", "species"]]
                    if ranks[0] != "no_information":
                        phylum = ranks[0][:4]
                    else:
                        phylum = "no_information"
                    if phylum not in phylum_count:
                        phylum_count[phylum] = 1
                    elif phylum == "no_information":
                        phylum_count[phylum] = ""
                    else:
                        phylum_count[phylum] += 1
                    row = ([line[0], line[1]] + [phylum + str(phylum_count[phylum])] + ranks)
                    csvwriter.writerow(row)

    tree = ncbi.get_topology(taxon_ids)

    with open(tree_output_file, "w") as tree_file:
        tree_file.write(tree.get_ascii(attributes=["sci_name", "rank"]))


def detect_phylum_species(phylum_named_species, all_phylums):
    """From a list of species named after their phylum, return a dictionary of {Phylum: [species_1, species_2]}

    Args:
        phylum_named_species (dict): {species_ID: species_named_after_phylum}
        all_phylums (list): all phylum in the dataset

    Returns:
        dict: {Phylum: [species_1, species_2]}
    """
    phylum_species = {}
    for phylum in phylum_named_species:
        if phylum[:4] not in phylum_species:
            phylum_species[phylum[:4]] = [phylum_named_species[phylum]]
        else:
            phylum_species[phylum[:4]].append(phylum_named_species[phylum])

    for phylum in all_phylums:
        if phylum not in phylum_species:
            phylum_species[phylum[:4]] = []

    return phylum_species


def detect_keystone_species(json_elements, all_phylums, phylum_named_species=None):
    """Detect keystone species (essential and alternative symbionts) from the miscoto json

    Args:
        json_elements (dict): miscoto results in a json dictionary
        all_phylums (list): all phylum in the dataset
        phylum_named_species (dict): {species_ID: species_named_after_phylum}

    Returns:
        key_stone_species (dict): {Phylum: [species_1, species_2]}
        essential_symbionts (dict): {Phylum: [species_1, species_2]}
        alternative_symbionts (dict): {Phylum: [species_1, species_2]}
    """
    if phylum_named_species:
        unions = {phylum_named_species[species_union]: species_union
            		for species_union in json_elements["union_bacteria"]}
        intersections = {phylum_named_species[species_intersection]: species_intersection
            		for species_intersection in json_elements["inter_bacteria"]}
    else:
        unions = json_elements["union_bacteria"]
        intersections = json_elements["inter_bacteria"]

    if phylum_named_species:
        key_stone_species = detect_phylum_species(unions, all_phylums)
        essential_symbionts = detect_phylum_species(intersections, all_phylums)
        alternative_symbionts = {}
        for phylum in key_stone_species:
            alternative_symbionts[phylum] = list(set(key_stone_species[phylum]) - set(essential_symbionts[phylum]))
        for phylum in all_phylums:
            if phylum not in alternative_symbionts:
                alternative_symbionts[phylum] = []
    else:
        key_stone_species = {}
        essential_symbionts = {}
        alternative_symbionts = {}
        key_stone_species["data"] = unions
        essential_symbionts["data"] = intersections
        alternative_symbionts["data"] = list(set(unions) - set(intersections))

    return key_stone_species, essential_symbionts, alternative_symbionts


def create_stat_species(target_category, json_elements, keystone_stats_writer, keystone_sup_writer, phylum_named_species=None, all_phylums=None):
    """Write stats on keystone species (essential and alternative symbionts) from the miscoto json

    Args:
        target_category (str): name of a target file (without extension)
        json_elements (dict): miscoto results in a json dictionary
        key_stats_writer (csv.writer): writer for stats of each group (keystone species) in each phylum
        keystone_sup_writer (csv.writer): writer for all keystone species in each phylum
        phylum_named_species (dict): {species_ID: species_named_after_phylum}
        all_phylums (list): all phylum in the dataset
    """
    key_stone_species, essential_symbionts, alternative_symbionts = detect_keystone_species(json_elements, all_phylums, phylum_named_species)

    key_stone_counts = [len(key_stone_species[phylum]) for phylum in sorted(list(key_stone_species.keys()))]
    keystone_stats_writer.writerow([target_category, "key_stone_species"] + key_stone_counts + [sum(key_stone_counts)])

    essential_symbiont_counts = [len(essential_symbionts[phylum]) for phylum in sorted(list(essential_symbionts.keys()))]
    keystone_stats_writer.writerow([target_category, "essential_symbionts"] + essential_symbiont_counts + [sum(essential_symbiont_counts)])

    alternative_symbiont_counts = [len(alternative_symbionts[phylum]) for phylum in sorted(list(alternative_symbionts.keys()))]
    keystone_stats_writer.writerow([target_category, "alternative_symbionts"] + alternative_symbiont_counts + [sum(alternative_symbiont_counts)])

    if all_phylums:
        for phylum in sorted(all_phylums):
            keystone_sup_writer.writerow([target_category, "key_stone_species", phylum] + key_stone_species[phylum])
            keystone_sup_writer.writerow([target_category, "essential_symbionts", phylum] + essential_symbionts[phylum])
            keystone_sup_writer.writerow([target_category, "alternative_symbionts", phylum] + alternative_symbionts[phylum])
    else:
        keystone_sup_writer.writerow([target_category, "key_stone_species", "data"] + key_stone_species["data"])
        keystone_sup_writer.writerow([target_category, "essential_symbionts", "data"] + essential_symbionts["data"])
        keystone_sup_writer.writerow([target_category, "alternative_symbionts", "data"] + alternative_symbionts["data"])


def get_phylum(phylum_file):
    """From the phylum file (created by extract_taxa) create a dictionary and a list linking phylum and species

    Args:
        phylum_file (str): path to the phylum_file
    """
    phylum_named_species = {}
    all_phylums = []
    with open(phylum_file, "r") as phylum_file:
        phylum_reader = csv.reader(phylum_file, delimiter="\t", quotechar="|")
        for row in phylum_reader:
            phylum_named_species[row[0]] = row[2]
            if row[2][:4] not in all_phylums:
                if "no_information" not in row[2] and "phylum_number" not in row[2]:
                    all_phylums.append(row[2][:4])

    return phylum_named_species, all_phylums


def create_gml(json_paths, target_paths, output_dir, taxon_file=None):
    """Create solution graph from miscoto output and compute stats

    Args:
        json_paths (str): {target: path_to_corresponding_json}
        target_paths (str): {target: path_to_corresponding_sbml}
        output_dir (str): results directory
        taxon_file (str): mpwt taxon file for species in sbml folder
    """
    miscoto_stat_output = output_dir + "/" + "miscoto_stats.txt"
    key_species_stats_output = output_dir + "/" + "keystone_species_stats.tsv"
    key_species_supdata_output = output_dir + "/" + "keystone_species_supdata.tsv"

    gml_output = output_dir + "/" + "gml" + "/"

    if not utils.is_valid_dir(gml_output):
        logger.critical("Impossible to access/create output directory")
        sys.exit(1)

    len_min_sol = {}
    len_union = {}
    len_intersection = {}
    len_solution = {}
    len_target = {}

    target_categories = {}
    for target in target_paths:
        target_categories[target] = sbml_management.get_compounds(target_paths[target])

    if taxon_file:
        phylum_named_species, all_phylums = get_phylum(taxon_file)
    else:
        phylum_named_species = None
        all_phylums = None

    with open(key_species_stats_output, "w") as key_stats_file, open(key_species_supdata_output, "w") as key_sup_file, open(miscoto_stat_output, "w") as stats_output:
        keystone_stats_writer = csv.writer(key_stats_file, delimiter="\t")
        if all_phylums:
            keystone_stats_writer.writerow(["target_categories", "keystones_group", *sorted(all_phylums), "Sum"])
        else:
            keystone_stats_writer.writerow(["target_categories", "keystones_group", "data", "Sum"])
        keystone_sup_writer = csv.writer(key_sup_file, delimiter="\t")
        for target_category in target_categories:
            with open(json_paths[target_category]) as json_data:
                dicti = json.load(json_data)
            create_stat_species(target_category, dicti, keystone_stats_writer, keystone_sup_writer, phylum_named_species, all_phylums)
            G = nx.Graph()
            added_node = []
            species_weight = {}
            if dicti["still_unprod"] != []:
                print("ERROR ", dicti["still_unprod"], " is unproducible")
            len_target[target_category] = len(dicti["newly_prod"]) + len(dicti["still_unprod"])
            len_min_sol[target_category] = len(dicti["bacteria"])
            len_union[target_category] = len(dicti["union_bacteria"])
            len_intersection[target_category] = len(dicti["inter_bacteria"])
            len_solution[target_category] = len(dicti["enum_bacteria"])
            for sol in dicti["enum_bacteria"]:
                for species_1, species_2 in combinations(
                    dicti["enum_bacteria"][sol], 2
                ):
                    if species_1 not in added_node:
                        if taxon_file:
                            G.add_node(phylum_named_species[species_1])
                        else:
                            G.add_node(species_1)
                        added_node.append(species_1)
                    if species_2 not in added_node:
                        if taxon_file:
                            G.add_node(phylum_named_species[species_2])
                        else:
                            G.add_node(species_2)
                        added_node.append(species_2)
                    combination_species = "_".join(sorted([species_1, species_2]))
                    if combination_species not in species_weight:
                        species_weight[combination_species] = 1
                    else:
                        species_weight[combination_species] += 1
                    if taxon_file:
                        G.add_edge(phylum_named_species[species_1], phylum_named_species[species_2], weight=species_weight[combination_species])
                    else:
                        G.add_edge(species_1, species_2, weight=species_weight[combination_species])

            statswriter = csv.writer(stats_output, delimiter="\t")
            statswriter.writerow(["categories", "nb_target", "size_min_sol", "size_union", "size_intersection", "size_enum"])
            statswriter.writerow([target_category, str(len_target[target_category]), str(len_min_sol[target_category]),
                                    str(len_union[target_category]), str(len_intersection[target_category]),
                                    str(len_solution[target_category])])
            logger.info('######### Graph of ' + target_category + ' #########')
            logger.info('Number of nodes: ' + str(G.number_of_nodes()))
            logger.info('Number of edges: ' + str(G.number_of_edges()))
            nx.write_gml(G, gml_output + "/" + target_category + ".gml")


def compression(gml_input, bbl_output):
    """Solution graph compression

    Args:
        gml_input (str): gml file
        bbl_output (str): bbl output file
    """
    starttime = time.time()
    with open('powergrasp.cfg', 'w') as config_file:
        config_file.write("[powergrasp options]\n")
        config_file.write("SHOW_STORY = no\n")

    import powergrasp
    from bubbletools import BubbleTree

    powernodes = 0
    poweredges = 0
    with open(bbl_output, "w") as fd:
        for line in powergrasp.compress_by_cc(gml_input):
            fd.write(line + "\n")

    tree = BubbleTree.from_bubble_file(bbl_output)
    logger.info('Number of powernodes: ' + str(len([powernode for powernode in tree.powernodes()])))
    logger.info('Number of poweredges: ' + str(tree.edge_number()))

    os.remove('powergrasp.cfg')

    logger.info(
        "Compression runtime %.2f seconds ---\n" % (time.time() - starttime))


def check_oog_jar_file(oog_jar):
    """Check Oog jar file

    Args:
        oog_jar (str): path to oog jar file
    """
    oog_class = None
    manifest_jar = None

    with zipfile.ZipFile(oog_jar, "r") as jarfile:
        for filename in jarfile.namelist():
            if filename.startswith("org/mattlab/eaglevista/Oog.class".rstrip("/")):
                oog_class = True
            if filename.startswith("META-INF/MANIFEST.MF".rstrip("/")):
                manifest_jar = True

    if oog_class and manifest_jar:
        return True
    elif manifest_jar:
        logger.info('No correct Oog.class in jar file.')
        return True
    else:
        sys.exit('Not a correct jar file.')


def bbl_to_svg(oog_jar, bbl_input, svg_output):
    """Powergraph picture creation

    Args:
        oog_jar (str): path to oog jar file
        bbl_input (str): bbl input file
        svg_output (str): svg output file
    """
    check_oog = check_oog_jar_file(oog_jar)

    if check_oog:
        oog_cmds = ["java", "-jar", oog_jar, "-inputfiles=" + bbl_input, "-img", "-outputdir=" + svg_output]
        subprocess.call(oog_cmds)
Browse the archive

https://github.com/AuReMe/metage2metabo