https://github.com/nrnb/mirna-pathway-finder
Raw File
Tip revision: 2a75f42dd8d65b42c60e38bffbab2e96f520a401 authored by Alexander Pico on 09 March 2018, 01:18:03 UTC
Create README.md
Tip revision: 2a75f42
mirna_pathway_finder.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import csv
# import numpy as np
import os
import pandas as pd
import pystache
import re
import urllib

current_path = os.path.dirname(os.path.abspath(__file__))


def MirnaPathwayFinder(
        mappings_path=None,
        query_values=None,
        query_value_list_column_index=0,
        node_type='rna',
        output_dir='.',
        cache=True,
        debug=False):

    def print_debug(message):
        if debug:
            print message

    def generate_widget_uri(mapping, highlight_string):
        return ''.join([
            'http://www.wikipathways.org/wpi/PathwayWidget.php?id=',
            mapping['identifier'],
            '&',
            highlight_string,
        ])

    def generate_pathways_table(html_template_input, query_value_list):
        # TODO handle the case where the query values are NOT display names
        highlight_values = map(
            lambda query_value: 'label[]=' + urllib.quote(
                query_value
            ), query_value_list
        )
        highlight_string = str.join('&', highlight_values) + '&colors=red'

        f = open(current_path + '/table-template.html', 'r')
        table_template = f.read()
        widget_uri = generate_widget_uri(
            html_template_input[0], highlight_string)
        initial_html_string = pystache.render(
            table_template, html_template_input)
        html_string_with_widget_url = initial_html_string.replace(
            'widget_uri',
            widget_uri
        )

        update_widget_path = os.path.join(current_path, 'update-widget.js')
        with open(update_widget_path, 'r') as update_widget:
            update_widget_string = ''.join([
                'var highlightString = \'',
                highlight_string,
                '\';\n',
                update_widget.read()
            ])

        html_string_with_update_widget = html_string_with_widget_url.replace(
            'update_widget_string',
            update_widget_string
        )
        f = open(os.path.join(output_dir, 'pathways.html'), 'w')
        f.write(html_string_with_update_widget)
        return html_string_with_update_widget

    def has_targeter(row):
        columns_to_check = (
            'stem_loop_name',
            'mature_name',
            'mirbase',
            'mirbase.mature',
            'ncbigene',
            'hgnc',
            'targeter_stem_loop_name',
            'targeter_mature_name',
            'targeter_mirbase',
            'targeter_mirbase.mature',
            'targeter_ncbigene',
            'targeter_hgnc',
            )

        possible_targeters = set(filter(lambda y: isinstance(y, str),
                                 map(lambda x: row[x], columns_to_check)))
        return len(possible_targeters.intersection(query_value_list)) > 0

    def has_target(row):
        columns_to_check = (
            'targeter_stem_loop_name',
            'targeter_mature_name',
            'targeter_mirbase',
            'targeter_mirbase.mature',
            'targeter_ncbigene',
            'targeter_hgnc',
            )

        possible_targeters = set(filter(lambda y: isinstance(y, str),
                                 map(lambda x: row[x], columns_to_check)))
        return len(possible_targeters.intersection(query_value_list)) > 0

    results_limit = 20

    query_value_list = set()
    if os.path.isfile(query_values):
        with open(query_values, 'rb') as csvfile:
            query_value_list_reader = csv.reader(
                csvfile, delimiter='\t', quotechar='|')
            for row in query_value_list_reader:
                query_value_list.add(row[query_value_list_column_index])
    else:
        if hasattr(query_values, '__iter__'):
            query_value_list = set(query_values)
        else:
            query_value_list.add(query_values)

    query_value_list = map(lambda x: re.sub(
        '^http:\/\/identifiers.org\/(hgnc|ncbigene|mirbase|mirbase\.mature)\/',
        '',
        x
        ), query_value_list)

    if mappings_path is None:
        # TODO remove the date part of the file name
        mappings_path = os.path.join(
            current_path, '..', 'wp-mir-table-hs-20160715.tsv')

    # TODO integrate this old code into the current code. Specifically,
    #      handle if the mapping data is provided as a Python object.
    #    # parse wp-mir-table-hs.csv (or other file, if specified)
    #    # to get mappings between pathways and mirnas,
    #    # including for each pathway:
    #    # * genes: all gene products in the pathway, annotated as genes
    #    # * mirna_hits_as_gene_specified: miRNAs actually existing in pathway,
    #    #                                 annotated as genes
    #    # * mirna_hits_as_mirna_specified: miRNAs actually shown on pathway,
    #    #                                  annotated as miRNAs
    #    # * mirna_hits_as_mirna_inferred: miRNAs NOT actually specified on the
    #    #       pathway but
    #    #       inferred to exist on the pathway because they target genes or
    #    #       proteins that DO actually exist on the pathway
    #    pathway_to_mirna_mappings = mappings_path
    #    pathway_to_mirna_mappings_list = []
    #    if os.path.isfile(pathway_to_mirna_mappings):
    #        with open(pathway_to_mirna_mappings, 'rb') as csvfile:
    #            pathway_to_mirna_mappings_reader = csv.DictReader(csvfile)
    #            for row in pathway_to_mirna_mappings_reader:
    #                genes = parse_hits_field(row['genes'])
    #                mirna_hits_as_gene_specified = parse_hits_field(
    #                                                               row['ghits'])
    #                mirna_hits_as_mirna_specified = parse_hits_field(
    #                                                                row['mhits'])
    #                mirna_hits_as_mirna_inferred = parse_hits_field(
    #                                                               row['mthits'])
    #
    #                wp_identifier = re.search('WP\d+', row['link']).group(0)
    #                parsed_row = {
    #                    'name': row['name'],
    #                    'identifier': wp_identifier,
    #                    'id': row['link'],
    #                    'genes': genes,
    #                    'mirna_hits_as_gene_specified':
    #                        mirna_hits_as_gene_specified,
    #                    'mirna_hits_as_mirna_specified':
    #                        mirna_hits_as_mirna_specified,
    #                    'mirna_hits_as_mirna_inferred':
    #                        mirna_hits_as_mirna_inferred,
    #                }
    #                pathway_to_mirna_mappings_list.append(parsed_row)
    #    else:
    #        if hasattr(pathway_to_mirna_mappings, '__iter__'):
    #            pathway_to_mirna_mappings_list += pathway_to_mirna_mappings
    #        else:
    #            pathway_to_mirna_mappings_list.append(pathway_to_mirna_mappings)
    wp_mirna = pd.read_csv(mappings_path,
                           sep='\t',
                           dtype=str)

    # TODO remove this. It's just for dev.
    # wp_mirna = wp_mirna.head(1000)

    with_targeter = wp_mirna[wp_mirna.apply(
        lambda d: has_targeter(d), axis=1)]

    with_targeter_by_pwy = with_targeter.groupby(['wikipathways'])

    # get targeter count by pathway
    targeter_n_by_pwy = with_targeter_by_pwy[
        'shown_or_inferred_mature_name'].nunique()

    # get shown_targeter count by pathway
    shown_targeter_n_by_pwy = with_targeter_by_pwy[
        'mature_name'].nunique()

    # get target count by pathway
    with_target = with_targeter[with_targeter.apply(
        lambda d: has_target(d), axis=1)]
    with_target_by_pwy = with_target.groupby(['wikipathways'])
    target_n_by_pwy = with_target_by_pwy['ncbigene'].nunique()

    pathways = with_targeter['wikipathways'].unique()
    d = {
        'wikipathways': pathways,
        'targeter_count': targeter_n_by_pwy,
        'shown_targeter_count': shown_targeter_n_by_pwy,
        'target_count': target_n_by_pwy
    }

    wp_counts = pd.DataFrame(
        data=d
        ).fillna(
            value=0
        ).nlargest(
            results_limit,
            ['shown_targeter_count', 'target_count', 'targeter_count'])

    results = with_targeter.join(wp_counts,
                                 on='wikipathways',
                                 how='left',
                                 lsuffix='',
                                 rsuffix='_r',
                                 sort=False)

    pathway_level_columns = [
        'shown_targeter_count',
        'target_count',
        'targeter_count',
        'wikipathways',
        'link',
        'pathway_name'
    ]
    results_sorted = results.sort_values(by=pathway_level_columns,
                                         axis=0,
                                         ascending=False,
                                         inplace=False,
                                         kind='quicksort',
                                         na_position='last')

    results_by_pwy = results_sorted.groupby(pathway_level_columns, sort=False)

    html_template_input = []
    for name_by_pwy, group_by_pwy in results_by_pwy:
        targets_by_targeters = []
        shown_targeters = []
        mature_names = filter(lambda x: isinstance(x, str),
                              group_by_pwy['mature_name'])
        if len(mature_names) > 0:
            shown_targeters = shown_targeters + map(
                lambda x: {'name': x}, mature_names)
        result = {
            'id': name_by_pwy[pathway_level_columns.index('link')],
            'identifier': name_by_pwy[
                pathway_level_columns.index('wikipathways')],
            'name': name_by_pwy[pathway_level_columns.index('pathway_name')],
            'targets_by_targeters': targets_by_targeters,
            'shown_targeters': shown_targeters,
            'shown_targeter_count': name_by_pwy[
                pathway_level_columns.index('shown_targeter_count')],
            'target_count': name_by_pwy[
                pathway_level_columns.index('target_count')],
            'targeter_count': name_by_pwy[
                pathway_level_columns.index('targeter_count')]
        }
        by_targeter = group_by_pwy.groupby('targeter_mature_name')
        for name_by_targeter, group_by_targeter in by_targeter:
            targets_by_targeters.append({
                'targeter': name_by_targeter,
                'targets': ', '.join(group_by_targeter[
                    'ncbigene'].unique().tolist())
            })
        html_template_input.append(result)

    generate_pathways_table(html_template_input, query_value_list)
back to top