Content - d7f887c7a65ad3ec21d412225fedb2232a2be440 - 34a2968/tools/stats/gsummary.py

visit type:
Tip revision: 00022427d5b83a0e3172b8bf24743d9fae334276 authored by mvdbeek on 17 November 2020, 20:02:41 UTC
Update version to 20.09
Tip revision: 0002242
gsummary.py
#!/usr/bin/env python
from __future__ import print_function

import re
import sys
import tempfile
try:
    from rpy2.rpy_classic import BASIC_CONVERSION, NO_CONVERSION, r, RException, set_default_mode
except ImportError:
    # RPy isn't maintained, and doesn't work with R>3.0, use it as a fallback
    from rpy import BASIC_CONVERSION, NO_CONVERSION, r, RException, set_default_mode


def stop_err(msg):
    sys.stderr.write(msg)
    sys.exit(1)


def S3_METHODS(all="key"):
    Group_Math = ["abs", "sign", "sqrt", "floor", "ceiling", "trunc", "round", "signif",
        "exp", "log", "cos", "sin", "tan", "acos", "asin", "atan", "cosh", "sinh", "tanh",
        "acosh", "asinh", "atanh", "lgamma", "gamma", "gammaCody", "digamma", "trigamma",
        "cumsum", "cumprod", "cummax", "cummin", "c"]
    Group_Ops = ["+", "-", "*", "/", "^", "%%", "%/%", "&", "|", "!", "==", "!=", "<", "<=", ">=", ">", "(", ")", "~", ","]
    if all == "key":
        return {'Math': Group_Math, 'Ops': Group_Ops}


def main():
    try:
        datafile = sys.argv[1]
        outfile_name = sys.argv[2]
        expression = sys.argv[3]
    except Exception:
        stop_err('Usage: python gsummary.py input_file ouput_file expression')

    math_allowed = S3_METHODS()['Math']
    ops_allowed = S3_METHODS()['Ops']

    # Check for invalid expressions
    for word in re.compile('[a-zA-Z]+').findall(expression):
        if word and word not in math_allowed:
            stop_err("Invalid expression '%s': term '%s' is not recognized or allowed" % (expression, word))
    symbols = set()
    for symbol in re.compile(r'[^a-z0-9\s]+').findall(expression):
        if symbol and symbol not in ops_allowed:
            stop_err("Invalid expression '%s': operator '%s' is not recognized or allowed" % (expression, symbol))
        else:
            symbols.add(symbol)
    if len(symbols) == 1 and ',' in symbols:
        # User may have entered a comma-separated list r_data_frame columns
        stop_err("Invalid columns '%s': this tool requires a single column or expression" % expression)

    # Find all column references in the expression
    cols = []
    for col in re.compile('c[0-9]+').findall(expression):
        try:
            cols.append(int(col[1:]) - 1)
        except Exception:
            pass

    tmp_file = tempfile.NamedTemporaryFile('w+')
    # Write the R header row to the temporary file
    hdr_str = "\t".join("c%s" % str(col + 1) for col in cols)
    tmp_file.write("%s\n" % hdr_str)
    skipped_lines = 0
    first_invalid_line = 0
    i = 0
    for i, line in enumerate(open(datafile)):
        line = line.rstrip('\r\n')
        if line and not line.startswith('#'):
            valid = True
            fields = line.split('\t')
            # Write the R data row to the temporary file
            for col in cols:
                try:
                    float(fields[col])
                except Exception:
                    skipped_lines += 1
                    if not first_invalid_line:
                        first_invalid_line = i + 1
                    valid = False
                    break
            if valid:
                data_str = "\t".join(fields[col] for col in cols)
                tmp_file.write("%s\n" % data_str)
    tmp_file.flush()

    if skipped_lines == i + 1:
        stop_err("Invalid column or column data values invalid for computation.  See tool tips and syntax for data requirements.")
    else:
        # summary function and return labels
        set_default_mode(NO_CONVERSION)
        summary_func = r("function( x ) { c( sum=sum( as.numeric( x ), na.rm=T ), mean=mean( as.numeric( x ), na.rm=T ), stdev=sd( as.numeric( x ), na.rm=T ), quantile( as.numeric( x ), na.rm=TRUE ) ) }")
        headings = ['sum', 'mean', 'stdev', '0%', '25%', '50%', '75%', '100%']
        headings_str = "\t".join(headings)

        r_data_frame = r.read_table(tmp_file.name, header=True, sep="\t")

        outfile = open(outfile_name, 'w')

        for col in re.compile('c[0-9]+').findall(expression):
            r.assign(col, r["$"](r_data_frame, col))
        try:
            summary = summary_func(r(expression))
        except RException as s:
            outfile.close()
            stop_err("Computation resulted in the following error: %s" % str(s))
        summary = summary.as_py(BASIC_CONVERSION)
        outfile.write("#%s\n" % headings_str)
        if type(summary) is dict:
            # using rpy
            outfile.write("%s\n" % "\t".join("%g" % summary[k] for k in headings))
        else:
            # using rpy2
            outfile.write("%s\n" % "\t".join("%g" % k for k in summary))
        outfile.close()

        if skipped_lines:
            print("Skipped %d invalid lines beginning with line #%d.  See tool tips for data requirements." % (skipped_lines, first_invalid_line))


if __name__ == "__main__":
    main()
Browse the archive

https://github.com/galaxyproject/galaxy