Revision 42b3c6ebb50ecaabf92e3d168da1be05a640f9f2 authored by davelopez on 11 May 2022, 14:31:46 UTC, committed by davelopez on 11 May 2022, 14:31:46 UTC
1 parent 06d37d9
Raw File
grouping.xml
<tool id="Grouping1" name="Group" version="2.1.4">
  <description>data by a column and perform aggregate operation on other columns.</description>
  <requirements>
    <requirement type="package" version="1.15.4">numpy</requirement>
    <requirement type="package" version="8.31">coreutils</requirement>
  </requirements>
  <command detect_errors="exit_code">
  python '$__tool_directory__/grouping.py'
      '${out_file1}'
      '${input1}'
      '${groupcol}'
      '${ignorecase}'
      '${ignorelines}'
      #for $op in $operations
        '${op.optype},${op.opcol},${op.opround},${op.opdefault}'
      #end for
  </command>
  <inputs>
    <param format="tabular" name="input1" type="data" label="Select data" help="Dataset missing? See TIP below."/>
    <param name="groupcol" label="Group by column" type="data_column" data_ref="input1" />
    <param name="ignorecase" type="boolean" truevalue="1" falsevalue="0">
      <label>Ignore case while grouping?</label>
    </param>
    <param name="ignorelines" type="select" display="checkboxes" multiple="True" label="Ignore lines beginning with these characters" help="lines beginning with these are not grouped">
            <option value="62">&gt;</option>
            <option value="64">@</option>
            <option value="43">+</option>
            <option value="60">&lt;</option>
            <option value="42">*</option>
            <option value="45">-</option>
            <option value="61">=</option>
            <option value="124">|</option>
            <option value="63">?</option>
            <option value="36">$</option>
            <option value="46">.</option>
            <option value="58">:</option>
            <option value="38">&amp;</option>
            <option value="37">%</option>
            <option value="94">^</option>
            <option value="35">&#35;</option>
    </param>
    <repeat name="operations" title="Operation">
      <param name="optype" type="select" label="Type">
        <option value="mean">Mean</option>
        <option value="median">Median</option>
        <option value="mode">Mode</option>
        <option value="max">Maximum</option>
        <option value="min">Minimum</option>
        <option value="sum">Sum</option>
        <option value="length">Count</option>
        <option value="unique">Count Distinct</option>
        <option value="cat">Concatenate</option>
        <option value="cat_uniq">Concatenate Distinct</option>
        <option value="random">Randomly pick</option>
        <option value="std">Standard deviation</option>
      </param>
      <param name="opcol" label="On column" type="data_column" data_ref="input1" />
      <param name="opround" type="select" label="Round result to nearest integer?">
         <option value="no">NO</option>
         <option value="yes">YES</option>
       </param>
       <param name="opdefault" type="float" optional="true" label="Replace non numeric data" help="leave empty for no replacements. Will replace, e.g., empty cells and text cells."/>
    </repeat>
  </inputs>
  <outputs>
    <data format="tabular" name="out_file1" />
  </outputs>
  <tests>
    <!-- Test valid data -->
    <test>
      <param name="input1" value="1.bed"/>
      <param name="groupcol" value="1"/>
      <param name="ignorecase" value="true"/>
      <repeat name="operations">
        <param name="optype" value="mean"/>
        <param name="opcol" value="2"/>
        <param name="opround" value="no"/>
      </repeat>
      <output name="out_file1" file="groupby_out1.dat"/>
    </test>
    <!-- Test with an empty first field-->
    <test>
      <param name="input1" value="groupby_in_1.tabular"/>
      <param name="groupcol" value="1"/>
      <param name="ignorecase" value="true"/>
      <repeat name="operations">
        <param name="optype" value="cat_uniq"/>
        <param name="opcol" value="1"/>
        <param name="opround" value="no"/>
      </repeat>
      <output name="out_file1" file="groupby_out2.dat"/>
    </test>
    <test>
      <param name="input1" value="1.bed"/>
      <param name="groupcol" value="1"/>
      <param name="ignorecase" value="false"/>
      <repeat name="operations">
        <param name="optype" value="mean"/>
        <param name="opcol" value="2"/>
        <param name="opround" value="no"/>
      </repeat>
      <repeat name="operations">
        <param name="optype" value="median"/>
        <param name="opcol" value="2"/>
        <param name="opround" value="no"/>
      </repeat>
      <repeat name="operations">
        <param name="optype" value="mode"/>
        <param name="opcol" value="6"/>
        <param name="opround" value="no"/>
      </repeat>
      <repeat name="operations">
        <param name="optype" value="max"/>
        <param name="opcol" value="2"/>
        <param name="opround" value="no"/>
      </repeat>
      <repeat name="operations">
        <param name="optype" value="min"/>
        <param name="opcol" value="2"/>
        <param name="opround" value="no"/>
      </repeat>
      <repeat name="operations">
        <param name="optype" value="sum"/>
        <param name="opcol" value="2"/>
        <param name="opround" value="no"/>
      </repeat>
      <repeat name="operations">
        <param name="optype" value="length"/>
        <param name="opcol" value="1"/>
        <param name="opround" value="no"/>
      </repeat>
      <repeat name="operations">
        <param name="optype" value="unique"/>
        <param name="opcol" value="1"/>
        <param name="opround" value="no"/>
      </repeat>
      <repeat name="operations">
        <param name="optype" value="cat"/>
        <param name="opcol" value="1"/>
        <param name="opround" value="no"/>
      </repeat>
      <repeat name="operations">
        <param name="optype" value="cat_uniq"/>
        <param name="opcol" value="6"/>
        <param name="opround" value="no"/>
      </repeat>
        <repeat name="operations">
        <param name="optype" value="random"/>
        <param name="opcol" value="2"/>
        <param name="opround" value="no"/>
      </repeat>
      <repeat name="operations">
        <param name="optype" value="std"/>
        <param name="opcol" value="2"/>
        <param name="opround" value="no"/>
      </repeat>
      <output name="out_file1" file="groupby_out3.tabular" compare="sim_size"/>
    </test>
    <!-- test failure for non-numeric data -->
    <test expect_exit_code="1" expect_failure="true">
      <param name="input1" value="1.tabular"/>
      <param name="groupcol" value="1"/>
      <param name="ignorecase" value="true"/>
      <repeat name="operations">
        <param name="optype" value="mean"/>
        <param name="opcol" value="2"/>
        <param name="opround" value="no"/>
      </repeat>
      <assert_stderr>
          <has_text text="non float value 'hap' found in colum 2"/>
      </assert_stderr>
    </test>
    <test>
      <param name="input1" value="groupby_in_1.tabular"/>
      <param name="groupcol" value="1"/>
      <param name="ignorecase" value="true"/>
      <repeat name="operations">
        <param name="optype" value="mean"/>
        <param name="opcol" value="2"/>
        <param name="opround" value="no"/>
        <param name="opdefault" value="0"/>
      </repeat>
      <output name="out_file1" file="groupby_out4.tabular"/>
    </test>
  </tests>
  <help>

.. class:: infomark

**TIP:** If your data is not TAB delimited, use *Text Manipulation-&gt;Convert*

-----

**Syntax**

This tool allows you to group the input dataset by a particular column and perform aggregate functions: Mean, Median, Mode, Sum, Max, Min, Count, Concatenate, and Randomly pick on any column(s).

The Concatenate function will take, for each group, each item in the specified column and build a comma delimited list. Concatenate Unique will do the same but will build a list of unique items with no repetition.

Count and Count Unique are equivalent to Concatenate and Concatenate Unique, but will only count the number of items and will return an integer.

- If multiple modes are present, all are reported.

-----

**Example**

- For the following input::

   chr22  1000  1003  TTT
   chr22  2000  2003  aaa
   chr10  2200  2203  TTT
   chr10  1200  1203  ttt
   chr22  1600  1603  AAA

- **Grouping on column 4** while ignoring case, and performing operation **Count on column 1** will return::

   AAA    2
   TTT    3
   
- **Grouping on column 4** while not ignoring case, and performing operation **Count on column 1** will return::

   aaa    1
   AAA    1
   ttt    1
   TTT    2
  </help>
</tool>
back to top