https://github.com/teerjk/VarSifter
Raw File
Tip revision: 7e57e5857b08f5253f28e96477fc211f67a0ffea authored by Jamie K. Teer on 27 April 2020, 14:42:41 UTC
-Documentation updates to point to github.
Tip revision: 7e57e58
VarData.java
import java.io.*;
import java.util.regex.*;
import java.util.BitSet;
import java.util.HashMap;
import java.util.Map;
import java.util.HashSet;
import java.util.Set;
import java.util.ArrayList;
import java.util.List;


/**
*   The VarData class handles interaction with the data.  It loads the data, filters, and returns results.
*   @author Jamie K. Teer
*/
public class VarData {
    
    //TODO:DONE alter this
    protected int S_FIELDS = 3;   //Number of columns for each sample

    final int AFF_NORM_PAIR = 0;
    final int CASE = 1;
    final int CONTROL = 2;
    final int MIN_MPG = 3;
    final int MIN_MPG_COV = 4;

    protected int genScoreThresh;
    
    final String[] geneDataHeaders = {"Gene_name", "Var Count"};
    final String[] ALLELES = {"A", "C", "G", "T"};
    final String[] requiredHeaders = {"Chr",
                                      "LeftFlank",
                                      "RightFlank",
                                      "Gene_name",
                                      "type",
                                      "muttype",
                                      "ref_allele",
                                      "var_allele"
                                     };

    final static int INTEGER = 0;
    final static int FLOAT = 1;
    final static int STRING = 2;
    final static int MULTISTRING = 3;

    protected String[] dataNamesOrig = {""};     // All data names, for writing purposes
    protected String[] dataNames = {""};
    protected String[] sampleNamesOrig = {""};   // All sample names, for writing purposes
    protected String[] sampleNames = {""};
    protected String[] sampleValueName = {""};

    //data fields
    protected int[][] data;           // Fields: [line][var_annotation col]
    protected int[][] outData;        // Gets returned (can be filtered)
    protected int[][][] samples;      // Fields: [line][sampleName][genotype:MPGscore:coverage]
    protected int[][][] outSamples;   // Gets returned (can be filtered)
    protected int[] classList = null;
    protected List<String> commentList = new ArrayList<String>(); //comment stored here for printing
    protected List<AbstractMapper> annotMapperBuilder = new ArrayList<AbstractMapper>();  //Build an array of AbstractMappers for annotations
    protected AbstractMapper[] annotMapper = new AbstractMapper[0];                  //the annotation AbstractMapper array
    //TODO:DONE alter this
    //ORIG: protected AbstractMapper[] sampleMapper = new AbstractMapper[0];      //The SampleMapper array
    protected AbstractMapper[] sampleMapper = new AbstractMapper[S_FIELDS];      //The SampleMapper array

    protected int[] compHetFields;

    protected final static Pattern vcf = Pattern.compile("^##fileformat=VCF");
    protected final static Pattern fDigits = VarSifter.fDigits;
    protected final static Pattern digits = VarTableModel.digits;
    protected final static Pattern comment = Pattern.compile("^#");
    protected final static Pattern floatNaN = Pattern.compile("^-?nan?$", Pattern.CASE_INSENSITIVE);
    protected final static Pattern colSuffixPat = Pattern.compile("_([a-z])$");

    protected BitSet dataIsIncluded;      // A mask used to filter data, samples
    protected BitSet dataIsEditable = new BitSet();      // Which data elements can be edited

    protected BitSet colMask;   // A mask used to load (and thus display) annotation columns (load if true)
    
    protected Map<String, Integer> dataTypeAt = new HashMap<String, Integer>();

    protected int[] affAt = new int[0];
    protected int[] normAt = new int[0];
    protected int[] caseAt = new int[0];
    protected int[] controlAt = new int[0];
    protected VarData parentVarData = null;
    protected int numCols = 0;         // Number of columns.  Set from first line, used to check subseq. lines
    protected String customQuery = "";
    protected BitSet[] bitSets;
    protected String dataFile;

    /**    
    *    Constructor reads in the file specified by full path in String inFile.
    *    It first reads through the file just to count lines for first dimension
    *     of data[][] and samples[][][].  Then, it reads again to fill in the array.
    *
    *   @param inFile Absolute path to VS file to load.
    */
    public VarData(String inFile) {
        dataFile = inFile;
    
        try {
            BufferedReader br = new BufferedReader(new FileReader(dataFile));
            String line = br.readLine();
            br.close();
            
            if (vcf.matcher(line).find()) { //VCF file
                VarSifter.showError("This looks like a VCF file - please append .vcf to filename and load again.");
                System.exit(1);
            }
            else {  //VarSifter file
                loadVSFile(inFile);
            }
        }
        catch (IOException ioe) {
            VarSifter.showError(ioe.toString());
            System.out.println(ioe);
            System.exit(1);
        }

        //indices for CompHet view
        compHetFields = new int[5];
        compHetFields[0] = (dataTypeAt.containsKey("Gene_name")) ? dataTypeAt.get("Gene_name") : -1;  //Gene name
        compHetFields[1] = dataTypeAt.get("Chr");    //chrom
        compHetFields[2] = dataTypeAt.get("LeftFlank"); //left flank
        compHetFields[3] = (dataTypeAt.containsKey("CDPred_score")) ? dataTypeAt.get("CDPred_score") : -1; //cdPred
        compHetFields[4] = dataTypeAt.get("type"); //variant type

        resetOutput();  //Initialize outData and outSamples
        
        //TESTING System.out.println("File Read finished: " + (System.currentTimeMillis() - time));
        /* TESTING
        for (int i =0; i<dataNames.length; i++) {
            System.out.println(i + " " + classList[i] + " " + annotMapper[i].getDataType() + " " + annotMapper[i].getLength());
        }

        for (int i=dataNames.length; i<classList.length; i++) {
            System.out.println(i + " " + classList[i] + " " + sampleMapper[(i-dataNames.length) % S_FIELDS].getDataType() + " " + sampleMapper[(i-dataNames.length) % S_FIELDS].getLength());
        }
        */
                
    }

    /** 
    *   Constructor for making subsetted copies using 
    *   the factory method returnSubVarData
    *  
    */
    private VarData(int[][] dataIn,
                    String[] dataNamesOrigIn,
                    String[] dataNamesIn,
                    int[][][] samplesIn,
                    String[] sampleNamesOrigIn,
                    String[] sampleNamesIn,
                    String[] sampleValueNameIn,
                    BitSet dataIsEditableIn,
                    Map<String, Integer> dataTypeAtIn,
                    int[] affAtIn,
                    int[] normAtIn,
                    int[] caseAtIn,
                    int[] controlAtIn,
                    VarData parentVarDataIn,
                    AbstractMapper[] annotMapperIn,
                    AbstractMapper[] sampleMapperIn,
                    List<String> commentListIn
                    ) {
        data = dataIn;
        dataNamesOrig = dataNamesOrigIn;
        dataNames = dataNamesIn;
        samples = samplesIn;
        sampleNamesOrig = sampleNamesOrigIn;
        sampleNames = sampleNamesIn;
        sampleValueName = sampleValueNameIn;
        dataIsEditable = (BitSet)dataIsEditableIn.clone();
        dataTypeAt = new HashMap<String, Integer>(dataTypeAtIn);
        affAt = affAtIn;
        normAt = normAtIn;
        caseAt = caseAtIn;
        controlAt = controlAtIn;
        parentVarData = parentVarDataIn;
        annotMapper = annotMapperIn;
        sampleMapper = sampleMapperIn;
        commentList = commentListIn;

        S_FIELDS = sampleValueName.length;

        dataIsIncluded = new BitSet(data.length);

        resetOutput();

    }

    
    /**
    *   Empty constructor - Do not use!! (
    */
    protected VarData() {
    }


    /**
    *   Load data structures by parsing a VarSifter file
    *
    *   @param inFile Absolute path to VarSifter file
    */
    private void loadVSFile(String inFile) {
        final int header_lines = 1;
        String line = "";
        int lineCount = 0;
        long time = System.currentTimeMillis();
        boolean first = true;
        boolean noSamples = false;
        int sampleCount = 0;
        boolean loadAll = false;
        final Pattern samPat = Pattern.compile("\\.NA(?:\\.\\w+)?$");
        final Pattern samLeadPat = Pattern.compile("\\.NA$");
        final Pattern samPostPat = Pattern.compile("\\.NA\\.(\\w+)$");
        final Pattern edPat = Pattern.compile("Comments");
        final Pattern samAff = Pattern.compile("aff");
        final Pattern samNorm = Pattern.compile("norm");
        final Pattern casePat = Pattern.compile("case");
        final Pattern controlPat = Pattern.compile("control");
        
        try {
            BufferedReader br = new BufferedReader(new FileReader(inFile));
            while ((line = br.readLine()) != null) {
                if (comment.matcher(line).find()) {
                    commentList.add(line);
                    continue;
                }

                lineCount++;
                String[] temp = line.split("\t", 0);
                if (first) {

                    List<String> dataT = new ArrayList<String>();
                    // Allow user to select columns for loading/viewing
                    for (int i=0; i<temp.length; i++) {
                        if ((samPat.matcher(temp[i])).find() ) {
                            sampleCount++;
                        }
                        else {
                            dataT.add(temp[i]);
                        }
                    }
                    numCols = temp.length;

                    if (VarSifter.emptyPat.matcher(inFile).find()) {
                        int emptyL = (VarSifter.emptyHeader.split("\t")).length;
                        colMask = new BitSet(emptyL);
                        colMask.set(0, emptyL);
                    }
                    else {
                        ColumnSelectionDialog csd = new ColumnSelectionDialog(dataT.toArray(new String[dataT.size()]), 
                                                                              requiredHeaders);
                        colMask = csd.runDialog();
                        csd = null;
                    }

                    if ((colMask.cardinality() + sampleCount) == temp.length) {
                        loadAll = true;
                    }
                    else {
                        //trim temp
                        temp = maskLine(temp, sampleCount);
                    }
                    
                    classList = new int[temp.length];
                    for (int i=0; i<classList.length; i++) {
                        classList[i] = INTEGER;
                    }
                    first = false;
                    continue;
                }
                    
                if (temp.length != numCols) {
                    VarSifter.showError("*** Input file appears to be malformed - column number not same as header! " +
                        "Line: " + (lineCount) + " ***");
                    System.out.println("*** Input file appears to be malformed - column number not same as header! " +
                        "Line: " + (lineCount) + " ***");
                    System.exit(1);
                }
                
                //Determine class of each column, change if not int; for now, do NOT set MULTISTRING here
                if (! loadAll) {
                    temp = maskLine(temp, sampleCount);
                }
                for (int i=0; i<temp.length; i++) {
                    if (classList[i] == STRING || classList[i] == MULTISTRING) {
                        continue;
                    }
                        if (fDigits.matcher(temp[i]).matches()) {
                            classList[i] = FLOAT;
                        }
                        else if (!digits.matcher(temp[i]).matches()) {
                            classList[i] = STRING;
                        }
                }

                if (lineCount % 1000 == 0) {
                    System.out.print(".");
                }

            }
            data = new int[lineCount - header_lines][];
            samples = new int[lineCount - header_lines][][];
            dataIsIncluded = new BitSet(lineCount - header_lines);
            System.out.println();
            System.out.println("File Parsing completed - loading file");

            br.close();
            lineCount = 0;
        }
        catch (IOException ioe) {
            VarSifter.showError(ioe.toString());
            System.out.println(ioe);
            System.exit(1);
        }
        first = true;
        //TESTING System.out.println("Parse finished: " + (System.currentTimeMillis() - time));

        try {
            BufferedReader br = new BufferedReader(new FileReader(inFile));
            
            while ((line = br.readLine()) != null) {
                
                if (comment.matcher(line).find()) {
                    continue;
                }
                
                String temp[] = line.split("\t", 0);
                List<String> sampleTemp = new ArrayList<String>();
                List<String> sampleTempOrig = new ArrayList<String>();
                List<String> sampleTempValueName = new ArrayList<String>(); // the sample value types, in order
                List<String> dataTemp = new ArrayList<String>();
                List<Integer> affPos = new ArrayList<Integer>();
                List<Integer> normPos = new ArrayList<Integer>();
                List<Integer> casePos = new ArrayList<Integer>();
                List<Integer> controlPos = new ArrayList<Integer>();

                long startT = System.currentTimeMillis();
                long annotT;
                long sampleT;
                
                if (! loadAll) {
                    temp = maskLine(temp, sampleCount);
                }

                //Handle the Header
                if (first) {
                    
                    int dataCount = 0;
                    sampleCount = 0;
                    int sampleValueCount = 0;

                    for (int i=0; i < temp.length; i++) {
                        // Is column a sample?
                        if ((samPat.matcher(temp[i])).find()) {

                            /* TODO:DONE match by a "name" tag, determine samPos using "samFieldCount" instead of
                             * S_FIELDS/
                             */
                            if ((samLeadPat.matcher(temp[i])).find()) {  //Sample name, not score, cov, etc
                                int samPos = sampleTemp.size();
                                sampleTemp.add(temp[i]);
                                sampleValueCount = 0;

                                //start building the sampleValueName order only if this is the first sample
                                if (sampleTemp.size() == 1) {
                                    sampleTempValueName.add("Genotype");
                                }
                                
                                if ((samAff.matcher(temp[i])).find()) {
                                    affPos.add(samPos);
                                }
                                else if ((samNorm.matcher(temp[i])).find()) {
                                    normPos.add(samPos);
                                }

                                if ((casePat.matcher(temp[i])).find()) {
                                    casePos.add(samPos);
                                }
                                else if ((controlPat.matcher(temp[i])).find()) {
                                    controlPos.add(samPos);
                                }
                            }
                            else {  //other custom sample fields
                                sampleValueCount++;
                                Matcher samPostPatMat = samPostPat.matcher(temp[i]);
                                String samType = null;
                                if (samPostPatMat.find()) {
                                    samType = samPostPatMat.group(1);
                                }
                                
                                if (samType != null) {
                                    //add to the sampleValueName order only if this is the first sample
                                    if (sampleTemp.size() == 1) {
                                        sampleTempValueName.add(samType);
                                    }
                                    else {
                                        if (sampleValueCount >= sampleTempValueName.size() ||
                                                !(sampleTempValueName.get(sampleValueCount)).equals(samType)) {
                                            VarSifter.showError("Sample columns must be same type and order"
                                                + " across all samples!");
                                            System.exit(1);
                                        }
                                    }
                                    
                                }
                                else {
                                    VarSifter.showError("Cannot parse sample headers.");
                                    System.exit(1);
                                }
                            /* TODO:DONE
                             * else if {} //Get the postfix, use as a new "sample" data element
                             */
                             
                            }
                            sampleCount++;
                            sampleTempOrig.add(temp[i]);
                        }

                        //Is column an annotation?
                        else {
                            
                            //Handle legacy files: refseq -> Gene_name
                            if (temp[i].equals("refseq")) {
                                temp[i] = "Gene_name";
                            }
                            else if (temp[i].equals("RS#")) {
                                temp[i] = "dbID";
                            }

                            // May want to read a flag from header - then can make checkboxes from this.
                            if (dataTypeAt.containsKey(temp[i])) {
                                Matcher m = colSuffixPat.matcher(temp[i]);

                                if (m.find()) {
                                    char suffix = m.group(1).toCharArray()[0];
                                    if (suffix == 'z') {
                                        VarSifter.showError("<html>Multiple columns have the same name, "
                                            + "which is not allowed.<p>Program unable to create a unique name.<p>"
                                            + "Please rename column #" + (i+1) + ": \"" + temp[i] + "\"</html>");
                                            System.exit(1);
                                    }
                                    else {
                                        suffix++;
                                        temp[i] = temp[i].substring(0, temp[i].length() - 2);
                                        temp[i] = (temp[i] + "_" + suffix);
                                        if (dataTypeAt.containsKey(temp[i])) {
                                            VarSifter.showError("<html>Multiple columns have the same name, "
                                                + "which is not allowed.<p>Program unable to create a unique name.<p>"
                                                + "Please rename column #" + (i+1) + ": \"" + temp[i] + "\"</html>");
                                            System.exit(1);
                                        }
                                    }
                                }
                                else {
                                    temp[i] += "_a";
                                    if (dataTypeAt.containsKey(temp[i])) {
                                        VarSifter.showError("<html>Multiple columns have the same name, "
                                            + "which is not allowed.<p>Program unable to create a unique name.<p>"
                                            + "Please rename column #" + (i+1) + ": \"" + temp[i] + "\"</html>");
                                        System.exit(1);
                                    }
                                }


                                VarSifter.showError("<html>Multiple columns have the same name, which is not allowed."
                                    + "<p>To fix this, the column name has been appended with a unique identifier:<p>"
                                    + temp[i] + "</html>");
                            }
                            dataTemp.add(temp[i]);
                            dataTypeAt.put(temp[i], i);

                            if ((edPat.matcher(temp[i])).find()) {
                                dataIsEditable.set(i);
                            }

                            //For now, only "type" field can be MULTISTRING
                            if (temp[i].equals("type")) {
                                classList[i] = VarData.MULTISTRING;
                            }
                            
                            switch (classList[i]) {
                                case INTEGER:
                                    annotMapperBuilder.add(new IntMapper());
                                    break;
                                case FLOAT:
                                    annotMapperBuilder.add(new FloatMapper());
                                    break;
                                case STRING:
                                    annotMapperBuilder.add(new StringMapper());
                                    break;
                                case MULTISTRING:
                                    annotMapperBuilder.add(new MultiStringMapper(";"));
                                    break;
                            }

                            dataCount++;
                        }
                    }
                    
                    //Ensure required headers present
                    checkReqHeaders();

                    if (sampleCount == 0) {
                        noSamples = true;
                    }
                    else {

                        S_FIELDS = sampleTempValueName.size();
                        sampleMapper = new AbstractMapper[S_FIELDS];

                        //Genotype
                        sampleMapper[0] = new StringMapper();
                        
                        /* TODO:DONE make a general data loader for these fields, instead of hard coded parsing
                         * of score, coverage. Will likely have to make a map, so order is consistent even 
                         * if columns are out of order.
                         */
                        //Map other sample values
                        for (int j=1; j < S_FIELDS; j++) {
                            int thisClass = -1;
                            for (int k=dataCount+j; k < temp.length; k+=S_FIELDS) {
                                if (thisClass < 0) {
                                    thisClass = classList[k];
                                }                            
                                //else if (classList[k] != thisClass && thisClass <= FLOAT && classList[k] <= FLOAT) {
                                //    VarSifter.showMessage("<html>Sample value columns have different numeric "
                                //        + "types<p>Initial type: " + thisClass + " Other type: " + classList[k]
                                //        + "<p>Row: " + lineCount + " Col: " + (k+1) + "/" 
                                //        + sampleTempValueName.get(j)
                                //        + "<p>VarSifter will continue to load, but you may want to check your "
                                //        + "data file");
                                //    thisClass = Math.max(thisClass, classList[k]);
                                //}
                                else if (classList[k] != thisClass) {
                                    VarSifter.showError("<html>Sample value columns have different data types<p>"
                                        + "Initial type: " + thisClass + " Other type: " + classList[k] + "<p>"
                                        + "Row: " + lineCount + " Col: " + (k+1) + "/" 
                                        + sampleTempValueName.get(j));
                                    System.exit(1);
                                }

                                //Assume second, third entries are score, coverage
                                if (j == 1 && (thisClass != INTEGER && thisClass != FLOAT)) {
                                    VarSifter.showError("<html> It looks like you have a non-integer, non-floating point value"
                                        + "<p>in the genotype score column! Row: " + lineCount + " Col: " 
                                        + (k+1) + "/" + sampleTempValueName.get(j));
                                    System.out.println("Error: non-integer, non-floating point number in genotype score"
                                        + "column, exiting!");
                                    System.exit(1);
                                }
                                if (j == 2 && thisClass != INTEGER) {
                                    VarSifter.showError("<html>It looks like you have a non-integer value in the genotype " 
                                        + "<p>coverage column! Row: " 
                                        + lineCount + " Col: " + (k+1) + "/" + sampleTempValueName.get(j));
                                    System.out.println("Error: non-integer in genotype coverage column, exiting!");
                                    System.exit(1);
                                }
                            }
                            
                            //Assign class type - for now, no MULTISTRINGMAPPER
                            switch (thisClass) {
                                case INTEGER:
                                    sampleMapper[j] = new IntMapper();
                                    break;
                                case FLOAT:
                                    sampleMapper[j] = new FloatMapper();
                                    break;
                                case STRING:
                                    sampleMapper[j] = new StringMapper();
                                    break;
                            }

                        }
                    }
                    

                    ////Genotype Qual (Integer or Float?)
                    //boolean isFloat = false;
                    //for (int j=dataCount+1; j <= temp.length; j+=3) {
                    //    if (classList[j] == FLOAT) {
                    //        isFloat = true;
                    //    }
                    //    else if (classList[j] == INTEGER) {
                    //        //OK, do nothing
                    //    }
                    //    else {
                    //        VarSifter.showError("<html> It looks like you have a non-integer, non-floating point value"
                    //            + "<p>in the genotype score column! Row: " + lineCount + " Col: " + (j+1));
                    //        System.out.println("Error: non-integer, non-floating point number in genotype score"
                    //            + "column, exiting!");
                    //        System.exit(1);
                    //    }
                    //}
                    //sampleMapper[1] = (isFloat) ? new FloatMapper() : new IntMapper();

                    ////Genotype coverage
                    //for (int j=dataCount+2; j <= temp.length; j+=3) {
                    //    if (classList[j] != INTEGER) {
                    //        VarSifter.showError("<html>It looks like you have a non-integer value in the genotype " 
                    //            + "<p>coverage column! Row: " 
                    //            + lineCount + " Col: " + (j+1));
                    //        System.out.println("Error: non-integer in genotype coverage column, exiting!");
                    //        System.exit(1);
                    //    }
                    //}
                    //sampleMapper[2] = new IntMapper();

                    //TODO:DONE may not need to do this if no longer hard coded
                    if (noSamples) {
                        sampleNames = new String[] {"NA"};
                        sampleNamesOrig = new String[] {"NA","NA","NA"};
                        sampleMapper = new AbstractMapper[3];
                        sampleMapper[0] = new StringMapper();
                        sampleMapper[0].addData("NA");
                        sampleMapper[1] = new IntMapper();
                        sampleMapper[2] = new IntMapper();
                        sampleValueName = new String[] {"NA", "NA", "NA", "NA"};
                    }
                    else {
                        sampleNames = sampleTemp.toArray(new String[sampleTemp.size()]);
                        sampleNamesOrig = sampleTempOrig.toArray(new String[sampleTempOrig.size()]);
                        sampleValueName = sampleTempValueName.toArray(new String[sampleTempValueName.size()]);
                    }
                    
                    dataNames = dataTemp.toArray(new String[dataTemp.size()]);
                    dataNamesOrig = dataNames; //Will have to change this when not all data included
                    annotMapper = annotMapperBuilder.toArray(new AbstractMapper[annotMapperBuilder.size()]);
                    
                    if (affPos.size() > 0 && normPos.size() > 0) {
                        affAt = new int[affPos.size()];
                        normAt = new int[normPos.size()];
                        for (int i=0; i < affAt.length; i++) { //Only works with norm/aff pairs...
                            affAt[i] = affPos.get(i);
                            normAt[i] = normPos.get(i);
                        }
                    }

                    if (casePos.size() > 0) {
                        caseAt = new int[casePos.size()];
                        for (int i=0; i < caseAt.length; i++) {
                            caseAt[i] = casePos.get(i);
                        }
                    }
                    if (controlPos.size() > 0) {
                        controlAt = new int[controlPos.size()];
                        for (int i=0; i< controlAt.length; i++) {
                            controlAt[i] = controlPos.get(i);
                        }
                    }

                    //Handle map file if available
                    File f = new File(inFile + ".map");
                    if (f.exists()) {
                        String mapLine = "";
                        Map<String, String> mapHash = new HashMap<String, String>();
                        BufferedReader mapReader = new BufferedReader(new FileReader(f));
                        while ((mapLine = mapReader.readLine()) != null) {
                            if (mapLine.contains("=")) {
                                String[] mapTemp = mapLine.split("=" , 2);
                                mapHash.put(mapTemp[0], mapTemp[1]);
                            }
                        }
                        mapReader.close();

                        for (int i=0; i < sampleNames.length; i++) {
                            String newName;
                            if ((newName = mapHash.get(sampleNames[i])) != null) {
                                sampleNames[i] = newName;
                            }
                        }

                    }


                    first = false;
                    continue;
                }

                
                //Fill data array (annotations)
                data[lineCount] = new int[dataNames.length];
                for (int i=0; i < dataNames.length; i++) {
                    switch (classList[i]) {
                        case INTEGER:
                            data[lineCount][i] = Integer.parseInt(temp[i]);
                            break;
                        case FLOAT:
                            float f = Float.parseFloat(temp[i]);
                            data[lineCount][i] = annotMapper[i].addData(f); 
                            break;
                        case STRING:
                            data[lineCount][i] = annotMapper[i].addData(temp[i]);
                            break;
                        case MULTISTRING:
                            data[lineCount][i] = annotMapper[i].addData(temp[i]);
                            break;
                    }
                }

                annotT = (System.currentTimeMillis() - startT);

                
                //Fill samples array (genotypes)
                samples[lineCount] = new int[sampleNames.length][S_FIELDS];
                
                if (noSamples) {
                    //TODO:DONE may not have to handle this
                    samples[lineCount][0][0] = sampleMapper[0].getIndexOf("NA");
                    samples[lineCount][0][1] = -1;
                    samples[lineCount][0][2] = -1;
                }
                else {
                    //TODO:DONE load sample info, based on included fields (no longer hard coded)
                    for (int i = 0; i < sampleNames.length; i++) {
                        for (int j=0; j<S_FIELDS; j++) {
                            int dataIndex = dataNames.length + (i * S_FIELDS) + j;
                            switch(classList[dataIndex]) {
                                case INTEGER:
                                    samples[lineCount][i][j] = Integer.parseInt(temp[dataIndex]);
                                    break;
                                case FLOAT:
                                    float f = Float.parseFloat(temp[dataIndex]);
                                    samples[lineCount][i][j] = sampleMapper[j].addData(f);
                                    break;
                                case STRING:
                                    samples[lineCount][i][j] = sampleMapper[j].addData(temp[dataIndex]);
                                    break;
                            }
                        }
                    }
                }

                sampleT = (System.currentTimeMillis() - startT - annotT);
                
                lineCount++;
                if (lineCount % 1000 == 0) {
                    //TESTING System.out.println(lineCount + " Annot: " + annotT + " Sample: " + sampleT );
                    System.out.print(".");
                }
            }
            br.close();
            System.out.println();
        }
        catch (IOException ioe) {
            VarSifter.showError(ioe.toString());
            System.out.println(ioe);
            System.exit(1);
        }
        catch (Exception e) {
            VarSifter.showError("<html>Ooops - VarSifter encountered an unexpected error when loading your " 
                + "VS file.<p>Check the terminal output for full details:<p>" + e.toString());
            e.printStackTrace();
            System.exit(1);
        }
   
    }


    /**
    *   Checks for required headers.  If any are missing, warn, close program.
    */
    protected void checkReqHeaders() {
        for (String s : requiredHeaders) {
            if ( !dataTypeAt.containsKey(s) ) {
                VarSifter.showError("Required column \"" + s + "\" missing! Please review the VS file format "
                    + "in the User Guide. Exiting.");
                System.exit(1);
            }
        }
    }


    /** 
    *   Returns 2d array of all data
    *  
    *   @return A two-dimension array of the data (1st Dimension is row, 2nd is column)
    */
    public int[][] dataDump() {
        boolean first = true;
        int[][] out = new int[data.length][dataNamesOrig.length + sampleNamesOrig.length];
        
        for (int i=0; i < data.length; i++) {
            System.arraycopy(data[i], 0, out[i], 0, dataNamesOrig.length);
            for (int j=0; j < sampleNames.length; j++) {
                System.arraycopy(samples[i][j], 0, out[i], (dataNamesOrig.length + (j * S_FIELDS)), S_FIELDS);
            }
        }
        return out;
    }
    
    /**
    *   Filter the data
    *
    *   @param df DataFilter object with the filtering options
    */
    /* 
    *   Filter mutation type
    *
    *   To add new filters, must do the following:
    *   -Add new JCheckBox
    *   -Add entry to JCheckBox[] VarSifter.cBox
    *   -Display new JCheckBox
    *   -change this.mask indices (so that correct bit is being read - same order as cbox)
    *   -change this.filterSet indices (so that correct bitset is being used)
    *   -increment TOTAL_FILTERS
    *   -add a test block with correct this.mask index
    *  
    */
    public void filterData(DataFilter df) {
        BitSet[] mask = df.getMask();
        String geneFile = df.getGeneFile();
        String bedFile = df.getBedFile();
        int[] spinnerData = df.getSpinnerData();
        String geneQuery = df.getGeneQuery();
        int minMPG = df.getMinMPG();
        float minMPGCovRatio = df.getMinMPGCovRatio();
        genScoreThresh = df.getGenScoreThresh();
        String geneDelim = df.getGeneDelim();

        dataIsIncluded.set(0,data.length);
        final int TOTAL_FILTERS = 11 + 1; //Number of non-type filters plus 1 (all type filters)
        BitSet[] filterSet = new BitSet[TOTAL_FILTERS];
        BitSet geneFilter = new BitSet(data.length);
        geneFilter.set(0, data.length);
        BitSet qualFilter = new BitSet(data.length);
        qualFilter.set(0, data.length);
        Pattern geneQueryPat = null;
        
        int typeIndex = dataTypeAt.get("type");
        int refAlleleIndex = dataTypeAt.get("ref_allele");
        int varAlleleIndex = dataTypeAt.get("var_allele");
        int dbSNPIndex = (dataTypeAt.containsKey("dbID")) ? dataTypeAt.get("dbID") : -1;
        int mendRecIndex = (dataTypeAt.containsKey("MendHomRec")) ? dataTypeAt.get("MendHomRec") : -1;
        int mendHetRecIndex = (dataTypeAt.containsKey("MendHetRec")) ? dataTypeAt.get("MendHetRec") : -1;
        int mendDomIndex = (dataTypeAt.containsKey("MendDom")) ? dataTypeAt.get("MendDom") : -1;
        int mendBadIndex = (dataTypeAt.containsKey("MendInconsis")) ? dataTypeAt.get("MendInconsis") : -1;
        int geneIndex = dataTypeAt.get("Gene_name");
        int chrIndex = dataTypeAt.get("Chr");
        int lfIndex = dataTypeAt.get("LeftFlank");
        int notMendHetRec = -1;
        Set<String> geneSet = new HashSet<String>();
        Map[] bedHash = null;   //<String, List<Integer>>

        //Set up type filters (filterSet[0], as all types are folded into one filter)
        filterSet[0] = new BitSet(data.length + 1);
        for (int i=0; i < mask[0].size(); i++) {
            if (mask[0].get(i)) {
                filterSet[0].set(data.length + 1);
                break;
            }
        }
        
        //Set up remaining filters (filterSet[x>0])
        for (int i=1; i < TOTAL_FILTERS; i++) {
            filterSet[i] = new BitSet(data.length + 1);
            if (mask[1].get(i - 1)) {  //must have -1 since mask is 0-based
                filterSet[i].set(data.length + 1);
            }
        }
        
        //Prepare certain tests

        // Type filters
        String[] typeNames = annotMapper[typeIndex].getSortedEntries();
        int[] types = new int[typeNames.length];
        for (int i=0; i < typeNames.length; i++) {
            types[i] = annotMapper[typeIndex].getIndexOf(typeNames[i]);
        }

        //dbSNP
        int nodbSNP = -1;
        if (dbSNPIndex > -1) {
            nodbSNP = annotMapper[dbSNPIndex].getIndexOf("-");
        }

        //menHetRec
        if (mask[1].get(VarSifter.MENDHETREC)) {
            notMendHetRec = annotMapper[mendHetRecIndex].getIndexOf("0,");
        }

        //aff/norm
        int naInt = sampleMapper[0].getIndexOf("NA");


        //filterFile
        if (mask[1].get(7) || mask[1].get(8)) {
            if (geneFile != null) {
                geneSet = returnGeneSet(geneFile);
            }
            else {
                VarSifter.showError("!!! geneFile not defined, so can't use it to filter !!!");
                System.out.println("!!! geneFile not defined, so can't use it to filter !!!");
            }
        }

        //bedFilterFile
        if (mask[1].get(9)) {
            if (bedFile != null) {
                bedHash = returnBedHash(bedFile);
            }
            else {
                VarSifter.showError("!!! bedFile not defined, so nothing to filter with !!!");
                System.out.println("!!! bedFile not defined, so nothing to filter with !!!");
            }
        }

        //Gene name filter
        if (geneQuery != null) {
            geneQueryPat = Pattern.compile(geneQuery, Pattern.CASE_INSENSITIVE);
            geneFilter.clear();
        }
        

        //Start filtering!
        
        for (int i = 0; i < data.length; i++) {
            String[] tempGeno = { annotMapper[refAlleleIndex].getString(data[i][refAlleleIndex]), 
                                  annotMapper[varAlleleIndex].getString(data[i][varAlleleIndex]) 
                                };
            String homNonRefGen = (tempGeno[1] + tempGeno[1]);
            java.util.Arrays.sort(tempGeno);
            String hetNonRefGen = "";
            for (String s : tempGeno) {
                hetNonRefGen += s;
            }
           
           
            // variant type
            for (int j=0; j < types.length; j++) {
                if (mask[0].get(j) && (data[i][typeIndex] & (int)Math.pow(2, types[j])) > 0 ) {

                    filterSet[0].set(i);
                    break;
                }
            }

            //dbSNP
            if ( dbSNPIndex > -1 
                && mask[1].get(0) 
                && ( annotMapper[dbSNPIndex].getString(data[i][dbSNPIndex]).matches("^0|-$") )
                ) {
                
                filterSet[1].set(i);
            }
            
            //Mendelian recessive (Hom recessive)
            if (mask[1].get(1) && data[i][mendRecIndex] == 1) {
                filterSet[2].set(i);
            }
            
            //Mendelian Dominant
            if (mask[1].get(2) && data[i][mendDomIndex] == 1) {
                filterSet[3].set(i);
            }

            //Mendelian Inconsistant
            if (mask[1].get(3) && data[i][mendBadIndex] == 1) {
                filterSet[4].set(i);
            }
                
            //Mendelian Compound Het (Het Recessive)
            if (mask[1].get(VarSifter.MENDHETREC) && data[i][mendHetRecIndex] != notMendHetRec) {
                filterSet[5].set(i);
            }

            //TODO: may need to adjust sample filtering
            //Affected different from Normal
            if (mask[1].get(5)) {
                int count = 0;
                for (int j=0; j < affAt.length; j++) {
                    int affTemp = samples[i][affAt[j]][0];
                    int normTemp = samples[i][normAt[j]][0];
                    if (affTemp != normTemp &&
                        affTemp != naInt &&
                        normTemp != naInt &&
                        ((sampleMapper[1].getDataType() == INTEGER && 
                          samples[i][affAt[j]][1] >= genScoreThresh &&
                          samples[i][normAt[j]][1] >= genScoreThresh) ||
                         (sampleMapper[1].getDataType() == FLOAT &&
                          sampleMapper[1].getFloat(samples[i][affAt[j]][1]) >= genScoreThresh &&
                          sampleMapper[1].getFloat(samples[i][normAt[j]][1]) >= genScoreThresh)
                        )
                       ) {

                        count++;
                    }
                }
                if (count >= spinnerData[AFF_NORM_PAIR]) {
                    filterSet[6].set(i);
                }
            }

            //TODO: may need to adjust sample filtering
            // Variant allele in >=x cases, <=y controls
            if (mask[1].get(6)) {
                int caseCount = 0;
                int controlCount = 0;
                for (int j=0; j < caseAt.length; j++) {
                    String caseTemp = sampleMapper[0].getString(samples[i][caseAt[j]][0]).replaceAll(":", "");
                    if ( (caseTemp.equals(hetNonRefGen) || caseTemp.equals(homNonRefGen)) &&
                        ((sampleMapper[1].getDataType() == INTEGER &&
                          samples[i][caseAt[j]][1] >= genScoreThresh) ||
                         (sampleMapper[1].getDataType() == FLOAT &&
                          sampleMapper[1].getFloat(samples[i][caseAt[j]][1]) >= genScoreThresh)
                        )
                       ) {

                        caseCount++;
                    }
                }
                for (int j=0; j < controlAt.length; j++) {
                    String controlTemp = sampleMapper[0].getString(samples[i][controlAt[j]][0]).replaceAll(":","");
                    if ( (controlTemp.equals(hetNonRefGen) || controlTemp.equals(homNonRefGen)) &&
                        ((sampleMapper[1].getDataType() == INTEGER &&
                          samples[i][controlAt[j]][1] >= genScoreThresh) ||
                         (sampleMapper[1].getDataType() == FLOAT &&
                          sampleMapper[1].getFloat(samples[i][controlAt[j]][1]) >= genScoreThresh)
                        )
                       ) {
                        controlCount++;
                    }
                }
                if (caseCount >= spinnerData[CASE] && controlCount <= spinnerData[CONTROL]) {
                    filterSet[7].set(i);
                }
            }

            //Gene Filter File (include, exclude)                        
            if (mask[1].get(7)) {
                String[] dataGenes = 
                    annotMapper[geneIndex].getString(data[i][geneIndex]).toLowerCase().split(geneDelim);
                
                for (String dG : dataGenes) {
                    if (geneSet.contains(dG)) {
                        filterSet[8].set(i);
                        break;
                    }
                }
            }
            
            //Gene Filter File (exclude)
            if (mask[1].get(8)) {
                String[] dataGenes = 
                    annotMapper[geneIndex].getString(data[i][geneIndex]).toLowerCase().split(geneDelim);
                boolean foundDG = false;
                for (String dG : dataGenes) {
                    if (geneSet.contains(dG)) {
                        foundDG = true;
                        break;
                    }
                }
                if (!foundDG) {
                    filterSet[9].set(i);
                }
            }
            
            //Bed Filter File (include)
            if (mask[1].get(9)) {
                String chrString = annotMapper[chrIndex].getString(data[i][chrIndex]);
                if (bedHash[0].get(chrString) != null) {

                    @SuppressWarnings("unchecked")
                    Object[] starts = ((Map<String, List<Integer>>)bedHash[0]).get(chrString).toArray();
                    @SuppressWarnings("unchecked")
                    Object[] ends = ((Map<String, List<Integer>>)bedHash[1]).get(chrString).toArray();
                    int pos = data[i][lfIndex] + 1;

                    for (int j=0; j<starts.length;j++) {
                        if (pos < (Integer)starts[j]) {
                            continue;
                        }

                        if (pos <= (Integer)ends[j]) {
                            filterSet[10].set(i);
                            break;
                        }
                    }
                }
            }

            // Gene name Filter (TextArea)
            if (geneQuery != null) {
                if ((geneQueryPat.matcher(annotMapper[geneIndex].getString(data[i][geneIndex]))).find()) {
                    geneFilter.set(i);
                }
            }

            //TODO: may need to adjust sample filtering
            // Qual filters
            if (minMPG != 0 || minMPGCovRatio != 0) {
                int minMPGCount = 0;
                int minMPGCovCount = 0;
                for (int j=0; j < sampleNames.length; j++) {
                    if ((sampleMapper[1].getDataType() == INTEGER &&
                          samples[i][j][1] >= minMPG) ||
                        (sampleMapper[1].getDataType() == FLOAT &&
                          sampleMapper[1].getFloat(samples[i][j][1]) >=minMPG)
                       ) {
                        minMPGCount++;
                    }
                    if ( samples[i][j][2] != 0 &&
                         ((sampleMapper[1].getDataType() == INTEGER &&
                           ((float)samples[i][j][1] / (float)samples[i][j][2]) >= minMPGCovRatio) ||
                          (sampleMapper[1].getDataType() == FLOAT &&
                           (sampleMapper[1].getFloat(samples[i][j][1]) / samples[i][j][2]) >= minMPGCovRatio)
                         )
                       ) {
                        minMPGCovCount++;
                    }
                }
                if (minMPGCount < spinnerData[MIN_MPG] || minMPGCovCount < spinnerData[MIN_MPG_COV]) {
                    qualFilter.clear(i);
                }
            }
                
                        
        }

        //TODO: may need to adjust sample filtering
        //Custom Query - outside data loop (it will loop by itself

        if (mask[1].get(10)) {
            try {
                CompileCustomQuery c = new CompileCustomQuery();
                if ( c.compileCustom(customQuery) ) {
                    filterSet[11] = c.run(this);
                    filterSet[11].set(data.length + 1);
                }
                else {
                    VarSifter.showError("Error with custom query - not applied!!");
                }
            }
            catch (NoClassDefFoundError e) {
                VarSifter.showError("<html>Couldn't find a class needed for custom querying - most likely you are"
                    + "<p>not running Java JDK 1.6 or greater.  See console for more details.");
                System.out.println(e.toString());
            }
        }

        
        //Apply all filters; intersection if that filter was used
        for (BitSet fs : filterSet) {
            if (fs.get(data.length + 1)) {
                dataIsIncluded.and(fs);
            }
        }

        dataIsIncluded.and(geneFilter);
        dataIsIncluded.and(qualFilter);
        filterOutput();
    }

    /** 
    *   Handle the filtering
    *  
    */
    protected void filterOutput() {
         
        if (dataIsIncluded.cardinality() == data.length) {
            outData = data;
            outSamples = samples;
        }
        else {
            outData = new int[dataIsIncluded.cardinality()][];
            outSamples = new int[dataIsIncluded.cardinality()][][];
            int j = 0;
            for (int i=0; i < data.length; i++) {
                if (dataIsIncluded.get(i)) {
                    outData[j] = data[i];
                    outSamples[j] = samples[i];
                    j++;
                }
            }
        }
    }


    /** 
    *   Returns true if column is editable
    *  
    *   @param col The number of the column to determine editable status
    *   @return True if editable
    */
    public boolean isEditable(int col) {
        return dataIsEditable.get(col);
    }


    /** 
    *   Returns number of requested sample type
    *  
    *   @param sType AFF_NORM_PAIR, CASE, or CONTROL
    *   @return The number of samples of the requested type
    */
    public int countSampleType(int sType) {
        switch (sType) {
            case AFF_NORM_PAIR: 
                return (affAt != null && normAt != null) ? affAt.length : 0;
            case CASE: 
                return (caseAt != null) ? caseAt.length : 0;
            case CONTROL: 
                return (controlAt != null) ? controlAt.length : 0;
            case MIN_MPG:
            case MIN_MPG_COV:
                return (sampleNames != null) ? sampleNames.length : 0;
            default:
                return 0;
        }
    }


    /** 
    *   Remove all filtering
    *  
    */
    public void resetOutput() {
        dataIsIncluded.set(0, data.length);
        filterOutput();
    }


    /**
    *   Remove elements of array not matching mask
    *
    *   @param inLine The araays of strings to mask
    *   @param unmaskedCount The number of elements (from right side) excluded from mask - usually sample cols
    *   @return The newly masked string array
    */
    protected String[] maskLine(String[] inLine, int unmaskedCount) {
        int colTot = colMask.cardinality();
        String[] outArray = new String[colTot + unmaskedCount];
        int annotCount = inLine.length - unmaskedCount;
        int colCount = 0;
        for (int i=0; i<annotCount; i++) {
            if (colMask.get(i)) {
                outArray[colCount] = inLine[i];
                colCount++;
            }
        }
        System.arraycopy(inLine, annotCount, outArray, colCount, unmaskedCount);

        return outArray;
    }
    

    /** 
    *   Return a hash of Lists containing start positions in a bedfile
    *  
    *   @param inFile The Bedfile to load
    *   @return An array of hashmaps.  Element 0: key=chrom, value = vector of starts.
    *                                  Element 1: key=chrom, value = vector of ends.
    */
    protected Map[] returnBedHash(String inFile) {
        Map<String, List<Integer>> outStart = new HashMap<String, List<Integer>>();
        Map<String, List<Integer>> outEnd   = new HashMap<String, List<Integer>>();
        Map[] outHash = new Map[2];

        try {
            String line = "";
            Pattern chr = Pattern.compile("^chr");
            BufferedReader br = new BufferedReader(new FileReader(inFile));
            while ((line = br.readLine()) != null) {
                if ( (chr.matcher(line)).find() ) {
                    String[] lineArray = line.split("\\s+");
                    if (outStart.get(lineArray[0]) == null) {
                        outStart.put(lineArray[0], new ArrayList<Integer>());
                        outEnd.put(lineArray[0], new ArrayList<Integer>());
                    }

                    outStart.get(lineArray[0]).add(Integer.valueOf(lineArray[1]) + 1);
                    outEnd.get(lineArray[0]).add(Integer.valueOf(lineArray[2]));
                }
            }
            br.close();
        }
        catch (IOException ioe) {
            System.out.println(ioe);
            System.exit(1);
        }

        outHash[0] = outStart;
        outHash[1] = outEnd;
        return outHash;
    }


    /**
    *   Return array of comment lines
    *   @return array of comment lines, one row per line
    */
    public String[] returnCommentList() {
        return commentList.toArray(new String[commentList.size()]);
    }


    /**
    *   Return array of indices used in CompHetView
    *
    *   @return Array of annotation indices used in CompHetView
    */
    public int[] returnCompHetFields() {
        return compHetFields;
    }
        

    /** 
    *   Return all annotation data
    *  
    *   @return Returns all the annotaion data as a 2d array: [line][annotation column]
    */
    public int[][] returnData() {
        return data;
    }

    /**
    *   Return filtered annotation data
    *
    *   @return Returns on the annotation data for variants (rows) passing filter. [line][annotation column]
    */
    public int[][] returnOutData() {
        return outData;
    }


    /**
    *   Return all sample data
    *
    *   @return Returns all sample data as a 3d array: [line][sample][gen:score:cov]
    */
    public int[][][] returnSamples() {
        return samples;
    }

    /**
    *   Return filtered sample data
    *
    *   @return Returns filtered sample data as 3d array: [line][sample][gen:score:cov]
    */
    public int[][][] returnOutSamples() {
        return outSamples;
    }


    /**
    *   Return the Annotation Column Names
    *   @return Returns an array of the column header names
    */
    public String[] returnDataNames() {
        return dataNames;
    }


    /**
    *   Return annotation lookup map
    *   @return array of lookup maps (in order of annotation column)
    */
    public AbstractMapper[] returnAnnotMap() {
        return annotMapper;
    }


    /**
    *   Return sample lookup map
    *   @return array of sample lookup maps (entry for each unique sample field)
    */
    public AbstractMapper[] returnSampleMap() {
        return sampleMapper;
    }


    /** 
    *   Return a clone of dataTypeAt
    *  
    *   @return A Map: key = Annotation column name  value: column number (0-based)
    */
    public Map<String, Integer> returnDataTypeAt() {
        return new HashMap<String, Integer>(dataTypeAt);
    }


    /** 
    *   Return the data value at a given row, column 
    *  
    *   @param row The row with the desired data (in the VarData object - not necessarily in the VarSifter view)
    *   @param colType The header name of the column with the desired data
    *   @return The data at the given position, or null if colType doesn't exist
    */
    public String returnDataValueAt(int row, String colType) {
        if (dataTypeAt.containsKey(colType)) {
            int index = dataTypeAt.get(colType);
            return annotMapper[index].getString(outData[row][index]);
        }
        else {
            return null;
        }
    }

    
    /** 
    *   Return the parent VarData or null if this is not a copy
    *  
    *   @return The parent VarData object of this object, or null if none
    */
    public VarData returnParent() {
        return parentVarData;
    }


    /** 
    *   Return data collapsed on gene name
    *  
    *   @return Data in a 2d array: [gene name key][number of variants]
    */
    public int[][] returnGeneData() {
        int[][] tempGeneData;
        Map<Integer, Integer> tempGeneHash = new HashMap<Integer, Integer>();

        for (int i=0; i<outData.length; i++) {
            int geneName = outData[i][dataTypeAt.get("Gene_name")];
            if (tempGeneHash.containsKey(geneName)) {
                tempGeneHash.put(geneName, tempGeneHash.get(geneName) + 1);
            }
            else {
                tempGeneHash.put(geneName, 1);
            }
        }

        tempGeneData = new int[tempGeneHash.size()][geneDataHeaders.length];
        int i = 0;
        for (Integer j : tempGeneHash.keySet()) {
            tempGeneData[i][0] = j.intValue();
            tempGeneData[i][1] = tempGeneHash.get(j).intValue();
            i++;
        }
        return tempGeneData;
    }

    /**
    *   Return column name for the Gene view
    *   @return Array of column names
    */
    public String[] returnGeneNames() {
        return geneDataHeaders;
    }




    /** 
    *   Get a Set from a file of gene names
    *  
    *   @param inFile The gene file to read (one gene per line)
    *   @return A Hashset of the gene names
    */
    protected Set<String> returnGeneSet(String inFile) {
        Set<String> outSet = new HashSet<String>();
        try {
            String line = "";
            BufferedReader br = new BufferedReader(new FileReader(inFile));
            while ((line = br.readLine()) != null) {
                outSet.add(line.toLowerCase());
            }
            br.close();
        }
        catch (IOException ioe) {
            System.out.println(ioe);
            System.exit(1);
        }
        return outSet;
    }

    
    /** 
    *   Return pairs of positions based on index
    *  
    *   @param inPair An array of indices where the others are in CompoundHet status with the first
    *   @param isSamples True if sample data is to be inlcluded in view
    *   @return A 2-d array with the data [pair][columns]
    */
    public int[][] returnIndexPairs(String[] inPair, boolean isSamples) {
        
        Set<Integer> inSet = new HashSet<Integer>();    //List of indices from MendHetRec columns
        int[][] out;
        int[][] eachPair;
        int indexIndex = (dataTypeAt.containsKey("Index")) ? dataTypeAt.get("Index") : -1;
        int pairCount = 0;
        int firstInPair; 
        List<Integer> rowSet = new ArrayList<Integer>();   //List of outData row indices.

        try {
            firstInPair = Integer.parseInt(inPair[0]);
            for (int i=0; i<inPair.length; i++) {
                inSet.add(new Integer(inPair[i]));
            }
        }
        catch (NumberFormatException nfe) {
            VarSifter.showError("MendHetRec field formatting is incorrect - must be comma-separated integers!");
            return null;
        }

        // store matches in outData (not necessarily inPair.length!)
        // ensure inPair[0] is in rowSet[0]!! Later, will return 0 1, 0 2, 0 3 etc pairs
        for (int i=0; i<outData.length; i++) {
            if (inSet.contains(outData[i][indexIndex])) {
                if (outData[i][indexIndex] == firstInPair) {
                    rowSet.add(0, i);   //Add rowIndex for line with mendhetrec Index inPair[0]
                }
                else {
                    rowSet.add(i);
                }
            }
        }

        eachPair = new int[rowSet.size()][];
        out = new int[rowSet.size() - 1][];

        for (Integer it: rowSet) {
            int i = it.intValue();
           
            if (isSamples) {
                eachPair[pairCount] = new int[compHetFields.length + (sampleNames.length * S_FIELDS)];
                for (int j = compHetFields.length; j < eachPair[pairCount].length; j+=S_FIELDS) {
                    int sampleIndex = (j - compHetFields.length) / S_FIELDS;
                    eachPair[pairCount][j]   = outSamples[i][sampleIndex][0];
                    eachPair[pairCount][j+1] = outSamples[i][sampleIndex][1];
                    eachPair[pairCount][j+2] = outSamples[i][sampleIndex][2];
                }
            }
            else {
                eachPair[pairCount] = new int[5];
            }
            for (int j=0; j<compHetFields.length; j++) {
                if (compHetFields[j] == -1) { //Field isn't in file
                    eachPair[pairCount][j] = -1; //Hold place, show "-" in CompHetTableModel
                }
                else {
                    eachPair[pairCount][j] = outData[i][compHetFields[j]];
                }
            }

            pairCount++;
        }

        for (int i=0; i<out.length; i++) {
            out[i] = new int[ ((eachPair[0].length * 2) - 2) ];
            System.arraycopy( eachPair[0], 0, out[i], 0, eachPair[0].length );
            System.arraycopy( eachPair[i+1], 2, out[i], eachPair[0].length, (eachPair[i+1].length - 2) );
        }
        return out;

    }



    /** 
    *   Return samples
    *  
    *   @param i The row (in this VarData object) for which to display sample information
    *   @return A 2-d array of sample data indices [sample_index][data type(SampleName, Genotype, MPG score, Coverage)]
    */
    public int[][] returnSample(int i) {
        int[][] tempOutSamples;
        
        if (outSamples.length == 0) {
            tempOutSamples = new int[0][];
        }
        else {
            //TODO:DONE:OK since I'm using S_FIELDS: adjust output of data - no hard coding of fields
            tempOutSamples = new int[sampleNames.length][S_FIELDS+1];
            for (int j = 0; j < sampleNames.length; j++) {
                for (int k = 0; k < S_FIELDS; k++) {
                    tempOutSamples[j][k+1] = outSamples[i][j][k];
                }
                tempOutSamples[j][0] = j;
            }
        }
        return tempOutSamples;
    }
    
    /**
    *   Return Samples Names
    *   @return Array of Sample Names
    */
    public String[] returnSampleNames() {
        return sampleNames;
    }

    
    /**
    *   Return Original sample column headers (not just names, but what was in the original file)
    *   @return An array of original sample column headers
    */
    public String[] returnSampleNamesOrig() {
        return sampleNamesOrig;
    }

    /**
    *   Return sample value names (the names of the column headers, including "genotype" for first column)
    *   @return An array of sample value names (labels after .NA.)
    */
    public String[] returnSampleValueNames() {
        return sampleValueName;
    }


    /** 
    *   Returns a new Object with a subset of the data
    *  
    *   @param vdatIn The VarData object to use as a basis for a Sub VarData object
    *   @param isInSubset BitSet where set bits determine which rows to include
    *   @return A child VarData object (usually with a subset of data) that knows its parent
    */
    public VarData returnSubVarData(VarData vdatIn, BitSet isInSubset) {
        if (isInSubset == null) {
            isInSubset = dataIsIncluded;
        }
        int[][] subsetData = new int[isInSubset.cardinality()][data[0].length];
        int[][][] subsetSamples = new int[subsetData.length][][];
        int lastPos = 0;
        for (int i=0; i < data.length; i++) {
            if (isInSubset.get(i)) {
                System.arraycopy(data[i], 0, subsetData[lastPos], 0, data[i].length);
                subsetSamples[lastPos] = samples[i];
                lastPos++;
            }
        }
        return new VarData(subsetData,
                           dataNamesOrig,
                           dataNames,
                           subsetSamples,
                           sampleNamesOrig,
                           sampleNames,
                           sampleValueName,
                           dataIsEditable,
                           dataTypeAt,
                           affAt,
                           normAt,
                           caseAt,
                           controlAt,
                           vdatIn,
                           annotMapper,
                           sampleMapper,
                           commentList
                           );
    }

    /**
    *   Set the array of BitSets to use for a custom query
    *
    *   @param inBS An array of BitSets to be used in custom querying
    */
    public void setCustomBitSet(BitSet[] inBS) {
        if (inBS == null) {
            bitSets = new BitSet[0];
        }
        else {
            bitSets = inBS;
        }
    }


    /**
    *   returns the array of BitSets to use for a custom query
    *
    *   @return An array of BitSets to interrogate for custom querying
    */
    public BitSet[] getCustomBitSet() {
        return bitSets;
    }


    /** 
    *   Set the customized part of a custom query
    *  
    *   @param in The logical statement to use as a filter in the custom compiled QueryModule object
    */
    public void setCustomQuery(String in) {
        customQuery = in;
    }


    /** 
    *   Overwrite field in data[][] (comments for now)
    *  
    *   @param row The row (in VarData.data rows that are set in dataIsIncluded) to modify
    *   @param col The column (in VarData.data) to modify
    *   @param newData The data to supplant to old data
    */
    public void setData(int row, int col, String newData) {
        int lastIndex = 0;
        for (int i = 0; i <= row; i++) {
            lastIndex = ( dataIsIncluded.nextSetBit(lastIndex) + 1 );
        }
        data[lastIndex - 1][col] = annotMapper[col].addData(newData);
    }


    public static void main(String args[]) {
        /*
        String input = "test_data.txt";
        if (args.length > 0) {
            input = args[0];
        }
        VarData vdat = new VarData(input);
        String[][] outData = vdat.returnOutData();
        String[] outNames = vdat.returnDataNames();
        
        for (String title : outNames) {
            System.out.print( title + "\t");
        }
        System.out.println();

        for (int i = 0; i < outData.length; i++) {
            System.out.print((i+1) + "\t");
            for (int j = 0; j < outData[i].length; j++) {
                if (outData[i][j].equals("")) {
                    System.out.print( "Err" + "\t");
                }
                else {
                    System.out.print(outData[i][j] + "\t");
                }
            }
            System.out.println();
        }
        */

        //Test - unique and not NA between first 2 samples
        //for (int i = 0; i < vdat.samples.length; i++) {
        //    if (!vdat.samples[i][0][0].equals(vdat.samples[i][1][0]) && (!vdat.samples[i][0][0].equals("NA") 
        //        && !vdat.samples[i][1][0].equals("NA"))) {

        //        StringBuilder out = new StringBuilder();
        //        for (String s : vdat.data[i]) {
        //            out.append(s + "\t");
        //        }
        //        for (String[] s : vdat.samples[i]) {
        //            out.append((String)s[0] + "\t");
        //        }
        //        out.delete((out.length()-1), (out.length()));
        //        System.out.println(out.toString());
        //    }
        //}

    }
}
back to top