https://github.com/teerjk/VarSifter
Tip revision: 7e57e5857b08f5253f28e96477fc211f67a0ffea authored by Jamie K. Teer on 27 April 2020, 14:42:41 UTC
-Documentation updates to point to github.
-Documentation updates to point to github.
Tip revision: 7e57e58
VCFVarData.java
import java.io.*;
import java.net.URL;
import javax.swing.*;
import java.text.NumberFormat;
import java.util.regex.*;
import java.util.BitSet;
import java.util.HashMap;
import java.util.Map;
import java.util.ArrayList;
import java.util.List;
import java.util.Collections;
/**
* A VarData subclass for loading VCF files
* @author Jamie K. Teer
*/
public class VCFVarData extends VarData {
private Map<String, Map<String, String>> infoMetaVCF = new HashMap<String, Map<String, String>>();
private Map<String, Map<String, String>> formatMetaVCF = new HashMap<String, Map<String, String>>();
private CustomAnnotation ca = null;
private static final Map<String, Integer> VCFTypeMap;
static {
Map<String, Integer> t = new HashMap<String, Integer>();
t.put("Integer", INTEGER);
t.put("Float", FLOAT);
t.put("Flag", INTEGER);
t.put("Character", STRING);
t.put("String", STRING);
VCFTypeMap = Collections.unmodifiableMap(t);
}
/**
* Interpret VCF file - load VarData data structures
* @param inFile Absolute pathe of VCF file to load
*/
public VCFVarData(String inFile) {
dataFile = inFile;
try {
BufferedReader br = new BufferedReader(new FileReader(inFile));
String line = br.readLine();
br.close();
if (vcf.matcher(line).find()) {
loadVCFFile(inFile);
}
else {
VarSifter.showError("VCF file doesn't look like a VCF - first header line not as expected.");
}
}
catch (IOException ioe) {
VarSifter.showError(ioe.toString());
System.out.println(ioe);
System.exit(1);
}
resetOutput();
}
/**
* Parses a VCF file to fill in data structures
* It first reads through the file to count lines for first dimension
* of data[][] and samples[][][]. Then, it reads again to fill in the array.
*
* @param inFile Absolute path to VCF file name
*/
private void loadVCFFile(String inFile) {
final Pattern info_pat = Pattern.compile("^##INFO");
final Pattern format_pat = Pattern.compile("^##FORMAT");
final Pattern head_pat = Pattern.compile("^#CHROM");
final Pattern genoSep_pat = Pattern.compile("([0-9])[/\\|]([0-9])");
final String[] fixedNames = { "Chr",
"LeftFlank",
"RightFlank",
"Gene_name",
"type",
"muttype",
"dbID",
"ref_allele",
"var_allele",
"QUAL",
"FILTER"
};
final int[] fixedClassList = { STRING,
INTEGER,
INTEGER,
STRING,
MULTISTRING,
STRING,
STRING,
STRING,
STRING,
FLOAT,
STRING
};
final String[] fixedSampleValueNames = { "GT",
"GQ",
"DP"
};
final int[] fixedSampleValueClassList = { STRING,
INTEGER,
INTEGER
};
if (fixedNames.length != fixedClassList.length) {
System.out.println("fixedName size different from fixed class list size! Tell developer!!");
System.exit(1);
}
//Fill dataTypeAt with fixed names
for (int i=0; i<fixedNames.length; i++) {
dataTypeAt.put(fixedNames[i], i);
}
String line = "";
boolean indel;
boolean noSamples = false;
boolean loadAll = false;
int lineCount = 0;
int infoCount = 0;
int headCount = 0;
int sampleCount = 0;
String geneNameKey = "";
String typeKey = "";
String typeDelim = "/";
final int annotCount = 8;
List<String> tempNames = new ArrayList<String>();
try {
BufferedReader br = new BufferedReader(new FileReader(inFile));
while ((line = br.readLine()) != null) {
String tempLine[] = line.split("\t", 0);
if (info_pat.matcher(line).find()) {
int pos = fixedNames.length + infoCount;
String key = updateVCFMetaHash(line, infoMetaVCF);
Map<String, String> tempMeta = infoMetaVCF.get(key);
String descTemp = tempMeta.get("Description");
if (dataTypeAt.containsKey(descTemp)) {
descTemp = (descTemp + "_" + tempMeta.get("ID"));
if (dataTypeAt.containsKey(descTemp)) {
VarSifter.showError("<html>INFO column has already been seen or has the same "
+ "Description as a reserved name in VarSifter,<p>"
+ "As VarSifter uses the Description to identify the column, this will not work.<p><p>"
+ "Failed to add a unique identifier, so please change the following:"
+ "ID=" + tempMeta.get("ID") + " Description="
+ tempMeta.get("Description") + "</html>");
System.exit(1);
}
VarSifter.showMessage("<html>INFO column has the same Description as a reserved name in VarSifter,<p>"
+ "or as the INFO description of another INFO field.<p>"
+ "As VarSifter uses the Description to identify the column, this will not work.<p><p>"
+ "To fix this, the column name has been appended with a unique identifier<p>"
+ "ID=" + tempMeta.get("ID") + " Description=" + tempMeta.get("Description")
+ "<p>New Description: " + descTemp
+ "</html>");
tempMeta.put("Description", descTemp);
}
dataTypeAt.put(tempMeta.get("Description"), pos);
tempNames.add(key);
infoCount++;
}
else if (format_pat.matcher(line).find()) {
String key = updateVCFMetaHash(line, formatMetaVCF);
}
else if (head_pat.matcher(line).find()) {
if (tempLine.length < annotCount) {
VarSifter.showError("<html>Header line (#CHROM...) column count is less than required."
+ "<p>Check the file format, and make sure the text file is tab-delimited!");
System.exit(1);
}
// Load any Custom Annotation JSON files.
CustomAnnotation[] caGroup = loadCustomAnnotation();
List<String> annotFormats = new ArrayList<String>();
for (CustomAnnotation cTemp : caGroup) {
if (infoMetaVCF.containsKey(cTemp.columnKey)) {
annotFormats.add(cTemp.format);
}
}
// display formats available, have user choose one.
if (annotFormats.size() > 0) {
String annotChoice = (String)JOptionPane.showInputDialog(
null,
"<html>The following special annotation formats were detected.<p>"
+ "If you want VarSifter to interpret one of these, select it and click \"OK\".<p>"
+ "Otherwise, click \"Cancel\".</html>",
"Choose Custom Annotation Format",
JOptionPane.QUESTION_MESSAGE,
null,
annotFormats.toArray(new String[annotFormats.size()]),
null);
if (annotChoice != null && annotChoice.length() > 0) {
for (int cIndex=0; cIndex < caGroup.length; cIndex++) {
if (annotChoice.equals(caGroup[cIndex].format)) {
ca = caGroup[cIndex];
}
}
}
}
// Ask User to give more info about data
InputTableDialog itd = new InputTableDialog(infoMetaVCF, inFile);
infoMetaVCF = itd.runDialog();
itd = null;
// determine custom Gene_name, type infoMetaVCF key (if any)
for (int i=0; i<tempNames.size(); i++) {
if ( Boolean.parseBoolean(infoMetaVCF.get(tempNames.get(i)).get("Gene_Name_Field"))
&& ca == null) {
if ( Boolean.parseBoolean(infoMetaVCF.get(tempNames.get(i)).get("Type_Field")) ) {
VarSifter.showError("<html>You cannot use the same column for both \"Gene Name\" and \"Type\""
+ ".<p>Please restart the program, and select distinct columns.</html>");
System.exit(1);
}
geneNameKey = tempNames.get(i);
}
else if ( Boolean.parseBoolean(infoMetaVCF.get(tempNames.get(i)).get("Gene_Name_Field"))
&& ca != null) {
VarSifter.showMessage("<html>You have indicated the special " + ca.format + " annotations "
+ "should be used by VarSifter.<p>Therefore, the \"Gene_Name_Field\" selection will "
+ "be ignored.<html>");
}
if ( Boolean.parseBoolean(infoMetaVCF.get(tempNames.get(i)).get("Type_Field"))
&& ca == null) {
typeKey = tempNames.get(i);
typeDelim = infoMetaVCF.get(tempNames.get(i)).get("Sub-delimiter");
if (typeDelim == null || typeDelim.equals("")) {
typeDelim = "/";
}
}
else if ( Boolean.parseBoolean(infoMetaVCF.get(tempNames.get(i)).get("Type_Field"))
&& ca != null) {
VarSifter.showMessage("<html>You have indicated the special " + ca.format + " annotations "
+ "should be used by VarSifter.<p>Therefore, the \"Type_Field\" selection will "
+ "be ignored.<html>");
}
}
// Allow user to select columns for loading / viewing
String[] toKeep = {geneNameKey, typeKey};
if (ca != null) {
toKeep = new String[] {ca.columnKey};
ca.setMultiAllelic( Boolean.parseBoolean(infoMetaVCF.get(ca.columnKey).get("MultiAllele")) );
}
ColumnSelectionDialog csd = new ColumnSelectionDialog(
tempNames.toArray(new String[tempNames.size()]),
toKeep);
colMask = csd.runDialog();
csd = null;
if ( colMask.cardinality() == tempNames.size() && geneNameKey.equals("") ) {
loadAll = true;
}
else {
List<String> maskedTempNames = new ArrayList<String>();
int colCount = fixedNames.length;
Map<String, Integer> maskedDataTypeAt =
new HashMap<String, Integer>(colMask.cardinality() + colCount);
//Fill maskedDataTypeAt with fixed names
for (int i=0; i<fixedNames.length; i++) {
maskedDataTypeAt.put(fixedNames[i], i);
}
// Remove undesired columns
// (except Gene_name: remove from temp fields, but preserve info for fixed fields)
// (Do NOT remove field asssigned to "type" - we will include both to preserve order,
// which is not preserved in the MULTISTRING type field)
for (int i=0; i<tempNames.size(); i++) {
if ( ! colMask.get(i) ) {
infoMetaVCF.remove(tempNames.get(i));
}
else if ( geneNameKey.equals(tempNames.get(i)) ) {
// Do nothing: don't remove from infoMetaVCF, but don't load to tempNames, dataTypeAt
// (This will be the Gene_name field, and will have that dataTypeAt.)
}
else {
maskedTempNames.add(tempNames.get(i));
maskedDataTypeAt.put(infoMetaVCF.get(tempNames.get(i)).get("Description"), colCount);
colCount++;
}
}
dataTypeAt = maskedDataTypeAt;
tempNames = maskedTempNames;
}
sampleCount = tempLine.length - (annotCount + 1);
//!!! Below TODO items may need to be done in the sampleCount test just below !!!
//TODO:DONE need to initialize sampleMapper based on total fields
//TODO:DONE populate String[] sampleValueName: GT first, then others (GQ, DP if present?)
//TODO:DONE fill sampleMapper based on FORMAT tags
classList = new int[ fixedNames.length + tempNames.size() ];
dataNames = new String[fixedNames.length + tempNames.size()];
if (sampleCount <= 0) {
noSamples = true;
sampleCount = 0;
sampleNames = new String[] {"NA"};
sampleNamesOrig = new String[] {"NA","NA","NA"};
sampleValueName = sampleNamesOrig;
//Populate sampleMapper with defaults, overwrite below if FORMAT tag exists
for (int i=0; i < fixedSampleValueClassList.length; i++) {
switch (fixedSampleValueClassList[i]) {
case INTEGER:
sampleMapper[i] = new IntMapper();
break;
case FLOAT:
sampleMapper[i] = new FloatMapper();
break;
case STRING:
sampleMapper[i] = new StringMapper();
break;
}
}
sampleMapper[0].addData("NA");
}
else {
//OK, we have samples, so let's set everything up
// First, determine sampleValue count, including fixed fields (even if not in header)
int sampleValueSize = formatMetaVCF.size();
for (String tag: fixedSampleValueNames) {
if (! formatMetaVCF.containsKey(tag) ) {
sampleValueSize++;
}
}
S_FIELDS = sampleValueSize;
sampleMapper = new AbstractMapper[S_FIELDS];
sampleValueName = new String[S_FIELDS];
//Populate sampleValueName, fixed fields first, then additional
System.arraycopy(fixedSampleValueNames, 0, sampleValueName, 0, fixedSampleValueNames.length);
//Populate sampleMapper with defaults, overwrite below if FORMAT tag exists
for (int i=0; i < fixedSampleValueClassList.length; i++) {
switch (fixedSampleValueClassList[i]) {
case INTEGER:
sampleMapper[i] = new IntMapper();
break;
case FLOAT:
sampleMapper[i] = new FloatMapper();
break;
case STRING:
sampleMapper[i] = new StringMapper();
break;
}
}
int svnOffset = fixedSampleValueNames.length;
int svnIndex = 0;
for (String tag: formatMetaVCF.keySet()) {
int realIndex = svnIndex + svnOffset; //assume tag not in fixed list
boolean isFixed = false;
for (int i=0; i < fixedSampleValueNames.length; i++) {
if (tag.equals(fixedSampleValueNames[i])) {
isFixed = true;
realIndex = i; //Ok, tag IS in fixed list - set realIndex
}
}
if (isFixed) {
svnOffset--;
}
else {
sampleValueName[realIndex] = tag;
}
int type = VCFTypeMap.get(formatMetaVCF.get(tag).get("Type"));
String number = formatMetaVCF.get(tag).get("Number");
// !!! Will need to explictly handle R, A (probably not G, .)
if (! number.equals("1") && ! tag.equals("GQ") && ! tag.equals("DP") ) {
type = STRING;
}
//TESTING
//System.out.println("tag:type:number " + tag + " " + type + " " + number);
switch (type) {
case INTEGER:
sampleMapper[realIndex] = new IntMapper();
break;
case FLOAT:
sampleMapper[realIndex] = new FloatMapper();
break;
case STRING:
sampleMapper[realIndex] = new StringMapper();
break;
}
svnIndex++;
}
sampleNames = new String[sampleCount];
sampleNamesOrig = new String[sampleCount*S_FIELDS];
}
for (int i=0; i < sampleCount; i++) {
//TODO:DONE alter this
sampleNames[i] = tempLine[i + annotCount + 1];
for (int j=0; j < sampleValueName.length; j++) {
sampleNamesOrig[ (i * S_FIELDS + j) ]
= sampleNames[i] + "." + sampleValueName[j];
}
}
//Fill dataNames
System.arraycopy(fixedNames, 0, dataNames, 0, fixedNames.length);
System.arraycopy(fixedClassList, 0, classList, 0, fixedClassList.length);
for (int i=0; i<tempNames.size(); i++) {
dataNames[i + fixedNames.length] = infoMetaVCF.get(tempNames.get(i)).get("Description");
//TESTING
//if (infoMetaVCF.get(tempNames.get(i)).get("MultiAllele").equals("true")) {;
// System.out.println(infoMetaVCF.get(tempNames.get(i)).get("ID")
// + " " + infoMetaVCF.get(tempNames.get(i)).get("Type") + " is multi-allelic.");
//}
String tempType = infoMetaVCF.get(tempNames.get(i)).get("Type");
String tempNum = infoMetaVCF.get(tempNames.get(i)).get("Number");
boolean multiAllelic = Boolean.parseBoolean(infoMetaVCF.get(tempNames.get(i)).get("MultiAllele"));
String subdelim = infoMetaVCF.get(tempNames.get(i)).get("Sub-delimiter");
//assign class
//TODO: use VCFTypeMap instead of this hard coding
if ( !subdelim.equals("") ) {
classList[i + fixedClassList.length] = STRING;
}
else if ( (tempType.equals("Integer")
&& (tempNum.equals("1") || multiAllelic))
|| tempType.equals("Flag")) {
classList[i + fixedClassList.length] = INTEGER;
}
else if (tempType.equals("Float")
&& (tempNum.equals("1") || multiAllelic)) {
classList[i + fixedClassList.length] = FLOAT;
}
else {
classList[i + fixedClassList.length] = STRING;
}
}
dataNamesOrig = dataNames;
//TODO:DONE - REMOVE - may not need this anymore?
//change genotype qual mapper to float if needed
//if (formatMetaVCF.containsKey("GQ")) {
// String tempType = formatMetaVCF.get("GQ").get("Type");
// String tempNum = formatMetaVCF.get("GQ").get("Number");
// //assign class
// if (tempType.equals("Float") && tempNum.equals("1")) {
// sampleMapper[1] = new FloatMapper();
// }
//}
//TESTING
//for (int i=0; i<dataNames.length; i++) {
// String in = "_";
// String type = "_";
// if (i >= fixedClassList.length) {
// in = tempNames.get(i-fixedClassList.length);
// type = infoMetaVCF.get(in).get("Type");
// }
// System.out.println(dataNames[i] + " " + in + " "
// + type + " "
// + classList[i]);
//}
//Fill annotMapper, sampleMapper
annotMapper = new AbstractMapper[classList.length];
for (int i=0; i<classList.length; i++) {
switch (classList[i]) {
case INTEGER:
annotMapper[i] = new IntMapper();
break;
case FLOAT:
annotMapper[i] = new FloatMapper();
break;
case STRING:
annotMapper[i] = new StringMapper();
break;
case MULTISTRING:
annotMapper[i] = new MultiStringMapper(typeDelim);
break;
}
}
}
else if (! comment.matcher(line).find()) {
if (tempLine.length < annotCount) {
VarSifter.showError("<html>Data line column count is less than required."
+ "<p>Check the file format, and make sure the text file is tab-delimited!");
System.exit(1);
}
//include multiple lines
String varAllele = tempLine[4];
if (varAllele.contains(",")) {
for (int i=0; i<varAllele.length(); i++) {
if (varAllele.charAt(i) == ',') {
lineCount++;
}
}
}
lineCount++;
}
if (lineCount % 1000 == 0) {
System.out.print(".");
}
}
data = new int[lineCount][];
samples = new int[lineCount][][];
dataIsIncluded = new BitSet(lineCount);
br.close();
//Ensure required columns are present (hopefully, as they are filled in by this class).
checkReqHeaders();
//TESTING FORMAT fields
//System.out.println("S_FIELDS: " + S_FIELDS);
//System.out.print("sampleValueName_type:");
//for (int s = 0; s < sampleValueName.length; s++) {
// System.out.print(" " + sampleValueName[s] + "_" + sampleMapper[s].getDataType());
//}
//System.out.println();
//System.out.println("sampleMapper count: " + sampleMapper.length);
//System.out.println(lineCount); //TESTING
lineCount = 0;
System.out.println();
System.out.println("File parsing completed - loading file");
}
catch (IOException ioe) {
System.out.println(ioe);
VarSifter.showError(ioe.toString());
System.exit(1);
}
//Open again - fill data
try {
BufferedReader br = new BufferedReader(new FileReader(inFile));
while ((line = br.readLine()) != null) {
if (! comment.matcher(line).find()) {
String tempLine[] = line.split("\t", 0);
List<String> alleles = new ArrayList<String>();
//Check for multiallelic line
String varAllele = tempLine[4];
int altAlleleCount = 1;
if (varAllele.contains(",")) {
for (int i=0; i<varAllele.length(); i++) {
if (varAllele.charAt(i) == ',') {
altAlleleCount++;
}
}
}
// First, load INFO fields to hash (so they are available for parsing)
String[] infoTemp = tempLine[7].split(";");
Map<String, String> infoHash = new HashMap<String, String>(tempNames.size() + 2);
for (String s : infoTemp) {
String[] pairs = s.split("=",2);
if (pairs.length == 2) {
infoHash.put(pairs[0], pairs[1]);
}
else if (pairs.length == 1) {
infoHash.put(pairs[0], "1");
}
}
// Load Custom Annotation data string to object
if (ca != null) {
if (infoHash.containsKey(ca.columnKey)) {
ca.loadAnnot(infoHash.get(ca.columnKey), altAlleleCount);
}
else {
ca.loadAnnot("", altAlleleCount);
}
}
//Run loop once for each alt allele
for (int altI = 0; altI < altAlleleCount; altI++) {
int tempLineCount = lineCount + altI;
data[tempLineCount] = new int[dataNames.length];
//Chr
if ( !tempLine[0].contains("chr") ) {
tempLine[0] = "chr" + tempLine[0];
}
data[tempLineCount][0] = annotMapper[0].addData(tempLine[0]);
//LeftFlank / RightFlank
data[tempLineCount][1] = Integer.parseInt(tempLine[1]) - 1;
data[tempLineCount][2] = Integer.parseInt(tempLine[1]) + tempLine[3].length();
//Gene_name
if (ca != null) {
data[tempLineCount][3] = annotMapper[3].addData(ca.getGeneName(altI));
}
else if ( !geneNameKey.equals("") ) {
if ( infoHash.get(geneNameKey) != null ) {
data[tempLineCount][3] = annotMapper[3].addData(
infoHash.get(geneNameKey));
}
else {
data[tempLineCount][3] = annotMapper[3].addData("-");
}
}
else {
data[tempLineCount][3] = annotMapper[3].addData("-");
}
//type
if (ca != null) {
// This is now split based on allele
data[tempLineCount][4] = annotMapper[4].addData(ca.getType(altI));
}
else if ( !typeKey.equals("") ) {
if ( infoHash.get(typeKey) != null ) {
if (infoMetaVCF.get(typeKey).get("MultiAllele").equals("true")) {
//split values, enter correct one for this allele
String[] multiValues = infoHash.get(typeKey).split(",",0);
String s = "-";
if (altI < multiValues.length) {
s = multiValues[altI];
}
data[tempLineCount][4] = annotMapper[4].addData(s);
}
else {
//Not multiallele, so add complete value
data[tempLineCount][4] = annotMapper[4].addData(
infoHash.get(typeKey));
}
}
else {
data[tempLineCount][4] = annotMapper[4].addData("-");
}
}
else {
data[tempLineCount][4] = annotMapper[4].addData("-");
}
//dbID
if (tempLine[2].equals(".")) {
tempLine[2] = "-";
}
data[tempLineCount][6] = annotMapper[6].addData(tempLine[2]);
//ref_allele
data[tempLineCount][7] = annotMapper[7].addData(tempLine[3]);
alleles.add(tempLine[3]);
//var_allele
String[] varTemp = tempLine[4].split(",", 0);
data[tempLineCount][8] = annotMapper[8].addData(varTemp[altI]);
//muttype and assingment of INDEL (and further parsing of var_allele)
indel = (tempLine[3].length() != 1) ? true : false;
if (tempLine[3].length() != varTemp[altI].length() ) {
indel = true;
}
// Uncommenting below lines breaks things - SNVs should be 1,2 char.
//else if (tempLine[3].length() == varTemp[altI].length() ) {
// indel = false;
//}
if (altI == 0) { //only load alleles once!
for (int i=0; i<varTemp.length; i++) {
alleles.add(varTemp[i]);
}
}
int index;
if (indel) {
index = annotMapper[5].addData("INDEL");
}
else {
index = annotMapper[5].addData("SNP");
}
data[tempLineCount][5] = index;
//QUAL
if (tempLine[5].equals(".")) {
tempLine[5] = "NaN";
}
data[tempLineCount][9] = annotMapper[9].addData(Float.parseFloat(tempLine[5]));
//FILTER
data[tempLineCount][10] = annotMapper[10].addData(tempLine[6]);
//INFO field
for (int i=0; i<tempNames.size(); i++) {
int pos = i + fixedNames.length;
String key = tempNames.get(i);
if (infoMetaVCF.get(key).get("MultiAllele").equals("true")) {
//split these values, enter correct value for alt allele (or 0/- if no value)
String[] multiValues = {""};
if (infoHash.containsKey(key)) {
multiValues = infoHash.get(key).split(",",0);
}
//TESTING System.out.println(key + " " + classList[pos] + " " + " " + pos + " " + infoMetaVCF.get(key).get("Type"));
switch (classList[pos]) {
case INTEGER:
if (infoHash.containsKey(key) && altI < multiValues.length) {
data[tempLineCount][pos] = Integer.parseInt(multiValues[altI]);
}
else {
data[tempLineCount][pos] = 0;
}
break;
case FLOAT:
float f = 0f;
if (infoHash.containsKey(key) && altI < multiValues.length) {
if (floatNaN.matcher(infoHash.get(key)).find()) {
f = Float.parseFloat("NaN");
}
else {
f = Float.parseFloat(multiValues[altI]);
}
}
data[tempLineCount][pos] = annotMapper[pos].addData(f);
break;
case STRING:
String s = "-";
if (infoHash.containsKey(key) && altI < multiValues.length) {
s = multiValues[altI];
}
data[tempLineCount][pos] = annotMapper[pos].addData(s);
break;
}
}
else {
//Not multiallele, so add complete value to each line
switch (classList[pos]) {
case INTEGER:
if (infoHash.containsKey(key)) {
data[tempLineCount][pos] = Integer.parseInt(infoHash.get(key));
}
else {
data[tempLineCount][pos] = 0;
}
break;
case FLOAT:
float f = 0f;
if (infoHash.containsKey(key)) {
if (floatNaN.matcher(infoHash.get(key)).find()) {
f = Float.parseFloat("NaN");
}
else {
f = Float.parseFloat(infoHash.get(key));
}
}
data[tempLineCount][pos] = annotMapper[pos].addData(f);
break;
case STRING:
String s = "-";
if (infoHash.containsKey(key)) {
s = infoHash.get(key);
}
data[tempLineCount][pos] = annotMapper[pos].addData(s);
break;
}
}
}
// Handle Samples
samples[tempLineCount] = new int[sampleNames.length][S_FIELDS];
if (noSamples) {
samples[tempLineCount][0][0] = sampleMapper[0].getIndexOf("NA");
samples[tempLineCount][0][1] = (sampleMapper[1].getDataType() == FLOAT)
? sampleMapper[1].addData(Float.parseFloat("NaN")) : 0;
samples[tempLineCount][0][2] = 0;
}
else {
String[] sampTemp = tempLine[8].split(":");
Map<String, Integer> sampHash = new HashMap<String,Integer>(7);
for (int i=0; i < sampTemp.length; i++) {
sampHash.put(sampTemp[i], i);
}
if ( (tempLine.length - (annotCount+1)) != sampleNames.length) {
System.out.println("INTERNAL ERROR: inconsistent sample counting at dataline "
+ tempLineCount);
System.exit(1);
}
for (int i = annotCount + 1; i < tempLine.length; i++) {
sampTemp = tempLine[i].split(":");
String geno = sampTemp[sampHash.get("GT")];
Matcher m = genoSep_pat.matcher(geno);
// Genotype
// !!! Will need to fix this for "normalized" VCF (from vt)
if (geno.contains(".")) {
geno = "NA";
}
else if (m.find()) {
String[] genoTemp = { alleles.get(Integer.parseInt(m.group(1))),
alleles.get(Integer.parseInt(m.group(2)))
};
java.util.Arrays.sort(genoTemp);
// DIV handling
if (indel) {
geno = genoTemp[0] + ":" + genoTemp[1];
}
else {
geno = genoTemp[0] + genoTemp[1];
}
}
else {
try {
geno = alleles.get(Integer.parseInt(geno));
}
catch (NumberFormatException nfe) {
System.out.println("Malformed genotype on line " + (tempLineCount + 1) + ": " + geno );
}
}
samples[tempLineCount][i - (annotCount + 1)][0] = sampleMapper[0].addData(geno);
//TODO:DONE - REMOVE - may not need separate "Qual score", "coverage" loaders
// Qual score
//if (sampHash.get("GQ") == null || sampTemp.length <= sampHash.get("GQ") || sampTemp[sampHash.get("GQ")].equals(".")) {
// samples[tempLineCount][i - (annotCount + 1)][1]
// = (sampleMapper[1].getDataType() == FLOAT)
// ? sampleMapper[1].addData(Float.parseFloat("NaN"))
// : sampleMapper[1].addData(0);
//}
//else {
// samples[tempLineCount][i - (annotCount + 1)][1]
// = (sampleMapper[1].getDataType() == FLOAT)
// ? sampleMapper[1].addData(Float.parseFloat(sampTemp[sampHash.get("GQ")]))
// : sampleMapper[1].addData(Integer.parseInt(sampTemp[sampHash.get("GQ")]));
//}
//// Read depth
//if (sampHash.get("DP") == null || sampTemp.length <= sampHash.get("DP") || sampTemp[sampHash.get("DP")].equals(".")) {
// samples[tempLineCount][i - (annotCount + 1)][2] = 0;
//}
//else {
// samples[tempLineCount][i - (annotCount + 1)][2]
// = Integer.parseInt(sampTemp[sampHash.get("DP")]);
//}
//TODO:DONE Load other sample fields
// Start at index 1, as 0 is GT (handled above)
for (int j = 1; j < S_FIELDS; j++) {
String tag = sampleValueName[j];
switch (sampleMapper[j].getDataType()) {
case INTEGER:
samples[tempLineCount][i - (annotCount + 1)][j]
= (sampHash.get(tag) != null
&& sampTemp.length > sampHash.get(tag)
&& !sampTemp[sampHash.get(tag)].equals(".") )
? sampleMapper[j].addData(Integer.parseInt(sampTemp[sampHash.get(tag)]))
: sampleMapper[j].addData(0);
break;
case FLOAT:
samples[tempLineCount][i - (annotCount + 1)][j]
= (sampHash.get(tag) != null
&& sampTemp.length > sampHash.get(tag)
&& !sampTemp[sampHash.get(tag)].equals(".") )
? sampleMapper[j].addData(Float.parseFloat(sampTemp[sampHash.get(tag)]))
: sampleMapper[j].addData(Float.parseFloat("NaN"));
break;
case STRING:
samples[tempLineCount][i - (annotCount + 1)][j]
= (sampHash.get(tag) != null
&& sampTemp.length > sampHash.get(tag)
&& !sampTemp[sampHash.get(tag)].equals(".") )
? sampleMapper[j].addData(sampTemp[sampHash.get(tag)])
: sampleMapper[j].addData(CustomAnnotation.EMPTY);
break;
}
}
}
}
}
lineCount += altAlleleCount;
}
if (lineCount % 1000 == 0) {
System.out.print(".");
}
}
br.close();
System.out.println();
}
catch (IOException ioe) {
System.out.println(ioe);
VarSifter.showError(ioe.toString());
System.exit(1);
}
catch (Exception e) {
VarSifter.showError("<html>Ooops - VarSifter encountered an unexpected error when loading your "
+ "VCF file.<p>Check the terminal output for full details:<p>" + e.toString());
e.printStackTrace();
System.exit(1);
}
}
/**
* Add to a hash of VCF metadata values
*
* @param line VCF metadata line
* @param mHash VCF metadata hash
* @return Metadata key
*/
private String updateVCFMetaHash(String line, Map<String, Map<String, String>> mHash) {
String key = "";
Map<String, String> temp = new HashMap<String, String>(3);
Pattern p = Pattern.compile("<(.*)>");
Matcher m = p.matcher(line);
String sub = "";
if (m.find()) {
sub = m.group(1);
}
else {
System.out.println("VCF file may have malformed Headers: no \"<>\"");
System.exit(1);
}
String[] tags = sub.split(",", 4);
for (String s : tags) {
String[] pairs = s.split("=");
if (pairs[0].equals("ID")) {
key = pairs[1];
}
temp.put(pairs[0], pairs[1].replaceAll("\"", ""));
}
mHash.put(key, temp);
return key;
}
/**
* Detect any JSON files in current working directory, return objects
*
* @return array of CustomAnnotation objects representing available custom annotation descriptions
*/
private CustomAnnotation[] loadCustomAnnotation() {
URL cDir = getClass().getProtectionDomain().getCodeSource().getLocation();
File file = new File(cDir.getFile());
if ( ! file.isDirectory() ) {
file = file.getParentFile();
}
//System.out.println(java.util.Arrays.toString(file.list()));
//System.out.println(cDir);
String[] files = file.list();
List<CustomAnnotation> out = new ArrayList<CustomAnnotation>();
for (String s : files) {
if (s.endsWith(".vs.json")) {
out.add(new CustomAnnotation(file + "/" + s));
//System.out.println(file + "/" + s);
}
}
/*
for (CustomAnnotation c: out) {
System.out.println(c.format + " " + c.columnKey);
}
System.exit(0);
*/
return out.toArray(new CustomAnnotation[out.size()]);
}
}