Revision 152ada7da67b08f3e04ac95b284e45999c90341c authored by jferna10 on 15 July 2021, 07:18:18 UTC, committed by GitHub on 15 July 2021, 07:18:18 UTC
1 parent 033e4b3
countDMScodons.java
import java.io.*;
import java.lang.*;
import java.util.*;
public class countDMScodons
{
public int[][] codoncount;
public int[][] aacount;
public String[] CODONS = new String[] {"AAA", "AAG", "AAC", "AAT",
"AGA", "AGG", "AGC", "AGT",
"ACA", "ACG", "ACC","ACT",
"ATA", "ATG", "ATC","ATT",
"GAA", "GAG", "GAC","GAT",
"GGA", "GGG", "GGC","GGT",
"GCA", "GCG", "GCC","GCT",
"GTA", "GTG", "GTC","GTT",
"CAA", "CAG", "CAC","CAT",
"CGA", "CGG", "CGC","CGT",
"CCA", "CCG", "CCC","CCT",
"CTA", "CTG", "CTC","CTT",
"TAA", "TAG", "TAC","TAT",
"TGA", "TGG", "TGC","TGT",
"TCA", "TCG", "TCC","TCT",
"TTA", "TTG", "TTC","TTT"};
public String[] AACIDS = new String[] {"K", "N", "R", "S", "T", "I", "M", "E","D","G","A","V","Q","H","P","L","*","Y","W","C","F"};
public static void main(String args[])
{
new countDMScodons(args[0], Integer.valueOf(args[1]), Integer.valueOf(args[2]));
}
public countDMScodons(String filename, int start, int stop)
{
codoncount = new int[(stop-start)/3+1][64];
aacount = new int[(stop-start)/3+1][21];
for(int i =0; i < codoncount.length;i++)
for(int j =0; j <codoncount[i].length; j++)
codoncount[i][j] = 0;
for(int i =0; i < aacount.length;i++)
for(int j =0; j < aacount[i].length; j++)
aacount[i][j] = 0;
readSAM(filename, start, stop);
}
public void readSAM(String filename, int start, int stop)
{
try
{
BufferedReader br = new BufferedReader(new FileReader(new File(filename)));
String temp = br.readLine();
while (temp != null)
{
if (temp.charAt(0) == '@')
{
temp = br.readLine(); //move through header
}
else
{
StringTokenizer tk1 = new StringTokenizer(temp, "\t");
String name1 = tk1.nextToken(); //read name
temp = br.readLine(); //read next record (should be mate)
StringTokenizer tk2 = new StringTokenizer(temp, "\t");
String name2 = "";
if (tk2.hasMoreTokens())
name2 = tk2.nextToken(); //read name
if(name2.equals(name1))
{
SAMString one = new SAMString(name1, tk1.nextToken(), tk1.nextToken(), Integer.valueOf(tk1.nextToken()), Integer.valueOf(tk1.nextToken()), tk1.nextToken(), tk1.nextToken(), Integer.valueOf(tk1.nextToken()), Integer.valueOf(tk1.nextToken()), tk1.nextToken(), tk1.nextToken()); //name, flag, genome_name, start, MQ, cigar, mate, of mate1
SAMString two = new SAMString(name2, tk2.nextToken(), tk2.nextToken(), Integer.valueOf(tk2.nextToken()), Integer.valueOf(tk2.nextToken()), tk2.nextToken(), tk2.nextToken(), Integer.valueOf(tk2.nextToken()), Integer.valueOf(tk2.nextToken()), tk2.nextToken(), tk2.nextToken()); //name, flag, genome_name, start, MQ, cigar, mate, of mate1
countCodons(start, stop, new SAMJoin(one, two));
temp = br.readLine(); //read next record
}
else
{
System.out.println(name1 + " has no mate or SAM needs to be sorted");
}
}
}
br.close();
}
catch (Exception e)
{
System.out.println("crap");
e.printStackTrace();
}
writeCodons(filename, start, stop);
writeAA(filename, start, stop);
}
public void countCodons(int start, int stop, SAMJoin cluster)
{
//System.out.print(cluster.getName()+"\t");
SAMString mate1 = cluster.getMate1();
SAMString mate2 = cluster.getMate2();
String full = "";
if (start > cluster.getStart() && stop < cluster.getEnd())
{
for (int i = 0; (start-cluster.getStart()+i+3) < (stop-cluster.getStart()); i = i+3)
{
try
{
if ((start-cluster.getStart()+i+3) < mate1.getaRead().length() && i < (mate2.getStart()-mate1.getStart())) //if mate 1 does not overlap mate 2
{
full = full + mate1.getaRead().substring(start-cluster.getStart()+i, start-cluster.getStart()+i+3);
}
else if((start-cluster.getStart()+i+3) < mate1.getaRead().length() && i +start > mate2.getStart() && (start+i-mate2.getStart()+3) < mate2.getaRead().length()) //if still within mate1 and mate2 has started and hasn't ended (i.e. overlapped)
{
String s = "";
for (int j = i; j < i+3; j++)
{
int offset = start+j-mate2.getStart();
if (mate1.getQual().charAt(start-cluster.getStart()+j) > mate2.getQual().charAt(offset)) //(check who has better quality at this base, add to small substring
s = s + mate1.getaRead().charAt(start-cluster.getStart()+j);
else
s = s + mate2.getaRead().charAt(offset);
}
full = full + s; //add highest quality substring to full
}
else if(i +start > mate2.getStart() && start+i-mate2.getStart()+3 < mate2.getaRead().length()) //mate 2 non-overlap
{
int offset = start+i-mate2.getStart();
full = full + mate2.getaRead().substring(offset,offset+3);
}
}
catch (Exception e)
{
System.out.println("Error parsing cigar");
e.printStackTrace();;
System.out.println(mate1.getStart() + " " + mate2.getStart()+" " +i+" " +mate1.getName()+" " +mate1.getaRead());
}
//full = full + " ";
}
}
else
{
;
}
if (!full.contains("~") && !full.contains("-"))
for (int i = 0; (3*i+3) < full.length(); i++)
countCodon(full.substring(3*i, 3*i+3), i);
}
private void writeCodons(String filename, int start, int stop)
{
int sum = 0;
try
{
StringTokenizer outfile = new StringTokenizer(filename, ".");
String of = outfile.nextToken() + "_"+start+"_"+stop+"_codons.tab";
FileWriter fw = new FileWriter(new File(of));
for (int i=0; i <CODONS.length; i++)
fw.write("\t"+CODONS[i]);
fw.write("\tSUM\n");
for(int i =0; i < codoncount.length;i++)
{
fw.write(i+"\t");
for(int j =0; j <codoncount[i].length; j++)
{
fw.write(codoncount[i][j]+"\t");
sum = sum + codoncount[i][j];
}
fw.write(sum+"\n");
// System.out.println(sum);
sum = 0;
}
fw.close();
}
catch (Exception e)
{
System.out.println("Boo");
}
//for (int i = 0; i < 21; i++)
//System.out.println(AACIDS[i]+"\t"+(aacount[j][i]/sum));
}
private void writeAA(String filename,int start, int stop)
{
int sum = 0;
try
{
StringTokenizer outfile = new StringTokenizer(filename, ".");
String of = outfile.nextToken() + "_"+start+"_"+stop+"_aa.tab";
FileWriter fw = new FileWriter(new File(of));
for (int i=0; i <AACIDS.length; i++)
fw.write("\t"+AACIDS[i]);
fw.write("\tSUM\n");
for(int i =0; i < aacount.length;i++)
{
fw.write(i+"\t");
for(int j =0; j <aacount[i].length; j++)
{
fw.write(aacount[i][j]+"\t");
sum = sum + aacount[i][j];
}
fw.write(sum+"\n");
sum = 0;
}
fw.close();
}
catch (Exception e)
{
System.out.println("Boo");
}
//for (int i = 0; i < 21; i++)
//System.out.println(AACIDS[i]+"\t"+(aacount[j][i]/sum));
}
private void countCodon(String codon, int i)
{
if (codon.equals("AAA")) //K
{
codoncount[i][0]++;
aacount[i][0]++;
}
else if (codon.equals("AAG")) //K
{
codoncount[i][1]++;
aacount[i][0]++;
}
else if (codon.equals("AAC")) //N
{
codoncount[i][2]++;
aacount[i][1]++;
}
else if (codon.equals("AAT")) //N
{
codoncount[i][3]++;
aacount[i][1]++;
}
else if (codon.equals("AGA")) //R
{
codoncount[i][4]++;
aacount[i][2]++;
}
else if (codon.equals("AGG")) //R
{
codoncount[i][5]++;
aacount[i][2]++;
}
else if (codon.equals("AGC")) //S
{
codoncount[i][6]++;
aacount[i][3]++;
}
else if (codon.equals("AGT")) //S
{
codoncount[i][7]++;
aacount[i][3]++;
}
else if (codon.equals("ACA"))//T
{
codoncount[i][8]++;
aacount[i][4]++;
}
else if (codon.equals("ACG")) //T
{
codoncount[i][9]++;
aacount[i][4]++;
}
else if (codon.equals("ACC")) //T
{
codoncount[i][10]++;
aacount[i][4]++;
}
else if (codon.equals("ACT")) //T
{
codoncount[i][11]++;
aacount[i][4]++;
}
else if (codon.equals("ATA")) //I
{
codoncount[i][12]++;
aacount[i][5]++;
}
else if (codon.equals("ATG")) //M
{
codoncount[i][13]++;
aacount[i][6]++;
}
else if (codon.equals("ATC")) //I
{
codoncount[i][14]++;
aacount[i][5]++;
}
else if (codon.equals("ATT")) //I
{
codoncount[i][15]++;
aacount[i][5]++;
}
else if (codon.equals("GAA")) //E
{
codoncount[i][16]++;
aacount[i][7]++;
}
else if (codon.equals("GAG")) //E
{
codoncount[i][17]++;
aacount[i][7]++;
}
else if (codon.equals("GAC"))//D
{
codoncount[i][18]++;
aacount[i][8]++;
}
else if (codon.equals("GAT")) //D
{
codoncount[i][19]++;
aacount[i][8]++;
}
else if (codon.equals("GGA")) //G
{
codoncount[i][20]++;
aacount[i][9]++;
}
else if (codon.equals("GGG")) //G
{
codoncount[i][21]++;
aacount[i][9]++;
}
else if (codon.equals("GGC")) //G
{
codoncount[i][22]++;
aacount[i][9]++;
}
else if (codon.equals("GGT")) //G
{
codoncount[i][23]++;
aacount[i][9]++;
}
else if (codon.equals("GCA")) //A
{
codoncount[i][24]++;
aacount[i][10]++;
}
else if (codon.equals("GCG")) //A
{
codoncount[i][25]++;
aacount[i][10]++;
}
else if (codon.equals("GCC")) //A
{
codoncount[i][26]++;
aacount[i][10]++;
}
else if (codon.equals("GCT")) //A
{
codoncount[i][27]++;
aacount[i][10]++;
}
else if (codon.equals("GTA")) //V
{
codoncount[i][28]++;
aacount[i][11]++;
}
else if (codon.equals("GTG")) //V
{
codoncount[i][29]++;
aacount[i][11]++;
}
else if (codon.equals("GTC")) //V
{
codoncount[i][30]++;
aacount[i][11]++;
}
else if (codon.equals("GTT")) //V
{
codoncount[i][31]++;
aacount[i][11]++;
}
else if (codon.equals("CAA")) //Q
{
codoncount[i][32]++;
aacount[i][12]++;
}
else if (codon.equals("CAG")) //Q
{
codoncount[i][33]++;
aacount[i][12]++;
}
else if (codon.equals("CAC")) //H
{
codoncount[i][34]++;
aacount[i][13]++;
}
else if (codon.equals("CAT")) //H
{
codoncount[i][35]++;
aacount[i][13]++;
}
else if (codon.equals("CGA")) //R
{
codoncount[i][36]++;
aacount[i][2]++;
}
else if (codon.equals("CGG")) //R
{
codoncount[i][37]++;
aacount[i][2]++;
}
else if (codon.equals("CGC")) //R
{
codoncount[i][38]++;
aacount[i][2]++;
}
else if (codon.equals("CGT")) //R
{
codoncount[i][39]++;
aacount[i][2]++;
}else if (codon.equals("CCA")) //P
{
codoncount[i][40]++;
aacount[i][14]++;
}
else if (codon.equals("CCG")) //P
{
codoncount[i][41]++;
aacount[i][14]++;
}
else if (codon.equals("CCC")) //P
{
codoncount[i][42]++;
aacount[i][14]++;
}
else if (codon.equals("CCT")) //P
{
codoncount[i][43]++;
aacount[i][14]++;
}
else if (codon.equals("CTA")) //L
{
codoncount[i][44]++;
aacount[i][15]++;
}
else if (codon.equals("CTG")) //L
{
codoncount[i][45]++;
aacount[i][15]++;
}
else if (codon.equals("CTC")) //L
{
codoncount[i][46]++;
aacount[i][15]++;
}
else if (codon.equals("CTT")) //L
{
codoncount[i][47]++;
aacount[i][15]++;
}
else if (codon.equals("TAA")) //*
{
codoncount[i][48]++;
aacount[i][16]++;
}
else if (codon.equals("TAG")) //*
{
codoncount[i][49]++;
aacount[i][16]++;
}
else if (codon.equals("TAC")) //Y
{
codoncount[i][50]++;
aacount[i][17]++;
}
else if (codon.equals("TAT")) //Y
{
codoncount[i][51]++;
aacount[i][17]++;
}
else if (codon.equals("TGA")) //*
{
codoncount[i][52]++;
aacount[i][16]++;
}
else if (codon.equals("TGG")) //W
{
codoncount[i][53]++;
aacount[i][18]++;
}
else if (codon.equals("TGC")) //C
{
codoncount[i][54]++;
aacount[i][19]++;
}
else if (codon.equals("TGT")) //C
{
codoncount[i][55]++;
aacount[i][19]++;
}
else if (codon.equals("TCA")) //S
{
codoncount[i][56]++;
aacount[i][3]++;
}
else if (codon.equals("TCG")) //S
{
codoncount[i][57]++;
aacount[i][3]++;
}
else if (codon.equals("TCC")) //S
{
codoncount[i][58]++;
aacount[i][3]++;
}
else if (codon.equals("TCT")) //S
{
codoncount[i][59]++;
aacount[i][3]++;
}
else if (codon.equals("TTA")) //L
{
codoncount[i][60]++;
aacount[i][15]++;
}
else if (codon.equals("TTG")) //L
{
codoncount[i][61]++;
aacount[i][15]++;
}
else if (codon.equals("TTC")) //F
{
codoncount[i][62]++;
aacount[i][20]++;
}
else if (codon.equals("TTT")) //F
{
codoncount[i][63]++;
aacount[i][20]++;
}
}
private class SAMString
{
private String name; //read name (cluster ID)
private String flag; //flag
private String gname;
private int start; //mapping start
private int mq; //mapping quality
private String cigar; //cigar mutations from ref
private String mate; //name of mate
private int mpos; //mpos mapping
private int tlen; //frag length
private String read; //actual read
private String qual; //actual quality
private String aligned_read;
public SAMString(String n, String f, String g, int s, int mpq, String cig, String mname, int mp, int t, String r, String q)
{
name = n;
flag = f;
gname = g;
start = s;
mq = mpq;
cigar = cig;
mate = mname;
mpos = mp;
tlen = t;
read = r;
qual = q;
aligned_read = parseCIGAR();
}
public String getaRead()
{
return aligned_read;
}
public String parseCIGAR()
{
String m1 = read;
StringTokenizer ct = new StringTokenizer(cigar, "DHIMNPSX=", true);
int j = 0;
while (ct.hasMoreTokens())
{
String tmp = ct.nextToken();
if (!tmp.equals("*"))
{
int i = Integer.valueOf(tmp); //cigar int
int k=j+i-1; //position in string, cigar is +1 instead of zero
String delim = ct.nextToken();
if (delim.equals("D"))
{
String s = "";
for (int a=0; a<i; a++)
{
s = s + "-";
}
m1 = m1.substring(0,k) + s + m1.substring(k+1,m1.length());
}
else if (delim.equals("S"))
{
String s = "";
for (int a=0; a<i; a++)
{
s = s + "N";
}
m1 = m1.substring(0,j) + s + m1.substring(k+1,m1.length());
}
else if (delim.equals("I"))
{
m1 = m1.substring(0,j-1) + "~" + m1.substring(k, m1.length());
}
j = k;
}
}
m1 = m1.replace("N","");
return m1;
}
public String getName()
{
return name;
}
public String getRead()
{
return read;
}
public String getQual()
{
return qual;
}
public int getTLength()
{
return tlen;
}
public int getStart()
{
return start;
}
public int getEnd()
{
return mpos;
}
private String reverseComplementRead()
{
String b = new StringBuilder(read).reverse().toString();
b = b.replace("A", "W");
b = b.replace("T", "X");
b = b.replace("G", "Y");
b = b.replace("C", "Z");
b = b.replace("W", "T");
b = b.replace("X", "A");
b = b.replace("Y", "C");
b = b.replace("Z", "G");
return b;
}
public String getCig()
{
return cigar;
}
}
private class SAMJoin
{
private SAMString mate1;
private SAMString mate2;
private int mstart;
private int mend;
private String name;
public SAMJoin(SAMString m1, SAMString m2)
{
mate1 = m1;
mate2 = m2;
mstart = m1.getStart();
mend = m1.getEnd();
name = m1.getName();
}
public SAMString getMate1()
{
return mate1;
}
public SAMString getMate2()
{
return mate2;
}
public String getName()
{
return name;
}
public int getEnd()
{
return (mstart + mate1.getTLength());
}
public int getStart()
{
return mstart;
}
}
}
Computing file changes ...