https://github.com/redayounsi/2kplus2
Raw File
Tip revision: 03943ab8b1e073bccf9b484f99358f5311c085a1 authored by Reda Younsi on 19 September 2018, 09:51:17 UTC
Update README.md
Tip revision: 03943ab
command.sh
#!/bin/sh
#$ -q short.q
#$ -cwd 
#$ -S /bin/sh 
#date 
#PBS -l walltime=12:15:00
#pwd 


date 
echo -- start time --  

#some awk commands to create a datasets to use for training a classifier

awk '{ if ($4 > 20)  if($2 != "mitochondria") if ($2 != "chloroplast") print $1, $2,$9 }' ~/Desktop/blast/blastedi0.txt > ~/Desktop/blast/clean_blastedi0.txt

awk '{ if ($3 == 0)  print $0 }' ~/Desktop/snplists/edi_0.v7c.sdi > ~/Desktop/snplists/edi0_snp_list.txt

awk '{ print substr($0,4) }' ~/Desktop/snplists/edi0_snp_list.txt > ~/Desktop/snplists/clean_edi0_snp_list.txt

awk 'NR==FNR{a[$2$3]=$1$2;next}a[$1$2]{print $0"\t"a[$1$2]}' ~/Desktop/blast/clean_blastedi0.txt ~/Desktop/snplists/clean_edi0_snp_list.txt > edi0_machlist.txt


awk '{print substr($0, 1, length() - 1)}' edi0_machlist.txt > edi0_machlist_clean.txt 

awk -F" " 'NR==FNR{a[$10]=$1;next}{if ($1 in a)print "Y" "\t"$0;else print "N""\t" $0;}' edi0_machlist_clean.txt ~/Desktop/blast/clean_blastedi0.txt > yn_edi0.txt


awk -F" "  '{ if ($1 == "Y")  print $0}' yn_edi0.txt > yesall.txt
awk -F" "  '{ if ($1 == "N")  print $0}' yn_edi0.txt > noall.txt

awk '!x[$2]++' noall.txt > noallnodup.txt
awk '!x[$2]++' yesall.txt > yesallnodup.txt

awk '{gsub(">branch0","",$2)}1' noallnodup.txt > noallnodup1.txt
awk '{gsub(">branch1","",$2)}1' noallnodup1.txt > noallnodup2.txt
awk '!x[$2]++' noallnodup2.txt > noallnodup3.txt


awk '{gsub(">branch0","",$2)}1' yesallnodup.txt > yesallnodup1.txt
awk '{gsub(">branch1","",$2)}1' yesallnodup1.txt > yesallnodup2.txt
awk '!x[$2]++' yesallnodup2.txt > yesallnodup3.txt


awk 'FNR == NR {data[ $2 ] = 1; next;}FNR < NR {if ( ! ($2 in data) ) {print $0;}}' yesallnodup3.txt noallnodup3.txt > noallnodup4.txt

run=$(awk ' END { print NR }' ~/Desktop/datasets/edi0_data.txt)

echo $run

awk -v var="$run" 'END {for (i=0;i<=var;i++) print ("snp"i) }' ~/Desktop/datasets/edi0_data.txt > ~/Desktop/datasets/edi0_data_index.txt
paste ~/Desktop/datasets/edi0_data_index.txt ~/Desktop/datasets/edi0_data.txt > ~/Desktop/datasets/edi0_data_indexed.txt


awk '{print $2}' noallnodup4.txt > noallsnps.txt
awk 'NR==FNR{a[$1]=$1;next}a[$1]{print $0}' noallsnps.txt  ~/Desktop/datasets/edi0_data_indexed.txt > neg_dataset_edi0.txt

awk '{print $2}' yesallnodup3.txt > yesallsnps.txt
awk 'NR==FNR{a[$1]=$1;next}a[$1]{print $0}' yesallsnps.txt  ~/Desktop/datasets/edi0_data_indexed.txt > pos_dataset_edi0.txt



awk -F" "  '{ if ($270 == 44)  print $0}' pos_dataset_edi0.txt > positives_col_edi0.txt
awk -F" "  '{ if ($270 != 44)  print $0}' pos_dataset_edi0.txt > positives_mix_edi0.txt


awk -F" "  '{ if ($270 == 44)  print $0}' ~/Desktop/datasets/edi0_data.txt > bub_col.txt

awk -F" "  '{ if ($270 != 44)  print $0}' ~/Desktop/datasets/edi0_data.txt > bub_mix.txt


awk ' END { print NR }' bub_col.txt
awk ' END { print NR }' bub_mix.txt
awk ' END { print NR }' positives_col_edi0.txt
awk ' END { print NR }' positives_mix_edi0.txt




awk '{print $0,"N"}' neg_dataset_edi0.txt > negative_edi0.txt
awk '{print $0,"Y"}' positives_mix_edi0.txt > positive_edi0.txt

cat positive_edi0.txt negative_edi0.txt > dataset_edi0.txt


cat ~/Desktop/datacrea/header.txt dataset_edi0.txt > dataheader_edi0.txt

awk -F" " -v OFS="," '{for (i=1;i<=NF;i++) gsub (/^ */,"",$i);print}' dataheader_edi0.txt > dataheadercsv_edi0.csv


cut -d,  -f2-  dataheadercsv_edi0.csv > ~/Desktop/csvdatasets/datasetheader_edi0.csv


echo -- end time --	
date 

#awk -F"," -v OFS="," '{ if ($270 != 44)  print $0}' dataheader.csv > datasetcom.csv
back to top