https://github.com/redayounsi/2kplus2
Tip revision: 03943ab8b1e073bccf9b484f99358f5311c085a1 authored by Reda Younsi on 19 September 2018, 09:51:17 UTC
Update README.md
Update README.md
Tip revision: 03943ab
command.sh
#!/bin/sh
#$ -q short.q
#$ -cwd
#$ -S /bin/sh
#date
#PBS -l walltime=12:15:00
#pwd
date
echo -- start time --
#some awk commands to create a datasets to use for training a classifier
awk '{ if ($4 > 20) if($2 != "mitochondria") if ($2 != "chloroplast") print $1, $2,$9 }' ~/Desktop/blast/blastedi0.txt > ~/Desktop/blast/clean_blastedi0.txt
awk '{ if ($3 == 0) print $0 }' ~/Desktop/snplists/edi_0.v7c.sdi > ~/Desktop/snplists/edi0_snp_list.txt
awk '{ print substr($0,4) }' ~/Desktop/snplists/edi0_snp_list.txt > ~/Desktop/snplists/clean_edi0_snp_list.txt
awk 'NR==FNR{a[$2$3]=$1$2;next}a[$1$2]{print $0"\t"a[$1$2]}' ~/Desktop/blast/clean_blastedi0.txt ~/Desktop/snplists/clean_edi0_snp_list.txt > edi0_machlist.txt
awk '{print substr($0, 1, length() - 1)}' edi0_machlist.txt > edi0_machlist_clean.txt
awk -F" " 'NR==FNR{a[$10]=$1;next}{if ($1 in a)print "Y" "\t"$0;else print "N""\t" $0;}' edi0_machlist_clean.txt ~/Desktop/blast/clean_blastedi0.txt > yn_edi0.txt
awk -F" " '{ if ($1 == "Y") print $0}' yn_edi0.txt > yesall.txt
awk -F" " '{ if ($1 == "N") print $0}' yn_edi0.txt > noall.txt
awk '!x[$2]++' noall.txt > noallnodup.txt
awk '!x[$2]++' yesall.txt > yesallnodup.txt
awk '{gsub(">branch0","",$2)}1' noallnodup.txt > noallnodup1.txt
awk '{gsub(">branch1","",$2)}1' noallnodup1.txt > noallnodup2.txt
awk '!x[$2]++' noallnodup2.txt > noallnodup3.txt
awk '{gsub(">branch0","",$2)}1' yesallnodup.txt > yesallnodup1.txt
awk '{gsub(">branch1","",$2)}1' yesallnodup1.txt > yesallnodup2.txt
awk '!x[$2]++' yesallnodup2.txt > yesallnodup3.txt
awk 'FNR == NR {data[ $2 ] = 1; next;}FNR < NR {if ( ! ($2 in data) ) {print $0;}}' yesallnodup3.txt noallnodup3.txt > noallnodup4.txt
run=$(awk ' END { print NR }' ~/Desktop/datasets/edi0_data.txt)
echo $run
awk -v var="$run" 'END {for (i=0;i<=var;i++) print ("snp"i) }' ~/Desktop/datasets/edi0_data.txt > ~/Desktop/datasets/edi0_data_index.txt
paste ~/Desktop/datasets/edi0_data_index.txt ~/Desktop/datasets/edi0_data.txt > ~/Desktop/datasets/edi0_data_indexed.txt
awk '{print $2}' noallnodup4.txt > noallsnps.txt
awk 'NR==FNR{a[$1]=$1;next}a[$1]{print $0}' noallsnps.txt ~/Desktop/datasets/edi0_data_indexed.txt > neg_dataset_edi0.txt
awk '{print $2}' yesallnodup3.txt > yesallsnps.txt
awk 'NR==FNR{a[$1]=$1;next}a[$1]{print $0}' yesallsnps.txt ~/Desktop/datasets/edi0_data_indexed.txt > pos_dataset_edi0.txt
awk -F" " '{ if ($270 == 44) print $0}' pos_dataset_edi0.txt > positives_col_edi0.txt
awk -F" " '{ if ($270 != 44) print $0}' pos_dataset_edi0.txt > positives_mix_edi0.txt
awk -F" " '{ if ($270 == 44) print $0}' ~/Desktop/datasets/edi0_data.txt > bub_col.txt
awk -F" " '{ if ($270 != 44) print $0}' ~/Desktop/datasets/edi0_data.txt > bub_mix.txt
awk ' END { print NR }' bub_col.txt
awk ' END { print NR }' bub_mix.txt
awk ' END { print NR }' positives_col_edi0.txt
awk ' END { print NR }' positives_mix_edi0.txt
awk '{print $0,"N"}' neg_dataset_edi0.txt > negative_edi0.txt
awk '{print $0,"Y"}' positives_mix_edi0.txt > positive_edi0.txt
cat positive_edi0.txt negative_edi0.txt > dataset_edi0.txt
cat ~/Desktop/datacrea/header.txt dataset_edi0.txt > dataheader_edi0.txt
awk -F" " -v OFS="," '{for (i=1;i<=NF;i++) gsub (/^ */,"",$i);print}' dataheader_edi0.txt > dataheadercsv_edi0.csv
cut -d, -f2- dataheadercsv_edi0.csv > ~/Desktop/csvdatasets/datasetheader_edi0.csv
echo -- end time --
date
#awk -F"," -v OFS="," '{ if ($270 != 44) print $0}' dataheader.csv > datasetcom.csv