https://github.com/Danko-Lab/F1_8Organs
Tip revision: 8093a6c2ca1b15e869608c55bbef48dc539dad38 authored by shaopei on 10 May 2021, 03:46:04 UTC
updates
updates
Tip revision: 8093a6c
Genetics_or_imprinting_v2.bsh
###
cd /workdir/sc2457/F1_Tissues/GeneAlleleSpecificity_imprinting
# mouse mm10_GRCm38
# make bed from gtf from GENCODE
# gtf2bed < gencode.vM20.annotation.gtf > gencode.vM20.annotation.bed
cat gencode.vM20.annotation.bed |awk 'BEGIN {OFS="\t"} ($8=="transcript"){print $0}' | grep protein_coding |gzip > gencode.vM20.annotation_transcript_protein_coding.bed.gz
for Head in BN HT SK SP KD LV GI ST
do
echo $Head
# keep transcript annotations that are overlap with dREG sites
intersectBed -u -a <(zcat gencode.vM20.annotation_transcript_protein_coding.bed.gz | cut -f 1-6) -b <(zcat /workdir/sc2457/F1_Tissues/dREG/Browser/${Head}_all.dREG.peak.score.bed.gz) | gzip > ${Head}_gencode.vM20.annotation_transcript_protein_coding.bed.gz
# avoid multiple count of transcript share similarl location, need to merge bed of overlaping transcript as follows:
# -s Force strandedness
bedtools merge -i <(zcat ${Head}_gencode.vM20.annotation_transcript_protein_coding.bed.gz) -s -o distinct -c 4,5,6 |awk 'BEGIN {OFS="\t"; a="111"} {print $1,$2,$3,$4,a,$6}'|LC_ALL=C sort --temporary-directory=/workdir/sc2457/tmp/ --parallel=10 -V | grep -v chrX | grep -v chrY > ${Head}_gencode.vM20.annotation_transcript_protein_coding_Merged.bed
cat ${Head}_gencode.vM20.annotation_transcript_protein_coding_Merged.bed | awk 'BEGIN {OFS="\t"} ($6=="+"){print $0}'> ${Head}_gencode.vM20.annotation_transcript_protein_coding_Merged_plus.bed
cat ${Head}_gencode.vM20.annotation_transcript_protein_coding_Merged.bed | awk 'BEGIN {OFS="\t"} ($6=="-"){print $0}'> ${Head}_gencode.vM20.annotation_transcript_protein_coding_Merged_minus.bed
done
# Update: DO NOT merge the transcripts
for Head in BN HT SK SP KD LV GI ST
do
echo $Head
# keep transcript annotations that are overlap with dREG sites
intersectBed -u -a <(zcat gencode.vM20.annotation_transcript_protein_coding.bed.gz | cut -f 1-6) -b <(zcat /workdir/sc2457/F1_Tissues/dREG/Browser/${Head}_all.dREG.peak.score.bed.gz) |LC_ALL=C sort --temporary-directory=/workdir/sc2457/tmp/ --parallel=10 -V| grep -v chrX | grep -v chrY | gzip > ${Head}_gencode.vM20.annotation_transcript_protein_coding.bed.gz
zcat ${Head}_gencode.vM20.annotation_transcript_protein_coding.bed.gz | awk 'BEGIN {OFS="\t"} ($6=="+"){print $0}'> ${Head}_gencode.vM20.annotation_transcript_protein_coding_plus.bed
zcat ${Head}_gencode.vM20.annotation_transcript_protein_coding.bed.gz | awk 'BEGIN {OFS="\t"} ($6=="-"){print $0}'> ${Head}_gencode.vM20.annotation_transcript_protein_coding_minus.bed
done
# use tunits that DO overlap with the gene transcript from above as another set of gene transcription
for Head in BN HT SK SP KD LV GI ST
do
echo $Head
ln -s /workdir/sc2457/F1_Tissues/bigWig/${Head}_all_h5.preds.full.bed .
intersectBed -u -s -a ${Head}_all_h5.preds.full.bed -b ${Head}_gencode.vM20.annotation_transcript_protein_coding_Merged.bed |cut -f 1-6| LC_ALL=C sort --temporary-directory=/workdir/sc2457/tmp/ --parallel=10 -V |grep -v chrX |grep -v chrY > ${Head}_all_h5.preds.full_inProtein_coding.bed
cat ${Head}_all_h5.preds.full_inProtein_coding.bed | awk 'BEGIN {OFS="\t"} ($6=="+"){print $0}' > ${Head}_all_h5.preds.full_inProtein_coding_plus.bed
cat ${Head}_all_h5.preds.full_inProtein_coding.bed | awk 'BEGIN {OFS="\t"} ($6=="-"){print $0}' > ${Head}_all_h5.preds.full_inProtein_coding_minus.bed
done
### calculate the numbe of ncRNA in imprinting
# use tunits that do not overlap with the gene transcript from above
for Head in BN HT SK SP KD LV GI ST
do
echo $Head
#ln -s /workdir/sc2457/F1_Tissues/bigWig/${Head}_all_h5.preds.full.bed .
# -v Only report those entries in A that have no overlap in B. Restricted by -f and -r.
intersectBed -s -v -a ${Head}_all_h5.preds.full.bed -b ${Head}_gencode.vM20.annotation_transcript_protein_coding_Merged.bed |cut -f 1-6| LC_ALL=C sort --temporary-directory=/workdir/sc2457/tmp/ --parallel=10 -V |grep -v chrX |grep -v chrY > ${Head}_all_h5.preds.full_outProtein_coding.bed
cat ${Head}_all_h5.preds.full_outProtein_coding.bed | awk 'BEGIN {OFS="\t"} ($6=="+"){print $0}' > ${Head}_all_h5.preds.full_outProtein_coding_plus.bed
cat ${Head}_all_h5.preds.full_outProtein_coding.bed | awk 'BEGIN {OFS="\t"} ($6=="-"){print $0}' > ${Head}_all_h5.preds.full_outProtein_coding_minus.bed
done
// # use dREG sites to represent ncRNA not in tunits? (did not work. very few allelic biase)
// for Head in BN HT SK SP KD LV GI ST
// do
// echo $Head
// # keep dREG sites that are NOT overlap with tanscript and tunits above
// intersectBed -v -a <(zcat /workdir/sc2457/F1_Tissues/dREG/Browser/${Head}_all.dREG.peak.score.bed.gz) -b <(cat ${Head}_all_h5.preds.full.bed ${Head}_gencode.vM20.annotation_transcript_protein_coding_Merged.bed |cut -f 1-6) | LC_ALL=C sort --temporary-directory=/workdir/sc2457/tmp/ --parallel=10 -V > ${Head}_dREG_outProtein_outTunit.bed
// cat ${Head}_dREG_outProtein_outTunit.bed | awk 'BEGIN {OFS="\t"} {print $0, "111", "+"}'> ${Head}_dREG_outProtein_outTunit_plus.bed
// cat ${Head}_dREG_outProtein_outTunit.bed | awk 'BEGIN {OFS="\t"} {print $0, "111", "-"}'> ${Head}_dREG_outProtein_outTunit_minus.bed
// done
### Do the calculation here
wait_a_second() {
joblist=($(jobs -p))
while (( ${#joblist[*]} >= 10 ))
do
sleep 1
joblist=($(jobs -p))
done
}
PL=/workdir/sc2457/alleleDB/alleledb_pipeline_mouse
MAPS=/workdir/sc2457/mouse_AlleleSpecific/mouse_genome.sanger.ac.uk/REL-1505-SNPs_Indels/PersonalGenome_P.CAST_M.B6_indelsNsnps_CAST.bam/%s_P.CAST.EiJ_M.C57BL.6J.map
FDR_SIMS=10
FDR_CUTOFF=0.1
BinomialTest(){
# BinomialTest $f $j $MAT_READ_BED $PAT_READ_BED
f=$1
j=$2
MAT_READ_BED=$3
PAT_READ_BED=$4
bedtools coverage -s -a $f -b ${MAT_READ_BED} -sorted| awk 'BEGIN {OFS="\t"; t="_"} {print $1t$2t$3, $1,$2,$3,$7,$8,$9,$10}' |LC_ALL=C sort -k1,1V -k2,2n > ${j}.mat_cov.bed &
bedtools coverage -s -a $f -b ${PAT_READ_BED} -sorted| awk 'BEGIN {OFS="\t"; t="_"} {print $1t$2t$3, $1,$2,$3,$7,$8,$9,$10}' |LC_ALL=C sort -k1,1V -k2,2n > ${j}.pat_cov.bed &
#bedtools coverage -s -a $f -b ${IDENTICAL_READ_BED} -sorted| awk 'BEGIN {OFS="\t"; t="_"} {print $1t$2t$3, $1,$2,$3,$7,$8,$9,$10}' |LC_ALL=C sort -k1,1V -k2,2n > ${j}.iden_cov.bed &
wait
# filter the block and only keep block with at lease 1 allele-specific read
LC_ALL=C join -t $'\t' -j 1 -o 1.1,1.2,1.3,1.4,1.5,2.5 ${j}.mat_cov.bed ${j}.pat_cov.bed | awk 'BEGIN{OFS="\t"} ($5+$6 >0) {print $0}' | \
awk 'BEGIN{OFS="\t"; t=","} ($5>$6) {print $2, $3, $4, "M"t$5t$6, $5, $6, "-" } ($5<$6) {print $2, $3, $4, "P"t$5t$6, $5, $6, "-"} ($5==$6) {print $2, $3, $4, "S"t$5t$6, $5, $6, "-"} ' \
| LC_ALL=C sort -k1,1V -k2,2n --parallel=30 > ${j}.merged_cov.bed
mv ${j}.mat_cov.bed ${j}.pat_cov.bed toremove
# output of BinomialTestFor_merged_cov.bed.py:(hmm+BinomialTest) if p-value <= 0.05, remain what it got from hmm (can ne M,P, or S), otherwise S.
python ${PL}/BinomialTestFor_merged_cov.bed.py ${j}.merged_cov.bed ${j}_binomtest.bed
mv ${j}.merged_cov.bed toremove
python ${PL}/FalsePosFor_merged_cov.bed.py ${j}_binomtest.bed ${FDR_SIMS} ${FDR_CUTOFF} > ${j}_binomtest_FDR${FDR_CUTOFF}.txt
awk 'NR==1 { print $0 } NR>1 && ($9+0) <= thresh { print $0 }' thresh=$(awk 'END {print $6}' ${j}_binomtest_FDR${FDR_CUTOFF}.txt) < ${j}_binomtest.bed > ${j}_interestingHets.bed &
}
## map reads from each individual sample from the same tissue and cross to the combined region of interest (protein-coding or ncRNA)
#
mkdir BinomialTest
cd BinomialTest
ln -s /workdir/sc2457/F1_Tissues/3rd_batch/map2ref/map2ref_bed/ .
mkdir toremove
for Head in BN HT SK SP KD LV GI ST
do
echo $Head
bed_dir=map2ref_bed
for f in ${bed_dir}/${Head}_MB6_all_R1.mat.bowtie.gz_AMBremoved_sorted_identical.map2ref.sorted.bed.gz #each samples from MB6 of the same tissue
do PREFIX=`echo $f|cut -d . -f 1`
echo $PREFIX
MAT_READ_BED=${PREFIX}.mat.bowtie.gz_AMBremoved_sorted_specific.map2ref.sorted.bed.gz
PAT_READ_BED=${PREFIX}.pat.bowtie.gz_AMBremoved_sorted_specific.map2ref.sorted.bed.gz
#IDENTICAL_READ_BED=${PREFIX}.mat.bowtie.gz_AMBremoved_sorted_identical.map2ref.sorted.bed.gz
P=`echo $PREFIX| cut -d / -f 2`
ln -s ../${Head}_all_h5.preds.full_outProtein_coding_plus.bed ${P}_ncRNA_plus.bed
ln -s ../${Head}_all_h5.preds.full_outProtein_coding_minus.bed ${P}_ncRNA_minus.bed
ln -s ../${Head}_all_h5.preds.full_inProtein_coding_plus.bed ${P}_tunitGene_plus.bed
ln -s ../${Head}_all_h5.preds.full_inProtein_coding_minus.bed ${P}_tunitGene_minus.bed
ln -s ../${Head}_gencode.vM20.annotation_transcript_protein_coding_plus.bed ${P}_ProteinCodingGene_plus.bed
ln -s ../${Head}_gencode.vM20.annotation_transcript_protein_coding_minus.bed ${P}_ProteinCodingGene_minus.bed
BinomialTest ${P}_ncRNA_plus.bed ${P}_ncRNA_plus ${MAT_READ_BED} ${PAT_READ_BED} &
BinomialTest ${P}_ncRNA_minus.bed ${P}_ncRNA_minus ${MAT_READ_BED} ${PAT_READ_BED} &
BinomialTest ${P}_tunitGene_plus.bed ${P}_tunitGene_plus ${MAT_READ_BED} ${PAT_READ_BED} &
BinomialTest ${P}_tunitGene_minus.bed ${P}_tunitGene_minus ${MAT_READ_BED} ${PAT_READ_BED} &
BinomialTest ${P}_ProteinCodingGene_plus.bed ${P}_ProteinCodingGene_plus ${MAT_READ_BED} ${PAT_READ_BED} &
BinomialTest ${P}_ProteinCodingGene_minus.bed ${P}_ProteinCodingGene_minus ${MAT_READ_BED} ${PAT_READ_BED} &
wait_a_second
done
for f in ${bed_dir}/${Head}_PB6_all_R1.mat.bowtie.gz_AMBremoved_sorted_identical.map2ref.sorted.bed.gz #each samples from PB6 of the same tissue
do PREFIX=`echo $f|cut -d . -f 1`
echo $PREFIX
# switch -m and -p for PB6
PAT_READ_BED=${PREFIX}.mat.bowtie.gz_AMBremoved_sorted_specific.map2ref.sorted.bed.gz
MAT_READ_BED=${PREFIX}.pat.bowtie.gz_AMBremoved_sorted_specific.map2ref.sorted.bed.gz
#IDENTICAL_READ_BED=${PREFIX}.mat.bowtie.gz_AMBremoved_sorted_identical.map2ref.sorted.bed.gz
P=`echo $PREFIX| cut -d / -f 2`
ln -s ../${Head}_all_h5.preds.full_outProtein_coding_plus.bed ${P}_ncRNA_plus.bed
ln -s ../${Head}_all_h5.preds.full_outProtein_coding_minus.bed ${P}_ncRNA_minus.bed
ln -s ../${Head}_all_h5.preds.full_inProtein_coding_plus.bed ${P}_tunitGene_plus.bed
ln -s ../${Head}_all_h5.preds.full_inProtein_coding_minus.bed ${P}_tunitGene_minus.bed
ln -s ../${Head}_gencode.vM20.annotation_transcript_protein_coding_plus.bed ${P}_ProteinCodingGene_plus.bed
ln -s ../${Head}_gencode.vM20.annotation_transcript_protein_coding_minus.bed ${P}_ProteinCodingGene_minus.bed
BinomialTest ${P}_ncRNA_plus.bed ${P}_ncRNA_plus ${MAT_READ_BED} ${PAT_READ_BED} &
BinomialTest ${P}_ncRNA_minus.bed ${P}_ncRNA_minus ${MAT_READ_BED} ${PAT_READ_BED} &
BinomialTest ${P}_tunitGene_plus.bed ${P}_tunitGene_plus ${MAT_READ_BED} ${PAT_READ_BED} &
BinomialTest ${P}_tunitGene_minus.bed ${P}_tunitGene_minus ${MAT_READ_BED} ${PAT_READ_BED} &
BinomialTest ${P}_ProteinCodingGene_plus.bed ${P}_ProteinCodingGene_plus ${MAT_READ_BED} ${PAT_READ_BED} &
BinomialTest ${P}_ProteinCodingGene_minus.bed ${P}_ProteinCodingGene_minus ${MAT_READ_BED} ${PAT_READ_BED} &
wait_a_second
done
done
# check if all the files were run sucessfully
for Head in BN HT SK SP KD LV GI ST
do
tail -n 1 ${Head}*all_R1_*binomtest_FDR0.1.txt
done
### identify imprinting and strain effect WITHOUT the strict filter of consistency
mkdir AllelicBiase_useAll_ignore_Consistency/
cd AllelicBiase_useAll_ignore_Consistency/
ln -s ../*all_R1_*_interestingHets.bed . #*/
join_AB(){
file1=$1
file2=$2
file3=$3
cat ${file1} ${file2} |grep -v "#" | awk 'BEGIN{t="_"} {print $1t$2t$3}' |sort -k1,1|uniq > ${file1}.tmp1
join -a 1 -t $'\t' -e S -j 1 -o 1.1,2.5 ${file1}.tmp1 <(<${file1} awk 'BEGIN{OFS="\t"; t="_"} {print $1t$2t$3,$0}' | sort -k1,1) > ${file1}.tmp2
join -a 1 -t $'\t' -e S -j 1 -o 1.1,1.2,2.5 ${file1}.tmp2 <(<${file2} awk 'BEGIN{OFS="\t"; t="_"} {print $1t$2t$3,$0}' | sort -k1,1) | awk 'BEGIN{OFS="\t"} {split($1,a,"_"); print a[1],a[2],a[3], $2, $3}' > ${file3}
rm ${file1}.tmp*
}
# make a table with Allelic Bias (AB) at MB6 and PB6 and use that to identify strain-effect, imprinting, and others (one S, one M|P)
for body in all_R1_tunitGene all_R1_ncRNA all_R1_ProteinCodingGene
do
#KD_PB6_all_R1_ncRNA_plus_interestingHets.bed
for f in *_${body}_plus_interestingHets.bed
do t=`echo $f|cut -d _ -f 1`
for s in plus minus
do
join_in1=${t}_MB6_${body}_${s}_interestingHets.bed
join_in2=${t}_PB6_${body}_${s}_interestingHets.bed
join_out=${t}_${body}_${s}_interestingHets.bed
h=${t}_${body}_${s}_fdr0.1_effect
join_AB ${join_in1} ${join_in2} ${join_out}
grep S ${join_out} > ${h}_NA.bed
im=${h}_imprinting.bed
st=${h}_strain.bed
rm ${im} ${st}
grep -v S ${join_out} | awk -v im=${im} -v st=${st} 'BEGIN{OFS="\t"} substr($4,1,1)==substr($5,1,1) {print $0 >> im} substr($4,1,1)!=substr($5,1,1) {print $0 >> st}'
sed -i '1i #chrm\tchrmStart\tchrmEnd\tMB6_winP_count_all_samples\tPB6_winP_count_all_samples' ${join_out}
done
done
done
rm temp_note
for body in all_R1_tunitGene all_R1_ncRNA all_R1_ProteinCodingGene
do
for head in BN SP HT SK KD ST GI LV
do for s in plus minus
do
wc -l ${head}_${body}_${s}_fdr0.1_effect* |paste - - - - >> temp_note
done
done
done
#HERE
#CombineT8 (overlap blocks NOT combined)
for effect in imprinting strain NA
do
echo ${effect}
for body in ProteinCodingGene tunitGene ncRNA
do echo $body
rm T8_${body}_fdr0.1_effect_${effect}.bed
for f in *_${body}_*_fdr0.1_effect_${effect}.bed #BN_all_R1_ncRNA_plus_fdr0.1_effect_imprinting.bed
do t=`echo $f|cut -d "_" -f 1`
s=`echo $f|rev| cut -d "_" -f 4|rev`
cat $f |awk -v t=${t} -v s=${s} 'BEGIN{OFS="\t"} (s=="plus"){print $1,$2,$3 , t, "111", "+" } (s=="minus"){print $1,$2,$3 , t, "111", "-" }' >> T8_${body}_fdr0.1_effect_${effect}.bed
done
# report the number of blocks CombineT8 (overlap blocks NOT combined)
cat T8_${body}_fdr0.1_effect_${effect}.bed | awk 'BEGIN{OFS="\t"} {print $1,$2,$3,$6}' |sort |uniq -c |wc -l
done
done
#CombineT8 (overlap blocks DO combined)
for effect in strain imprinting NA
do
for body in ProteinCodingGene tunitGene ncRNA
do echo $body
#bedtools merge -i <(sort-bed T8_${body}_fdr0.1_effect_${effect}.bed) -c 4 -o distinct -d 0 > T8_${body}_fdr0.1_effect_${effect}_merged.bed
bedtools merge -s -i <(sort-bed T8_${body}_fdr0.1_effect_${effect}.bed) -c 4,5,6 -o distinct -d 0 > T8_${body}_fdr0.1_effect_${effect}_s_merged.bed
done
done