https://github.com/Danko-Lab/F1_8Organs
Tip revision: 8093a6c2ca1b15e869608c55bbef48dc539dad38 authored by shaopei on 10 May 2021, 03:46:04 UTC
updates
updates
Tip revision: 8093a6c
PolyA_Allele-specific-manuscript_2021updates.bsh
cd /workdir/sc2457/F1_Tissues/PolyA_Allele-specific-manuscript
## identify allele specific termination
# intersect annotation (protein-coding genes) and AlleleHMM (native) blocks
ln -s ../Find_consistent_blocks/HMM_bed . # AlleleHMM blocks
mkdir annotations
cd annotations
ln -s /workdir/sc2457/mouse_AlleleSpecific/mouse_genome.sanger.ac.uk/REL-1505-SNPs_Indels/PersonalGenome_P.CAST_M.B6_indelsNsnps_CAST.bam/gencode.vM25.annotation.gtf .
# mouse mm10_GRCm38
# make bed from gtf from GENCODE
cat gencode.vM25.annotation.gtf | grep protein_coding| awk 'BEGIN{OFS="\t"} $3=="transcript" {split($12,a,"\""); split($16,b,"\""); print $1, $4-1, $5,a[2], b[2],$7}' | gzip > gencode.vM25.annotation_transcript_protein_coding.bed.gz
for Head in BN HT SK SP KD LV GI ST
do
echo $Head
# keep transcript annotations that are overlap with dREG sites
# use sort-bed
intersectBed -u -a <(zcat gencode.vM25.annotation_transcript_protein_coding.bed.gz | cut -f 1-6) -b <(zcat /workdir/sc2457/F1_Tissues/dREG/Browser/${Head}_all.dREG.peak.score.bed.gz) |sort-bed - | grep -v chrX | grep -v chrY | gzip > ${Head}_gencode.vM25.annotation_transcript_protein_coding.bed.gz
zcat ${Head}_gencode.vM25.annotation_transcript_protein_coding.bed.gz | awk 'BEGIN {OFS="\t"} ($6=="+"){print $0}'| gzip > ${Head}_gencode.vM25.annotation_transcript_protein_coding_plus.bed.gz
zcat ${Head}_gencode.vM25.annotation_transcript_protein_coding.bed.gz | awk 'BEGIN {OFS="\t"} ($6=="-"){print $0}'| gzip > ${Head}_gencode.vM25.annotation_transcript_protein_coding_minus.bed.gz
done
cd ..
# use tunits that DO overlap with the gene transcript with dREG sites from above as gene transcription
# exclude chrX and chrY
mkdir tunit_protein_coding
cd tunit_protein_coding
ln -s /workdir/sc2457/F1_Tissues/bigWig/tunit .
for Head in BN HT SK SP KD LV GI ST
do
echo $Head
intersectBed -u -s -a tunit/${Head}_all_h5.preds.full.bed -b <(zcat ../annotations/${Head}_gencode.vM25.annotation_transcript_protein_coding.bed.gz) |grep -v chrX |grep -v chrY \
> ${Head}_all_h5.preds.full_inProtein_coding.bed
done
// wc -l *
// 19201 BN_all_h5.preds.full_inProtein_coding.bed
// 21398 GI_all_h5.preds.full_inProtein_coding.bed
// 18334 HT_all_h5.preds.full_inProtein_coding.bed
// 21801 KD_all_h5.preds.full_inProtein_coding.bed
// 21374 LV_all_h5.preds.full_inProtein_coding.bed
// 14017 SK_all_h5.preds.full_inProtein_coding.bed
// 18127 SP_all_h5.preds.full_inProtein_coding.bed
// 21208 ST_all_h5.preds.full_inProtein_coding.bed
cd ..
for Head in BN HT SK SP KD LV GI ST
do
echo $Head
# use first bp of HMM
# tunits that overlaps with 1st bp of AlleleHMM blocks
# -u Write original A entry once if any overlaps found in B
intersectBed -s -u -a <(cat tunit_protein_coding/${Head}_all_h5.preds.full_inProtein_coding.bed |cut -f 1-6| grep -v chrX | grep -v chrY) \
-b <(cat HMM_bed/${Head}_*.bed | grep -v '#'| awk 'BEGIN {OFS="\t"} ($6=="+") {print $1, $2, $2+1, $4, NR,$6} ($6=="-") {print $1, $3-1, $3, $4, NR,$6}') \
> ${Head}_AT_0tunitIntersectNativeHMM_tunits.bed #AT is AllelicTermination
# the 1bp where 1st bp of AlleleHMM blocks overlap with tunits
intersectBed -s -wo -b <(cat tunit_protein_coding/${Head}_all_h5.preds.full_inProtein_coding.bed |cut -f 1-6| grep -v chrX | grep -v chrY) \
-a <(cat HMM_bed/${Head}_*.bed | grep -v '#'| awk 'BEGIN {OFS="\t"} ($6=="+") {print $1, $2, $2+1, $4, NR,$6} ($6=="-") {print $1, $3-1, $3, $4, NR,$6}') \
|cut -f 1-12 > ${Head}_wb_tmp.bed
# AlleleHMM blocks whose 1st bp overlap with tunits col1-6, col7-12 the tunits that overlap with AlleleHMM
# keep AlleleHMM blocks where the 1st bp were inside Tunits
intersectBed -s -wo -a <(cat HMM_bed/${Head}_*.bed | grep -v '#' | awk 'BEGIN {OFS="\t"} {print $1, $2, $3, $4, NR,$6}' ) -b ${Head}_wb_tmp.bed \
| awk 'BEGIN {OFS="\t"}($5==$11){print $1,$2,$3,$4,$5,$6, $13, $14, $15, $16, $17, $18}' > ${Head}_AT_0tunitIntersectNativeHMM_AlleleHMM.bed
rm ${Head}_wb_tmp.bed
## examine the pair tunits and AlleleHMM blocks to remove internal enhancers
# from the blocks that 1st bp within tunits
# keep the ones whose last bp outside 0-90% of tunit length
rm ${Head}_pair_tmp -r
mkdir ${Head}_pair_tmp
cat ${Head}_AT_0tunitIntersectNativeHMM_AlleleHMM.bed | awk -v d=${Head}_pair_tmp 'BEGIN {OFS="\t"} {print $0 >> d"/"$10".bed"}'
rm ${Head}_AT_2tunitIntersectNativeHMM_tmp.bed
for f in ${Head}_pair_tmp/*.bed #*/
do
#last bp of AlleleHMM blocks
cat ${f} |awk 'BEGIN {OFS="\t"} ($6=="+") {print $1, $3-1, $3, $4, $5,$6} ($6=="-") {print $1, $2, $2+1, $4, $5,$6}' > temp_a
# first 90% length of tunits
cat ${f} | cut -f 7- |uniq | awk '{OFS="\t"}($6=="+"){s=$2; e=$3; print $1,$2, e-((e-s)/10),$4,$5,$6} ($6=="-"){s=$2; e=$3; print $1, s+((e-s)/10), $3,$4,$5,$6}' > temp_b
# keep the ones whose last bp outside 0-90% of tunit length
intersectBed -s -v -a temp_a -b temp_b >> ${Head}_AT_2tunitIntersectNativeHMM_tmp.bed
done
rm ${Head}_pair_tmp -r
# Recover AlleleHMM blocks location (instead of only last bp)
intersectBed -s -wo -a ${Head}_AT_0tunitIntersectNativeHMM_AlleleHMM.bed -b ${Head}_AT_2tunitIntersectNativeHMM_tmp.bed \
| awk 'BEGIN {OFS="\t"}($5==$17){print $0}'|cut -f 1-12 |uniq |sort-bed - |uniq > ${Head}_AT_2tunitIntersectNativeHMM_AlleleHMM.bed
cat ${Head}_AT_2tunitIntersectNativeHMM_AlleleHMM.bed |cut -f 7- |uniq |sort-bed - |uniq > ${Head}_AT_2tunitIntersectNativeHMM_tunit.bed
done
# some tunits have two AlleleHMM blocks, one from MB6 the other from PB6
1091 BN_AT_2tunitIntersectNativeHMM_AlleleHMM.bed
879 BN_AT_2tunitIntersectNativeHMM_tunit.bed
904 GI_AT_2tunitIntersectNativeHMM_AlleleHMM.bed
787 GI_AT_2tunitIntersectNativeHMM_tunit.bed
591 HT_AT_2tunitIntersectNativeHMM_AlleleHMM.bed
559 HT_AT_2tunitIntersectNativeHMM_tunit.bed
998 KD_AT_2tunitIntersectNativeHMM_AlleleHMM.bed
864 KD_AT_2tunitIntersectNativeHMM_tunit.bed
1537 LV_AT_2tunitIntersectNativeHMM_AlleleHMM.bed
1230 LV_AT_2tunitIntersectNativeHMM_tunit.bed
1098 SK_AT_2tunitIntersectNativeHMM_AlleleHMM.bed
992 SK_AT_2tunitIntersectNativeHMM_tunit.bed
1190 SP_AT_2tunitIntersectNativeHMM_AlleleHMM.bed
1005 SP_AT_2tunitIntersectNativeHMM_tunit.bed
408 ST_AT_2tunitIntersectNativeHMM_AlleleHMM.bed
360 ST_AT_2tunitIntersectNativeHMM_tunit.bed
14493 total
# the regions where tunit overlaps alleleHMM
# tunits might have duplicates corresponding to AlleleHMM blocks from both MB6 and PB6
for Head in BN HT SK SP KD LV GI ST
do
echo $Head
intersectBed -s -wb -a ${Head}_AT_2tunitIntersectNativeHMM_AlleleHMM.bed -b ${Head}_AT_2tunitIntersectNativeHMM_tunit.bed > ${Head}_temp
cat ${Head}_temp |awk 'BEGIN{OFS="\t"} ($10==$16) {print $0}'| cut -f 1-6 > ${Head}_AT_3tunitIntersectNativeHMM_intersectRegion.bed
cat ${Head}_temp |awk 'BEGIN{OFS="\t"} ($10==$16) {print $0}'| cut -f 13- > ${Head}_AT_3tunitIntersectNativeHMM_tunits.bed
done
# Length distribution of AlleleHMM blocks that contains allelic termination
for Head in BN HT SK SP KD LV GI ST
do R --vanilla --slave --args $(pwd) ${Head}_AT_2tunitIntersectNativeHMM_AlleleHMM.bed ${Head}_AT_2tunitIntersectNativeHMM_tunit.bed ${Head} < getLengthHist.R &
done
# Length distribution of intersect of AlleleHMM blocks and the paired tunits
for Head in BN HT SK SP KD LV GI ST
do R --vanilla --slave --args $(pwd) ${Head}_AT_3tunitIntersectNativeHMM_intersectRegion.bed ${Head}_AT_3tunitIntersectNativeHMM_tunits.bed ${Head} < getLengthHist.R &
done
# some lenght of Allelic termination windows (AT window) are more than 50% the length of tunits, which doesnot make biological sense
# So we add another filter and only keep AT windows that are less than or equal to 50% the length of tunits
# change to 50% instead
for Head in BN HT SK SP KD LV GI ST
do R --vanilla --slave --args $(pwd) ${Head}_AT_3tunitIntersectNativeHMM_intersectRegion.bed ${Head}_AT_3tunitIntersectNativeHMM_tunits.bed ${Head}_AT_4tunitIntersectNativeHMM_intersectRegion.bed ${Head}_AT_4tunitIntersectNativeHMM_tunits.bed 0.5 < Keep_AT_less_than_Xratio_tunits.R
done
# Length distribution of interect of AlleleHMM blocks and the paired tunits in each organ
for Head in BN HT SK SP KD LV GI ST
do R --vanilla --slave --args $(pwd) ${Head}_AT_4tunitIntersectNativeHMM_intersectRegion.bed ${Head}_AT_4tunitIntersectNativeHMM_tunits.bed ${Head} < getLengthHist.R &
done
# identify the orignal AlleleHMM blocks
for Head in BN HT SK SP KD LV GI ST
do
intersectBed -s -wao -a ${Head}_AT_4tunitIntersectNativeHMM_intersectRegion.bed -b ${Head}_AT_2tunitIntersectNativeHMM_AlleleHMM.bed \
| awk 'BEGIN {OFS="\t"}($4==$10){print $0}'|cut -f 7-18 |uniq |sort-bed - |uniq > ${Head}_AT_4tunitIntersectNativeHMM_AlleleHMM.bed
done
wc -l *_4tunitIntersectNativeHMM_AlleleHMM.bed
917 BN_AT_4tunitIntersectNativeHMM_AlleleHMM.bed
643 GI_AT_4tunitIntersectNativeHMM_AlleleHMM.bed
409 HT_AT_4tunitIntersectNativeHMM_AlleleHMM.bed
745 KD_AT_4tunitIntersectNativeHMM_AlleleHMM.bed
1150 LV_AT_4tunitIntersectNativeHMM_AlleleHMM.bed
869 SK_AT_4tunitIntersectNativeHMM_AlleleHMM.bed
844 SP_AT_4tunitIntersectNativeHMM_AlleleHMM.bed
360 ST_AT_4tunitIntersectNativeHMM_AlleleHMM.bed
5937 total
# how many tunits with AT in each organ and total (exclude overlapped)
wc -l *_4tunitIntersectNativeHMM_tunits.bed
917 BN_AT_4tunitIntersectNativeHMM_tunits.bed
643 GI_AT_4tunitIntersectNativeHMM_tunits.bed
409 HT_AT_4tunitIntersectNativeHMM_tunits.bed
745 KD_AT_4tunitIntersectNativeHMM_tunits.bed
1150 LV_AT_4tunitIntersectNativeHMM_tunits.bed
869 SK_AT_4tunitIntersectNativeHMM_tunits.bed
844 SP_AT_4tunitIntersectNativeHMM_tunits.bed
360 ST_AT_4tunitIntersectNativeHMM_tunits.bed
5937 total
rm temp.txt
for Head in BN HT SK SP KD LV GI ST
do
echo ${Head}_AT_4tunitIntersectNativeHMM_tunits.bed >> temp.txt
sort-bed ${Head}_AT_4tunitIntersectNativeHMM_tunits.bed |uniq |wc -l >> temp.txt
done
cat temp.txt |paste - -
BN_AT_4tunitIntersectNativeHMM_tunits.bed 742
HT_AT_4tunitIntersectNativeHMM_tunits.bed 390
SK_AT_4tunitIntersectNativeHMM_tunits.bed 791
SP_AT_4tunitIntersectNativeHMM_tunits.bed 723
KD_AT_4tunitIntersectNativeHMM_tunits.bed 651
LV_AT_4tunitIntersectNativeHMM_tunits.bed 929
GI_AT_4tunitIntersectNativeHMM_tunits.bed 569
ST_AT_4tunitIntersectNativeHMM_tunits.bed 317
# merge the tunits from 8 organs with strandness
coreBody=AT_4tunitIntersectNativeHMM_tunits.bed
rm T8_temp
for head in BN SP HT SK KD ST GI LV
do
cat ${head}_${coreBody} >> T8_temp
done
wc -l T8_temp
# merge overlapped and/or book-ended (-d 0). Force strandedness (-s)
bedtools merge -s -i <(sort-bed T8_temp) -d 0 > T8_${coreBody}
wc -l T8_${coreBody}
3451 T8_AT_4tunitIntersectNativeHMM_tunits.bed
# legnth distribution of AT windows. combined all AT windows from all organs WITHOUT removing duplicates
rm T8_AT_4tunitIntersectNativeHMM_intersectRegion.bed
for Head in BN HT SK SP KD LV GI ST
do
cat ${Head}_AT_4tunitIntersectNativeHMM_intersectRegion.bed >> T8_AT_4tunitIntersectNativeHMM_intersectRegion.bed
done
## scripts for panels
poly-termination-analysis-figures.R # analysis for figure 5B, 5C, sup_fig 5A, 5Bchosen