https://github.com/qqwang-berkeley/JUM
Raw File
Tip revision: 029d1589601782bb4b950b581a660451ad210322 authored by Qingqing Wang on 06 February 2019, 23:21:26 UTC
Update README.md
Tip revision: 029d158
JUM_A.sh
#!/bin/bash
set -e
# $1=directory for JUM scripts; $2=threshold for junction reads; $3=file_number; $4=read # threshold for intron_exon boundary reads; $5=read length; $6=thread number for sam/bam processing;
################################# 
#folder="$1";
#threshold="$2";
#file_num="$3";
#IR_threshold="$4";
#read_length="$5";
#thread_num="$6";
##################################

ARGUMENT_LIST=(
    "Folder"
    "JuncThreshold"
    "Condition1_fileNum_threshold"
    "Condition2_fileNum_threshold"
    "IRthreshold"
    "Readlength"
    "Thread"
    "Condition1SampleName"
    "Condition2SampleName"
)


# read arguments
opts=$(getopt \
    --longoptions "$(printf "%s:," "${ARGUMENT_LIST[@]}")" \
    --name "$(basename "$0")" \
    --options "" \
    -- "$@"
)

eval set --$opts

while [[ $# -gt 0 ]]; do
    case "$1" in
        --Folder)
            folder=$2
            shift 2
            ;;

        --JuncThreshold)
            threshold=$2
            shift 2
            ;;

        --Condition1_fileNum_threshold)
            file_num_1=$2
            shift 2
            ;;
       
	--Condition2_fileNum_threshold)
            file_num_2=$2
            shift 2
            ;;

        --IRthreshold)
	    IR_threshold=$2
            shift 2
	    ;;
      
        --Readlength)
            read_length=$2
            shift 2
	    ;;
        	    
        --Thread)
            thread_num=$2
            shift 2
            ;;

        --Condition1SampleName)
            condition_1=$2
            shift 2
            ;;

        --Condition2SampleName)
            condition_2=$2
            shift 2
            ;;

        *)
            break
            ;;
    esac
done


start=$(date +%s)

bash $folder/JUM_2-1.sh;

if [ -d "con1_1" ]; then
rm -r con1_1;
fi

mkdir con1_1;

if [ -d "con2_1" ]; then
rm -r con2_1;
fi

mkdir con2_1;

echo "Sample names are:";
IFS=',' read -r -a array1 <<< "$condition_1"

for element in "${array1[@]}"
do
    cp "$element"SJ.out.tab_strand_symbol_scaled con1_1/;
    echo "$element";
done

IFS=',' read -r -a array2 <<< "$condition_2"

for element in "${array2[@]}"
do
    cp "$element"SJ.out.tab_strand_symbol_scaled con2_1/;
    echo "$element";
done

cp UNION_junc_coor_with_junction_ID.txt con1_1/;
cp UNION_junc_coor_with_junction_ID.txt con2_1/;

cd con1_1/;
bash $folder/JUM_2-2.sh --Folder $folder --Threshold $threshold --Filenum $file_num_1  --Condition "condition1"
cd ..

cd con2_1/;
bash $folder/JUM_2-2.sh --Folder $folder --Threshold $threshold --Filenum $file_num_2  --Condition "condition2"
cd ..

cp con1_1/*junction_counts.txt .
cp con2_1/*junction_counts.txt .

cp con1_1/*formatted.txt .
cp con2_1/*formatted.txt .

file_num=0;
if [ "$file_num_1" -ge "$file_num_2" ]; then
    file_num="$file_num_1"
fi

if [ "$file_num_1" -lt "$file_num_2" ]; then
    file_num="$file_num_2"
fi

#echo "$file_num";

cat *more_than_"$threshold"_read_in_at_least_*_samples_formatted.txt | sort -u > UNION_junc_coor_with_junction_ID_more_than_"$threshold"_read_in_at_least_"$file_num"_samples_formatted.txt;

awk '{print $1 "\t" $3 "\t" $4 "\t" $2 "\t" $5}' UNION_junc_coor_with_junction_ID_more_than_"$threshold"_read_in_at_least_"$file_num"_samples_formatted.txt > UNION_junc_coor_with_junction_ID_more_than_"$threshold"_read_in_at_least_"$file_num"_samples.txt;

awk '{print $1 "\t" $2 "\t" $3 "\t" $4 "\t" $4-$3 "\t" $5}' UNION_junc_coor_with_junction_ID_more_than_"$threshold"_read_in_at_least_"$file_num"_samples_formatted.txt > UNION_junc_coor_with_junction_ID_more_than_"$threshold"_read_in_at_least_"$file_num"_samples_formatted_with_junction_length.txt;

less UNION_junc_coor_with_junction_ID_more_than_"$threshold"_read_in_at_least_"$file_num"_samples_formatted.txt | cut -f5 | sort > UNION_junc_coor_with_junction_ID_more_than_"$threshold"_read_in_at_least_"$file_num"_samples_formatted_junction_list.txt;

less UNION_junc_coor_with_junction_ID_more_than_"$threshold"_read_in_at_least_"$file_num"_samples_formatted.txt | cut -f1,2,3 | sort | uniq -d > more_than_"$threshold"_union_junc_coor_5_prime_ss_list_with_alternative_3_ss.txt;

perl $folder/profile_AS_events_sharing_5_prime_ss.pl UNION_junc_coor_with_junction_ID_more_than_"$threshold"_read_in_at_least_"$file_num"_samples_formatted.txt more_than_"$threshold"_union_junc_coor_5_prime_ss_list_with_alternative_3_ss.txt > more_than_"$threshold"_profiled_5_ss_and_corresponding_alternative_3_ss_junction_list.txt;

less UNION_junc_coor_with_junction_ID_more_than_"$threshold"_read_in_at_least_"$file_num"_samples_formatted.txt | cut -f1,2,4 | sort | uniq -d > more_than_"$threshold"_union_junc_coor_3_prime_ss_list_with_alternative_5_ss.txt;

perl $folder/profile_AS_events_sharing_3_prime_ss.pl UNION_junc_coor_with_junction_ID_more_than_"$threshold"_read_in_at_least_"$file_num"_samples_formatted.txt more_than_"$threshold"_union_junc_coor_3_prime_ss_list_with_alternative_5_ss.txt > more_than_"$threshold"_profiled_3_ss_and_corresponding_alternative_5_ss_junction_list.txt;

cat more_than_"$threshold"_profiled_5_ss_and_corresponding_alternative_3_ss_junction_list.txt more_than_"$threshold"_profiled_3_ss_and_corresponding_alternative_5_ss_junction_list.txt > more_than_"$threshold"_profiled_total_AS_event_junction.txt;

perl $folder/first_processing_generate_JUM_reference_for_profiled_total_alternative_splicing_event_junction.pl more_than_"$threshold"_profiled_total_AS_event_junction.txt more_than_"$threshold"_profiled_total_AS_event_junction_first_processing_for_JUM_reference_building.txt;

perl $folder/second_processing_generate_JUM_reference_for_profiled_total_alternative_splicing_event_junction.pl more_than_"$threshold"_profiled_total_AS_event_junction.txt more_than_"$threshold"_profiled_total_AS_event_junction_second_processing_for_JUM_reference_building.txt;

perl $folder/generate_JUM_format_annotation_file_for_profiled_alternative_splicing_event_junction.pl more_than_"$threshold"_profiled_total_AS_event_junction_second_processing_for_JUM_reference_building.txt more_than_"$threshold"_profiled_total_AS_event_junction_first_processing_for_JUM_reference_building.txt more_than_"$threshold"_profiled_total_AS_event_junction_JUM_annotation.gff;


for file in *junction_counts.txt;
do
    iden=${file%_junction_counts.txt}"_"fn_count.txt;
    perl $folder/making_junction_count_file_matching_annotation_for_JUM.pl more_than_"$threshold"_profiled_total_AS_event_junction_first_processing_for_JUM_reference_building.txt $file $iden;
done


echo Preparing for intron retention AS events analyses...

process_sam_func() {
    file=$1;
    name=${file%Aligned.out.sam}"_"temp1;
    header=${file%Aligned.out.sam}"_"sam_header;
    juncSam=${file%Aligned.out.sam}"_"Aligned.out.spanning_junction_reads.sam;
    juncBam=${file%Aligned.out.sam}"_"Aligned.out.spanning_junction_reads.bam;
    juncBed=${file%Aligned.out.sam}"_"Aligned.out.spanning_junction_reads.bed;
    overhang1=profiled"_"${file%Aligned.out.sam}"_"Aligned.out.spanning_junction_reads_junction_overhang_mapped_num.txt;
    overhang2=${file%Aligned.out.sam}"_"junction_counts_more_than_"$threshold"_in_all_samples_with_both_overhangs.txt;
    trash=${file%Aligned.out.sam}"_"trash_out;
    title=${file%Aligned.out.sam};
    juncount=${file%Aligned.out.sam}"_"junction_counts.txt;
    juncountAll=${file%Aligned.out.sam}"_"junction_counts_more_than_"$threshold"_in_all_samples.txt;
    samtools view -@ $thread_num -S $file | awk '($6 ~ /N/)' > $name;
    samtools view -H $file > $header;
    cat $header $name > $juncSam;
    samtools view -@ $thread_num -bS $juncSam > $juncBam;
    bedtools bamtobed -bed12 -i $juncBam > $juncBed;
    perl $folder/profile_overhang_from_STAR_output_step_1.pl $juncBed $overhang1 $read_length;
    awk 'FNR==NR {arr[$0];next} ($5 in arr)' UNION_junc_coor_with_junction_ID_more_than_"$threshold"_read_in_at_least_"$file_num"_samples_formatted_junction_list.txt $juncount > $juncountAll;
    perl $folder/profile_overhang_from_STAR_output_step_2.pl $juncountAll $overhang1 $overhang2 $trash;
        if [ -s $trash ]
        then 
           echo "Error:$trash file should be empty!";
        else
           echo "Processing junction overhangs for Sample $title... Success."; 
        fi
}

for file in *Aligned.out.sam;
do
	process_sam_func $file &

done

wait

perl $folder/extract_intron_retention_event_coordinate_from_samples_conditional_union.pl UNION_junc_coor_with_junction_ID_more_than_"$threshold"_read_in_at_least_"$file_num"_samples_formatted_with_junction_length.txt *junction_counts_more_than_"$threshold"_in_all_samples_with_both_overhangs.txt UNION_junc_coor_with_junction_ID_more_than_"$threshold"_read_in_at_least_"$file_num"_samples_formatted_with_junction_length_and_overhang_union_from_all_samples.txt;


perl $folder/extract_intron_retention_event_coordinate_conditional_union_splicing_junction_gff.pl UNION_junc_coor_with_junction_ID_more_than_"$threshold"_read_in_at_least_"$file_num"_samples_formatted_with_junction_length_and_overhang_union_from_all_samples.txt output_long_intron.gff output_short_intron.gff $read_length;

sort -k1,1 -k4,4n output_long_intron.gff > output_long_intron_sorted.gff;

awk -v readlen="$read_length" '$5-$4 >= readlen' output_short_intron.gff > output_short_intron1.gff;
awk -v readlen="$read_length" '$5-$4 < readlen' output_short_intron.gff > output_short_intron2.gff;
sort -k1,1 -k4,4n output_short_intron1.gff > output_short_intron1_sorted.gff;
sort -k1,1 -k4,4n output_short_intron2.gff > output_short_intron2_sorted.gff;

awk -v readlen="$read_length" '{ gsub("junc_id=","",$9); print $1 "\t" $4 "\t" $5-readlen+1 "\t" $9 "\t" "1" "\t" $7}' output_long_intron.gff | sort -k1,1 -k2,2n > output_long_intron_adjusted_range_sorted.bed;
awk -v readlen="$read_length" '{ gsub("junc_id=","",$9); print $1 "\t" $4 "\t" $5-readlen+1 "\t" $9 "\t" "1" "\t" $7}' output_short_intron1.gff | sort -k1,1 -k2,2n > output_short_intron_1_adjusted_range_sorted.bed;
awk -v readlen="$read_length" '{ gsub("junc_id=","",$9); print $1 "\t" $5-readlen "\t" $4+1 "\t" $9 "\t" "1" "\t" $7}' output_short_intron2.gff | sort -k1,1 -k2,2n > output_short_intron_2_adjusted_range_sorted.bed;


process_sortedbam () {
    sortfile=$1;
    bamfiletobed=${sortfile%_sorted.bam}.bed;
    bamsortedbed=${sortfile%_sorted.bam}.sorted.bed;
    intersect_bam_long_intron=${sortfile%_sorted.bam}"_"intersect_long_intron.txt;
    intersect_bam_short_intron1=${sortfile%_sorted.bam}"_"intersect_short_intron1.txt;
    intersect_bam_short_intron2=${sortfile%_sorted.bam}"_"intersect_short_intron2.txt;
    bedtools bamtobed -i $sortfile > $bamfiletobed;
    sort -k1,1 -k2,2n $bamfiletobed > $bamsortedbed;
    intersectBed -a $bamsortedbed -b output_long_intron_sorted.gff -sorted -wa -u -f 1 > $intersect_bam_long_intron 
    intersectBed -a $bamsortedbed -b output_short_intron1_sorted.gff -sorted -wa -u -f 1 > $intersect_bam_short_intron1 
    intersectBed -a $bamsortedbed -b output_short_intron2_sorted.gff -sorted -wa -u -F 1 > $intersect_bam_short_intron2 
}    

for sortfile in *Aligned.out_sorted.bam;
do
    process_sortedbam $sortfile &
done    

wait

coverage_bamfile () {
    bamfile=$1;
    coveragefile=${bamfile%_sorted.bam}"_"coverage.bed;
    genomeCoverageBed -ibam $bamfile -bga -split > $coveragefile;
}

for bamfile in *Aligned.out_sorted.bam;
do
    coverage_bamfile $bamfile &
done

wait

process_inter_long_intron() {
    intersect_long_intron=$1;
    intersect_long_intron_coor=${intersect_long_intron%.txt}"_"chr_coor.txt
    output_long_intron=${intersect_long_intron%.txt}"_"count_table
    awk '{print $1 "\t" $2+1 "\t" $2+2 "\t" $4 "\t" $5 "\t" $6}' $intersect_long_intron | sort -k1,1 -k2,2n > $intersect_long_intron_coor;
    intersectBed -a output_long_intron_adjusted_range_sorted.bed -b $intersect_long_intron_coor -wa -c -sorted | sort -V -k4,4 | awk '{print $4 "\t" $7}' > $output_long_intron
}

for intersect_long_intron in *intersect_long_intron.txt;
do
	process_inter_long_intron $intersect_long_intron &
done

wait

process_inter_short_intron() {
    intersect_short_intron1=$1;
    intersect_short_intron2=${intersect_short_intron1%_intron1.txt}"_"intron2.txt;
    intersect_short_intron2_edit=${intersect_short_intron1%_intron1.txt}"_"intron2_edited.txt;
    intersect_short_intron1_coor=${intersect_short_intron1%.txt}"_"chr_coor.txt;
    output_short_intron1=${intersect_short_intron1%.txt}"_"count_table;
    intersect_short_intron2_coor=${intersect_short_intron2%.txt}"_"chr_coor.txt;
    output_short_intron2=${intersect_short_intron2%.txt}"_"count_table;
    output_short_intron=${intersect_short_intron1%_intron1.txt}"_"intron_count_table;
    awk '{print $1 "\t" $2+1 "\t" $2+2 "\t" $4 "\t" $5 "\t" $6}' $intersect_short_intron1 | sort -k1,1 -k2,2n > $intersect_short_intron1_coor;
    intersectBed -a output_short_intron_1_adjusted_range_sorted.bed -b $intersect_short_intron1_coor -wa -c -sorted | sort -V -k4,4 | awk '{print $4 "\t" $7}' > $output_short_intron1;
    awk -v readlen="$read_length" '$3-$2 <= readlen' $intersect_short_intron2 > $intersect_short_intron2_edit;
    awk '{print $1 "\t" $2+1 "\t" $2+2 "\t" $4 "\t" $5 "\t" $6}' $intersect_short_intron2_edit | sort -k1,1 -k2,2n > $intersect_short_intron2_coor;
    intersectBed -a output_short_intron_2_adjusted_range_sorted.bed -b $intersect_short_intron2_coor -wa -c -sorted | sort -V -k4,4 | awk '{print $4 "\t" $7}' > $output_short_intron2;   
    cat $output_short_intron1 $output_short_intron2 > $output_short_intron;
}

for intersect_short_intron1 in *intersect_short_intron1.txt;
do
	process_inter_short_intron $intersect_short_intron1 &
done

wait


for file in *_intersect_long_intron_count_table;
do
    file_short=${file%_long_intron_count_table}"_"short_intron_count_table;
    name=${file%Aligned.out_intersect_long_intron_count_table}"_"junction_counts_more_than_"$threshold"_in_all_samples.txt;
    name_long_left_count=${file%Aligned.out_intersect_long_intron_count_table}"_"junction_counts_more_than_"$threshold"_in_all_samples_long_intron_retention_left_count.txt;
    name_long_left_count_list=${file%Aligned.out_intersect_long_intron_count_table}"_"junction_counts_more_than_"$threshold"_in_all_samples_long_intron_retention_left_count_list.txt;
    name_long_right_count=${file%Aligned.out_intersect_long_intron_count_table}"_"junction_counts_more_than_"$threshold"_in_all_samples_long_intron_retention_right_count.txt;
    name_long_right_count_list=${file%Aligned.out_intersect_long_intron_count_table}"_"junction_counts_more_than_"$threshold"_in_all_samples_long_intron_retention_right_count_list.txt;
    name_short_count=${file%Aligned.out_intersect_long_intron_count_table}"_"junction_counts_more_than_"$threshold"_in_all_samples_short_intron_retention.txt;
    name_short_count_list=${file%Aligned.out_intersect_long_intron_count_table}"_"junction_counts_more_than_"$threshold"_in_all_samples_short_intron_retention_list.txt;
    name_combined_list=${file%Aligned.out_intersect_long_intron_count_table}"_"junction_counts_combined_intron_retention_event_list.txt;
    name_combined=${file%Aligned.out_intersect_long_intron_count_table}"_"junction_counts_combined_intron_retention_event.txt;
    perl $folder/preparing_intron_retention_count_table_long_intron.pl $file $name $name_long_left_count $name_long_left_count_list $name_long_right_count $name_long_right_count_list $IR_threshold;
    perl $folder/preparing_intron_retention_count_table_short_intron.pl $file_short $name $name_short_count $name_short_count_list $IR_threshold;
    cat $name_long_left_count_list $name_long_right_count_list $name_short_count_list > $name_combined_list;
    cat $name_long_left_count $name_long_right_count $name_short_count > $name_combined;
done

if [ -d "con1_2" ]; then
rm -r con1_2;
fi

mkdir con1_2;

if [ -d "con2_2" ]; then
rm -r con2_2;
fi

mkdir con2_2;

for element in "${array1[@]}"
do
    cp "$element"_junction_counts_combined_intron_retention_event_list.txt con1_2/;
    echo "$element";
done

for element in "${array2[@]}"
do
    cp "$element"_junction_counts_combined_intron_retention_event_list.txt con2_2/;
    echo "$element";
done

cd con1_2/;

perl $folder/Identify_intron_retention_event_exist_in_all_samples.pl *junction_counts_combined_intron_retention_event_list.txt condition1_junction_counts_intron_retention_in_all_samples_list.txt $file_num_1;

cd ..

cd con2_2/;
perl $folder/Identify_intron_retention_event_exist_in_all_samples.pl *junction_counts_combined_intron_retention_event_list.txt condition2_junction_counts_intron_retention_in_all_samples_list.txt $file_num_2;

cd ..

cat con1_2/condition1_junction_counts_intron_retention_in_all_samples_list.txt con2_2/condition2_junction_counts_intron_retention_in_all_samples_list.txt | sort -u > All_junction_counts_intron_retention_in_all_samples_sorted_list.txt;

perl $folder/preparing_intron_retention_event_for_JUM_combine.pl All_junction_counts_intron_retention_in_all_samples_sorted_list.txt output_long_intron.gff output_short_intron.gff UNION_junc_coor_with_junction_ID_more_than_"$threshold"_read_in_at_least_"$file_num"_samples.txt JUM_intron_retention_splicing_event.txt;

perl $folder/first_processing_generate_JUM_reference_for_profiled_total_alternative_splicing_event_junction.pl JUM_intron_retention_splicing_event.txt output_intron_retention_splicing_event_first_processing_for_JUM_reference_building.txt; 

perl $folder/second_processing_generate_JUM_reference_for_profiled_total_alternative_splicing_event_junction.pl JUM_intron_retention_splicing_event.txt output_intron_retention_splicing_event_second_processing_for_JUM_reference_building.txt;

perl $folder/generate_JUM_format_annotation_file_for_profiled_alternative_splicing_event_junction.pl output_intron_retention_splicing_event_second_processing_for_JUM_reference_building.txt output_intron_retention_splicing_event_first_processing_for_JUM_reference_building.txt output_intron_retention_splicing_event_JUM_annotation.gff;

cat more_than_"$threshold"_profiled_total_AS_event_junction_JUM_annotation.gff output_intron_retention_splicing_event_JUM_annotation.gff > combined_AS_JUM.gff;

for files in *junction_counts_combined_intron_retention_event.txt
do
   refom=${files%_combined_intron_retention_event.txt}"_"intron_retention_counts_exist_in_all_samples.txt;
   ori_no_AS=${files%_junction_counts_combined_intron_retention_event.txt}"_"fn_count.txt;
   combined_all_AS=${files%_junction_counts_combined_intron_retention_event.txt}"_"combined_count.txt;
   perl $folder/making_junction_count_file_matching_annotation_for_JUM_in_intron_retention.pl output_intron_retention_splicing_event_first_processing_for_JUM_reference_building.txt $files $refom;
   cat $ori_no_AS $refom > $combined_all_AS;
done

echo ready to performing differential AS analyses...

end=$(date +%s)

runtime=$((end-start))

echo "JUM_A.sh core execution finished in $runtime seconds."
echo "Command is:"
echo "bash JUM_A.sh --Folder $folder --JuncThreshold $threshold --Condition1_fileNum_threshold $file_num_1 --Condition2_fileNum_threshold $file_num_2 --IRthreshold $IR_threshold --Readlength $read_length --Thread $thread_num --Condition1SampleName $condition_1 --Condition2SampleName $condition_2"


if [ -d "temp_JUM_A_run" ]; then
rm -r temp_JUM_A_run;
fi

mkdir temp_JUM_A_run;

ls | grep -v temp_JUM_A_run | xargs mv -t temp_JUM_A_run;

if [ -d "JUM_diff" ]; then
rm -r JUM_diff;
fi

mkdir JUM_diff;

mv temp_JUM_A_run/*combined_count.txt JUM_diff/
mv temp_JUM_A_run/combined_AS_JUM.gff JUM_diff/
mv temp_JUM_A_run/*Aligned.out_coverage.bed JUM_diff/
mv temp_JUM_A_run/*profiled_total_AS_event_junction_first_processing_for_JUM_reference_building.txt JUM_diff/
mv temp_JUM_A_run/UNION_junc_coor_with_junction_ID_more_than_"$threshold"_read_in_at_least_"$file_num"_samples.txt JUM_diff/

mv temp_JUM_A_run/*Aligned.out.sam .
mv temp_JUM_A_run/*Aligned.out_sorted.bam* .
mv temp_JUM_A_run/*formatted.txt .
mv temp_JUM_A_run/*SJ.out.tab .

echo "JUM_A.sh completed. Ready for next step! Feel free to delete the temporary file folder temp_JUM_A_run once satisfied."
back to top