Content - 2bd271585c0bca2fdda95f7327a55597acbff4b7 - 3db3d5c/SL-quant.sh

visit type:
Tip revision: 76117b8023dc47a5aa82b663d18cb68b961e9614 authored by Carlo Yague on 27 August 2019, 14:29:17 UTC
Fix EOF warning sort issue in -sp mode
Tip revision: 76117b8
SL-quant.sh
#!/bin/bash

# In single-end mode, SL-quant identifies trans-splicing events as unmapped reads containing a splice leader (SL) sequence at the 5’ end of the read. 
# In paired-end mode, SL-quant identifies trans-splicing events as read pairs with one end unmapped and starting with a SL sequence while the other end is mapped. 
# After trimming of those SL sequences, the reads/pairs are re-mapped on the genome and counted at the gene level.
# instructions on https://github.com/cyaguesa/SL-quant

# REQUIREMENTS:	            VERSION USED 

# - blastn                    2.6.0
# - samtools                    1.5
# - picard                    2.9.0
# - featureCounts             1.5.0              
# - hisat2                    2.0.5             
# - cutadapt                   1.14              
# - bedtools                 2.26.0              


# by Carlo Yague-Sanz, 2017

#––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––
# USAGE
#––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––

usage=$(cat <<EOF

SL-quant identifies trans-splicing events as unmapped reads containing a splice leader (SL) sequence at the 5’ end of the read. In paired-end mode, the unmapped reads are pre-filtered based on the status of their mate (it must be mapped). In sensitive mode, the criteria to identify SL sequences are less stringent, increasing sensitivity.

Detailed instructions on github.com/cyaguesa/SL-quant

USAGE: ./SL-quant.sh [-p -m mapped.bam] [-s] unmapped.bam output_base

Required arguments:
  unmapped.bam
				file containing unmapped reads.
  output_base
				base name (+path) for the ouput.

Optional arguments:
  --mapped mapped.bam, -m mapped.bam
				file containing mapped reads.
  --paired, -p
				run SL-quant in paired-end mode. 
				Requires -m argument.
  --sensitive, -s
				run SL-quant in sensitive mode.
  --help, -h
				show this help message and exit.


EOF
)

#––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––
# PARAMETERS
#––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––



set -e                                 # stops script at the first error.
SL_db="data/blast_db/SL.fasta"         # path to SL sequence database (for blast).
gene_annotation="data/genes.gtf"       # annotation file
index="data/ce10_hisat2_index/genome"  # genome index file (only required on single-end mode).
paired_orientation="FR"                # ignored in single-end mode. Value={"FR" (default), "RF", "unstranded"}
single_orientation="R"                 # ignored in paired-end mode. Value={"F" (stranded), "R" (reversely stranded), "unstranded"}
threads=4                              # number of threads to use.
send=22                                # End of alignment in 'subject' threshold for blast.
align_length=5                         # length of the cutadapt alignment (default = 5). (paired-end mode only)


PARAMS=""
SINGLE="single"                        # set to "single" for single-end mode. Any other value for paired-end mode.
METHOD="specific"                      # method used to detect SL-containing reads. "specific" (default, with blast) or "sensitive" (with cutadapt)

while (( "$#" )); do
  case "$1" in
    -p|--paired)
      SINGLE="paired"
      shift
      ;;
    -m|--mapped)
      PARAMS="$PARAMS $2"
      shift 2
      ;;
    -s|--sensitive)
      METHOD="sensitive"
      shift
      ;;
    -h|--help)
      echo "$usage"
      exit 1
      ;;
    --) # end argument parsing
      shift
      break
      ;;
    -*|--*=) # unsupported flags
      echo "Error: Unsupported flag $1" >&2
      echo "$usage"
      exit 1
      ;;
    *) # preserve positional arguments
      PARAMS="$PARAMS $1"
      shift
      ;;
  esac
done

eval set -- "$PARAMS"


#––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––
# PAIRED-END MODE
#––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––

if [ "$SINGLE" != "single" ]; then
  if [ $# != 3 ]; then
     echo "${usage}"
     exit 1

  else
    echo ""
    echo "SL-quant.sh: PAIRED-END MODE [$METHOD]"
    echo "   [mapped.bam] = $1"
    echo "   [unmapped.bam] = $2"
    echo "   [ouput_dir/base] = $3"
    echo "   [SL blast database] = $SL_db"
    echo "   [gene annotation] = $gene_annotation"
    echo "   [read orientation] = $paired_orientation";echo ""

    mkdir -p "${3%/*}"

    echo "   fetch read pairs with one end unmapped..."
    
    if [ "$paired_orientation" == "FR" ]; then
 
      echo "      [1/2] get unmapped R2 reads with mate mapped and convert to fastq"
      samtools view -u -f 133 -F 8 ${2} > ${3}_oneEnd_unmapped.bam

      echo "      [2/2] get primary alignments of mapped R1 reads with mate unmapped"
      samtools view -u -f 73 -F 260 ${1}  > ${3}_oneEndMapped.bam

      featureCounts_S=2                    # set R1 read orientation for featureCounts (reverse)
      strand="plus"                        # set blast database strandness (normal)

    elif [ "$paired_orientation" == "RF" ]; then

      echo "      [1/2] get R1 reads unmapped with mate mapped and convert to fastq"
      samtools view -f 69 -F 8 ${2} > ${3}_oneEnd_unmapped.bam

      echo "      [2/2] get primary alignments of R2 reads mapped with mate unmapped"
      samtools view -u -f 137 -F 260 ${1}  > ${3}_oneEndMapped.bam

      featureCounts_S=2                    # set R2 read orientation for featureCounts (reverse)
      strand="minus"                       # set blast database strandness (reverse)

    else

      echo "      [1/2] get reads unmapped with mate mapped and convert to fastq"
      samtools view -f 5 -F 8 ${2} > ${3}_oneEnd_unmapped.bam

      echo "      [2/2] get reads mapped with mate unmapped"
      samtools view -u -f 9 -F 260 ${1}  > ${3}_oneEndMapped.bam

      featureCounts_S=0                   # set read orientation for featureCounts (unstranded)
      strand="both"                       # set blast database strandness (unstranded)

    fi

    if [ "$METHOD" != "sensitive" ]; then

      echo "   done... blast unmapped reads on SL sequences..."
      samtools view ${3}_oneEnd_unmapped.bam | awk '{OFS="\t"; print ">"$1"\n"$10}'  > ${3}_oneEnd_unmapped.fasta
      blastn -query ${3}_oneEnd_unmapped.fasta -task blastn -db $SL_db  -outfmt 6 -max_target_seqs 1 -num_threads $threads -word_size 8 -strand $strand > ${3}_blasted.txt 2>${3}_log.txt

      echo "   done... filter significant alignment with qstart==1..."
      awk '$7 == 1 && $10 >= "'"$send"'" && $11 < "0.05" {print $0}' ${3}_blasted.txt | grep "SL1" > ${3}_blasted_SL1.txt
      awk '$7 == 1 && $10 >= "'"$send"'" && $11 < "0.05" {print $0}' ${3}_blasted.txt | grep "SL2" > ${3}_blasted_SL2.txt
      
      echo "   done... trim SL sequences and convert back into fastq..."

      cut -f 1 ${3}_blasted_SL1.txt > ${3}_SL1_IDs.txt
      picard FilterSamReads SORT_ORDER=unsorted FILTER=includeReadList COMPRESSION_LEVEL=0 READ_LIST_FILE=${3}_SL1_IDs.txt I=${3}_oneEnd_unmapped.bam O=${3}_SL1.sam 2>> ${3}_log.txt
      samtools sort -n ${3}_SL1.sam > ${3}_SL1_Nsorted.sam
      paste <(samtools view ${3}_SL1_Nsorted.sam | cut -f 1-11) <(cat ${3}_blasted_SL1.txt) | awk '{OFS="\t"; print $1,"4",$3,$4,$5,$6,$7,$8,$9, substr($10, $19+1),  substr($11, $19+1)}' | picard SamToFastq VALIDATION_STRINGENCY=SILENT QUIET=TRUE I=/dev/stdin FASTQ=${3}_SL1_trimmed.fq

      cut -f 1 ${3}_blasted_SL2.txt > ${3}_SL2_IDs.txt
      picard FilterSamReads SORT_ORDER=unsorted FILTER=includeReadList COMPRESSION_LEVEL=0 READ_LIST_FILE=${3}_SL2_IDs.txt I=${3}_oneEnd_unmapped.bam O=${3}_SL2.sam 2>> ${3}_log.txt
      samtools sort -n ${3}_SL2.sam > ${3}_SL2_Nsorted.sam
      paste <(samtools view ${3}_SL2_Nsorted.sam | cut -f 1-11) <(cat ${3}_blasted_SL2.txt) | awk '{OFS="\t"; print $1,"4",$3,$4,$5,$6,$7,$8,$9, substr($10, $19+1),  substr($11, $19+1)}' | picard SamToFastq VALIDATION_STRINGENCY=SILENT QUIET=TRUE I=/dev/stdin FASTQ=${3}_SL2_trimmed.fq


    else

      echo "   done... find & cut SL sequences..."

      samtools fixmate ${3}_oneEnd_unmapped.bam - | picard SamToFastq VALIDATION_STRINGENCY=SILENT QUIET=TRUE I=/dev/stdin FASTQ=${3}_oneEnd_unmapped.fq
      cutadapt -g file:$SL_db -O $align_length -m 15 -o ${3}{name}.fq --discard-untrimmed ${3}_oneEnd_unmapped.fq 2>> ${3}_log.txt

      cat ${3}*_SL2_splice_leader*.fq | paste - - - - | sort -n -k2,2 -t. | tr "\t" "\n" > ${3}_SL2_trimmed.fq
      cat ${3}*_SL1_splice_leader*.fq | paste - - - - | sort -n -k2,2 -t. | tr "\t" "\n" > ${3}_SL1_trimmed.fq
      rm ${3}*_SL2_splice_leader*.fq
      rm ${3}*_SL1_splice_leader*.fq

      awk 'NR%4==1 {print substr($1,2); }' ${3}_SL1_trimmed.fq > ${3}_SL1_IDs.txt
      awk 'NR%4==1 {print substr($1,2); }' ${3}_SL2_trimmed.fq > ${3}_SL2_IDs.txt

    fi
    echo "   done... retrieve mapped mates..."

    echo "      [1/2] of SL1-containing reads"
    picard FilterSamReads SORT_ORDER=unsorted FILTER=includeReadList READ_LIST_FILE=${3}_SL1_IDs.txt I=${3}_oneEndMapped.bam O=${3}_SL1_mates.bam 2>> ${3}_log.txt
    samtools sort -n ${3}_SL1_mates.bam > ${3}_SL1_mates_Nsorted.bam 
    bedtools bamtofastq -i ${3}_SL1_mates_Nsorted.bam -fq ${3}_SL1_mates.fq

    echo "      [2/2] of SL2-containing reads"
    picard FilterSamReads SORT_ORDER=unsorted FILTER=includeReadList READ_LIST_FILE=${3}_SL2_IDs.txt I=${3}_oneEndMapped.bam O=${3}_SL2_mates.bam 2>> ${3}_log.txt
    samtools sort -n ${3}_SL2_mates.bam > ${3}_SL2_mates_Nsorted.bam 
    bedtools bamtofastq -i ${3}_SL2_mates_Nsorted.bam -fq ${3}_SL2_mates.fq

    echo "   done... remap with hisat2"
    hisat2 -p $threads --no-discordant --no-softclip --min-intronlen 20 --max-intronlen 5000 --rna-strandness $paired_orientation -x $index -1 ${3}_SL1_mates.fq -2 ${3}_SL1_trimmed.fq | samtools view -b -F 260 -f 2 > ${3}_SL1_remapped.bam
    hisat2 -p $threads --no-discordant --no-softclip --min-intronlen 20 --max-intronlen 5000 --rna-strandness $paired_orientation -x $index -1 ${3}_SL2_mates.fq -2 ${3}_SL2_trimmed.fq | samtools view -b -F 260 -f 2 > ${3}_SL2_remapped.bam

    echo "   done... summarize..."
    featureCounts -p -s $featureCounts_S -g gene_id -T 4 -a $gene_annotation -o ${3}_counts.txt ${3}_SL1_remapped.bam ${3}_SL2_remapped.bam 2>> ${3}_log.txt

  fi


#––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––
# SINGLE-END MODE
#––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––

else
  if [ $# != 2 ]; then
     echo "${usage}"
     exit 1

  else
    echo ""
    echo "SL-quant.sh: SINGLE-END MODE [$METHOD]"
    echo "   [unmapped.bam] = $1"
    echo "   [ouput_dir/base] = $2"
    echo "   [SL blast database] = $SL_db"
    echo "   [gene annotation] = $gene_annotation"
    echo "   [read orientation] = $single_orientation";echo ""

    mkdir -p "${2%/*}"
 
    if [ "$single_orientation" == "R" ]; then
      strand="plus"                                 # set blast database strandness (normal)
      featureCounts_S=1                             # set featureCounts strandness (stranded)    

    elif [ "$single_orientation" == "F" ]; then
      strand="minus"                                # set blast database strandness (reverse)
      featureCounts_S=1                             # set featureCounts strandness (stranded) 

    else
      strand="both"                                 # set blast database strandness (unstranded)
      featureCounts_S=0                             # set featureCounts strandness (unstranded)
    fi
    
    paired_entries=$(samtools view -c -f 1 $1)
    
    if [ $paired_entries -gt 0 ]; then
      echo ""; echo "WARNING : there are ${paired_entries} 'paired in sequencing' reads in ${1}. Consider running the script in paired-end mode. Only R2 reads will be considered in single-end mode."; echo ""
      samtools view -b -f 128 ${1} > ${1}_R2.bam
      input="${1}_R2.bam"
    
    else
      input="${1}"
    fi


    if [ "$METHOD" != "sensitive" ]; then

      echo "   blast unmapped reads on SL sequences..."
      samtools view -f 4 $input | awk '{OFS="\t"; print ">"$1"\n"$10}' | blastn -db $SL_db -outfmt 6 -max_target_seqs 1 -num_threads 4 -word_size 8 -strand $strand > ${2}_blasted.txt 2>${2}_log.txt
    

      echo "   done... filter..."
      awk '$7 == 1 && $10 >= "'"$send"'" && $11 < "0.05" {print $0}' ${2}_blasted.txt | grep "SL1" > ${2}_blasted_SL1.txt
      awk '$7 == 1 && $10 >= "'"$send"'" && $11 < "0.05" {print $0}' ${2}_blasted.txt | grep "SL2" > ${2}_blasted_SL2.txt

      echo "   done... retrieve SL-containing reads"

      echo "      [1/2] of SL1-containing reads"
      cut -f 1 ${2}_blasted_SL1.txt > ${2}_SL1_IDs.txt
      picard FilterSamReads SORT_ORDER=unsorted FILTER=includeReadList VALIDATION_STRINGENCY=SILENT COMPRESSION_LEVEL=0 QUIET=TRUE READ_LIST_FILE=${2}_SL1_IDs.txt I=$input O=${2}_SL1.sam


      echo "      [2/2] of SL2-containing reads"
      cut -f 1 ${2}_blasted_SL2.txt > ${2}_SL2_IDs.txt
      picard FilterSamReads SORT_ORDER=unsorted FILTER=includeReadList VALIDATION_STRINGENCY=SILENT COMPRESSION_LEVEL=0 QUIET=TRUE READ_LIST_FILE=${2}_SL2_IDs.txt I=$input O=${2}_SL2.sam

      echo "   done... trim SL-containing reads and convert to fastq"
      paste <(samtools view ${2}_SL1.sam | cut -f 1-11) <(cat ${2}_blasted_SL1.txt) | awk '{OFS="\t"; print $1,"4",$3,$4,$5,$6,$7,$8,$9, substr($10, $19+1),  substr($11, $19+1)}' | picard SamToFastq VALIDATION_STRINGENCY=SILENT QUIET=TRUE I=/dev/stdin FASTQ=${2}_SL1_trimmed.fq
      paste <(samtools view ${2}_SL2.sam | cut -f 1-11) <(cat ${2}_blasted_SL2.txt) | awk '{OFS="\t"; print $1,"4",$3,$4,$5,$6,$7,$8,$9, substr($10, $19+1),  substr($11, $19+1)}' | picard SamToFastq VALIDATION_STRINGENCY=SILENT QUIET=TRUE I=/dev/stdin FASTQ=${2}_SL2_trimmed.fq


    else

      #SL_db="data/blast_db/SL_RV.fa"         # path to SL sequence database (for cutadapt).

      echo "   convert to fastq..."

      bedtools bamtofastq -i ${input} -fq ${2}_unmapped.fq

      echo "   done...find & cut SL sequences..."
      
      cutadapt -g file:$SL_db -O $align_length -m 15 -o ${2}{name}.fq --discard-untrimmed ${2}_unmapped.fq 2>> ${2}_log.txt

      cat ${2}*_SL2_splice_leader*.fq | paste - - - - | sort -n -k2,1 -t. | tr "\t" "\n" > ${2}_SL2_trimmed.fq
      cat ${2}*_SL1_splice_leader*.fq | paste - - - - | sort -n -k2,1 -t. | tr "\t" "\n" > ${2}_SL1_trimmed.fq
      rm ${2}*_SL2_splice_leader*.fq
      rm ${2}*_SL1_splice_leader*.fq

    fi
 
    echo "   done... map with hisat2"
    hisat2 -p $threads --no-discordant --no-softclip --min-intronlen 20 --max-intronlen 5000 --rna-strandness $single_orientation -x $index -U ${2}_SL1_trimmed.fq | samtools view -b -F 260 > ${2}_SL1_remapped.bam
    hisat2 -p $threads --no-discordant --no-softclip --min-intronlen 20 --max-intronlen 5000 --rna-strandness $single_orientation -x $index -U ${2}_SL2_trimmed.fq | samtools view -b -F 260 > ${2}_SL2_remapped.bam

    echo "   done... summarize..."
    featureCounts -s $featureCounts_S -g gene_id -T 4 -a $gene_annotation -o ${2}_counts.txt ${2}_SL1_remapped.bam ${2}_SL2_remapped.bam 2>> ${2}_log.txt
  fi

fi
echo "   All done ! Have a nice day :)"
echo ""