https://github.com/mengwu1002/Multi-taxon_analysis_pipeline
Tip revision: fac437a7d35ecfd53600ff4dc667563dfb251d25 authored by Meng Wu on 15 October 2015, 16:36:34 UTC
Launch of the multi-taxon INSeq analysis pipeline
Launch of the multi-taxon INSeq analysis pipeline
Tip revision: fac437a
Multitaxon_INSeq_pipeline.pl
#!/usr/bin/perl -w
#--------------------------------------------------------------------------#
# Author: Meng Wu, the Gordon Lab, Washington University in St. Louis #
# #
# File: Multitaxon_INSeq_pipeline.pl #
# Description: This is the analysis pipeline script
# Date: 2014-09-05 #
# Version: 1.30 #
# #
# Usage: #
# see Usage in the code #
# #
# Contact: mengwu@wustl.edu #
#--------------------------------------------------------------------------#
use strict;
use warnings;
use Getopt::Long;
use IO::File;
use FindBin qw($Bin);
use IO::Uncompress::AnyUncompress; # can read RFC 1950, RFC 1951, gzip, zip, bzip2, lzop, lzf
if (!@ARGV){
print "Usage: perl $0 -i <the raw reads file> -m <Barcodes mapping file> -s <indexed genome name> [-index <index seq reads file> -d <length_disrupt_percent (max=1)> [-operon -c <operon_probability_cutoff (max=1)>] [-arrayed] [\n";
print "Required argument: ";
print "-i gives the input raw reads file, -m gives the mapping file with the barcode sequence and name for each sample in the format as <barcode>\t<Sample name>, -s gives the name of the indexed genome that the reads should map to";
print "Optional argument: \n";
print "-d gives the region of the gene in which insertions are expected to disrupt gene function. The default is 1, which means when insertion falls anywhere in the gene (100%), the gene function will be disrupted. Setting the -d argument to 0.9, for example, would exclude insertions in the distal 10% of the gene when calculating the total number of reads/insertions for that gene. -operon (no argument) specifies that putative downstream (polar) insertions should be calculated based on a user-provided operon probability file. -c is the cutoff for operon probability, default is 1, which means only when the probability of two genes being in an operon is equal to 1 (100%), the polar effect will be considered for the downstream gene. Setting the -c argument to 0.8, for example, will calculate a polar effect for the downstream gene if the probability of the genes being in an operon is at least 0.8 (80%). -arrayed (no argument) is the option for the arrayed library. Refer to README for more information\n";
exit;
}
my $datasource; #### The raw reads file which will be analyzed
my $mismatch1=0; #### Mismatches allowed in finding the transposon sequence
my $mismatch2=0; #### Mismatches allowed when mapping the sequences to reference using bowtie
my $mapping; #### Mapping file name
my $poolname="INSEQ_experiment"; #### The prefix of all the analyzed files
my $index; #### The name of the index fold in the indexes
my $operon=0; #### The option if analyze the sequence using operon information
my $cutoff=100; #### The cutoff for operon probability, default is 100
my $disruption=100; #### The disruptable percentage of genes. Only when transposon was inserted into this proximal region of gene, the gene is considered interrupted since the insertion in the distal region of a gene may not affect the function of gene at all. The default is 100.
my $arrayed=0; #### The option if the data is from an arrayed library
my $path=$Bin; #### Get the path for this analysis package
my $indexfile;
GetOptions (
"index=s"=>\$indexfile,
"i|input=s"=>\$datasource,
"s|species=s"=>\$index,
"m|mappingfile=s"=> \$mapping,
"operon"=>\$operon,
"d|disruption=s"=>\$disruption,
"c|cutoff=i"=>\$cutoff,
"arrayed"=>\$arrayed,
"mismatch1=i"=>\$mismatch1,
"mismatch2=i"=>\$mismatch2,
);
print "test $mismatch1\n";
my $bowtie_path="";
open BT,"$path/config.txt" || die "Error, please check configuration file as config.txt in the package";
while (<BT>){
chomp;
if ($_=~m/\=\"(.*)\"/){
$bowtie_path=$1;
if ($1 eq ""){
die "please specify where the bowtie directory is\n";
}
}
}
my $outputdir=`pwd`;
chomp $outputdir;
#print "$outputdir \n";
my $input=$poolname.".scarf";
`rm $input`;
`ln -s $datasource $input`;
mkdir "bcsortedseqs";
mkdir "results";
# Step 1
# Use the mapping file to assign each read to a barcode and store the output file as inputfile_assigned.txt", and store the statistics in the log file
my $barcode_assigned=$input."_assigned.txt";
my $logfile=$input.".log";
open CODES,$mapping || die "Error: can't open the mapping file, check the README for more details about mapping file\n";
my @codes;
my %codes_hash; #$codes_hash{$code} = sampleID
my %codes_number; #$codes_number{$code} = sample number (within a given sequencing lane)
my $sample_count=0;
my %has_barcode; #$has_barcode{$code} = number of reads with barcode $code
my $total=0;
my $total_mapped=0;
my $code_length;
my $unbarcoded="INSEQ_without_BC.txt";
my $libraries;
while (my $line=<CODES>){
chomp $line;
my @codes_array = split (/\s+/, $line);
$code_length=length ($codes_array[0]);
$codes_hash{$codes_array[0]}=$codes_array[1];
$codes_number{$codes_array[0]}=$sample_count;
$sample_count++;
@{$libraries->{$codes_array[1]}}=split (/\,/,$codes_array[2]);
}
close CODES;
my $nonbc_count=0;
open NONBC,">$unbarcoded";
my $fh_input=new IO::Uncompress::AnyUncompress $input or die "can't open $input: $!\n";
my $fh_index=new IO::Uncompress::AnyUncompress $indexfile or die "can't open $indexfile: $!\n";
my $seq_length;
open OUT,">$barcode_assigned";
while (my $lineF=<$fh_input>) {
chomp $lineF;
$total++;
my $seq1=<$fh_input>;
chomp $seq1;
my $qualN=<$fh_input>;
my $qualS=<$fh_input>;
<$fh_index>;
my $indexseq=<$fh_index>;
chomp $indexseq;
<$fh_index>;
<$fh_index>;
my $seq_barcode = substr ($seq1, 0, $code_length,"");
if (exists $codes_hash{$seq_barcode}){ #if first bases of read are one of the barcodes
$total_mapped++;
unless (exists $has_barcode{$seq_barcode}) {
$has_barcode{$seq_barcode}=1;
} else {
$has_barcode{$seq_barcode}++;
}
$seq_length=length ($seq1);
my $seq = $seq1.$indexseq;
print OUT ">".$codes_hash{$seq_barcode}.":".$seq_barcode."\n".$seq."\n";
}else{
$nonbc_count++;
my $header="seq_".$nonbc_count;
print NONBC ">$header\n$seq1\n";
}
}
close OUT;
my $index_position=$seq_length+1;
my $filehandlehash;
foreach my $a (keys %codes_hash) {
if ($has_barcode{$a}){
foreach my $library (@{$libraries->{$codes_hash{$a}}}){
my $fh = IO::File->new(">bcsortedseqs\/$input\_$codes_hash{$a}\_$library\.fas");
$filehandlehash->{$codes_hash{$a}}->{$library} = $fh;
}
}
}
open LOG,">$logfile";
print LOG "Number of total reads in $input is $total\n";
print LOG "Total mapped $total_mapped\t".100*$total_mapped/$total." %\n";
print LOG "Barcode\tSample\tReads\tPercent\n";
foreach my $key (sort {$codes_number{$a} <=> $codes_number{$b}} keys %codes_number){
if ($has_barcode{$key}){
my $pct = 100*$has_barcode{$key}/$total;
print LOG "$key\t$codes_hash{$key}\t$has_barcode{$key}\t$pct\n";
}
}
#Step 2: Trimmed reads to remove transposon, append 16bp reads with a 5'N
my $species;
my $position;
$species->{"ATCG"}="Bt7330";
$species->{"TCGA"}="Bt7330";
$species->{"ACGT"}="Bvulgatus";
$species->{"TACG"}="Bvulgatus";
$species->{"CGAT"}="Bovatus";
$species->{"GATC"}="Bovatus";
$species->{"GTAC"}="Buniformis";
$species->{"CGTA"}="Buniformis";
$species->{"TCAG"}="BWH2";
$species->{"ACTG"}="BWH2";
$species->{"TCGT"}="BtVPI";
$position->{"ATCG"}="Forward";
$position->{"TCGA"}="Reverse";
$position->{"ACGT"}="Forward";
$position->{"TACG"}="Reverse";
$position->{"CGAT"}="Forward";
$position->{"GATC"}="Reverse";
$position->{"GTAC"}="Forward";
$position->{"CGTA"}="Reverse";
$position->{"TCAG"}="Forward";
$position->{"ACTG"}="Reverse";
$position->{"TCGT"}="Both";
open IN,$barcode_assigned;
my @tn_array=split (//,"ACAGGTTG");
$total=0;
my $count=0;
my $exact_match_count=0;
my $mismatch1_count=0;
my $mismatch2_count=0;
my $header;
my %tn_match; #$tn_match{sampleID} = number of reads that have TN sequence at allowable #mismatches
my @header;
my $bc;
my $number;
my $length;
my $NON_TN="INSEQ_nonTN";
my $count_nonTN=0;
my $innerbarcode;
open NONTN,">$NON_TN";
my $bag;
my $inner_count;
while (my $line =<IN>){ #go through each line
chomp $line;
if ($line =~m/^>/){ #if a header row
$header = $line;
@header = split (/:/,$line);
$bc=$header[0];
$bc=~s/>//;
unless (exists $tn_match{$header[0]}){
$tn_match{$header[0]}=0;
}
} else { #if a sequence row
my $seq = $line;
my $pos1_match=0;
my $pos2_match=0;
#does read contain transposon at bp 20 or 21?
my $tn_pos1=substr($seq, 16, 8);
my $tn_pos2=substr($seq, 17, 8);
if ($tn_pos1 eq 'ACAGGTTG') {
$pos1_match=1;
$exact_match_count++;
$count++;
} else {
if ($tn_pos2 eq 'ACAGGTTG') {
$pos2_match=1;
$exact_match_count++;
$count++;
}
}
if ($mismatch1 == 1) { #if 1bp tn mismatches allowed
my $pos1_score=0;
my $pos2_score=0;
my @pos1_array = split (//,$tn_pos1);
my @pos2_array = split (//,$tn_pos2);
for (my $n=0; $n<8;$n++){
if ($pos1_array[$n] eq $tn_array[$n]) {
$pos1_score++;
}
if ($pos2_array[$n] eq $tn_array[$n]) {
$pos2_score++;
}
}
if ($pos1_score == 7) {
$pos1_match=1;
$mismatch1_count++;
$count++;
}
if ($pos2_score == 7) {
$pos2_match=1;
$mismatch1_count++;
$count++;
}
}
if ($mismatch1 == 2) { #if 2bp tn mismatches allowed
my $pos1_score=0;
my $pos2_score=0;
my @pos1_array = split (//,$tn_pos1);
my @pos2_array = split (//,$tn_pos2);
for (my $n=0; $n<8;$n++){
if ($pos1_array[$n] eq $tn_array[$n]) {
$pos1_score++;
}
if ($pos2_array[$n] eq $tn_array[$n]) {
$pos2_score++;
}
}
if ($pos1_score == 7) {
$pos1_match=1;
$mismatch1_count++;
$count++;
}
if ($pos2_score == 7) {
$pos2_match=1;
$mismatch1_count++;
$count++;
}
if ($pos1_score == 6) {
$pos1_match=1;
$mismatch2_count++;
$count++;
}
if ($pos2_score == 6) {
$pos2_match=1;
$mismatch2_count++;
$count++;
}
}
#if match found
if ($pos1_match==1){
#trim transposon sequence
my $trimmed_seq = substr ($seq, 0, 16);
$tn_match{$header[0]}++;
$innerbarcode=substr($seq,$index_position,4);
# print "$bc\t$innerbarcode\t$species->{$innerbarcode}\t$position->{$innerbarcode}\n";
if ($species->{$innerbarcode}){
push @{$bag->{$bc}->{$species->{$innerbarcode}}},$trimmed_seq;
$inner_count->{$bc}->{$species->{$innerbarcode}}->{$position->{$innerbarcode}}++;
}
}
elsif ($pos2_match==1){
#trim transposon sequence
my $trimmed_seq = substr ($seq, 0, 17);
$tn_match{$header[0]}++;
$innerbarcode=substr($seq,$index_position,4);
if ($species->{$innerbarcode}){
push @{$bag->{$bc}->{$species->{$innerbarcode}}},$trimmed_seq;
$inner_count->{$bc}->{$species->{$innerbarcode}}->{$position->{$innerbarcode}}++;
}
}else{
$count_nonTN++;
$header=$header."_".$count_nonTN;
print NONTN "$header\n$seq\n";
}
}
}
close OUT;
foreach my $sample (sort keys %$bag){
foreach my $library (@{$libraries->{$sample}}){
my $fh=$filehandlehash->{$sample}->{$library};
foreach my $read (@{$bag->{$sample}->{$library}}){
$header=$sample.":".$library;
print $fh ">$header\n";
print $fh "$read\n";
}
}
}
my $non_TN_percent=sprintf ("%.1f",$count_nonTN/$total_mapped*100);
print LOG "In the trimming process, $count_nonTN ($non_TN_percent%) are not trimmed, Here is the statistics for each sample\n";
print LOG "Sample\tTrimmed\tPercentage\n";
foreach my $key (sort {$codes_number{$a} <=> $codes_number{$b}} keys %codes_number){
if ($has_barcode{$key}){
my $trimkey=">".$codes_hash{$key};
my $percentage=sprintf("%.1f",$tn_match{$trimkey}/$has_barcode{$key}*100);
print LOG "$codes_hash{$key}\t$has_barcode{$key}\t$tn_match{$trimkey}\t$percentage\n";
}
}
foreach my $sample (sort keys %{$bag}){
foreach my $code (sort keys %$species){
if ($inner_count->{$sample}->{$species->{$code}}->{$position->{$code}}){
print LOG "$sample\t$species->{$code}\t$position->{$code}\t$inner_count->{$sample}->{$species->{$code}}->{$position->{$code}}\n";
}
}
}
close LOG;
my $clean_up="clean_up.sh";
open CL,">$clean_up";
print CL "rm $barcode_assigned\n",
"rm mappingjobs_*\n",
"rm -r bcsortedseqs\n";
#Create job files for each sample
chdir "bcsortedseqs";
my @sorted_seq=`ls *.fas`;
chdir $outputdir;
foreach my $sorted_seq (@sorted_seq) {
chomp $sorted_seq;
if ($sorted_seq=~m/$input\_(\w+)\_(\w+)\.fas/){
my $sample=$1;
my $index=$2;
open (TASKSFORBC, ">$outputdir\/mappingjobs\_$sorted_seq\.job") || die "Error: Can't create $outputdir\/mappingjobs\_$sorted_seq\.job\n\n";
my @ptt=`ls $path/indexes/$index/*.ptt`;
print TASKSFORBC "$bowtie_path/bowtie -m 1 --best --strata -a --fullref -n $mismatch2 -l 17 $path\/indexes\/$index\/$index -f bcsortedseqs\/$sorted_seq results\/$sorted_seq\.bowtiemap\n";
print TASKSFORBC "perl $path/scripts/process_bowtie_output.pl results\/$sorted_seq\.bowtiemap \n";
foreach my $ptt (@ptt){
chomp $ptt;
if ($ptt =~m/$path\/indexes\/$index\/(.*)\.ptt/){
print TASKSFORBC "perl $path/scripts/normalize_processed_filter.pl results\/$sorted_seq\.bowtiemap_processed.txt_$1\n";
if (!$arrayed){
if ($operon==0){
print TASKSFORBC "perl $path/scripts/map_genes.pl $ptt results\/$sorted_seq\.bowtiemap_processed.txt_$1_filter_cpm.txt $disruption\n";
}elsif ($operon==1){
print TASKSFORBC "perl $path/scripts/map_genes_operon.pl $ptt $path/indexes\/$index\/$1.operons $disruption $cutoff results\/$sorted_seq\.bowtiemap_processed.txt_$1_filter_cpm.txt \n";
}
}else{
}
}
}
close TASKSFORBC;
}
system("qsub -l h_vmem=4G mappingjobs\_$sorted_seq\.job");
}