https://github.com/AlexanderDilthey/MHC-PRG
Raw File
Tip revision: e59943adb8855532573a6c276651efad1e18a6b1 authored by Alexander Dilthey on 18 December 2018, 10:20:48 UTC
Update HLA-PRG.md
Tip revision: e59943a
reduce_coverage_hlaBAM_targetedByAmplification.pl
#!/usr/bin/perl

# Always assume paired-end reads!

use strict;
use Data::Dumper;
use Getopt::Long;   

my $samtools_bin = find_present_alternative('/opt/sw/software/Bio/samtools/0.1.19/bin/samtools', '/home/dilthey/samtools-0.1.18/samtools', '/apps/well/samtools/0.1.19/bin/samtools');


my $maximum_number_starting_reads = 250;
#my $input_BAM = '/gpfs1/well/gsk_hla/temp_mapping_2/AM_WTSI19556/merged.bam';
#my $output_BAM = '/gpfs1/well/gsk_hla/Alexander_Mentzer_BAMs_downsampled/AM_WTSI19556.bam';

my $input_BAM;
my $output_BAM;


GetOptions (
 'input_BAM:s' => \$input_BAM,  
 'output_BAM:s' => \$output_BAM,  
);

die "Please specify --input_BAM" unless($input_BAM);
die "Please specify --output_BAM" unless($output_BAM);
die "Please specify an existing --input_BAM" unless(-e $input_BAM);
die unless($input_BAM ne $output_BAM);

print "\nTarget (maximum) rate of reads starting per position: ", $maximum_number_starting_reads, "\n\n";

my $tF = getTempFile();

my $cmd_extract = qq($samtools_bin view -h $input_BAM > $tF);

system($cmd_extract) and die "Samtools extraction command failed: $cmd_extract";

print "Extracted SAM into $tF\n";

my $fn_samout = $tF . '.filtered';

my $examined_reads = 0;
my $printed_reads = 0;

open(SAMOUT, '>', $fn_samout) or die "Cannot open $fn_samout";

my %seen_read_IDs;
my %read_IDs_decisions;

my $current_start_position;
my @current_start_position_reads;


my $process_open_reads = sub {

	if(not defined $current_start_position)
	{
		return;
	}

	$current_start_position = undef;	
	return unless(scalar(@current_start_position_reads));
	
	my %read_IDs;
	my $positionID;
	foreach my $read (@current_start_position_reads)
	{
		my $readID = $read->[0];
		$read_IDs{$readID}++;
		
		my $thisRead_posID = $read->[2] . ':' . $read->[3];
		unless(defined $positionID)
		{
			$positionID = $thisRead_posID;
		}
		
		die unless($thisRead_posID eq $positionID);
	}
	
	my %take_read_IDs;
	my $take_reads_rate = $maximum_number_starting_reads / scalar(keys %read_IDs);
	
	if($positionID eq '*:0')
	{
		$take_reads_rate = 1;
	}
	
	if($take_reads_rate >= 1)
	{
		%take_read_IDs = map {$_ => 1} (keys %read_IDs);
	}
	else
	{
		foreach my $readID (keys %read_IDs)
		{
			my $r = rand(1);
			die unless(($r >= 0) and ($r <= 1));
			$take_read_IDs{$readID} = ($r <= $take_reads_rate);
		}
	}
	
	# print "Position ${positionID}, original reads ", scalar(keys %read_IDs), ", take rate ${take_reads_rate}, taken reads ", scalar(grep {$take_read_IDs{$_}} keys %take_read_IDs), "\n";
	
	foreach my $read (@current_start_position_reads)
	{
		my $readID = $read->[0];
		die unless(defined $take_read_IDs{$readID});
		if($take_read_IDs{$readID})
		{
			print SAMOUT join("\t", @$read), "\n";
			$printed_reads++;
		}
	}
	
	foreach my $readID (keys %take_read_IDs)
	{
		die if(defined $read_IDs_decisions{$readID});
		$read_IDs_decisions{$readID} = $take_read_IDs{$readID};
	}
	
	@current_start_position_reads = ();
};	

open(SAMIN, '<', $tF) or die "Cannot open $tF";
while(<SAMIN>)
{	
	my $line = $_;
	chomp($line);
	next unless($line);
	
	if(substr($line, 0, 1) eq '@')
	{
		print SAMOUT $line, "\n";
	}
	else
	{
		$examined_reads++;
		
		my @line_fields = split(/\t/, $line);
		my $readName = $line_fields[0];
		my $reference = $line_fields[2];
		my $position = $line_fields[3];
		my $posID = $reference . ':' . $position;
		
		if($posID ne $current_start_position)
		{
			$process_open_reads->();
		}
		
		$current_start_position = $posID;
		
		$seen_read_IDs{$readName}++;
		if(exists $read_IDs_decisions{$readName})
		{	
			if($read_IDs_decisions{$readName})
			{
				print SAMOUT $line, "\n";
				$printed_reads++;
			}
		}
		else
		{
			push(@current_start_position_reads, \@line_fields);
		}
	}
}

$process_open_reads->();

close(SAMOUT);
close(SAMIN);

print "\nProcessed reads: ${examined_reads}; taken: $printed_reads - rate ", sprintf("%.2f", $printed_reads / $examined_reads ), "\n";

my $lost_read_IDs = 0;
my $split_read_IDs = 0;
foreach my $readID (keys %seen_read_IDs)
{
	my $c = $seen_read_IDs{$readID};
	if($c <= 1)
	{
		$lost_read_IDs++;
	}
	elsif($c >= 3)
	{
		$split_read_IDs++;
	}
}

print "\nRead ID statistics:\n";
print "\tTotal: ", scalar(keys %seen_read_IDs), "\n";
print "\tApparently non-paired: ", $lost_read_IDs, "\n";
print "\tApparently split: ", $split_read_IDs, "\n";

my $cmd_transform_BAM = qq(cat $fn_samout | ${samtools_bin} view -Sb - > ${output_BAM});
system($cmd_transform_BAM) and "BAM conversion command $cmd_transform_BAM failed";

my $cmd_index_BAM = qq(${samtools_bin} index ${output_BAM});
system($cmd_index_BAM) and "BAM conversion command $cmd_index_BAM failed";

print "\nProduced BAM: $output_BAM \n\n";

unlink($tF) or warn "Could not delete $tF";
unlink($fn_samout) or warn "Could not delete $fn_samout";

sub getTempFile
{
	my $tmp_dir = "../tmp";
	die unless(-e $tmp_dir);
	die unless(-d $tmp_dir);
	
	my $fnTmp;
	do {
		my $k = genkey();
		$fnTmp = $tmp_dir . '/' . $k;		
	} while (-e $fnTmp);
	
	return $fnTmp;
}
sub find_present_alternative
{
        foreach my $a (@_)
        {
                if(-e $a)
                {
                        return $a;
                }
        }
        die "Could not find a present alternative from list:\n".join("\n", @_);
}

sub genkey
{
        my $key;
        my $num = $_[0] ? $_[0] : 25;
        for (my $i = 0; $i <= $num; $i++)
        {
                my $val =  int(rand(9)+1);
                $val = ($val > 9) ? 9 : $val;
                $key .= $val;
        }
        return $key;
}


back to top