https://github.com/AlexanderDilthey/MHC-PRG
Tip revision: e59943adb8855532573a6c276651efad1e18a6b1 authored by Alexander Dilthey on 18 December 2018, 10:20:48 UTC
Update HLA-PRG.md
Update HLA-PRG.md
Tip revision: e59943a
downsample_hla_reads.pl
#!/usr/bin/perl
use strict;
use Data::Dumper;
use List::MoreUtils qw/all/;
use File::Path;
my @coverage_targets = (40, 30, 20);
my $iterations = 3;
my %samples = (
'I1_AA02O9Q_Z2' => 53.4,
# 'I3_NA12878' => 60.7,
);
open(CMD, '>', '_downsample_inference_commands.txt') or die;
foreach my $sample (keys %samples)
{
my $current_coverage = $samples{$sample};
die "Coverage problem" unless(all {$_ < $current_coverage} @coverage_targets);
my $reads_file_1 = '../tmp/hla/' . $sample . '/reads.p.n_1';
my $reads_file_2 = '../tmp/hla/' . $sample . '/reads.p.n_2';
die unless(-e $reads_file_1);
die unless(-e $reads_file_2);
my @sampleIDs;
my @sampleIDs_perTarget;
foreach my $target (@coverage_targets)
{
my @sampleIDs_thisTarget;
for(my $iteration = 1; $iteration <= $iterations; $iteration++)
{
my $downsampling_factor = $target / $current_coverage;
print "Sample $sample target ${target}x iteration $iteration downsampling factor ", sprintf("%.2f", $downsampling_factor), "\n";
my $new_sampleID = 'downsample_' . $sample . '_DSC' . $target . '_' . $iteration;
my $output_dir = '../tmp/hla/' . $new_sampleID;
if(-e $output_dir)
{
rmtree($output_dir);
}
mkdir($output_dir) or die "Cannot mkdir $output_dir";
my $output_file_1 = $output_dir . '/reads.p.n_1';
my $output_file_2 = $output_dir . '/reads.p.n_2';
my $fh_in_1;
my $fh_in_2;
open($fh_in_1, '<', $reads_file_1) or die "Cannot open $reads_file_1";
open($fh_in_2, '<', $reads_file_2) or die "Cannot open $reads_file_2";
print "\t$reads_file_1 => $output_file_1\n";
print "\t$reads_file_2 => $output_file_2\n";
my $reads_read = 0;
my $reads_printed = 0;
open(READSOUT1, '>', $output_file_1) or die "Cannot open $output_file_1";
open(READSOUT2, '>', $output_file_2) or die "Cannot open $output_file_2";
while(! eof($fh_in_1))
{
my @r = get_read_lines($fh_in_1, $fh_in_2);
$reads_read++;
if(rand() <= $downsampling_factor)
{
print READSOUT1 join('', @{$r[0]});
print READSOUT2 join('', @{$r[1]});
$reads_printed++;
}
}
close(READSOUT2);
close(READSOUT1);
my $achieved_downsampling_factor = ($reads_read != 0) ? ($reads_printed / $reads_read) : -1;
print "\t$reads_read reads read, $reads_printed reads printed, achieved factor: ", sprintf("%.2f", $achieved_downsampling_factor), "\n";
close($fh_in_1);
close($fh_in_2);
push(@sampleIDs, $new_sampleID);
push(@sampleIDs_thisTarget, $new_sampleID);
}
print "\n";
push(@sampleIDs_perTarget, \@sampleIDs_thisTarget);
}
print CMD qq(./HLAtypeinference.pl --actions ai --sampleIDs ).join(',', @sampleIDs)."\n";
foreach my $sampleList (@sampleIDs_perTarget)
{
print CMD qq(./HLAtypeinference.pl --actions v ---trueHLA /file/validation --sampleIDs ).join(',', @$sampleList)."\n";
}
print CMD "\n";
}
close(CMD);
sub get_read_lines
{
my $fh_1 = shift;
my $fh_2 = shift;
my @l1, my @l2;
for(my $i = 1; $i <= 4; $i++)
{
die if eof($fh_1);
die if eof($fh_2);
my $l1 = <$fh_1>;
my $l2 = <$fh_2>;
push(@l1, $l1);
push(@l2, $l2);
}
die unless(substr($l1[0], 0, 1) eq '@');
die unless(substr($l2[0], 0, 1) eq '@');
die unless(substr($l1[2], 0, 1) eq '+');
die unless(substr($l2[2], 0, 1) eq '+');
return (\@l1, \@l2);
}