https://github.com/AlexanderDilthey/MHC-PRG
Tip revision: e59943adb8855532573a6c276651efad1e18a6b1 authored by Alexander Dilthey on 18 December 2018, 10:20:48 UTC
Update HLA-PRG.md
Update HLA-PRG.md
Tip revision: e59943a
HLAtypeinference_copySimulations.pl
#!/usr/bin/perl -w
use strict;
use List::MoreUtils qw/all mesh any /;
use Data::Dumper;
use Getopt::Long;
use Sys::Hostname;
use File::Copy;
use File::Basename;
use Storable;
# input parameters
my $iteration = 1;
GetOptions ('iteration:s' => \$iteration,
);
my @dirs = glob('../tmp/hla/simulations/*');
foreach my $dir (@dirs)
{
die unless($dir =~ /.+\/(.+)/);
my $baseName_outer = $1;
# true HLA
my $trueHLA = $dir . '/trueHLA.txt';
my $trueHLA_out = '../tmp/hla/trueHLA_'.$iteration.'_'.$baseName_outer;
open(TRUE, '<', $trueHLA) or die "Cannot open $trueHLA";
open(TRUEOUT, '>', $trueHLA_out) or die "Cannot open $trueHLA_out";
my $headerLine = <TRUE>;
chomp($headerLine);
my @header_fields = split(/ /, $headerLine);
for(my $i = 1; $i <= $#header_fields; $i++)
{
$header_fields[$i] = 'HLA'.$header_fields[$i];
}
print TRUEOUT join("\t", @header_fields), "\n";
while(<TRUE>)
{
my $line = $_;
chomp($line);
my @fields = split(/ /, $line);
die unless($fields[0] =~ /S_(\d+)/);
my $sampleN = $1;
$fields[0] = 'simulations_'.$baseName_outer.'_sample'.$sampleN;
for(my $i = 1; $i <= $#fields; $i++)
{
my $v = $fields[$i];
my @p = split(/\//, $v);
die unless($#p == 1);
for(my $j = 0; $j <= $#p; $j++)
{
my @p2 = split(/\:/, $p[$j]);
if($#p2 >= 1)
{
$p[$j] = join(':', @p2[0, 1]);
}
}
$v = join('/', @p);
$fields[$i] = $v;
}
print TRUEOUT join("\t", @fields), "\n";
}
close(TRUE);
close(TRUEOUT);
print "Validation HLA file:\n$trueHLA_out\n\n";
# true haplotypes
my $trueHaplotypes = $dir . '/trueHaplotypes.txt';
my $trueHaplotypes_out = '../tmp/hla/trueHaplotypes'.$iteration.'_'.$baseName_outer;
open(TRUE, '<', $trueHaplotypes) or die "Cannot open $trueHaplotypes";
open(TRUEOUT, '>', $trueHaplotypes_out) or die "Cannot open $trueHaplotypes_out";
$headerLine = <TRUE>;
chomp($headerLine);
@header_fields = split(/ /, $headerLine);
for(my $i = 1; $i <= $#header_fields; $i++)
{
$header_fields[$i] = 'HLA'.$header_fields[$i];
}
print TRUEOUT join("\t", @header_fields), "\n";
while(<TRUE>)
{
my $line = $_;
chomp($line);
my @fields = split(/ /, $line);
die unless($fields[0] =~ /S_(\d+)/);
my $sampleN = $1;
$fields[0] = 'simulations_'.$baseName_outer.'_sample'.$sampleN;
for(my $i = 1; $i <= $#fields; $i++)
{
my $v = $fields[$i];
my @p = split(/\//, $v);
die unless($#p == 1);
for(my $j = 0; $j <= $#p; $j++)
{
my @p2 = split(/\:/, $p[$j]);
if($#p2 >= 1)
{
$p[$j] = join('', @p2[0, 1]);
}
}
$v = join('/', @p);
$fields[$i] = $v;
}
print TRUEOUT join("\t", @fields), "\n";
}
close(TRUE);
close(TRUEOUT);
print "Validation haplotypes file:\n$trueHaplotypes_out\n\n";
# perturbed positions
my $trueHaplotypesPerturbed = $dir . '/trueHaplotypes.txt.perturbed';
my $trueHaplotypesPerturbed_out = '../tmp/hla/trueHaplotypes'.$iteration.'_'.$baseName_outer.".perturbed";
open(TRUE, '<', $trueHaplotypesPerturbed) or die "Cannot open $trueHaplotypesPerturbed";
open(TRUEOUT, '>', $trueHaplotypesPerturbed_out) or die "Cannot open $trueHaplotypesPerturbed_out";
$headerLine = <TRUE>;
chomp($headerLine);
@header_fields = split(/ /, $headerLine);
for(my $i = 1; $i <= $#header_fields; $i++)
{
$header_fields[$i] = 'HLA'.$header_fields[$i];
}
print TRUEOUT join("\t", @header_fields), "\n";
while(<TRUE>)
{
my $line = $_;
chomp($line);
my @fields = split(/ /, $line);
die unless($fields[0] =~ /S_(\d+)/);
my $sampleN = $1;
$fields[0] = 'simulations_'.$baseName_outer.'_sample'.$sampleN;
for(my $i = 1; $i <= $#fields; $i++)
{
my $v = $fields[$i];
my @p = split(/\//, $v, -1);
die "Problem with file $trueHaplotypesPerturbed: missing slash in line $. for field $header_fields[$i] - field value '$v' " unless($#p == 1);
for(my $j = 0; $j <= $#p; $j++)
{
my @p2 = split(/\:/, $p[$j]);
if($#p2 >= 1)
{
$p[$j] = join('', @p2[0, 1]);
}
}
$v = join('/', @p);
$fields[$i] = $v;
}
print TRUEOUT join("\t", @fields), "\n";
}
close(TRUE);
close(TRUEOUT);
print "File with perturbed positions: $trueHaplotypesPerturbed_out\n";
# files
my @subfiles = glob($dir.'/*_1.fastq');
foreach my $subfile (@subfiles)
{
my $subfile_2 = $subfile;
$subfile_2 =~ s/_1\.fastq/_2.fastq/;
die unless(-e $subfile_2);
die unless($subfile =~ /.+\/(.+)/);
my $basename = $1;
die unless($basename =~ /S_(\d+)_/);
my $simulationNumber = $1;
#die Dumper($subfile_2, $basename, $simulationNumber);
my $targetDir = '../tmp/hla/I'.$iteration.'_simulations_'.$baseName_outer.'_sample'.$simulationNumber;
mkdir($targetDir);
die "Cannot mkdir $targetDir" unless(-e $targetDir);
my $targetfile_1 = $targetDir . '/reads.p_1';
my $targetfile_2 = $targetDir . '/reads.p_2';
print "Copying into $targetDir ...\n";
copy($subfile, $targetfile_1) or die "Cannot copy file";
copy($subfile_2, $targetfile_2) or die "Cannot copy file";
}
}