Content - 16cb454d1aa0706ed6f32c08e55232cbf05d031d - ec167a1/HLAtypeinference.pl

visit type:
Tip revision: e59943adb8855532573a6c276651efad1e18a6b1 authored by Alexander Dilthey on 18 December 2018, 10:20:48 UTC
Update HLA-PRG.md
Tip revision: e59943a
HLAtypeinference.pl
#!/usr/bin/perl -w

use strict;
use List::MoreUtils qw/all mesh any /;
use List::Util qw/sum/;
use Data::Dumper;
use Getopt::Long;   
use Sys::Hostname;
use File::Copy;
use File::Basename;
use Storable;
use simpleHLA;
use File::Basename;

my $kMer_size = 55;  

# my @testCases = (
	# [[qw/A A/], [qw/A A/]],
	# [[qw/? A/], [qw/A A/]],
	# [[qw/A ?/], [qw/A A/]],
	# [[qw/A T/], [qw/A A/]],
	# [[qw/A A/], [qw/T A/]],
	# [[qw/A C/], [qw/G T/]],
	# [[qw/A C/], [qw/? T/]],
	# [[qw/? C/], [qw/G T/]],
	# [[qw/? ?/], [qw/G T/]],
	# [[qw/? T/], [qw/? T/]],
	# [[qw/? C/], [qw/? T/]],
	
# );
# foreach my $testCase (@testCases)
# {
	# print join(' vs ', map {join('/', @$_)} @$testCase), "   ", join(' ', compatibleStringAlleles($testCase->[0], $testCase->[1])), "\n";
# }
# exit;

# input parameters

my $graph = 'hla';   
my $sampleIDs = '';
my $BAMs = '';
my $actions;
my $trueHLA;
my $trueHaplotypes;
#my $validation_round = 'R1';
my $T = 0;
my $minPropkMersCovered = 0;
my $minCoverage = 0;
my $all_2_dig = 0;
my $only_4_dig = 1;
my $reduce_to_4_dig = 0;
my $HiSeq250bp = 0;
my $MiSeq250bp = 0;

my $fastExtraction = 0;

my $fromPHLAT = 0;
my $fromHLAreporter = 0;

my $referenceGenome;
my $threads = 1;

my $no_fail = 0;
my $vP = '';

my @loci_for_check = qw/A B C DQA1 DQB1 DRB1/;

GetOptions ('graph:s' => \$graph,
 'sampleIDs:s' => \$sampleIDs, 
 'BAMs:s' => \$BAMs, 
 'actions:s' => \$actions, 
 'trueHLA:s' => \$trueHLA,
 'trueHaplotypes:s' => \$trueHaplotypes, 
 'referenceGenome:s' => \$referenceGenome, 
 #'validation_round:s' => \$validation_round,
 'T:s' => \$T,
 'minCoverage:s' => \$minCoverage,
 'minPropkMersCovered:s' => \$minPropkMersCovered,
 'all_2_dig:s' => \$all_2_dig,
 'only_4_dig:s' => \$only_4_dig,
 'HiSeq250bp:s' => \$HiSeq250bp, 
 'MiSeq250bp:s' => \$MiSeq250bp, 
 'fastExtraction:s' => \$fastExtraction, 
 'fromPHLAT:s' => \$fromPHLAT,
 'fromHLAreporter:s' => \$fromHLAreporter,
 'reduce_to_4_dig:s' => \$reduce_to_4_dig,
 'threads:s' => \$threads,
 'no_fail:s' => \$no_fail,
 'vP:s' => \$vP,
);         

die if($fromPHLAT and $fromHLAreporter);
my $fromMHCPRG = ((not $fromPHLAT) and (not $fromHLAreporter));

if($MiSeq250bp and $HiSeq250bp)
{
	die "You activated switches for both HiSeq and MiSeq 250bp data - it is either-or";
}

if($minCoverage)
{
	print "Minimum coverage threshold in place: $minCoverage\n";
}

if($minPropkMersCovered)
{
	print "Threshold for kMer coverage in place: $minPropkMersCovered \n";
}

if($fastExtraction)
{
	$HiSeq250bp = 1;
}

my $genome_graph_file = qq(../tmp2/GS_nextGen/hla/derived/Homo_sapiens.GRCh37.60.dna.chromosome.ALL.blockedHLAgraph_k25.ctx);
unless(-e $genome_graph_file)
{
	die "Please set variable \$genome_graph_file to an existing file - the current value $genome_graph_file is not accessible.";
}

my $expected_kMer_file = qq(../tmp2/GS_nextGen/${graph}/requiredkMers_graph.txt.kmers_25);
unless(-e $expected_kMer_file)
{
	die "Please set variable \$expected_kMer_file to an existing file - the current value $expected_kMer_file is not accessible.";
}

my $exon_folder = qq(../tmp2/GS_nextGen/${graph}/);
unless(-e $exon_folder)
{
	die "Please provide a kMerified graph -- exon folder not there!";
}

my $normal_bin = qq(../bin/MHC-PRG);
my $cluster3_bin = qq(../bin/MHC-PRG);
my $use_bin = ((hostname() =~ /cluster3/) or (hostname() =~ /^comp[AB]\d+$/)) ? $cluster3_bin : $normal_bin;
unless(-e $use_bin)
{
	die "Cannot find expected binary: $use_bin";
}
	
my @BAMs = split(/,/, $BAMs);
my @sampleIDs = split(/,/, $sampleIDs);
if(@sampleIDs)
{
	foreach my $sampleID (@sampleIDs)
	{
		unless($sampleID =~ /^[\w]+$/)
		{
			die "Please provide only sample IDs that consist of 'word' characters (regexp \\w+).";
		}
	}
}	

my $sample_IDs_abbr;
if($sampleIDs =~ /^allSimulations(_\w+)?/)
{
	my $addFilter = $1;
	my @dirs;
	if($addFilter)
	{
		@dirs = grep {$_ =~ /I\d+_simulations${addFilter}/} grep {-d $_} glob('../tmp/hla/*');
	}
	else
	{
		@dirs = grep {$_ =~ /I\d+_simulations/} grep {-d $_} glob('../tmp/hla/*');
	}
	
	@sampleIDs = map {die "Can't parse $_" unless($_ =~ /tmp\/hla\/(.+)/); $1} @dirs;
	
	if($sampleIDs =~ /^all_simulations_I(\d+)/i)
	{
		my $iteration = $1;
		@sampleIDs = grep {$_ =~ /^I${iteration}_/i} @sampleIDs;
	}
	
	my $debug = 1;
	if($debug)
	{
		@sampleIDs = grep {die unless($_ =~ /sample(\d+)$/); ($1 < 30)} @sampleIDs;
	}
	
	$sample_IDs_abbr = $sampleIDs;
}
elsif($sampleIDs =~ /^all/)
{
	my @dirs = grep {$_ !~ /simulations/} grep {-d $_} glob('../tmp/hla/*');
	@sampleIDs = map {die "Can't parse $_" unless($_ =~ /tmp\/hla\/(.+)/); $1} @dirs;
	
	if($sampleIDs =~ /^all_I(\d+)/i)
	{
		my $iteration = $1;
		@sampleIDs = grep {$_ =~ /^I${iteration}_/i} @sampleIDs;
	}
	else
	{
		die "Does this make sense?";
		@sampleIDs = grep {$_ =~ /^$sampleIDs/i} @sampleIDs;	
	}
	
	if($actions =~ /v|w/)
	{
		@sampleIDs = @sampleIDs[5 .. $#sampleIDs];
		@sampleIDs = grep {$_ !~ /AA02O9Q_Z2/} @sampleIDs;
		@sampleIDs = grep {$_ !~ /AA02O9R/} @sampleIDs;		
		
		# warn "\n\n\n\n!!!!!!!!!!!!!!!!!!!!!\n\nRemove first five and NA12878 and high-coverage sample for validation!\n\n!!!!!!!!!!!!!!!!!!!!!!!!!\n\n\n";
		# @sampleIDs = @sampleIDs[0 .. 4];
		
		die unless(($actions eq 'v') or ($actions eq 'w'));
	}
	
	$sample_IDs_abbr = $sampleIDs;
}
else
{
	$sample_IDs_abbr = join('_', @sampleIDs);
	if(length($sample_IDs_abbr) > 50)
	{
		$sample_IDs_abbr = substr($sample_IDs_abbr, 0, 50);
	}
}

if(scalar(@sampleIDs) > 5)
{
	#@sampleIDs = @sampleIDs[0 .. 4];
	#warn "\n\n\n\n!!!!!!!!!!!!!!!!!!!!!\n\nLimited samples!\n\n!!!!!!!!!!!!!!!!!!!!!!!!!\n\n\n";
}

#@sampleIDs = $sampleIDs[0]; # todo remove
#warn "\n\n\n\n!!!!!!!!!!!!!!!!!!!!!\n\nLimited samples:\n".join("\n", @sampleIDs)."\n\n!!!!!!!!!!!!!!!!!!!!!!!!!\n\n\n";

# die Dumper(\@sampleIDs, \@BAMs); 

if($actions =~ /p/)
{
	die "No positive (-p) and long-read-positive (-p) filtering at the same time" if($actions =~ /l/);

	unless(@BAMs)
	{
		die "Please provide --BAMs for positive filtering";
	}
	unless($#BAMs == $#sampleIDs)
	{
		die "Please provide an equal number of --BAMs and --sampleIDs";
	}
	
	for(my $bI = 0; $bI <= $#BAMs; $bI++)
	{
		my $BAM = $BAMs[$bI];
		my $sampleID = $sampleIDs[$bI];
		
		unless(-e $BAM)
		{
			die "Specified BAM $BAM (in --BAMs) does not exist!\n";
		}
		
		my $output_file = '../tmp/hla/'.$sampleID.'/reads.p';
		unless(-e '../tmp/hla/'.$sampleID)
		{
			mkdir('../tmp/hla/'.$sampleID) or die "Cannot mkdir ".'../tmp/hla/'.$sampleID;
		}
		
		my $command = qq($use_bin domode filterReads --input_BAM $BAM --positiveFilter $expected_kMer_file --output_FASTQ $output_file --threads $threads );
		
		if($referenceGenome)
		{
			$command .= qq( --referenceGenome $referenceGenome);
		}	
		else
		{
			die "Positive filtering should use reference genome, deactivate only if you know what you are doing!";
		}
		
		if($HiSeq250bp)
		{
			$command .= qq( --HiSeq250bp);
		}
		
		print "Now executing command:\n$command\n\n";
		
		my $ret = system($command);
		unless($ret == 0)
		{
			die "Command $command failed";
		}
	}
}

if($actions =~ /l1/)
{
	die "No positive (-p) and long-read-positive (-p) filtering at the same time" if($actions =~ /p/);
	
	unless(@BAMs)
	{
		die "Please provide --BAMs for positive filtering";
	}
	unless($#BAMs == $#sampleIDs)
	{
		die "Please provide an equal number of --BAMs and --sampleIDs";
	}
	
	for(my $bI = 0; $bI <= $#BAMs; $bI++)
	{
		my $BAM = $BAMs[$bI];
		my $sampleID = $sampleIDs[$bI];
		
		unless(-e $BAM)
		{
			die "Specified BAM $BAM (in --BAMs) does not exist!\n";
		}
		
		my $output_file = '../tmp/hla/'.$sampleID.'/reads.p';
		unless(-e '../tmp/hla/'.$sampleID)
		{
			mkdir('../tmp/hla/'.$sampleID) or die "Cannot mkdir ".'../tmp/hla/'.$sampleID;
		}
		
		my $command = qq($use_bin domode filterLongOverlappingReads --input_BAM $BAM --output_FASTQ $output_file --graphDir ../tmp2/GS_nextGen/${graph});
		
		if($referenceGenome)
		{
			$command .= qq( --referenceGenome $referenceGenome);
		}	
		
		print "Now executing command:\n$command\n\n";
		
		my $ret = system($command);
		unless($ret == 0)
		{
			die "Command $command failed";
		}		
	}
}


if($actions =~ /l2/)
{
	die "No positive (-p) and long-read-positive (-p) filtering at the same time" if($actions =~ /p/);
	
	unless(@BAMs)
	{
		die "Please provide --BAMs for positive filtering";
	}
	unless($#BAMs == $#sampleIDs)
	{
		die "Please provide an equal number of --BAMs and --sampleIDs";
	}
	
	for(my $bI = 0; $bI <= $#BAMs; $bI++)
	{
		my $BAM = $BAMs[$bI];
		my $sampleID = $sampleIDs[$bI];
		
		my $FASTQ = $BAM . '.fastq';
		
		unless(-e $BAM)
		{
			die "Specified BAM $BAM (in --BAMs) does not exist!\n";
		}

		
		unless(-e $FASTQ)
		{
			die "Specified FASTQ $FASTQ (from --BAMs + .fastq) does not exist!\n";
		}
		
		
		my $output_file = '../tmp/hla/'.$sampleID.'/reads.p';
		unless(-e '../tmp/hla/'.$sampleID)
		{
			mkdir('../tmp/hla/'.$sampleID) or die "Cannot mkdir ".'../tmp/hla/'.$sampleID;
		}
		
		my $command = qq($use_bin domode filterLongOverlappingReads2 --input_BAM $BAM --input_FASTQ $FASTQ --output_FASTQ $output_file --graphDir ../tmp2/GS_nextGen/${graph});
		
		if($referenceGenome)
		{
			$command .= qq( --referenceGenome $referenceGenome);
		}	
		
		print "Now executing command:\n$command\n\n";
		
		my $ret = system($command);
		unless($ret == 0)
		{
			die "Command $command failed";
		}		
	}
}


if($actions =~ /n/)
{
	unless(scalar(@sampleIDs))
	{
		die "Please provide some --sampleIDs for negative filtering.";
	}
		
	my @fastQ_files;
	my @output_files;
	foreach my $sampleID (@sampleIDs)
	{
		my $fastQ_file = '../tmp/hla/'.$sampleID.'/reads.p';
		my $fastQ_file_1 = $fastQ_file.'_1';
		my $fastQ_file_2 = $fastQ_file.'_2';
		unless(-e $fastQ_file_1)
		{
			die "Expected file $fastQ_file_1 not found";
		}
		unless(-e $fastQ_file_2)
		{
			die "Expected file $fastQ_file_2 not found";
		}		
		my $output_file = '../tmp/hla/'.$sampleID.'/reads.p.n';
		
		push(@fastQ_files, $fastQ_file);
		push(@output_files, $output_file);
	}
	
	my $fastQ_files = join(',', @fastQ_files);
	my $output_files = join(',', @output_files);
	
	my $command = qq($use_bin domode filterReads --input_FASTQ $fastQ_files --negativeFilter $genome_graph_file --output_FASTQ $output_files --negativePreserveUnique --uniqueness_base ${expected_kMer_file} --uniqueness_subtract ${genome_graph_file});
	
	print "Now executing command:\n$command\n\n";
	
	my $ret = system($command);
	unless($ret == 0)
	{
		die "Command $command failed";
	}	
}


if($actions =~ /a/)
{
	unless(scalar(@sampleIDs))
	{
		die "Please provide some --sampleIDs for alignment.";
	}
		
	my @fastQ_files;
	foreach my $sampleID (@sampleIDs)
	{
		my $fastQ_file = '../tmp/hla/'.$sampleID.'/reads.p.n';
		my $fastQ_file_1 = $fastQ_file.'_1';
		my $fastQ_file_2 = $fastQ_file.'_2';
		unless(-e $fastQ_file_1)
		{
			die "Expected file $fastQ_file_1 not found";
		}
		unless(-e $fastQ_file_2)
		{
			die "Expected file $fastQ_file_2 not found";
		}		
		my $output_file = '../tmp/hla/'.$sampleID.'/reads.p.n';
		
		push(@fastQ_files, $fastQ_file);
	}
	
	my $fastQ_files = join(',', @fastQ_files);
	
	my $pseudoReferenceGenome = qq(../tmp2/GS_nextGen/${graph}/pseudoReferenceGenome.txt);
	unless(-e $pseudoReferenceGenome)
	{
		die "Pseudo-reference file $pseudoReferenceGenome not existing.";
	}
	my $command = qq($use_bin domode alignShortReadsToHLAGraph --input_FASTQ $fastQ_files --graphDir ../tmp2/GS_nextGen/${graph} --referenceGenome ${pseudoReferenceGenome});
	
	if($MiSeq250bp)
	{
		$command .= ' --MiSeq250bp';
	}
	print "Now executing command:\n$command\n\n";
	
	my $ret = system($command);
	unless($ret == 0)
	{
		die "Command $command failed";
	}
}


if($actions =~ /u/)
{
	unless(scalar(@sampleIDs))
	{
		die "Please provide some --sampleIDs for alignment.";
	}
		
	my @fastQ_files;
	foreach my $sampleID (@sampleIDs)
	{
		my $fastQ_file = '../tmp/hla/'.$sampleID.'/reads.p';
		push(@fastQ_files, $fastQ_file);
	}
	
	my $fastQ_files = join(',', @fastQ_files);
	
	my $pseudoReferenceGenome = qq(../tmp2/GS_nextGen/${graph}/pseudoReferenceGenome.txt);
	unless(-e $pseudoReferenceGenome)
	{
		die "Pseudo-reference file $pseudoReferenceGenome not existing.";
	}
	my $command = qq($use_bin domode alignLongUnpairedReadsToHLAGraph --input_FASTQ $fastQ_files --graphDir ../tmp2/GS_nextGen/${graph} --referenceGenome ${pseudoReferenceGenome});
	
	print "Now executing command:\n$command\n\n";
	
	my $ret = system($command);
	unless($ret == 0)
	{
		die "Command $command failed";
	}	
}


if($actions =~ /i/)
{
	unless(scalar(@sampleIDs))
	{
		die "Please provide some --sampleIDs for HLA type inference.";
	}
		
	my @aligned_files;
	my @stdout_files;
	
	my $switch_long_reads;

	foreach my $sampleID (@sampleIDs)
	{
		my $local_switch_long_reads = '';
		my $aligned_file = '../tmp/hla/'.$sampleID.'/reads.p.n.aligned';
		unless(-e $aligned_file)
		{
			$aligned_file = '../tmp/hla/'.$sampleID.'/reads.p.aligned';
			$local_switch_long_reads = '--longUnpairedReads';
			unless(-e $aligned_file)
			{			
				die "Expected file $aligned_file not found";
			}
		}
		
		if(defined $switch_long_reads)
		{
			die unless($switch_long_reads eq $local_switch_long_reads);
		}
		else
		{
			$switch_long_reads = $local_switch_long_reads;
		}
	
		push(@aligned_files, $aligned_file);
		
		my $stdout_file = '../tmp/hla/'.$sampleID.'/inference.stdout';
		push(@stdout_files, $stdout_file);
		
		foreach my $validation_round (qw/R1 R2/)
		{
			my $bestguess_file = '../tmp/hla/'.$sampleID.'/'.$validation_round.'_bestguess.txt';
			if(-e $bestguess_file)
			{
				warn "Delete existing best-guess file $bestguess_file";
				unlink($bestguess_file) or die "Cannot delete $bestguess_file";
			}
		}
	}
	die unless(defined $switch_long_reads);
	
	open(FAILEDSAMPLES, '>', '_failedSampleID_inference.txt') or die;
	
	if($no_fail)
	{
		warn "--no_fail 1 active, will try to continue when inference for a sample fails.";
	}
	SAMPLE: for(my $sI = 0; $sI <= $#aligned_files; $sI++)
	{
		my $sampleID = $sampleIDs[$sI];
		my $aligned_file = $aligned_files[$sI];
		my $stdout_file = $stdout_files[$sI];
		
		
		my ($aligned_file_name, $aligned_file_path) = fileparse($aligned_file);
					
		my $command = qq($use_bin domode HLATypeInference --input_alignedReads $aligned_file --graphDir ../tmp2/GS_nextGen/${graph} ${switch_long_reads} --sampleID $sampleID);

		if($MiSeq250bp)
		{
			$command .= ' --MiSeq250bp ';
		}
			
		$command .= ' > ' . $stdout_file;
		
		print "Now executing command:\n$command\n\n";
						
		my $ret = system($command);			
		
		unless($ret == 0)
		{
			if($no_fail)
			{
				warn "When executing $command, got return code $ret";
				print FAILEDSAMPLES $sampleID, "\n";
				next SAMPLE;		
			}		
			else
			{
				die "When executing $command, got return code $ret";
			}
		}
		
		my $expected_bestguess_file = $aligned_file_path . '/' . 'R1_bestguess.txt';
		unless(-e $expected_bestguess_file)
		{
			die "File $expected_bestguess_file not existing!";			
		}
		
		my %l_counter;
		open(F, '<', $expected_bestguess_file) or die "Cannot open $expected_bestguess_file";
		<F>;
		while(<F>)
		{
			my $l = $_;
			die unless($l =~ /^(\w+)\t/);
			$l_counter{$1}++;
		}
		close(F);

		foreach my $locus (@loci_for_check)
		{
			unless($l_counter{$locus} == 2)
			{
				die Dumper("Wrong bestguess count", $locus, $expected_bestguess_file, \%l_counter);
			}
			
			my $bestguess_haplotypes_file =  $aligned_file_path . '/' . 'R2_haplotypes_bestguess_' . $locus . '.txt';
			unless(-e $bestguess_haplotypes_file)
			{
				#die "Expected best-guess haplotypes file cannot be found : ". $bestguess_haplotypes_file;
			}	
		}
	}
}  

if($actions =~ /v/)
{
	my $validation_round = 'R1';
	die "Please specify --trueHLA for validation" unless($trueHLA);
			
	# read reference dataset
	my %reference_data;
	open(REFERENCE, "<", $trueHLA) or die "Cannot open $trueHLA";
	my $headerLine = <REFERENCE>;
	chomp($headerLine);
	$headerLine =~ s/\n//g;
	$headerLine =~ s/\r//g;
	my @header_fields = split(/[\t ]/, $headerLine);
	@header_fields = map {if($_ =~ /HLAD((QA)|(QB)|(RB))$/){$_ .= '1';} $_} @header_fields;	
	while(<REFERENCE>)
	{
		my $line = $_;
		chomp($line);
		
		$line =~ s/\n//g;
		$line =~ s/\r//g;
		
		next unless($line);
		
		my @fields = split(/[\t ]/, $line);
		my %line = (mesh @header_fields, @fields);
		
		my $primary_key = $line{'IndividualID'};
		$reference_data{$primary_key} = \%line;
	}
	close(REFERENCE);
	
	# die Dumper(\%reference_data);
	
	my %imputed_HLA;
	my %imputed_HLA_Q;
	my %imputed_HLA_kMersCovered;
	my %imputed_HLA_avgCoverage;
	my %imputed_HLA_lowCoverage;
	my %imputed_HLA_minCoverage;
	
	my %sample_noI_toI;
	
	my $total_imputations = 0;
	
	my %missing_reference_data;
	
	my $summary_file = 'temp/summary_' . $sample_IDs_abbr . '.txt';	
	open(SUMMARY, '>', $summary_file) or die "Cannot open $summary_file";
	print SUMMARY $sample_IDs_abbr, "\n";
	print SUMMARY "\t", join("\t", qw/Locus N CallRate Accuracy/), "\n";
	foreach my $sampleID (@sampleIDs)
	{
		my $sampleID_noI = $sampleID;
		$sampleID_noI =~ s/^I\d+_//g;
		
		
		my $bestGuess_file;
		
		if($fromPHLAT)
		{
			$bestGuess_file = '/gpfs1/well/gsk_hla/PHLAT/'.$sampleID.'/'.$validation_round.'_bestguess.txt';	
			$bestGuess_file =~ s/I6_WTS/I6_AM_WTS/;
			unless(-e $bestGuess_file)
			{
				warn "Best-guess file $bestGuess_file not existing";
				next;
			}		
		}
		elsif($fromHLAreporter)
		{
			$bestGuess_file = '/gpfs1/well/gsk_hla/HLAreporter/results/'.$sampleID.'/'.$validation_round.'_bestguess.txt';	
			$bestGuess_file =~ s/I6_WTS/I6_AM_WTS/;
			
			unless(-e $bestGuess_file)
			{
				warn "Best-guess file $bestGuess_file not existing";
				next;
			}			
		}
		else
		{
			$bestGuess_file = '../tmp/hla/'.$sampleID.'/'.$validation_round.'_bestguess.txt';
			unless(-e $bestGuess_file)
			{
				warn "Best-guess file $bestGuess_file not existing";
				next;
			}		
		}
				  
		open(BESTGUESS, '<', $bestGuess_file) or die "Cannot open $bestGuess_file";
		my $bestguess_header_line = <BESTGUESS>;
		chomp($bestguess_header_line);
		my @bestguess_header_files = split(/\t/, $bestguess_header_line);
		while(<BESTGUESS>)
		{
			my $line = $_;
			chomp($line);
			my @line_fields = split(/\t/, $line);
			my %line_hash = (mesh @bestguess_header_files, @line_fields);
			
			my $Q = $line_hash{'Q1'};
			my $kMersCovered = $line_hash{'proportionkMersCovered'};
			$kMersCovered = 1 unless(defined $kMersCovered);
			
			$imputed_HLA_Q{$line_hash{'Locus'}}{$sampleID_noI}{$line_hash{'Chromosome'}} = $Q;
			$imputed_HLA_kMersCovered{$line_hash{'Locus'}}{$sampleID_noI}{$line_hash{'Chromosome'}} = $kMersCovered;
			
			if(($Q < $T) or ($kMersCovered < $minPropkMersCovered))
			{
				$imputed_HLA{$line_hash{'Locus'}}{$sampleID_noI}{$line_hash{'Chromosome'}} = '??:??';			
			}
			else
			{
				die unless(defined $line_hash{'Allele'});
				$imputed_HLA{$line_hash{'Locus'}}{$sampleID_noI}{$line_hash{'Chromosome'}} = $line_hash{'Allele'};			
			}
			
			$total_imputations++;
			
			
			if($line_hash{'Chromosome'} eq '1')
			{
				$imputed_HLA_avgCoverage{$line_hash{'Locus'}}{$sampleID_noI} = $line_hash{'AverageCoverage'};
				$imputed_HLA_lowCoverage{$line_hash{'Locus'}}{$sampleID_noI} = $line_hash{'CoverageFirstDecile'};
				$imputed_HLA_minCoverage{$line_hash{'Locus'}}{$sampleID_noI} = $line_hash{'MinimumCoverage'};
				
				if($minCoverage and ($line_hash{'MinimumCoverage'} < $minCoverage))
				{
					$imputed_HLA{$line_hash{'Locus'}}{$sampleID_noI}{$line_hash{'Chromosome'}} = '??:??';								
				}
			}
		}	
		close(BESTGUESS);
		
		die if(exists $sample_noI_toI{$sampleID_noI});
		$sample_noI_toI{$sampleID_noI} = $sampleID;
	}
	
	print "\nTotal imputations (for comparisons): ", $total_imputations, "\n";
		
	my $debug = 0;
	my $comparisons = 0;
	my $compare_problems = 0;
	my %locus_avgCoverages;
	my %locus_lowCoverages;
	my %locus_minCoverages;
	my @allLoci_allIndivs_avgCoverage;
	
	my %problem_locus_detail;
	my %problem_locus_examined;
	my %problem_haplo_counter;
	my %problem_haplo_detail;
	my %imputations_predictions;
	my %reference_predictions;
	my %imputed_HLA_Calls;
	my %quality_measures; # not used
	my $pileup_href = {};
	
	# die Dumper(\%imputed_HLA);
	
	my @loci = sort keys %imputed_HLA;
	
	# die Dumper(\@loci);
	
	my $process_quality_measures = sub {};
	
	my $PP_to_basket = sub {
		my $PP = shift;
		die unless(($PP >= 0) && ($PP <= 1));
		my $basket = int($PP * 10);
		$basket = 9 if($basket == 10);
		return $basket;			
	};
	
	
	my $fh_qualityMetricsRunning;
	if($fromMHCPRG)
	{
		open($fh_qualityMetricsRunning, '>>', '_qualityMetrics_per_imputation_running.txt') or die;
	}
		
	my %errors_per_sample;
	my %validated_per_sample;
	my %types_as_validated;
	foreach my $locus (@loci)
	{		
		my $arbitraty_indiv = (keys %reference_data)[0];
		next unless((defined $reference_data{$arbitraty_indiv}{'HLA'.$locus}));
		
		my %calibration_baskets;
		my %coverage_over_samples;
		my %coverage_over_samples_individualValues;
		my $coverage_over_samples_nSamples = 0;
		
	
		my $add_to_calibration_basket = sub {
			my $str_correct = shift;
			my $PP = shift;
			my $weight = shift;
			my $locus = shift;
			my $sampleID = shift;
			my $kMer_correct = shift;
			
			die unless(($str_correct eq 'correct') or ($str_correct eq 'incorrect'));
			die unless(defined $PP);
			die unless(defined $weight);
			die unless(defined $kMer_correct);
			
			push(@{$calibration_baskets{$PP_to_basket->($PP)}{$str_correct}}, {PP => $PP, weight => $weight});
			
			if($fromMHCPRG)
			{
				my $correct_numeric = ($str_correct eq 'correct') ? 1 : 0;
				print ${fh_qualityMetricsRunning} join("\t", $sampleID, $vP, $locus, $correct_numeric, $PP, $kMer_correct), "\n";
			}
		};	
		
		$problem_locus_examined{$locus} = 0;
		$problem_locus_detail{$locus} = 0;
		my @indivIDs = keys %{$imputed_HLA{$locus}};
		my $indivI_processedForEvaluation = -1;
		INDIV: foreach my $indivID (@indivIDs)
		{	
			$| = 1;
			
			# print "\t", $indivID, "\t", $locus, "\n";
						
			$debug = 0;
			
			my @imputed_hla_values = map { $imputed_HLA{$locus}{$indivID}{$_} } keys %{$imputed_HLA{$locus}{$indivID}};
			
			if(grep {simpleHLA::autoHLA_is2digit($_)} @imputed_hla_values)
			{
				die "Warning: 2-digit alleles detected in the inference set\n" . Dumper(\@imputed_hla_values); # 2-digit test
			}
		
			my @imputed_hla_values_propkMersCovered = map { my $r = $imputed_HLA_kMersCovered{$locus}{$indivID}{$_}; die unless(defined $r); $r } keys %{$imputed_HLA{$locus}{$indivID}};
			my @imputed_hla_values_q = map { my $r = $imputed_HLA_Q{$locus}{$indivID}{$_}; die unless(defined $r); $r } keys %{$imputed_HLA{$locus}{$indivID}};
			
			die "Undefined HLA ".join(', ', @imputed_hla_values) unless(scalar(grep {defined $_} @imputed_hla_values) == scalar(@imputed_hla_values));
					
			my @reference_hla_values;
			
			next INDIV unless($#imputed_hla_values == 1);
			
			my $reference_lookup_ID = $indivID;
			if($reference_lookup_ID =~ /^downsample_/)
			{
				$reference_lookup_ID =~ s/^downsample_(I\d+_)?//;
				$reference_lookup_ID =~ s/_DSC\d+_\d+//;				
			}
			if($reference_lookup_ID =~ /^C_Platinum_/)
			{
				$reference_lookup_ID =~ s/C_Platinum_//;
				
			}			
			unless(exists $reference_data{$reference_lookup_ID})
			{
				$missing_reference_data{$reference_lookup_ID}{$locus}++;
				# warn "No reference data for $locus $indivID";
			}
			next INDIV unless(exists $reference_data{$reference_lookup_ID});
			
			$reference_data{$reference_lookup_ID}{'HLA'.$locus} or die;
			
			if($reference_data{$reference_lookup_ID}{'HLA'.$locus})
			{
				@reference_hla_values = split(/\//, $reference_data{$reference_lookup_ID}{'HLA'.$locus});
			}
			
			$types_as_validated{$reference_lookup_ID}{$locus} = \@reference_hla_values;

			die Dumper("Weird", $reference_data{$reference_lookup_ID}, \@reference_hla_values) unless($#reference_hla_values == 1);				
						
			die "Undefined HLA ".join(', ', @reference_hla_values) unless(scalar(grep {defined $_} @reference_hla_values) == scalar(@reference_hla_values));			
			@reference_hla_values = grep {! &simpleHLA::modernHLA_is_missing($_)} @reference_hla_values;

			if($#reference_hla_values == -1)
			{
				$missing_reference_data{$reference_lookup_ID}{$locus}++;
				next;
			}
						
			if($only_4_dig)
			{
				next unless (&simpleHLA::HLA_is4digit($reference_hla_values[0]) and (($#reference_hla_values == 0) || (&simpleHLA::HLA_is4digit($reference_hla_values[1]))));
			}
			
			if($reduce_to_4_dig)
			{
				my @reference_hla_values_before = @reference_hla_values;
				@reference_hla_values = map {
					my @inputAlleles = split(/;/, $_);
					join(';', map {simpleHLA::HLA_4digit($_)} @inputAlleles)		
				} @reference_hla_values;
				
				print "Before:\n\t".join(' / ', @reference_hla_values_before), "\n";
				print "After:\n\t".join(' / ', @reference_hla_values), "\n\n";
			}
					
			$imputed_HLA_Calls{$locus}{sum} += scalar(@imputed_hla_values);		
			$indivI_processedForEvaluation++;
			
			my @imputed_present = map {(! &simpleHLA::is_missing($_)) ? 1 : 0} @imputed_hla_values;
			my @imputed_hla_values_q_new;
			my @imputed_hla_values_propkMersCovered_new;
			for(my $i = 0; $i <= $#imputed_hla_values; $i++)
			{
				my $Q = $imputed_hla_values_q[$i];
				my $kP = $imputed_hla_values_propkMersCovered[$i];
				
				die unless(defined $Q);
				die unless(defined $kP);
				die unless(defined $imputed_present[$i]);
				
				if($imputed_present[$i])
				{
					push(@imputed_hla_values_q_new, $Q);
					push(@imputed_hla_values_propkMersCovered_new, $kP);
				}
			}
			
			@imputed_hla_values = grep {! &simpleHLA::is_missing($_)} @imputed_hla_values;
			@imputed_hla_values_q = @imputed_hla_values_q_new;
			@imputed_hla_values_propkMersCovered = @imputed_hla_values_propkMersCovered_new;
			
			die unless($#imputed_hla_values == $#imputed_hla_values_q);
			die unless($#imputed_hla_values == $#imputed_hla_values_propkMersCovered);
			
			$imputed_HLA_Calls{$locus}{called} += scalar(@imputed_hla_values);
			
			if($all_2_dig)
			{
				@reference_hla_values = map {join(';', map {&simpleHLA::autoHLA_2digit($_)} split(/;/, $_))} @reference_hla_values;
				@imputed_hla_values = map {join(';', map {&simpleHLA::autoHLA_2digit($_)} split(/;/, $_))} @imputed_hla_values;
			}
					
			if($locus eq 'B')
			{
			#	print Dumper($indivID, \@reference_hla_values, \@imputed_hla_values), "\n";
			}
			my $comparisons_before = $comparisons;
			my $problem_locus_detail_before = $problem_locus_detail{$locus};
			my $problem_locus_examined_before = $problem_locus_examined{$locus};
			
			if($#imputed_hla_values == 1)
			{
				if($#reference_hla_values > -1)
				{
					if($#reference_hla_values == 0)
					{
						if(&compatibleAlleles_individual($locus, $reference_hla_values[0], $imputed_hla_values[0]) or &compatibleAlleles_individual($locus, $reference_hla_values[0], $imputed_hla_values[1]))
						{
							$comparisons++;
							$problem_locus_examined{$locus}++;
							
							if(&compatibleAlleles_individual($locus, $reference_hla_values[0], $imputed_hla_values[0]))
							{
								$reference_predictions{$locus}{$reference_hla_values[0]}{correct}++;
								$imputations_predictions{$locus}{$imputed_hla_values[0]}{correct}++;
								
								$add_to_calibration_basket->('correct', $imputed_hla_values_q[0], 1, $locus, $indivID, $imputed_hla_values_propkMersCovered[0]);														
							}
							elsif(&compatibleAlleles_individual($locus, $reference_hla_values[0], $imputed_hla_values[1]))
							{
								$reference_predictions{$locus}{$reference_hla_values[0]}{correct}++;
								$imputations_predictions{$locus}{$imputed_hla_values[1]}{correct}++;

								$add_to_calibration_basket->('correct', $imputed_hla_values_q[1], 1, $locus, $indivID, $imputed_hla_values_propkMersCovered[1]);								
							}
							
							$process_quality_measures->($locus, $quality_measures{$indivID}, 1, 1);
							
						}	
						else
						{
							$comparisons++;
							$compare_problems++;
							$problem_haplo_counter{$indivID}++;
							$problem_haplo_detail{$indivID}{$locus} = 'Reference: '.join('/', @reference_hla_values).' VS Imputed: '.join('/', @imputed_hla_values).'   (1 problem of 1)';
							$problem_locus_detail{$locus}++;
							$problem_locus_examined{$locus}++;
							
							$reference_predictions{$locus}{$reference_hla_values[0]}{incorrect}++;
							$imputations_predictions{$locus}{$imputed_hla_values[0]}{incorrect} += 0.5;
							$imputations_predictions{$locus}{$imputed_hla_values[1]}{incorrect} += 0.5;
							
							$add_to_calibration_basket->('incorrect', $imputed_hla_values_q[0], 0.5, $locus, $indivID, $imputed_hla_values_propkMersCovered[0]);
							$add_to_calibration_basket->('incorrect', $imputed_hla_values_q[1], 0.5, $locus, $indivID, $imputed_hla_values_propkMersCovered[1]);
							
							
							$process_quality_measures->($locus, $quality_measures{$indivID}, 0, 1);
						}
					}
					elsif($#reference_hla_values == 1)
					{
						if(&compatibleAlleles_individual($locus, $reference_hla_values[0], $imputed_hla_values[0]) and &compatibleAlleles_individual($locus, $reference_hla_values[1], $imputed_hla_values[1]))
						{
							$comparisons += 2;
							$problem_locus_examined{$locus} += 2;
							
							$reference_predictions{$locus}{$reference_hla_values[0]}{correct}++;
							$imputations_predictions{$locus}{$imputed_hla_values[0]}{correct}++;	
							$reference_predictions{$locus}{$reference_hla_values[1]}{correct}++;
							$imputations_predictions{$locus}{$imputed_hla_values[1]}{correct}++;	
							
							$process_quality_measures->($locus, $quality_measures{$indivID}, 2, 2);	
							
							$add_to_calibration_basket->('correct', $imputed_hla_values_q[0], 1, $locus, $indivID, $imputed_hla_values_propkMersCovered[0]);
							$add_to_calibration_basket->('correct', $imputed_hla_values_q[1], 1, $locus, $indivID, $imputed_hla_values_propkMersCovered[1]);
							
						}		
						elsif(&compatibleAlleles_individual($locus, $reference_hla_values[0], $imputed_hla_values[1]) and &compatibleAlleles_individual($locus, $reference_hla_values[1], $imputed_hla_values[0]))
						{
							$comparisons += 2;
							$problem_locus_examined{$locus} += 2;
							
							$reference_predictions{$locus}{$reference_hla_values[0]}{correct}++;
							$imputations_predictions{$locus}{$imputed_hla_values[0]}{correct}++;					
							$reference_predictions{$locus}{$reference_hla_values[1]}{correct}++;
							$imputations_predictions{$locus}{$imputed_hla_values[1]}{correct}++;			
							
							$process_quality_measures->($locus, $quality_measures{$indivID}, 2, 2);		

							$add_to_calibration_basket->('correct', $imputed_hla_values_q[0], 1, $locus, $indivID, $imputed_hla_values_propkMersCovered[0]);
							$add_to_calibration_basket->('correct', $imputed_hla_values_q[1], 1, $locus, $indivID, $imputed_hla_values_propkMersCovered[1]);
							
						}
						else
						{
							if(
								&compatibleAlleles_individual($locus, $reference_hla_values[0], $imputed_hla_values[0]) or &compatibleAlleles_individual($locus, $reference_hla_values[1], $imputed_hla_values[1]) or
								&compatibleAlleles_individual($locus, $reference_hla_values[1], $imputed_hla_values[0]) or &compatibleAlleles_individual($locus, $reference_hla_values[0], $imputed_hla_values[1])
							)
							{
								$comparisons += 2;
								$compare_problems++;
								$problem_haplo_counter{$indivID}++;
								$problem_haplo_detail{$indivID}{$locus} = 'Reference: '.join('/', @reference_hla_values).' VS Imputed: '.join('/', @imputed_hla_values).'   (1 problem of 2)';						
								$problem_locus_detail{$locus}++;
								$problem_locus_examined{$locus} += 2;
								
								if(&compatibleAlleles_individual($locus, $reference_hla_values[0], $imputed_hla_values[0]))
								{
									$reference_predictions{$locus}{$reference_hla_values[0]}{correct}++;
									$imputations_predictions{$locus}{$imputed_hla_values[0]}{correct}++;					
									$reference_predictions{$locus}{$reference_hla_values[1]}{incorrect}++;
									$imputations_predictions{$locus}{$imputed_hla_values[1]}{incorrect}++;		

									$add_to_calibration_basket->('correct', $imputed_hla_values_q[0], 1, $locus, $indivID, $imputed_hla_values_propkMersCovered[0]);
									$add_to_calibration_basket->('incorrect', $imputed_hla_values_q[1], 1, $locus, $indivID, $imputed_hla_values_propkMersCovered[1]);									
								}
								elsif(&compatibleAlleles_individual($locus, $reference_hla_values[1], $imputed_hla_values[1]))
								{
									$reference_predictions{$locus}{$reference_hla_values[1]}{correct}++;
									$imputations_predictions{$locus}{$imputed_hla_values[1]}{correct}++;					
									$reference_predictions{$locus}{$reference_hla_values[0]}{incorrect}++;
									$imputations_predictions{$locus}{$imputed_hla_values[0]}{incorrect}++;		
									
									$add_to_calibration_basket->('correct', $imputed_hla_values_q[1], 1, $locus, $indivID, $imputed_hla_values_propkMersCovered[1]);
									$add_to_calibration_basket->('incorrect', $imputed_hla_values_q[0], 1, $locus, $indivID, $imputed_hla_values_propkMersCovered[0]);	
								}
								elsif(&compatibleAlleles_individual($locus, $reference_hla_values[1], $imputed_hla_values[0]))
								{
									$reference_predictions{$locus}{$reference_hla_values[1]}{correct}++;
									$imputations_predictions{$locus}{$imputed_hla_values[0]}{correct}++;					
									$reference_predictions{$locus}{$reference_hla_values[0]}{incorrect}++;
									$imputations_predictions{$locus}{$imputed_hla_values[1]}{incorrect}++;	
									
									$add_to_calibration_basket->('correct', $imputed_hla_values_q[0], 1, $locus, $indivID, $imputed_hla_values_propkMersCovered[0]);
									$add_to_calibration_basket->('incorrect', $imputed_hla_values_q[1], 1, $locus, $indivID, $imputed_hla_values_propkMersCovered[1]);	
								}
								elsif(&compatibleAlleles_individual($locus, $reference_hla_values[0], $imputed_hla_values[1]))
								{
									$reference_predictions{$locus}{$reference_hla_values[0]}{correct}++;
									$imputations_predictions{$locus}{$imputed_hla_values[1]}{correct}++;					
									$reference_predictions{$locus}{$reference_hla_values[1]}{incorrect}++;
									$imputations_predictions{$locus}{$imputed_hla_values[0]}{incorrect}++;		

									$add_to_calibration_basket->('correct', $imputed_hla_values_q[1], 1, $locus, $indivID, $imputed_hla_values_propkMersCovered[1]);
									$add_to_calibration_basket->('incorrect', $imputed_hla_values_q[0], 1, $locus, $indivID, $imputed_hla_values_propkMersCovered[0]);										
								}
								
								$process_quality_measures->($locus, $quality_measures{$indivID}, 1, 2);
							}
							else
							{
								$comparisons += 2;
								$compare_problems += 2;
								$problem_haplo_counter{$indivID} += 2;
								$problem_haplo_detail{$indivID}{$locus} = 'Reference: '.join('/', @reference_hla_values).' VS Imputed: '.join('/', @imputed_hla_values).'   (2 problems of 2)';						
								$problem_locus_detail{$locus} += 2;
								$problem_locus_examined{$locus} += 2;
								
								$reference_predictions{$locus}{$reference_hla_values[0]}{incorrect}++;
								$imputations_predictions{$locus}{$imputed_hla_values[0]}{incorrect}++;					
								$reference_predictions{$locus}{$reference_hla_values[1]}{incorrect}++;
								$imputations_predictions{$locus}{$imputed_hla_values[1]}{incorrect}++;	
								
								$add_to_calibration_basket->('incorrect', $imputed_hla_values_q[0], 1, $locus, $indivID, $imputed_hla_values_propkMersCovered[0]);
								$add_to_calibration_basket->('incorrect', $imputed_hla_values_q[1], 1, $locus, $indivID, $imputed_hla_values_propkMersCovered[1]);									
								
								$process_quality_measures->($locus, $quality_measures{$indivID}, 0, 2);						
							}
						}																
					}
					else
					{
						die;
					}
				}
			}
			elsif($#imputed_hla_values == 0)
			{
				if($#reference_hla_values > -1)
				{
					if($#reference_hla_values == 0)
					{
						if(&compatibleAlleles_individual($locus, $reference_hla_values[0], $imputed_hla_values[0]))
						{
							$comparisons++;
							$problem_locus_examined{$locus}++;
							$reference_predictions{$locus}{$reference_hla_values[0]}{correct}++;
							$imputations_predictions{$locus}{$imputed_hla_values[0]}{correct}++;
							
							$add_to_calibration_basket->('correct', $imputed_hla_values_q[0], 1, $locus, $indivID, $imputed_hla_values_propkMersCovered[0]);
								
							$process_quality_measures->($locus, $quality_measures{$indivID}, 1, 1);
							
						}	
						else
						{
							$comparisons++;
							$compare_problems++;
							$problem_haplo_counter{$indivID}++;
							$problem_haplo_detail{$indivID}{$locus} = 'Reference: '.join('/', @reference_hla_values).' VS Imputed: '.join('/', @imputed_hla_values).'   (1 problem of 1)';
							$problem_locus_detail{$locus}++;
							$problem_locus_examined{$locus}++;
							
							$reference_predictions{$locus}{$reference_hla_values[0]}{incorrect}++;
							$imputations_predictions{$locus}{$imputed_hla_values[0]}{incorrect}++;
							
							$add_to_calibration_basket->('incorrect', $imputed_hla_values_q[0], 1, $locus, $indivID, $imputed_hla_values_propkMersCovered[0]);
							
							$process_quality_measures->($locus, $quality_measures{$indivID}, 0, 1);									
						}					
					}
					elsif($#reference_hla_values == 1)
					{
						if(&compatibleAlleles_individual($locus, $reference_hla_values[0], $imputed_hla_values[0]) or &compatibleAlleles_individual($locus, $reference_hla_values[1], $imputed_hla_values[0]))
						{
							$comparisons += 1;
							$problem_locus_examined{$locus} += 1;
							
							if(&compatibleAlleles_individual($locus, $reference_hla_values[0], $imputed_hla_values[0]))
							{
								$reference_predictions{$locus}{$reference_hla_values[0]}{correct}++;
								$imputations_predictions{$locus}{$imputed_hla_values[0]}{correct}++;

								$add_to_calibration_basket->('correct', $imputed_hla_values_q[0], 1, $locus, $indivID, $imputed_hla_values_propkMersCovered[0]);
								
							}
							elsif(&compatibleAlleles_individual($locus, $reference_hla_values[1], $imputed_hla_values[0]))
							{
								$reference_predictions{$locus}{$reference_hla_values[1]}{correct}++;
								$imputations_predictions{$locus}{$imputed_hla_values[0]}{correct}++;	

								$add_to_calibration_basket->('correct', $imputed_hla_values_q[0], 1, $locus, $indivID, $imputed_hla_values_propkMersCovered[0]);								
							}
							
							$process_quality_measures->($locus, $quality_measures{$indivID}, 1, 1, $locus, $indivID, $imputed_hla_values_propkMersCovered[0]);

						}		
						else
						{
							$comparisons++;
							$compare_problems++;
							$problem_haplo_counter{$indivID}++;
							$problem_haplo_detail{$indivID}{$locus} = 'Reference: '.join('/', @reference_hla_values).' VS Imputed: '.join('/', @imputed_hla_values).'   (1 problem of 1)';
							$problem_locus_detail{$locus}++;
							$problem_locus_examined{$locus}++;
							
							$reference_predictions{$locus}{$reference_hla_values[0]}{incorrect} += 0.5;
							$reference_predictions{$locus}{$reference_hla_values[1]}{incorrect} += 0.5;
							$imputations_predictions{$locus}{$imputed_hla_values[0]}{incorrect}++;	
							
							$add_to_calibration_basket->('incorrect', $imputed_hla_values_q[0], 1, $locus, $indivID, $imputed_hla_values_propkMersCovered[0]);

							$process_quality_measures->($locus, $quality_measures{$indivID}, 0, 1);								
						}
					}
					else
					{
						die;
					}
				}		
			}
			
			my $thisIndiv_problems = $problem_locus_detail{$locus} - $problem_locus_detail_before;
			$errors_per_sample{$indivID} += $thisIndiv_problems;
			
			my $thisIndiv_examined = $problem_locus_examined{$locus} - $problem_locus_examined_before;
			$validated_per_sample{$indivID} += $thisIndiv_examined;
							
			my $avgCoverage = $imputed_HLA_avgCoverage{$locus}{$indivID};
			my $lowCoverage = $imputed_HLA_lowCoverage{$locus}{$indivID};
			my $minCoverage = $imputed_HLA_minCoverage{$locus}{$indivID};

			# average coverages
			if(($thisIndiv_problems > 0))
			{
				push(@{$locus_avgCoverages{$locus}{problems}}, $avgCoverage);
				push(@{$locus_lowCoverages{$locus}{problems}}, $lowCoverage);
				push(@{$locus_minCoverages{$locus}{problems}}, $minCoverage);
			}
			else
			{
				push(@{$locus_avgCoverages{$locus}{ok}}, $avgCoverage);
				push(@{$locus_lowCoverages{$locus}{ok}}, $lowCoverage);
				push(@{$locus_minCoverages{$locus}{ok}}, $minCoverage);	
			}
			
			push (@allLoci_allIndivs_avgCoverage, $avgCoverage);
			
			# print "\t", $thisIndiv_problems, "\n";
			
			# just for debugging - deactivated
			if($thisIndiv_problems == 0)
			{
				if($locus eq 'A')
				{
					# print join(' vs ', join('/', @reference_hla_values), join('/', @imputed_hla_values)), "\n";
				}	
			}
			
			my $thisIndiv_comparions = $comparisons - $comparisons_before;
			my $thisIndiv_OK = $thisIndiv_comparions - $thisIndiv_problems;
			
			my $indivID_withI = $sample_noI_toI{$indivID};
			die unless(defined $indivID_withI);				
			my $pileup_file = qq(../tmp/hla/$indivID_withI/${validation_round}_pileup_${locus}.txt);
				
			# my $coverages_href = load_coverages_from_pileup($pileup_file);
			my $coverages_href = {};
			my @k_coverages_existing;
			if($indivI_processedForEvaluation > 0)
			{
				foreach my $exon (keys %coverage_over_samples)
				{
					foreach my $exonPos (keys %{$coverage_over_samples{$exon}})
					{
						push(@k_coverages_existing, $exon . '-/-' . $exonPos);
					}
				}					
			}
			

			my @k_coverages_thisSample;
			foreach my $exon (keys %$coverages_href)
			{
				foreach my $exonPos (keys %{$coverages_href->{$exon}})
				{
					$coverage_over_samples{$exon}{$exonPos} += $coverages_href->{$exon}{$exonPos};
					push(@{$coverage_over_samples_individualValues{$exon}{$exonPos}}, $coverages_href->{$exon}{$exonPos});
					push(@k_coverages_thisSample, $exon . '-/-' . $exonPos);
				}
			}	
			$coverage_over_samples_nSamples++;
			
			if($indivI_processedForEvaluation > 0)
			{
				my ($n_shared, $aref_l1_excl, $aref_l2_excl) = list_comparison(\@k_coverages_existing, \@k_coverages_thisSample);
				if(($#{$aref_l1_excl} == -1) and ($#{$aref_l2_excl} == -1))
				{	
					die unless($n_shared == scalar(@k_coverages_existing));
					die unless($n_shared == scalar(@k_coverages_thisSample));					
				}
				else
				{
					die Dumper("There is a problem with per-exon coverage numbers.", $indivI_processedForEvaluation, scalar(@k_coverages_existing), scalar(@k_coverages_thisSample), scalar(@$aref_l1_excl), scalar(@$aref_l2_excl));
				}
			}

			if(($thisIndiv_problems > 0) and (not $all_2_dig) and not ($fromPHLAT) and not($fromHLAreporter))
			{
				my %readIDs;
				
				unless(-e $pileup_file)
				{
					die "Can't find pileup file $pileup_file";
				}
				
				load_pileup($pileup_href, $pileup_file, $indivID_withI);

			
				my $output_fn = '../tmp/hla_validation/pileup_'.$validation_round.'_'.$indivID_withI.'_'.$locus.'.txt';
				open(my $output_fh, '>', $output_fn) or die "Cannot open $output_fn";
				print $output_fh join("\t", $indivID_withI, $locus, $thisIndiv_OK), "\n";
				
				unless(scalar(@imputed_hla_values) == 2)
				{
					warn "Can't produce pileup for $locus / $indivID";
					next;
				}
				
				my $inferred = \@imputed_hla_values;				
				my $truth = \@reference_hla_values;
				
				print {$output_fh} $inferred->[0], "\t\t\t", $truth->[0], "\n";
				print {$output_fh} $inferred->[1], "\t\t\t", $truth->[1], "\n\n";
				
				print {$output_fh} join("\t", "Inferred", "", "", "Apparently true", ""), "\n";
				
				my @exons = print_which_exons($locus);				
				my @inferred_trimmed = twoClusterAlleles($inferred);
				
				foreach my $exon (@exons)
				{
					my $file = find_exon_file($locus, $exon);
					my $sequences = read_exon_sequences($file);
					my $randomKey = (keys %$sequences)[0];
					my $randomSequence = $sequences->{$randomKey};
					$randomSequence =~ s/./?/g;
					$sequences->{'?'} = $randomSequence;
					
					# die Dumper($truth, $inferred);
										
					my @validated_extended = twoValidationAlleles_2_proper_names($truth, $locus, $sequences);
								
					my $oneAllele = (keys %$sequences)[0];
					my $length = length($sequences->{$oneAllele});
										
					# print "-", $sequences->{$oneAllele}, "-\n\n";
					
					die Dumper("No sequences?", \@validated_extended) unless(scalar(grep {$sequences->{$_}} @validated_extended) == 2);
					die unless(scalar(grep {$sequences->{$_}} @inferred_trimmed) == 2);
					
					print {$output_fh} join("\t",
						$inferred_trimmed[0].' Exon '.$exon,
						$inferred_trimmed[1].' Exon '.$exon,
						"",
						$validated_extended[0].' Exon '.$exon,
						$validated_extended[1].' Exon '.$exon,
						), "\n";
					
					# print Dumper($locus, [keys %{$pileup_href->{$indivID_withI}}]);

					for(my $i = 0; $i < $length; $i++)
					{
						my @chars_for_print;
						
						
						push(@chars_for_print, map {substr($sequences->{$_}, $i, 1)} @inferred_trimmed);
						push(@chars_for_print, '');
						push(@chars_for_print, map {substr($sequences->{$_}, $i, 1)} @validated_extended);
						
						if($chars_for_print[0] ne $chars_for_print[1])
						{
							$chars_for_print[2] = '!';
						}
						
						if($chars_for_print[0] ne $chars_for_print[3])
						{
							$chars_for_print[5] = '!';
						}
						else
						{
							$chars_for_print[5] = '';
						}
						
						if($chars_for_print[1] ne $chars_for_print[4])
						{
							$chars_for_print[6] = '!';
						}			
						else
						{
							$chars_for_print[6] = '';				
						}
						
				
						if(exists $pileup_href->{$indivID_withI}{'HLA'.$locus})
						{
							my $pileUpString = $pileup_href->{$indivID_withI}{'HLA'.$locus}[$exon-2][$i];
							die unless(defined $pileup_href->{$indivID_withI});
							die unless(defined $pileup_href->{$indivID_withI}{'HLA'.$locus});
							die unless(defined $pileup_href->{$indivID_withI}{'HLA'.$locus}[$exon-2]);
							# die "Problem with pileup for $locus / $indivID / $exon / $i " unless(defined $pileUpString);
							# next unless(defined $pileUpString);
							
							
							# die Dumper("Pileup too short", $length, scalar(@{$pileup_href->{$indivID_withI}{'HLA'.$locus}[$exon-2]})) unless(defined $pileUpString);
							unless(($chars_for_print[5] eq '!') or ($chars_for_print[6] eq '!'))
							{
								# $pileUpString =~ s/\[.+?\]//g;
							}
							
							my $printPileUpDetails = (1 or (($chars_for_print[5] eq '!') or ($chars_for_print[6] eq '!')));
			
							my $getShortRead = sub {
								my $read = shift;
								my $rE = shift;
								
								my @readIDs = split(/ /, $read);
								die unless($#readIDs == 1);
								
								
								
								my $rI; 
								if(exists $readIDs{$readIDs[0]})
								{
									$rI = $readIDs{$readIDs[0]};
								}
								else
								{
									my $nR = scalar(keys %readIDs);
									$nR++;
									$rI = "Read${nR}X";
									$readIDs{$readIDs[0]} = $rI;
									$readIDs{$readIDs[1]} = $rI;
								}
								
								if($printPileUpDetails)
								{
									return $rI.$rE;
								}
								else
								{
									return $rE;
								}
							};
							
							$pileUpString =~ s/(\@\@.+?)(\](\,|$))/$getShortRead->($1, $2);/ge;
							
							if(($chars_for_print[5] eq '!') or ($chars_for_print[6] eq '!'))
							{
								push(@chars_for_print, $pileUpString);
							}
						}
						print {$output_fh} join("\t", @chars_for_print), "\n";
					}	
				}
				
				foreach my $readID (keys %readIDs)
				{
					print {$output_fh} $readIDs{$readID}, "\t", $readID, "\n";
				}
				close($output_fh);					
			}
		}
		
		if($T == 0)
		{
			my $totalAlleles = 0;
			
			my $calibration_file = 'temp/calibration_' . $locus . '_' . $sample_IDs_abbr . '.txt';	
			open(CALIBRATION, ">", $calibration_file) or die "Cannot open $calibration_file";
			print CALIBRATION join("\t", qw/Bin MeanPP PercCorrect NCorrect NIncorrect/), "\n";
			for(my $i = 0; $i <= 9; $i++)
			{
				my $meanPP = 0;
				my $percCorrect = 0;
				
				my $nCorrect = 0;
				my $nIncorrect = 0;
				
				my $mean_normalize = 0;
				foreach my $elem (@{$calibration_baskets{$i}{correct}})			
				{
					$nCorrect += $elem->{weight};
					$meanPP += $elem->{PP}*$elem->{weight};
					$mean_normalize += $elem->{weight};
				}
				foreach my $elem (@{$calibration_baskets{$i}{incorrect}})			
				{
					$nIncorrect += $elem->{weight};
					$meanPP += $elem->{PP}*$elem->{weight};
					$mean_normalize += $elem->{weight};				
				}			
				
				if(($nCorrect+$nIncorrect) != 0)
				{
					$percCorrect = $nCorrect/($nCorrect+$nIncorrect);
				}
				
				if($mean_normalize != 0)
				{
					$meanPP = $meanPP / $mean_normalize;
				}
				
				print CALIBRATION join("\t",
					$i,
					sprintf("%.2f", $meanPP),
					sprintf("%.2f", $percCorrect),
					sprintf("%.2f", $nCorrect),
					sprintf("%.2f", $nIncorrect),
				), "\n";
				
				$totalAlleles += ($nCorrect + $nIncorrect);
			}
			close(CALIBRATION);		
			
			die "Problem $totalAlleles vs $imputed_HLA_Calls{$locus}{called}" unless($totalAlleles == $imputed_HLA_Calls{$locus}{called});
					
			my $spatial_coverage_file = 'temp/spatialCoverage_' . $locus . '_' . $sample_IDs_abbr . '.txt';	
			open(SPATIALCOVERAGE, ">", $spatial_coverage_file) or die "Cannot open $spatial_coverage_file";			
			foreach my $exon (sort {$a <=> $b} keys %coverage_over_samples)
			{
				foreach my $exonPos (sort {$a <=> $b} keys %{$coverage_over_samples{$exon}})
				{
					my @individualValues = @{$coverage_over_samples_individualValues{$exon}{$exonPos}};
					@individualValues = sort {$a <=> $b} @individualValues;
					my $idx_10 = int($#individualValues * 0.1 + 0.5);
					my $idx_90 = int($#individualValues * 0.9 + 0.5);
					print SPATIALCOVERAGE join("\t", $exon, $exonPos, $exon . '-' . $exonPos, $coverage_over_samples{$exon}{$exonPos} / $coverage_over_samples_nSamples, $individualValues[$idx_10], $individualValues[$idx_90]), "\n";
				}
			}				
			close(SPATIALCOVERAGE)		
		}
				
		my $CR = sprintf("%.2f", $imputed_HLA_Calls{$locus}{sum} ? ($imputed_HLA_Calls{$locus}{called}/$imputed_HLA_Calls{$locus}{sum}) : 0);
		my $accuracy = sprintf("%.2f", 1 - (($problem_locus_examined{$locus} != 0) ? $problem_locus_detail{$locus}/$problem_locus_examined{$locus} : 0));
				
		print SUMMARY "\t", join("\t", $locus, $problem_locus_examined{$locus}, $CR, $accuracy), "\n";
	}
	
	my $comparions_OK = $comparisons - $compare_problems;
	print "\nComparisons: $comparisons -- OK: $comparions_OK\n";
		
	if($fromMHCPRG)
	{
		open(PROBLEMSPERSAMPLE, '>>', '_problems_per_sample_running.txt') or die;
		foreach my $indivID (keys %errors_per_sample)
		{
			my $fullSampleID = $sample_noI_toI{$indivID};
			die unless(defined $fullSampleID);
			
			my $sample_dir = '../tmp/hla/'.$fullSampleID;
			my $aligned_file = $sample_dir.'/reads.p.n.aligned';
			die "Aligned reads file $aligned_file not existing" unless(-e $aligned_file);
			open(ALIGNED, '<', $aligned_file) or die;
			my $firstLine = <ALIGNED>;
			chomp($firstLine);
			close(ALIGNED);
			my @IS = split(/ /, $firstLine);
			die unless(scalar(@IS) == 3);
			my $effectiveReadLength = inferReadLength($aligned_file);
			my $averageAlignmentOK = averageFractionAlignmentOK($sample_dir);
			print PROBLEMSPERSAMPLE join("\t", $fullSampleID, $vP, $validated_per_sample{$indivID}, $errors_per_sample{$indivID}, $IS[1], $IS[2], $effectiveReadLength, $averageAlignmentOK), "\n";
		}
		close(PROBLEMSPERSAMPLE);
	}
	open(TMP_OUTPUT, '>', '../tmp/hla_validation/validation_summary.txt') or die;
	print "\nPER-LOCUS SUMMARY:\n";
	foreach my $key (sort keys %problem_locus_detail)
	{
		my $OK_locus = $problem_locus_examined{$key} - $problem_locus_detail{$key};
		my $accuracy = sprintf("%.2f", 1 - (($problem_locus_examined{$key} != 0) ? $problem_locus_detail{$key}/$problem_locus_examined{$key} : 0));
		my $CR = sprintf("%.2f", $imputed_HLA_Calls{$key}{sum} ? ($imputed_HLA_Calls{$key}{called}/$imputed_HLA_Calls{$key}{sum}) : 0);
		print "\t", $key, ": ", $OK_locus, " of ", $problem_locus_examined{$key}, ",\tAccuracy ", $accuracy, " ";
		print "\tCall rate: ", $CR,  "\n";
		
		my @fields = ($key, $problem_locus_examined{$key}, $CR, $accuracy);
		print TMP_OUTPUT join("\t", @fields), "\n";
	
	}
	close(TMP_OUTPUT);	
		
	print "\nCorrect vs incorrect coverages per locus:\n";
	foreach my $locus (sort keys %problem_locus_detail)
	{
		my @avg_minMax_ok = min_avg_max(@{$locus_avgCoverages{$locus}{ok}});
		my @low_minMax_ok = min_avg_max(@{$locus_lowCoverages{$locus}{ok}});
		my @min_minMax_ok = min_avg_max(@{$locus_minCoverages{$locus}{ok}});
			
		my @avg_minMax_problems = min_avg_max(@{$locus_avgCoverages{$locus}{problems}});
		my @low_minMax_problems = min_avg_max(@{$locus_lowCoverages{$locus}{problems}});
		my @min_minMax_problems = min_avg_max(@{$locus_minCoverages{$locus}{problems}});
		
		print "\t", $locus, "\n";

		print "\t\tAverage ", join(' / ', @avg_minMax_ok), " vs ", join(' / ', @avg_minMax_problems), " [problems]", "\n";
		print "\t\tLow ", join(' / ', @low_minMax_ok), " vs ", join(' / ', @low_minMax_problems), " [problems]", "\n";
		print "\t\tMin ", join(' / ', @min_minMax_ok), " vs ", join(' / ', @min_minMax_problems), " [problems]", "\n";
	}
	
	print "\n";
	close(SUMMARY);
	
	print "Over all loci, all individuals:\n";
	my @avg_avg_minMax = min_avg_max(@allLoci_allIndivs_avgCoverage);
	print "\tAverage ", join(' / ', @avg_avg_minMax), "\n";
	print "\tLow ", join(' / ', @avg_avg_minMax), "\n";
	print "\tMin ", join(' / ', @avg_avg_minMax), "\n";
	
	if(scalar(keys %missing_reference_data))
	{
		print "Missing reference data for individuals:\n";
		foreach my $indivID (keys %missing_reference_data)
		{
			print " - ", $indivID, "\n";
		}
	}
	
	my $vPForFile = $vP;
	if($vPForFile)
	{
		$vPForFile .= '_';
	}
	open(TYPES, '>', '_' . $vPForFile . 'types_as_validated.txt') or die;
	my %perLocus_hom_het_missing;
	my @loci_for_print = sort {$a cmp $b} (keys %problem_locus_detail);
	print TYPES join("\t", 'IndividualID', map {'HLA' . $_ } @loci_for_print), "\n";
	foreach my $indivID (sort keys %types_as_validated)
	{
		my @fields_for_indiv = ($indivID);
		foreach my $locus (@loci_for_print)
		{
			my $locus_values;
			
			if((exists $types_as_validated{$indivID}{$locus}) and (scalar(@{$types_as_validated{$indivID}{$locus}}) > 0) )
			{
				my @alleles = @{$types_as_validated{$indivID}{$locus}};
				my @alleles_for_print;
				foreach my $allele (@alleles)
				{
					if($allele =~ /\:/)
					{
						push(@alleles_for_print, $allele);
					}
					else
					{
						die unless((length($allele) == 4) or (length($allele) == 5));
						$allele =~ s/g/G/;
						$allele = substr($allele, 0, 2) . ':' . substr($allele, 2);
						push(@alleles_for_print, $allele);

					}
				}
				$locus_values = join("/", @alleles_for_print);
				
				die unless($#alleles_for_print == 1);
				if($alleles_for_print[0] eq $alleles_for_print[1])
				{
					$perLocus_hom_het_missing{$locus}[0]++;
				}
				else
				{
					$perLocus_hom_het_missing{$locus}[1]++;			
				}
			}
			else
			{
				$locus_values = '????/????';
				$perLocus_hom_het_missing{$locus}[2]++;
			}
			
			die unless(defined $locus_values);
			
			push(@fields_for_indiv, $locus_values);
			
		}
		print TYPES join("\t", @fields_for_indiv), "\n";
	}
	close(TYPES);
	
	open(HOMHET, '>', '_' . $vPForFile . 'types_as_validated_homhet.txt') or die;
	print HOMHET join("\t", qw/Locus Hom Het HomMissing/), "\n";
	foreach my $locus (keys %perLocus_hom_het_missing)
	{
		my @printFields = ($locus);
		for(my $i = 0; $i <= 2; $i++)
		{
			my $v = 0;
			if(defined $perLocus_hom_het_missing{$locus}[$i])
			{
				$v = $perLocus_hom_het_missing{$locus}[$i];
			}
			push(@printFields, $v);
		}
		print HOMHET join("\t", @printFields), "\n";
	}
	close(HOMHET)
	
}


if($actions =~ /w/)
{
	my $validation_round = 'R2';
	die "Please specify --trueHaplotypes for validation" unless($trueHaplotypes);
	
	my $debug = 0;
	
	# read reference dataset
	my %reference_data;
	open(REFERENCE, "<", $trueHaplotypes) or die "Cannot open $trueHaplotypes";
	my $headerLine = <REFERENCE>;
	chomp($headerLine);
	$headerLine =~ s/\n//g;
	$headerLine =~ s/\r//g;
	my @header_fields = split(/\t/, $headerLine);
	@header_fields = map {if($_ =~ /HLAD((QA)|(QB)|(RB))$/){$_ .= '1';} $_} @header_fields;	
	while(<REFERENCE>)
	{
		my $line = $_;
		chomp($line);
		
		$line =~ s/\n//g;
		$line =~ s/\r//g;
		
		my @fields = split(/\t/, $line);
		my %line = (mesh @header_fields, @fields);
		
		my $primary_key = $line{'IndividualID'};
		$reference_data{$primary_key} = \%line;
	}
	close(REFERENCE);
	
	# perturbed positions
	my %reference_data_perturbedWhere;	
	my $trueHaplotypes_perturbed = $trueHaplotypes . '.perturbed';
	if(-e $trueHaplotypes_perturbed)
	{
		open(REFERENCE, "<", $trueHaplotypes_perturbed) or die "Cannot open $trueHaplotypes_perturbed";
		my $headerLine = <REFERENCE>;
		chomp($headerLine);
		$headerLine =~ s/\n//g;
		$headerLine =~ s/\r//g;
		my @header_fields = split(/\t/, $headerLine);
		@header_fields = map {if($_ =~ /HLAD((QA)|(QB)|(RB))$/){$_ .= '1';} $_} @header_fields;	
		while(<REFERENCE>)
		{
			my $line = $_;
			chomp($line);
			
			$line =~ s/\n//g;
			$line =~ s/\r//g;
			
			my @fields = split(/\t/, $line);
			my %line = (mesh @header_fields, @fields);
			
			my $primary_key = $line{'IndividualID'};
			$reference_data_perturbedWhere{$primary_key} = \%line;
		}
		close(REFERENCE);
	}
	else
	{
		warn "No perturbation information ($trueHaplotypes_perturbed) found.";
	}	
	
	
	my %imputed_haplotypes;
	my %sample_noI_toI;
	
	my %imputed_haplotypes_lines;
	
	$debug = 1;
	
	foreach my $sampleID (@sampleIDs) 
	{
		my $sampleID_noI = $sampleID;
		$sampleID_noI =~ s/^I\d+_//g;
		
		my @bestGuess_files = glob('../tmp/hla/'.$sampleID.'/'.$validation_round.'_haplotypes_bestguess_*.txt');
		
		foreach my $bestGuess_file (@bestGuess_files)
		{
			die unless($bestGuess_file =~ /haplotypes_bestguess_(.+)\.txt/);
			my $locus = $1;
			
			# next unless($locus eq 'A'); # todo remove   
			
			unless(-e $bestGuess_file)
			{
				warn "Best-guess file $bestGuess_file not existing";
				next;
			}		
			  
			open(BESTGUESS, '<', $bestGuess_file) or die "Cannot open $bestGuess_file";
			my @bestguess_header_fields = qw/Position GraphLevel Type TypeNumber PositionInType C1 C2 GTConfidence PruningInterval PruningC1 PruningC1HaploConfidence PruningC2 PruningC2HaploConfidence NReads ReadAlleles ConsideredGTPairs ConsideredH1 ConsideredH2 AA1 AA1HaploConf AA2 AA2HaploConf/;
			while(<BESTGUESS>)
			{
				my $line = $_;
				chomp($line);
				my @line_fields = split(/\t/, $line, -1);
				die Dumper("Line fields problem", $#line_fields, $#bestguess_header_fields) unless($#line_fields == $#bestguess_header_fields);
				my %line_hash = (mesh @bestguess_header_fields, @line_fields);
				
				my $Q = $line_hash{'GTConfidence'};
				if($Q < $T)
				{
					$imputed_haplotypes{$locus}{$sampleID_noI}[0] .= '?;';	
					$imputed_haplotypes{$locus}{$sampleID_noI}[1] .= '?;';	
				}
				else
				{
					$imputed_haplotypes{$locus}{$sampleID_noI}[0] .= $line_hash{'C1'}.';';	
					$imputed_haplotypes{$locus}{$sampleID_noI}[1] .= $line_hash{'C2'}.';';
				}
				
				if($debug)
				{
					push(@{$imputed_haplotypes_lines{$locus}{$sampleID_noI}}, $line);
				}
			}	
			close(BESTGUESS);
			
			die if((exists $sample_noI_toI{$sampleID_noI}) and ($sample_noI_toI{$sampleID_noI} ne $sampleID));
			$sample_noI_toI{$sampleID_noI} = $sampleID;
		}
	}
		
	my @loci = sort keys %imputed_haplotypes;
	
	my $process_quality_measures = sub {};
	
	
	foreach my $locus (@loci)
	{
		my $arbitraty_indiv = (keys %reference_data)[0];
		next unless((defined $reference_data{$arbitraty_indiv}{'HLA'.$locus}));
		
		my $locus_agree = 0;
		my $locus_disagree = 0;
		my $locus_missing = 0;

		my $locus_gt_agree = 0;
		my $locus_gt_disagree = 0;
		my $locus_gt_missing = 0;
					
		my $locus_pt_agree = 0;
		my $locus_pt_disagree = 0;
		my $locus_pt_missing = 0;

		my $locus_pt_gt_agree = 0;
		my $locus_pt_gt_disagree = 0;
		my $locus_pt_gt_missing = 0;
		
		my @indivIDs = keys %{$imputed_haplotypes{$locus}};
		INDIV: foreach my $indivID (@indivIDs)
		{	
			$debug = 0;
			
			# my @imputed_hla_values = map { $imputed_HLA{$locus}{$indivID}{$_} } keys %{$imputed_HLA{$locus}{$indivID}};
			my @imputed_haplotypes = @{ $imputed_haplotypes{$locus}{$indivID} };
								
			next INDIV unless($#imputed_haplotypes == 1);
			next INDIV unless(exists $reference_data{$indivID});
			next INDIV unless(exists $reference_data{$indivID}{'HLA'.$locus});
			
			$reference_data{$indivID}{'HLA'.$locus} or die;
			
			my @reference_haplotypes = split(/\//, $reference_data{$indivID}{'HLA'.$locus});
			if(not $reference_data_perturbedWhere{$indivID}{'HLA'.$locus})
			{
				warn "No reference haplotype for $locus";
				$reference_data_perturbedWhere{$indivID}{'HLA'.$locus} = '';
			}
			my @reference_haplotypes_perturbed = split(/\//, $reference_data_perturbedWhere{$indivID}{'HLA'.$locus});
			
			die Dumper($reference_data{$indivID}, \@reference_haplotypes) unless($#reference_haplotypes == 1);
				
			die unless($#imputed_haplotypes == 1);
			die unless($#reference_haplotypes == 1);
			
			my @reference_haplotypes_split = (map {[split(/;/, $_)]} @reference_haplotypes);
			my @reference_haplotypes_perturbed_split = (map {split(/;/, $_)} @reference_haplotypes_perturbed);
			
			my %perturbedWhere = map {$_ => 1} @reference_haplotypes_perturbed_split;
			
			# die Dumper(\%perturbedWhere);

			my @imputed_haplotypes_split = (map {[split(/;/, $_)]} @imputed_haplotypes);
			
			die unless($#reference_haplotypes_split == 1);
			die unless($#imputed_haplotypes_split == 1);
			
			die Dumper($#reference_haplotypes_split, $#imputed_haplotypes_split, [$#{$reference_haplotypes_split[0]}, $#{$reference_haplotypes_split[1]}], [$#{$imputed_haplotypes_split[0]}, $#{$imputed_haplotypes_split[1]}]) unless($#{$reference_haplotypes_split[0]} == $#{$reference_haplotypes_split[1]});
			
			die Dumper($#reference_haplotypes_split, $#imputed_haplotypes_split, [$#{$reference_haplotypes_split[0]}, $#{$reference_haplotypes_split[1]}], [$#{$imputed_haplotypes_split[0]}, $#{$imputed_haplotypes_split[1]}]) unless($#{$imputed_haplotypes_split[0]} == $#{$imputed_haplotypes_split[1]});

			die Dumper($#reference_haplotypes_split, $#imputed_haplotypes_split, [$#{$reference_haplotypes_split[0]}, $#{$reference_haplotypes_split[1]}], [$#{$imputed_haplotypes_split[0]}, $#{$imputed_haplotypes_split[1]}]) unless($#{$reference_haplotypes_split[0]} == $#{$imputed_haplotypes_split[1]});
			
			my @alleles_agree = (0, 0);
			my @alleles_missing = (0, 0);;
			my @alleles_disagree = (0, 0);;

			my @alleles_gt_agree = (0, 0);
			my @alleles_gt_missing = (0, 0);;
			my @alleles_gt_disagree = (0, 0);;
			
			my @alleles_pt_agree = (0, 0);
			my @alleles_pt_missing = (0, 0);;
			my @alleles_pt_disagree = (0, 0);;

			my @alleles_pt_gt_agree = (0, 0);
			my @alleles_pt_gt_missing = (0, 0);;
			my @alleles_pt_gt_disagree = (0, 0);;
			
			
			for(my $invertImputations = 0; $invertImputations <= 1; $invertImputations++)
			{
				my @imputed_haplotypes_split_forAnalysis = ($invertImputations) ? reverse(@imputed_haplotypes_split) : @imputed_haplotypes_split;
				
				for(my $i = 0; $i <= $#{$reference_haplotypes_split[0]}; $i++)
				{
					for(my $j = 0; $j <= 1; $j++)
					{
						die unless((defined $reference_haplotypes_split[$j][$i]) and (defined $imputed_haplotypes_split_forAnalysis[$j][$i]));
						if($imputed_haplotypes_split_forAnalysis[$j][$i] eq "?")
						{
							$alleles_missing[$invertImputations]++;
							if($perturbedWhere{$i})
							{
								$alleles_pt_missing[$invertImputations]++;
							}
						}
						else
						{
							if($reference_haplotypes_split[$j][$i] eq $imputed_haplotypes_split_forAnalysis[$j][$i])
							{
								$alleles_agree[$invertImputations]++;
								if($perturbedWhere{$i})
								{
									$alleles_pt_agree[$invertImputations]++;
								}								
							}
							else
							{
								$alleles_disagree[$invertImputations]++;
								if($perturbedWhere{$i})
								{
									$alleles_pt_disagree[$invertImputations]++;
								}									
							}
						}
					}
					
					my ($thisPosition_gt_agree, $thisPosition_gt_disagree, $thisPosition_gt_missing) = compatibleStringAlleles([$reference_haplotypes_split[0][$i], $reference_haplotypes_split[1][$i]], [$imputed_haplotypes_split_forAnalysis[0][$i], $imputed_haplotypes_split_forAnalysis[1][$i]]);
					
					$alleles_gt_agree[$invertImputations] += $thisPosition_gt_agree;
					$alleles_gt_disagree[$invertImputations] += $thisPosition_gt_disagree;
					$alleles_gt_missing[$invertImputations] += $thisPosition_gt_missing;
					
					if($perturbedWhere{$i})
					{					
						$alleles_pt_gt_agree[$invertImputations] += $thisPosition_gt_agree;
						$alleles_pt_gt_disagree[$invertImputations] += $thisPosition_gt_disagree;
						$alleles_pt_gt_missing[$invertImputations] += $thisPosition_gt_missing;
					
						if(($invertImputations == 0) and ($thisPosition_gt_agree != 2) and 0)
						{
							print "Position $i -- agreement: $thisPosition_gt_agree\n";
							print "\tTrue genotypes: ", join('/', $reference_haplotypes_split[0][$i], $reference_haplotypes_split[1][$i]), "\n";
							print "\tImputed genotypes: ", join('/', $imputed_haplotypes_split_forAnalysis[0][$i], $imputed_haplotypes_split_forAnalysis[1][$i]), "\n";
							print "\t\tLine: ",	$imputed_haplotypes_lines{$locus}{$indivID}[$i], "\n";
							print "\n";
						}					
					}
					
					if(($invertImputations == 0) and ($thisPosition_gt_agree != 2))
					{
						# print "Position $i -- agreement: $thisPosition_gt_agree\n";
						# print "\tTrue genotypes: ", join('/', $reference_haplotypes_split[0][$i], $reference_haplotypes_split[1][$i]), "\n";
						# print "\tImputed genotypes: ", join('/', $imputed_haplotypes_split_forAnalysis[0][$i], $imputed_haplotypes_split_forAnalysis[1][$i]), "\n";
						# print "\t\tLine: ",	$imputed_haplotypes_lines{$locus}{$indivID}[$i], "\n";
						# print "\n";
					}
				}
			}
			
			die unless($alleles_gt_agree[0] == $alleles_gt_agree[1]);
			die unless($alleles_gt_disagree[0] == $alleles_gt_disagree[1]);
			die unless($alleles_gt_missing[0] == $alleles_gt_missing[1]);
			
			# print "Individual $indivID Locus $locus\n";
			my @invertImputations_notOK;
			for(my $invertImputations = 0; $invertImputations <= 1; $invertImputations++)
			{	
				my $alleles_sum = $alleles_agree[$invertImputations] + $alleles_disagree[$invertImputations] + $alleles_missing[$invertImputations];
				die unless($alleles_sum > 0);
				
				# print "\t", "Inversion ", $invertImputations, "\n";
				# print "\t\tOK:       ", $alleles_agree[$invertImputations], " ", sprintf("%.2f", $alleles_agree[$invertImputations]/$alleles_sum * 100), "%\n";
				# print "\t\tNOT OK:   ", $alleles_disagree[$invertImputations], " ", sprintf("%.2f", $alleles_disagree[$invertImputations]/$alleles_sum * 100), "%\n";
				# print "\t\tMISSING:  ", $alleles_missing[$invertImputations], " ", sprintf("%.2f", $alleles_missing[$invertImputations]/$alleles_sum * 100), "%\n";
				# print "\n";
				
				$invertImputations_notOK[$invertImputations] = $alleles_disagree[$invertImputations];
				
				my $alleles_pt_sum = $alleles_pt_agree[$invertImputations] + $alleles_pt_disagree[$invertImputations] + $alleles_pt_missing[$invertImputations];
				
				# print "\t", "Inversion ", $invertImputations, " (perturbed alleles only)\n";
				# if($alleles_pt_sum > 0)		
				# {
					# print "\t\tOK:       ", $alleles_pt_agree[$invertImputations], " ", sprintf("%.2f", $alleles_pt_agree[$invertImputations]/$alleles_pt_sum * 100), "%\n";
					# print "\t\tNOT OK:   ", $alleles_pt_disagree[$invertImputations], " ", sprintf("%.2f", $alleles_pt_disagree[$invertImputations]/$alleles_pt_sum * 100), "%\n";
					# print "\t\tMISSING:  ", $alleles_pt_missing[$invertImputations], " ", sprintf("%.2f", $alleles_pt_missing[$invertImputations]/$alleles_pt_sum * 100), "%\n";
				# }
				# print "\n";
				
			}
			
			my $invertImputations_optimal = ($invertImputations_notOK[1] < $invertImputations_notOK[0]) ? 1 : 0;
			
			my $alleles_sum = $alleles_agree[$invertImputations_optimal] + $alleles_disagree[$invertImputations_optimal] + $alleles_missing[$invertImputations_optimal];
			die unless($alleles_sum > 0);
						
			
			# print "\t", "Haplotypes - inverted ", $invertImputations_optimal, "\n";
			# print "\t\tOK:       ", $alleles_agree[$invertImputations_optimal], " ", sprintf("%.2f", $alleles_agree[$invertImputations_optimal]/$alleles_sum * 100), "%\n";
			# print "\t\tNOT OK:   ", $alleles_disagree[$invertImputations_optimal], " ", sprintf("%.2f", $alleles_disagree[$invertImputations_optimal]/$alleles_sum * 100), "%\n";
			# print "\t\tMISSING:  ", $alleles_missing[$invertImputations_optimal], " ", sprintf("%.2f", $alleles_missing[$invertImputations_optimal]/$alleles_sum * 100), "%\n";
			# print "\n";
				
			
			my $alleles_gt_sum = $alleles_gt_agree[0] + $alleles_gt_disagree[0] + $alleles_gt_missing[0];
			# print "\t", "Genotypes ", "\n";
			# print "\t\tOK:       ", $alleles_gt_agree[0], " ", sprintf("%.2f", $alleles_gt_agree[0]/$alleles_gt_sum * 100), "%\n";
			# print "\t\tNOT OK:   ", $alleles_gt_disagree[0], " ", sprintf("%.2f", $alleles_gt_disagree[0]/$alleles_gt_sum * 100), "%\n";
			# print "\t\tMISSING:  ", $alleles_gt_missing[0], " ", sprintf("%.2f", $alleles_gt_missing[0]/$alleles_gt_sum * 100), "%\n";
			# print "\n";			
			
			my $alleles_pt_gt_sum = $alleles_pt_gt_agree[0] + $alleles_pt_gt_disagree[0] + $alleles_pt_gt_missing[0];
			# print "\t", "Genotypes at perturbed positions", "\n";
			# if($alleles_pt_gt_sum > 0)
			# {
				# print "\t\tOK:       ", $alleles_pt_gt_agree[0], " ", sprintf("%.2f", $alleles_pt_gt_agree[0]/$alleles_pt_gt_sum * 100), "%\n";
				# print "\t\tNOT OK:   ", $alleles_pt_gt_disagree[0], " ", sprintf("%.2f", $alleles_pt_gt_disagree[0]/$alleles_pt_gt_sum * 100), "%\n";
				# print "\t\tMISSING:  ", $alleles_pt_gt_missing[0], " ", sprintf("%.2f", $alleles_pt_gt_missing[0]/$alleles_pt_gt_sum * 100), "%\n";
			# }
			# print "\n";					
								
			$locus_agree += $alleles_agree[$invertImputations_optimal];
			$locus_disagree += $alleles_disagree[$invertImputations_optimal];
			$locus_missing += $alleles_missing[$invertImputations_optimal];
			
			$locus_gt_agree += $alleles_gt_agree[0];
			$locus_gt_disagree += $alleles_gt_disagree[0];
			$locus_gt_missing += $alleles_gt_missing[0];	

			$locus_pt_agree += $alleles_pt_agree[$invertImputations_optimal];
			$locus_pt_disagree += $alleles_pt_disagree[$invertImputations_optimal];
			$locus_pt_missing += $alleles_pt_missing[$invertImputations_optimal];
			
			$locus_pt_gt_agree += $alleles_pt_gt_agree[0];
			$locus_pt_gt_disagree += $alleles_pt_gt_disagree[0];
			$locus_pt_gt_missing += $alleles_pt_gt_missing[0];	
		}
		
		my $locus_sum = $locus_agree + $locus_disagree + $locus_missing;
		my $locus_gt_sum = $locus_gt_agree + $locus_gt_disagree + $locus_gt_missing;
		
		die if($locus_sum == 0);
		print "LOCUS SUMMARY $locus\n";
		print "\tHaplotypes:\n";
		print "\t\tOK:       ", $locus_agree, " ", sprintf("%.2f", $locus_agree/$locus_sum * 100), "%\n";
		print "\t\tNOT OK:   ", $locus_disagree, " ", sprintf("%.2f", $locus_disagree/$locus_sum * 100), "%\n";
		print "\t\tMISSING:  ", $locus_missing, " ", sprintf("%.2f", $locus_missing/$locus_sum * 100), "%\n\n";
		print "\tGenotypes:\n";
		print "\t\tOK:       ", $locus_gt_agree, " ", sprintf("%.2f", $locus_gt_agree/$locus_gt_sum * 100), "%\n";
		print "\t\tNOT OK:   ", $locus_gt_disagree, " ", sprintf("%.2f", $locus_gt_disagree/$locus_gt_sum * 100), "%\n";
		print "\t\tMISSING:  ", $locus_gt_missing, " ", sprintf("%.2f", $locus_gt_missing/$locus_gt_sum * 100), "%\n";
		print "\n";
		
		my $locus_pt_sum = $locus_pt_agree + $locus_pt_disagree + $locus_pt_missing;
		my $locus_pt_gt_sum = $locus_pt_gt_agree + $locus_pt_gt_disagree + $locus_pt_gt_missing;
		
		print "LOCUS SUMMARY $locus (perturbed only)\n";
		if($locus_pt_sum > 0)
		{
			print "\tHaplotypes:\n";
			print "\t\tOK:       ", $locus_pt_agree, " ", sprintf("%.2f", $locus_pt_agree/$locus_pt_sum * 100), "%\n";
			print "\t\tNOT OK:   ", $locus_pt_disagree, " ", sprintf("%.2f", $locus_pt_disagree/$locus_pt_sum * 100), "%\n";
			print "\t\tMISSING:  ", $locus_pt_missing, " ", sprintf("%.2f", $locus_pt_missing/$locus_pt_sum * 100), "%\n\n";
			print "\tGenotypes:\n";
			print "\t\tOK:       ", $locus_pt_gt_agree, " ", sprintf("%.2f", $locus_pt_gt_agree/$locus_pt_gt_sum * 100), "%\n";
			print "\t\tNOT OK:   ", $locus_pt_gt_disagree, " ", sprintf("%.2f", $locus_pt_gt_disagree/$locus_pt_gt_sum * 100), "%\n";
			print "\t\tMISSING:  ", $locus_pt_gt_missing, " ", sprintf("%.2f", $locus_pt_gt_missing/$locus_pt_gt_sum * 100), "%\n";
		}	
	
		# exit; # todo remove
	}

	# my $comparions_OK = $comparisons - $compare_problems;
	# print "\nComparisons: $comparisons -- OK: $comparions_OK\n";
		
	open(TMP_OUTPUT, '>', '../tmp/hla_validation/validation_haplotypes_summary.txt') or die;
	# print "\nPER-LOCUS SUMMARY:\n";
	# foreach my $key (sort keys %problem_locus_detail)
	# {
		# my $OK_locus = $problem_locus_examined{$key} - $problem_locus_detail{$key};
		# my $accuracy = sprintf("%.2f", 1 - (($problem_locus_examined{$key} != 0) ? $problem_locus_detail{$key}/$problem_locus_examined{$key} : 0));
		# my $CR = sprintf("%.2f", $imputed_HLA_Calls{$key}{sum} ? ($imputed_HLA_Calls{$key}{called}/$imputed_HLA_Calls{$key}{sum}) : 0);
		# print "\t", $key, ": ", $OK_locus, " of ", $problem_locus_examined{$key}, ",\tAccuracy ", $accuracy, " ";
		# print "\tCall rate: ", $CR,  "\n";
		
		# my @fields = ($key, $problem_locus_examined{$key}, $CR, $accuracy);
		# print TMP_OUTPUT join("\t", @fields), "\n";
	# }
	close(TMP_OUTPUT);	

	print "\n";
}


sub compatibleStringAlleles
{
	my $alleles_validation = shift;
	my $alleles_inference = shift;

	die unless($#{$alleles_inference} == 1);
	die unless($#{$alleles_validation} == 1);

	my ($OK1, $NOTOK1, $MISSING1) = compatibleStringAlleles_noFlip($alleles_validation, $alleles_inference);
	
	my $alleles_validation_flipped = [reverse(@$alleles_validation)];
	my ($OK2, $NOTOK2, $MISSING2) = compatibleStringAlleles_noFlip($alleles_validation_flipped, $alleles_inference);
	
	# die unless($MISSING1 == $MISSING2);
	
	if(($NOTOK1 < $NOTOK2))
	{
		return ($OK1, $NOTOK1, $MISSING1);
	}
	elsif($NOTOK1 == $NOTOK2)
	{
		return (($OK1 > $OK2) ? ($OK1, $NOTOK1, $MISSING1) : ($OK2, $NOTOK2, $MISSING2));		
	}
	else
	{
		return ($OK2, $NOTOK2, $MISSING2);
	}
	
	
}


sub compatibleStringAlleles_noFlip
{
	my $alleles_validation = shift;
	my $alleles_inference = shift;

	die unless($#{$alleles_inference} == 1);
	die unless($#{$alleles_validation} == 1);

	my $alleles_OK = 0;
	my $alleles_NOTOK = 0;
	my $alleles_MISSING = 0;
	
	for(my $aI = 0; $aI < 2; $aI++)
	{	
		my ($OK, $NOTOK, $MISSING) = (compatibleStringAlleles_individual(undef, $alleles_validation->[$aI], $alleles_inference->[$aI]));
		$alleles_OK += $OK;
		$alleles_NOTOK += $NOTOK;
		$alleles_MISSING += $MISSING;
	}
		
	return ($alleles_OK, $alleles_NOTOK, $alleles_MISSING);
}


sub compatibleStringAlleles_individual
{
	my $locus = shift;
	my $allele_validation = shift;	
	my $allele_inference = shift;

	if(($allele_validation =~ /\?/) || ($allele_inference =~ /\?/))
	{
		return (0, 0, 1);
	}
	else
	{
		if($allele_validation eq $allele_inference)
		{
			return (1, 0, 0);
		}
		else
		{
			return (0, 1, 0);
		}
	}
}



# sub compatibleAlleles
# {

	# my $alleles_validation = shift;
	# my $alleles_inference = shift;

	# die unless($#{$alleles_inference} == 1);
	# die unless($#{$alleles_validation} == 1);

	# my $r1 = compatibleAlleles_noFlip($alleles_validation, $alleles_inference);
	
	# my $alleles_validation_flipped = [reverse(@$alleles_validation)];
	# my $r2 = compatibleAlleles_noFlip($alleles_validation_flipped, $alleles_inference);
	
	# return (($r1 > $r2) ? $r1 : $r2);
# }


# sub compatibleAlleles_noFlip
# {

	# my $alleles_validation = shift;
	# my $alleles_inference = shift;

	# die unless($#{$alleles_inference} == 1);
	# die unless($#{$alleles_validation} == 1);

	
	# my $alleles_compatible = 0;
	
	# for(my $aI = 0; $aI < 2; $aI++)
	# {	

		# $alleles_compatible += (compatibleAlleles_individual(undef, $alleles_validation->[$aI], $alleles_inference->[$aI]));
	# }

		
	# return $alleles_compatible;
# }


sub compatibleAlleles_individual
{
	my $locus = shift;
	my $allele_validation = shift;
	my $allele_inference = shift;
	

	# die Dumper($allele_validation, $allele_inference);
	
	my @components_allele_validation = split(/;/, $allele_validation);
	if(scalar(@components_allele_validation) > 1)
	{
		my $found_compatible = 0;
		foreach my $validation_allele (@components_allele_validation)
		{
			$found_compatible += compatibleAlleles_individual($locus, $validation_allele, $allele_inference);
			last if($found_compatible);
		}
		if($found_compatible)
		{
			return 1;
		}
		else
		{
			return 0;
		}
	}	
	
	my $allele_validation_original = $allele_validation;
		
	if($locus eq 'C')
	{
		# print Dumper([$locus, $allele_validation, $allele_inference, scalar(@components_allele_validation)]);
	}
	
	die "Weird allele $locus $allele_validation" unless(length($allele_validation) >= 4);
	die unless($allele_inference =~ /:/);

	
	#die unless($allele_validation =~ /^\d\d/);

	my $components_validation;
	if($allele_validation !~ /\:/)
	{
		die if($allele_validation =~ /\//);
		$allele_validation = substr($allele_validation, 0, 2).':'.substr($allele_validation, 2);
		$components_validation = 2;
		$allele_validation =~ s/g//i;
		
	}
	else
	{
		my @_components = split(/\:/, $allele_validation);
		$components_validation = scalar(@_components);
		die "Weird allele code $allele_validation" if($allele_validation =~ /g/i);
		die if($allele_validation =~ /D/i);
			
	}
	die unless(defined $components_validation);
	die unless($components_validation >= 2);

	
	my $true_allele = $allele_validation;

		
	my $inferred_alleles = $allele_inference;
	my @inferred_alleles = split(/;/, $inferred_alleles);
	
	# if inferred allele has fewer components than validation allele: add 0s, mismatch if validation allele is not 0 at this position
	# => conservative
	# if inferred allele has more components than validation allele, truncate inferred allele
	# e.g. inferred: 02:03:01, validation allele: 03:01, truncated inferred allele: 02:03
	# if inferred and validation allele have different 4-digit groups, we always generate a mismatch
	# if inferred and validation allele have the same 4-digit group, we generate a match
	# e.g. inferred 02:03:01, validation: 02:03
	# is this a problem? No. Consider two scenarios:
	# - validation allele typed to complete 4-digit (amino acid sequence, all exons) resolution: this is what we want
	# - validation allele typed to SBT G-groups: this case is not a problem.
	# 		Case 1: specified as G, i.e. 02:03G
	#			This is a bad example, because 02:03G is not a valid G allele - needs to be six digits, e.g.
	#			02:03:01G. This is fine as long as our output is also at G group level (i.e. no
	#			higher resolution than exon 2 / exons 2,3), as 02:03:01 will be one of our output alleles
	#			if and only if we infer 02:03:01G as a result.
	# 			Gs are confusing and we will deactivate them now.
	#			(Or enable proper translation)
	#  		Case 2: specified as explicit allele list, separated with slashes - we now assume that 02:03 is 
	#   			one such explicit member of a list.
	#			This case does not exist. If 02:03:01 exists as a named allele, other alleles like 02:03:02 must also exist,
	#			and 02:03 would not be an individual member of any G group (because all members with identical
	#			4-digit group would be differentiated by their 6-digit groups).

	
	@inferred_alleles = map {
		# die "Can't parse allele $_" unless($_ =~ /^\s*\w+\*(\d\d\d?)\:(\d\d\d?N?).*$/);
		# my $allele = $1.':'.$2;
		# $allele
		
		die "Can't parse allele $_" unless($_ =~ /^\s*(\w+)\*([\d\:N]+Q?)L?S?$/);
		
		my $allele = $2;
		
		my @components_allele = split(/:/, $allele);
		my @components_allele_rightLength;
		
		for(my $i = 0; $i < $components_validation; $i++)
		{
			if($i <= $#components_allele)
			{
				push(@components_allele_rightLength, $components_allele[$i]);
			}
			else
			{
				push(@components_allele_rightLength, '00');
			}
		}
		$allele = join(':', @components_allele_rightLength);
		
	} @inferred_alleles;
	
	my %inferred = map {$_ => 1} @inferred_alleles;
	
	if($inferred{$true_allele})
	{
		# print Dumper($allele_validation_original, $allele_inference, \@inferred_alleles, 1), "\n" if ($locus eq 'DQB1');	
		
		return 1;  
	}		
	else
	{
		# print Dumper($allele_validation_original, $allele_inference, \@inferred_alleles, 0), "\n"  if ($locus eq 'DQB1');	
	
		return 0;
	}
}

sub intersection
{
	my $aref1 = shift;
	my $aref2 = shift;

	my %h1 = map {$_ => 1} @$aref1;
	my %h2 = map {$_ => 1} @$aref2;
	
	my %c = map {$_ => 1} ((keys %h1), (keys %h2));
	
	my @ret = grep {$h1{$_} and $h2{$_}} keys %c;
	
	return @ret;
}

sub print_which_exons
{
	my $locus = shift;
	if((length($locus) == 1) or ($locus =~ /HLA\w/))
	{
		return (2, 3);
	}
	else
	{
		return (2);
	}
}	


sub load_pileup
{
	my $r_href = shift;
	my $file = shift;
	my $indivID = shift;
	
	die unless($file =~ /R\d+_pileup_(\w+).txt$/);
	my $locus = $1;
	
	open(F, '<', $file) or die "Cannot open $file";
	while(<F>)
	{
		my $line = $_;
		chomp($line);
		next unless($line);
		my @f = split(/\t/, $line, -1);
		die unless(($#f == 3) or ($#f == 2));
		if($#f == 3)
		{
			my $exon = $f[0];
			my $exonPos = $f[1];
			my $pileUp = $f[3];		
			$r_href->{$indivID}{'HLA'.$locus}[$exon][$exonPos] = $pileUp;
		}
		else
		{
			my $exon = $f[0];
			my $exonPos = $f[1];
			my $pileUp = '';		
			$r_href->{$indivID}{'HLA'.$locus}[$exon][$exonPos] = $pileUp;
		}
	}	
	close(F);
}

sub load_coverages_from_pileup
{
	my $file = shift;
	
	die unless($file =~ /R\d+_pileup_(\w+).txt$/);
	my $locus = $1;
	
	my %forReturn;
	
	open(F, '<', $file) or die "Cannot open $file";
	while(<F>)
	{
		my $line = $_;
		chomp($line);
		next unless($line);
		die "Cannot parse pileup coverage line" unless($line =~ /^(\d+)\s(\d+)\s(\d+)/);
		my $exon = $1;
		my $exonPos = $2;
		my $coverage = $3;				
		
		die "Pos twice in $file ? $exon $exonPos" if(defined $forReturn{$exon}{$exonPos});
		$forReturn{$exon}{$exonPos} = $coverage;
	}	
	close(F);
	
	return \%forReturn;
}


sub twoValidationAlleles_2_proper_names
{
	my $alleles_validation = shift;
	my $locus = shift;
	my $exon_sequences = shift;
	
	die unless($#{$alleles_validation} == 1);
	
	my @forReturn;
	
	$locus =~ s/HLA//;
	for(my $aI = 0; $aI < 2; $aI++)
	{
		my @vA = split(/;/, $alleles_validation->[$aI]);
		REFALLELE: foreach my $validation_allele (@vA)
		{
			my $original_validation_allele = $validation_allele;
			
			if($validation_allele =~ /\:/)
			{
				$validation_allele = $locus . '*' . $validation_allele;
				# $validation_allele =~ s/g//;
				# my @components = split(/\:/, $validation_alleles),
				
				# die unless(length($validation_allele) >= 4);
				
				# $validation_allele = substr($validation_allele, 0, 2) . ':' . substr($validation_allele, 2);
				# $validation_allele = $locus . '*' . $validation_allele;
			
			}
			else
			{
				$validation_allele =~ s/g//;
				die unless(length($validation_allele) >= 4);
				
				$validation_allele = substr($validation_allele, 0, 2) . ':' . substr($validation_allele, 2);
				$validation_allele = $locus . '*' . $validation_allele;
			}
			my @validation_alleles = ($validation_allele);
			
			my @history_extensions = ($validation_allele);
			my $extensions = 0;
			my $notFoundFirstRound = 0;
			while( not exists $exon_sequences->{$validation_allele} )
			{
				$extensions++;
			
				$validation_allele .= ':01';
				push(@history_extensions, $validation_allele);
				if($extensions > 3)
				{
					$notFoundFirstRound = 1;
					last;
				}
				
			}
			
			if($notFoundFirstRound)
			{
				my $extensions = 0;
				while(scalar(grep {exists $exon_sequences->{$_}} @validation_alleles) == 0)
				{
					$extensions++;
					my $viMax = $#validation_alleles;
					for(my $vI = 0; $vI <= $viMax; $vI++)
					{
						my $a1 = $validation_alleles[$vI];
						$validation_alleles[$vI] .= ':01';
						push(@validation_alleles, $a1.':02');
					}
					
					$validation_allele .= ':01';
					if($extensions > 3)
					{
						# print Dumper([grep {$_ =~ /DQB1/} keys %$exon_sequences]);
						if($validation_allele eq $vA[$#vA])
						{
							push(@forReturn, '?');								
							warn Dumper("Can't identify (II) exon alleles for $alleles_validation->[$aI]", \@validation_alleles, $locus, $alleles_validation, $extensions, [(keys %$exon_sequences)[0 .. 10]], $validation_allele, $original_validation_allele, \@history_extensions);							
							next REFALLELE;
						}
						else
						{
							next REFALLELE;
						}
					}
				}		
				
				my @foundAlleles = grep {exists $exon_sequences->{$_}} @validation_alleles;
				die unless(scalar(@foundAlleles) > 0);
				push(@forReturn, $foundAlleles[0]);	
				last REFALLELE;				
				
			}
			else
			{
				push(@forReturn, $validation_allele);
				last REFALLELE;
			}
		}
	}
			
	return @forReturn;
}

sub list_comparison
{
	my $list1_aref = shift;
	my $list2_aref = shift;
	
	my %l1 = map {$_ => 1} @$list1_aref;
	my %l2 = map {$_ => 1} @$list2_aref;
	
	unless(scalar(keys %l1) == scalar(@$list1_aref))
	{
		die "List 1 non-unique elements";
	}
	
	unless(scalar(keys %l2) == scalar(@$list2_aref))
	{
		die "List 2 non-unique elements";
	}	
	
	my %combined = map {$_ => 1} (@$list1_aref, @$list2_aref);
	
	my @l1_exclusive;
	my @l2_exclusive;
	my $n_shared = 0;
	foreach my $e (keys %combined)
	{
		if($l1{$e} and $l2{$e})
		{
			$n_shared++;
		}
		elsif($l1{$e})
		{
			push(@l1_exclusive, $e);
		}
		elsif($l2{$e})
		{
			push(@l2_exclusive, $e);
		}
		else
		{
			die;
		}
				
	}
	
	die unless(($n_shared + scalar(@l1_exclusive) + scalar(@l2_exclusive)) == scalar(keys %combined));
	
	return ($n_shared, \@l1_exclusive, \@l2_exclusive);
}

sub twoClusterAlleles
{
	my $alleles_inference = shift;
	die Dumper("Don't have two inferred alleles", $alleles_inference) unless($#{$alleles_inference} == 1);
	
	my @forReturn;
	
	for(my $aI = 0; $aI < 2; $aI++)
	{
		my $inferred_alleles = $alleles_inference->[$aI];
		my @inferred_alleles = split(/;/, $inferred_alleles);
	
		$inferred_alleles[0] =~ s/^\s+//;
		
		push(@forReturn, $inferred_alleles[0]);
	}
	
	return @forReturn;
}

sub averageFractionAlignmentOK
{
	my $dir = shift;
	my $f = $dir . '/summaryStatistics.txt';
	die "File $f not there" unless(-e $f);
	
	my $fractionOK;
	open(STATISTICS, '<', $f) or die;
	while(<STATISTICS>)
	{
		my $line = $_;
		chomp($line);
		last if($line =~ /unpaired/);
		if($line =~ /Alignment pairs, average fraction alignment OK:\s+([\d\.]+)/)
		{
			die if(defined $fractionOK);
			$fractionOK = $1;
		}
	}
	close(STATISTICS);	
	
	die "Could not find fraction OK entry" unless(defined $fractionOK);
	return $fractionOK;
}

sub inferReadLength
{
	my $file = shift;
	open(ALIGNED, '<', $file) or die;
	my $firstLine = <ALIGNED>;
	chomp($firstLine);
	my @have_lengths;
	while(<ALIGNED>)
	{
		my $line = $_;
		die unless($line =~ /Aligned pair /);
		my @p1 = map {scalar(<ALIGNED>)} (1 .. 9);
		my @p2 = map {scalar(<ALIGNED>)} (1 .. 9);
		die unless($p1[0] =~ /Read /);
		die unless($p2[0] =~ /Read /);
		my $sequence_1 = $p1[7];
		my $sequence_2 = $p1[7];
		chomp($sequence_1);
		chomp($sequence_2);
		$sequence_1 =~ s/\s+//g;
		$sequence_2 =~ s/\s+//g;
		push(@have_lengths, length($sequence_1));
		push(@have_lengths, length($sequence_2));
		last if(scalar(@have_lengths) >= 20);
	}
	close(ALIGNED);	
	
	my $avg = int(sum(@have_lengths)/scalar(@have_lengths));
	return 2*$avg;
}

sub read_exon_sequences
{
	my $file = shift;
	my %r;
	open(F, '<', $file) or die "Cannot open $file";
	my $firstLine = <F>;
	chomp($firstLine);
	my @header_fields = split(/ /, $firstLine);
	die unless($header_fields[0] eq 'IndividualID');
	while(<F>)
	{
		my $line = $_;
		chomp($line);
		$line =~ s/\r//g;
		$line =~ s/\n//g;
		my @line_fields = split(/ /, $line);
		my $allele = shift(@line_fields);
		$r{$allele} = join('', @line_fields);
	}
	close(F);
	
	return \%r;
}


sub find_exon_file
{
	my $locus = shift;
	my $exon = shift;
	$locus =~ s/HLA//;
	
	opendir(my $dh, $exon_folder) or die;
	my @exon_folder_files = readdir($dh);
	closedir($dh);
	
	@exon_folder_files = grep {$_ =~ /^${locus}_\d+_exon_${exon}\.txt$/} @exon_folder_files;
	if($#exon_folder_files != 0)
	{
		die Dumper("Can't find exon file for $locus // $exon", @exon_folder_files);
	}
	else
	{
		return $exon_folder.'/'.$exon_folder_files[0];
	}	
}


sub min_avg_max
{
	my @v = @_;
	@v = sort {$a <=> $b} @v;
	if($#v >= 1)
	{
		die unless($v[0] <= $v[1]);
	}
	
	if(scalar(@v) == 0)
	{
		return ('', '', '');
	}
	if(scalar(@v) == 1)
	{
		return($v[0], $v[0], '');
	}
	
	my $min = $v[0];
	my $max = $v[$#v];
	  
	my $sum = 0;
	foreach my $vE (@v)
	{
		$sum += $vE;
	}
	
	my $avg = '';
	if(scalar(@v) > 0)
	{
		$avg = $sum / scalar(@v);
	}
	
	return ($min, $avg, $max);
}
Browse the archive

https://github.com/AlexanderDilthey/MHC-PRG