https://github.com/AlexanderDilthey/MHC-PRG
Raw File
Tip revision: e59943adb8855532573a6c276651efad1e18a6b1 authored by Alexander Dilthey on 18 December 2018, 10:20:48 UTC
Update HLA-PRG.md
Tip revision: e59943a
nextGenInferenceVariGraph.pl
#!/usr/bin/perl -w

use strict;
use 5.010;
use List::MoreUtils qw/all mesh any /;
use Data::Dumper;
use Getopt::Long;   
use Sys::Hostname;
use File::Copy;
use File::Basename;
use Storable;
  
die unless(more_max(1,2,3) == 3);
die unless(more_max(3,2,1) == 3);
die unless(more_max(1,3,2) == 3);

my $kMer_size = 55;  

# input parameters

my $graph;   
my $sample;
my $redo = 0;
my $collect = 0;
my $labelOnly = 1;
my $vcfPos;
my $mapping_display_only_instructions = 0;
my $localOxford = 0;
if(hostname() =~ /(sequoia)|(elm)|(birch)|(banyan)|(cluster3)/)
{
	$localOxford = 1;
}

GetOptions ('graph:s' => \$graph,
 'sample:s' => \$sample, 
 'redo:s' => \$redo,  
 'collect:s' => \$collect, 
 'labelOnly:s' => \$labelOnly,
 'kmer:s' => \$kMer_size,
 'vcfPos:s' => \$vcfPos,
 'mapping_display_only_instructions:s' => \$mapping_display_only_instructions,
 );         

 
# external programs path

my $stampy_bin = qq(/home/dilthey/Stampy/stampy-1.0.20/stampy.py);
my $PICARD_SAM2FASTQ = '/Net/fs1/home/dilthey/picard/picard-tools-1.83/SamToFastq.jar';
my $BWA_bin = qq(/home/dilthey/BWA/bwa-0.6.2/bwa);
my $samtools_bin = qq(/home/dilthey/samtools-0.1.18/samtools);
my $platypus_executable = qq(/home/dilthey/Platypus/Platypus_0.2.0/Platypus.py);

# external data paths

my $path_to_PGF_haplotype = qq(../data/mhc_ref_8_haplotypes/pgf_ref.fasta);
my $reference_genome_FASTA_path = qq(../data/GRCh37.60/fasta);
my $path_to_HumanGenome_graph = qq(../data/GRCh37.60/graph/alex_grc37_auto_X_Y.k${kMer_size}.ctx);

# temporary directory for read re-mapping

my $remapping_basedir = qq(../tmp/readReMapping/);

# data lookup paths

my $sample_path = $localOxford ? qq(/gpfs1/well/gsk_hla/CortexGraphs/) : qq(../data/samples/CortexGraphs);
my $base_path_BAMs = $localOxford ? qq(/gpfs1/well/gsk_hla/bam_output/) : qq(../data/samples/BAMs/);

### do not modify anything below this line

# my $genomic_SNP_information = qq(../data/GS v3 SNP information.txt);
# die "Cannot access $genomic_SNP_information" unless (-e $genomic_SNP_information);

# my $base_path_reads = qq(/Net/x9000/projects/gsk_hla/);


my $xMHC_reference = $path_to_PGF_haplotype;
die "Cannot access $path_to_PGF_haplotype" unless (-e $path_to_PGF_haplotype);

# my $tmpPGF = read_PGF();
# die "PGF spanning from 28702185 to ".(28702185+length($tmpPGF)-1)."\n";
# die read_PGF();

my $memory_height = 26;
my $memory_width = 100;


 
my $pgf_start = 28702185;
my $remapping_flank_length = 100;


 die "Cannot access $path_to_HumanGenome_graph" unless (-e $path_to_HumanGenome_graph);
die unless($kMer_size);

my $expected_normal_graph = $graph.'/graph.txt';
die "Normal graph $expected_normal_graph not there!" unless(-e $expected_normal_graph);

my $expected_kmer_graph_file = $expected_normal_graph.".kmers_".$kMer_size;
die "Augmented kMer graph $expected_kmer_graph_file not there!" unless(-e $expected_kmer_graph_file);

die "Normal graph more recent than kMer graph" unless((stat ($expected_normal_graph))[9] < (stat ($expected_kmer_graph_file))[9]);

my $sample_graph_file = $sample_path.'/'.$sample.'_'.$kMer_size.'.ctx';
#my $sample_graph_glob = $sample_path.'/'.$sample.'_'.$kMer_size.'_*.ctx';
#my @potential_sample_graph_files = glob($sample_graph_glob);
#unless($#potential_sample_graph_files == 0)
#{#
#	die "Failed to find out which binary to use.\nGlob: $sample_graph_glob\n".Dumper(@potential_sample_graph_files);
#}
#my $sample_graph_file = $potential_sample_graph_files[0];

if($sample ne 'N')
{
	die "Sample graph $sample_graph_file not there!" unless(-e $sample_graph_file)
}

# Load list of required kMers

my $expected_required_kMers_file = $expected_kmer_graph_file.".requiredKMers";
unless((-e $expected_required_kMers_file) and (!$redo) and ((stat ($expected_kmer_graph_file))[9] < (stat ($expected_required_kMers_file))[9]))
{
	print "Determining required kMers\n";
	my $cmd = qq(../bin/MHC-PRG domode determineRequiredKMers $expected_kmer_graph_file $expected_required_kMers_file);
	my $output = `$cmd`;    
	unless(-e $expected_required_kMers_file)
	{
		die "Could not determine required kMers.\n\nCommand:\n$cmd\n\nOutput:\n\n$output";
	}
	
	open(KMERS, "<", $expected_required_kMers_file) or die "Cannot open $expected_required_kMers_file";
	while(<KMERS>)
	{
		my $line = $_;
		chomp($line);
		last if ($. > 5);
		die "Wrong length of kMers in graph $expected_kmer_graph_file: expect $kMer_size, got ".length($line) unless(length($line) == $kMer_size);
	}
	close(KMERS);
}

# Count occurences of required kMers in the reference genome (cortex-based, expect output file)

my %kMer_reference_count;
my $kMer_count_reference = $expected_required_kMers_file.'.binaryCount';
unless((-e $kMer_count_reference) and (!$redo))
{
	print "Count reference kMers\n";
	#my $cmd = qq($cortex_binary --multicolour_bin $path_to_HumanGenome_graph  --kmer_size $kMer_size --mem_height ${memory_height} --mem_width ${memory_width} --dilthey2 $expected_required_kMers_file);
	my $cmd = qq(./readCortexCoverage.pl --cortex_bin $path_to_HumanGenome_graph  --kMer_size $kMer_size --interesting_kMers $expected_required_kMers_file);
	my $output = `$cmd`;
	unless(-e $kMer_count_reference)
	{
		die "Could not count reference kMers\n\nCommand:\n $cmd \n\nOutput:\n\n$output";
	}
}

open(KMERS, "<", $kMer_count_reference) or die "Cannot open $kMer_count_reference";
while(<KMERS>)
{#
	my $line = $_;
	chomp($line);
	next unless($line);
	next if($line =~ /^Mean/);
	next if($line =~ /^Total/);
	
	my @f = split(/ /, $line);
	unless($#f == 1)
	{
		die "Strange line in $kMer_count_reference -- expect two fields, separated by a space.\n\nL:$line";
	}
	die "length ".length($f[0])." vs expected $kMer_size in file $kMer_count_reference" unless(length($f[0])==$kMer_size);
	$kMer_reference_count{$f[0]} = $f[1];
}
close(KMERS);

# Count occurences of required kMers in the sample genome (cortex-based, expect output file)

my $graphForFileName = $graph;
$graphForFileName =~ s/^.+tmp2//;
$graphForFileName =~ s/\W/_/g;
my $kMer_count_sample_required = qq(../tmp/kMerCount_).join('_', $graphForFileName, $sample, $kMer_size,  'required');
my $kMer_count_sample = $kMer_count_sample_required.'.binaryCount';
if($sample ne 'N')
{
	unless((-e $kMer_count_sample) and (!$redo))
	{
		print "Count sample kMers\n";
		copy($expected_required_kMers_file, $kMer_count_sample_required) or die "Cannot copy $expected_required_kMers_file to $kMer_count_sample_required";	
		#my $cmd = qq($cortex_binary --multicolour_bin $sample_graph_file  --kmer_size $kMer_size --mem_height ${memory_height} --mem_width ${memory_width} --dilthey2 $kMer_count_sample_required);
		my $cmd = qq(./readCortexCoverage.pl --cortex_bin $sample_graph_file  --kMer_size $kMer_size --interesting_kMers $kMer_count_sample_required);
		
		my $output = `$cmd`;
		unless(-e $kMer_count_sample)
		{
			die "Could not count sample kMers\n\nCommand:\n$cmd\n\nOutput:\n\n$output";
		}
	}
}
else
{
	my @kMers;
	open(KMERSREQ, "<", $expected_required_kMers_file) or die "Cannot open $expected_required_kMers_file";
	while(<KMERSREQ>)
	{
		my $line = $_;
		chomp($line);
		push(@kMers, $line);
	}
	close(KMERSREQ);
		
	open(KMERS, '>', $kMer_count_sample) or die "Cannot open $kMer_count_sample";
	foreach my $kMer(@kMers)
	{
		print KMERS $kMer, " ", 1, "\n";
	}
	print KMERS 'MeanReadLen', ' ', 100, "\n";
	print KMERS 'TotalKMerCoverage', ' ', 3000000000*30 * (55/100), "\n";
	print KMERS 'TotalSeq', ' ', 3000000000*30, "\n";	
	
	close(KMERS);
}

# Correct the reference genome-based kMer count. How many of these kMers are  in the area that is covered by the haplotype
# graph? This number needs to be subtracted.

my $output_corrected_kMers_reference = $expected_kmer_graph_file.".kMersReferenceCorrected";

if((not -e $output_corrected_kMers_reference) or ($redo))
{
	# determine SNPs in graph file
	my %loci_in_graph;
	open(GRAPH, "<", $expected_normal_graph) or die "Cannot open $expected_normal_graph\n";
	my $firstLine = <GRAPH>;
	chomp($firstLine);
	unless($firstLine eq 'CODE:')
	{
		die "Expect first line to be CODE: - graph format changed, file $expected_normal_graph?";
	}
	while(<GRAPH>)
	{
		my $line = $_;
		chomp($line);
		if($line =~ /\:$/)
		{
			last;
		}
		my @fields = split(/\|\|\|/, $line);
		die unless($#fields == 2);
		my $locusID = $fields[0];
		$loci_in_graph{$locusID} = 1;
	}
	close(GRAPH);
	
	my @positions_in_graph = grep {$_ =~ /^S\d_(\d+)_(\d+)$/} keys %loci_in_graph;
	@positions_in_graph = map {$_ =~ /^S\d_(\d+)_(\d+)$/; $2} @positions_in_graph;
	
	unless(scalar(@positions_in_graph) > 1)
	{
		die "Have less than two loci with integrated coordinates - chr6:1234... - in the graph. Cannot determine graph boundaries";
	}
	
	# find the positions of the first and the last SNP in the graph on reference coordinates
	
	@positions_in_graph = sort {$a <=> $b} @positions_in_graph;
	
	my $first_position_graph_inPGF = $positions_in_graph[0];
	my $last_position_graph_inPGF = $positions_in_graph[$#positions_in_graph];
	
	die unless($first_position_graph_inPGF < $last_position_graph_inPGF);
	
	my %subtract_kMers;
	print "Remove kMers for positions $first_position_graph_inPGF - $last_position_graph_inPGF (in PGF)\n";
	
	# read the pgf (reference) haplotype, extract the stretch spanned by first and last SNP in graph,
	# count kMers, subtract from reference genome kMer count
	
	my $pgf_haplotype = read_PGF();
	(length($pgf_haplotype) > $first_position_graph_inPGF+$kMer_size-1) or die Dumper("PGF coordinate problem!", length($pgf_haplotype), $first_position_graph_inPGF, $kMer_size)."\n\n";
	
	my $kMer = substr($pgf_haplotype, $first_position_graph_inPGF, $kMer_size-1);
	(length($kMer) == ($kMer_size-1)) or die "Problem with extracting position $first_position_graph_inPGF + $kMer_size symbols, first position in graph $first_position_graph_inPGF , last position in graph $last_position_graph_inPGF , total PGF length ".length($pgf_haplotype);
	
	for(my $end_KMer = $first_position_graph_inPGF+$kMer_size-1; $end_KMer <= $last_position_graph_inPGF; $end_KMer++)
	{
		my $char_PGF = substr($pgf_haplotype, $end_KMer, 1);
		(length($char_PGF) == 1) or die;
		$kMer = $kMer.$char_PGF;
		(length($kMer) == $kMer_size) or die;
		$subtract_kMers{$kMer}++;
		substr($kMer, 0, 1) = '';       
	}
	
	# die Dumper(\%subtract_kMers);
	
	open(OUTPUT, '>', $output_corrected_kMers_reference) or die "Cannot open $output_corrected_kMers_reference";
	my $have_subtracted = 0;
	foreach my $kMer (sort keys %kMer_reference_count)
	{
		my $original_count = $kMer_reference_count{$kMer};
		my $subtract_count = (exists $subtract_kMers{$kMer}) ? $subtract_kMers{$kMer} : 0;
		my $new_count = $original_count - $subtract_count;
		if($subtract_count > 0)
		{
			$have_subtracted = 1;
		}
		die unless($new_count >= 0);
		print OUTPUT join(' ', $kMer, $new_count), "\n";
	}
	close(OUTPUT);
	
	unless($have_subtracted)
	{
		unlink $output_corrected_kMers_reference;		
		die "No PGF kMer was subtracted to correct the reference counts. Bad. Fix, and --redo!\n";
	}
}

my $label_part = ($labelOnly) ? ('--labelonly') : '';
my $final_command = qq(../bin/MHC-PRG domode nextGenInference $expected_kmer_graph_file $output_corrected_kMers_reference $kMer_count_sample $label_part --genotypingMode 8);
print "Execute:\n", $final_command, "\n\n";

if($collect eq '3')
{
	my $needGZ = 0;
	my $expected_output_haplotypes = $kMer_count_sample.'.viterbiHaplotypes';
	
	# get graph information
	
	my @loci_in_graph;
	my $graph_segments_file = $graph.'/segments.txt';
	die "File not there: $graph_segments_file" unless (-e $graph_segments_file);
	open(SEGMENTS, '<', $graph_segments_file) or die;
	while(<SEGMENTS>)
	{
		my $line = $_;
		chomp($line);
		$line =~ s/\n//g;
		$line =~ s/\r//g;	
		next unless($line);
		
		my $segment_file = $graph.'/'.$line;

		open(SEGMENT, '<', $segment_file) or die "Cannot open $segment_file";
		my $firstLine = <SEGMENT>;
		chomp($firstLine);
		$firstLine =~ s/\n//g;
		$firstLine =~ s/\r//g;			
		my @line_fields = split(/ /, $firstLine);
		shift(@line_fields); # kick out individual ID
		push(@loci_in_graph, @line_fields);
		close(SEGMENT);
	}
	close(SEGMENTS);
	
	my %locus_position;
	my $positions_file = $graph.'/positions.txt';
	die "File not there: $positions_file" unless (-e $positions_file);
	open(POSITIONS, '<', $positions_file) or die;	
	while(<POSITIONS>)
	{
		my $line = $_;
		chomp($line);
		$line =~ s/\n//g;
		$line =~ s/\r//g;		
		my @fields = split(/ /, $line);
		my $field_id = $fields[0];
		my $field_position = $fields[1];
		$locus_position{$field_id} = $field_position;
	}
	close(POSITIONS);
	
	for(my $lI = 1; $lI <= $#loci_in_graph; $lI++)
	{
		if(not defined $locus_position{$loci_in_graph[$lI]})
		{
			die Dumper("No position information for locus $loci_in_graph[$lI]", $lI);
		}
		if(not defined $locus_position{$loci_in_graph[$lI]})
		{
			die Dumper("No position information for locus $loci_in_graph[$lI-1]", $lI-1);
		}		
		die unless($locus_position{$loci_in_graph[$lI]} > $locus_position{$loci_in_graph[$lI-1]});
	}
	
	# read xMHC reference
	
	open(REF, "<", $xMHC_reference) or die "Cannot open $xMHC_reference";
	my $pgf_ref_seq;
	while(<REF>)
	{
		my $line = $_; 
		chomp($line);
		$line =~ s/[\x0A\x0D]+//g;	
		next if ($line =~ /^\>/);
		$pgf_ref_seq .= $line;
	}
	close(REF);
	my $pgf_stop = $pgf_start+length($pgf_ref_seq);
#die $pgf_stop;
	
	print "Reading graph information completed\n";
	
	# get the first two haplotypes
	
	open(HAPLOTYPES, '<', $expected_output_haplotypes) or die "Cannot open $expected_output_haplotypes";
	my $haplotype_1, my $haplotype_2;
	my @haplotype_1_alignment_fields;
	my @haplotype_2_alignment_fields;
	while(<HAPLOTYPES>)
	{
		my $line = $_;
		chomp($line);
		last if ($. > 2);
		my @fields = split(/ /, $line);
		my $haplotype = join('', @fields[2 .. ($#fields-2)]);
		if($. == 1)
		{
			$haplotype_1 = $haplotype;
			@haplotype_1_alignment_fields = @fields[2 .. ($#fields-2)];
		}
		elsif($. == 2)
		{
			$haplotype_2 = $haplotype;	
			@haplotype_2_alignment_fields = @fields[2 .. ($#fields-2)];
		}
	}
	close(HAPLOTYPES);
	
	$| = 1;
	
	die unless($haplotype_1 and $haplotype_2);
	die unless($#haplotype_1_alignment_fields == $#loci_in_graph);
	die unless($#haplotype_2_alignment_fields == $#loci_in_graph);
		
	print "Reading two inferred haplotypes completed\n";
	
	my @haplotype_1_alignment_index;
	my @haplotype_2_alignment_index;
	for(my $i = 0; $i < length($haplotype_1); $i++)
	{
		my $character = substr($haplotype_1, $i, 1);
		if($character ne '_')
		{
			push(@haplotype_1_alignment_index, $i);
		}
	}
	for(my $i = 0; $i < length($haplotype_2); $i++)
	{
		my $character = substr($haplotype_2, $i, 1);
		if($character ne '_')
		{
			push(@haplotype_2_alignment_index, $i);
		}
	}	
	
	# print join(', ', @haplotype_1_alignment_index[0 .. 30]), "\n";
	# print join(', ', @haplotype_2_alignment_index[0 .. 30]), "\n";	
	# exit;

	$haplotype_1 =~ s/\*/N/g;
	$haplotype_1 =~ s/_//g;
	unless($haplotype_1 =~ /^[ACGTNacgtn]+$/)
	{
		my $nonOK = $haplotype_1;
		$nonOK =~ s/[ACGTNacgtn]//g;
		die Dumper('h1', length($nonOK), $nonOK);
	}
	
	$haplotype_2 =~ s/\*/N/g;
	$haplotype_2 =~ s/_//g;
	unless($haplotype_2 =~ /^[ACGTNacgtn]+$/)
	{
		my $nonOK = $haplotype_2;
		$nonOK =~ s/[ACGTNacgtn]//g;
		die Dumper('h2', length($nonOK), $nonOK);
	}
	
	
	die Dumper(length($haplotype_1), scalar(@haplotype_1_alignment_index)) unless(length($haplotype_1) == scalar(@haplotype_1_alignment_index));
	die Dumper(length($haplotype_2), scalar(@haplotype_2_alignment_index)) unless(length($haplotype_2) == scalar(@haplotype_2_alignment_index));
	

	# mapping directory
	my $remapping_dir = $remapping_basedir.'/kMerCount_'.join('_', $graphForFileName, $sample, $kMer_size,  'remapping');
	my $xMHC_VCF_1_readDecision = $remapping_dir.'/mapped_xMHC_1.vcf';
	my $xMHC_VCF_2_readDecision = $remapping_dir.'/mapped_xMHC_2.vcf';
	unless((-e $xMHC_VCF_1_readDecision) and (-e $xMHC_VCF_2_readDecision))
	{
		my $remapping_dir_rawGenome_1 = $remapping_dir."/genomeRef_1";
		my $remapping_dir_rawGenome_2 = $remapping_dir."/genomeRef_2";
		unless(-e $remapping_dir)
		{
			mkdir($remapping_dir) or die "Cannot create $remapping_dir"; 	
		}
			
		
		unless(-e $remapping_dir_rawGenome_1)
		{
			mkdir($remapping_dir_rawGenome_1) or die "Cannot create $remapping_dir_rawGenome_1";
			mkdir($remapping_dir_rawGenome_2) or die "Cannot create $remapping_dir_rawGenome_2";
			
			# copy reference genome
			my @reference_FASTAs = glob($reference_genome_FASTA_path.'/*.fa');
			die "Cannot find reference FASTA files - did you specify the right directory? \[ $reference_genome_FASTA_path \]" unless(@reference_FASTAs);
			foreach my $fF (@reference_FASTAs)
			{
				my $basename = fileparse($fF);
				my $newfilename_1 = $remapping_dir_rawGenome_1.'/'.$basename;
				my $newfilename_2 = $remapping_dir_rawGenome_2.'/'.$basename;			
				print "Copying $fF ... \n";
				copy($fF, $newfilename_1) or die "Cannot copy file $fF to $newfilename_1";
				copy($fF, $newfilename_2) or die "Cannot copy file $fF to $newfilename_2";			
			}
		}
		

		my $remapping_dir_reads = $remapping_dir."/reads/";
		unless(-e $remapping_dir_reads )
		{
			mkdir($remapping_dir_reads) or die;
		}
		my $BAM_file;

		if($localOxford and ($sample =~ /Z[12]/))
		{
			$remapping_dir_reads = '/Net/birch/data/oa/NA12878_ILLUMINA_PLATINUM/';
			$needGZ = 1;
			unless($mapping_display_only_instructions)
			{
				die "We want to deal with the reads of NA12878, but we can do so only in external mapping!\n";
			}
		}	
		else
		{
			$BAM_file = find_individual_BAM($sample);
			my $extraction_dir_OK_flag = $remapping_dir_reads.'/'.'extraction_successful';

			unless(-e $extraction_dir_OK_flag)
			{	
				unless(-e $BAM_file)
				{
					die "BAM file $BAM_file not there, but required!";
				}
				
				my $cmd = qq(java -Xmx2g -jar ${PICARD_SAM2FASTQ} INPUT=${BAM_file} FASTQ=${remapping_dir_reads}/file_1.fastq SECOND_END_FASTQ=${remapping_dir_reads}/file_2.fastq VALIDATION_STRINGENCY=LENIENT);
				my $ret = system($cmd);
				unless($ret == 0)
				{
					die "PICARD extraction command $cmd failed";	
				}
				
				open(OK, '>', $extraction_dir_OK_flag) or die "Cannot open $extraction_dir_OK_flag";
				print OK 1;
				close(OK);
				
				print "Reads extracted\n";		
			}
		}
		
		# my $extraction_dir_OK_flag = $remapping_dir_reads.'/'.'unzipping_successful';

		# unless(-e $extraction_dir_OK_flag)
		# {
			# Get and extract reads
			
			# my $directory = &find_individual_readsPath($sample);
			# die "Directory not existing: $directory\n" unless (-e $directory);
			# unless(-e $remapping_dir_reads)
			# {
				# mkdir($remapping_dir_reads) or die "Cannot mkdir $remapping_dir_reads";
			# }
			
			# my @zipped_original_files = glob($directory.'/*.gz');
			# my @zipped_temp_files;
			# foreach my $file (@zipped_original_files)
			# {
				# my ($filename,$filepath) = fileparse($file);
				# my $zip_new_temp_name = $remapping_dir_reads.'/'.$filename;
				# my $copy_try = 1;
				# while($copy_try != 0)
				# {
					# my $success = copy($file, $zip_new_temp_name);
					# if($success)
					# {
						# $copy_try = 0;
						# next;
					# }
					# $copy_try++;
					# if($copy_try > 5)
					# {
						# die "Fifth attempt to copy $file to $zip_new_temp_name has failed!\n";
					# }
				# }
				# my $unzip_command = qq(gunzip $zip_new_temp_name);
				# print "Executing $unzip_command\n";
				# system $unzip_command;
				# push(@zipped_temp_files, $zip_new_temp_name);
			# }
			
			# open(OK, '>', $extraction_dir_OK_flag) or die "Cannot open $extraction_dir_OK_flag";
			# print OK 1;
			# close(OK);
			
			# print "Reads extracted\n";
		# }

		my @chromosomal_files_1 = (glob($remapping_dir_rawGenome_1.'/*.fa'));
		my @chromosomal_files_2 = (glob($remapping_dir_rawGenome_2.'/*.fa'));	
		my $chromosome_6_file_1;
		my $chromosome_6_file_2;	
		foreach my $f (@chromosomal_files_1)
		{
			open(CHROMO, '<', $f) or die "Cannot open $f";
			my $firstLine = <CHROMO>;
			close(CHROMO);
			if(($firstLine =~ /^>chr6/) or ($firstLine =~ />6/))
			{
				$chromosome_6_file_1 = $f;
			}
		}
		foreach my $f (@chromosomal_files_2)
		{
			open(CHROMO, '<', $f) or die "Cannot open $f";
			my $firstLine = <CHROMO>;
			close(CHROMO);
			if(($firstLine =~ /^>chr6/) or ($firstLine =~ />6/))
			{
				$chromosome_6_file_2 = $f;
			}
		}	
		
	
		die Dumper("Cannot find file for chromosome 6 [1]", \@chromosomal_files_1)   unless ($chromosome_6_file_1);
		die Dumper("Cannot find file for chromosome 6 [2]", \@chromosomal_files_2) unless ($chromosome_6_file_2);	
		
		foreach my $files ([$chromosome_6_file_1, $remapping_dir_rawGenome_1, $haplotype_1], [$chromosome_6_file_2, $remapping_dir_rawGenome_2, $haplotype_2])
		{
			my $chromosome_6_file = $files->[0];
			my $remapping_dir_rawGenome = $files->[1];
			my $haplotype = $files->[2];
					
			my $chromo_6_header;
			my $chromo_6_content;
					
			my $chromo6;
			open($chromo6, '<', $chromosome_6_file) or die "Cannot open $chromosome_6_file";
						
			$chromo_6_header = <$chromo6>;
			chomp($chromo_6_header);	
					
			$chromo_6_header =~ s/\n//g;
			$chromo_6_header =~ s/\r//g;
					
			if($chromo_6_header !~ /xMHC_excised/)
			{		
				while(<$chromo6>)
				{
					my $line = $_;
					chomp($line);	
					die if($line =~ />/);
					$line =~ s/\n//g;
					$line =~ s/\r//g;
					$chromo_6_content .= $line;
				}
				
				# now get positions to blind
				
				my %loci_in_graph;
				open(GRAPH, "<", $expected_normal_graph) or die "Cannot open $expected_normal_graph\n";
				my $firstLine = <GRAPH>;
				chomp($firstLine);
				unless($firstLine eq 'CODE:')
				{
					die "Expect first line to be CODE: - graph format changed, file $expected_normal_graph?";
				}
				while(<GRAPH>)
				{
					my $line = $_;
					chomp($line);
					if($line =~ /\:$/)
					{
						last;
					}
					my @fields = split(/\|\|\|/, $line);
					die unless($#fields == 2);
					my $locusID = $fields[0];
					$loci_in_graph{$locusID} = 1;
				}
				close(GRAPH);
				
				my @positions_in_graph = grep {$_ =~ /^S\d_(\d+)_(\d+)$/} keys %loci_in_graph;
				@positions_in_graph = map {$_ =~ /^S\d_(\d+)_(\d+)$/; $2} @positions_in_graph;
				
				unless(scalar(@positions_in_graph) > 1)
				{
					die "Have less than two loci with integrated coordinates - chr6:1234... - in the graph. Cannot determine graph boundaries";
				}
				
				# find the positions of the first and the last SNP in the graph on reference coordinates
				
				@positions_in_graph = sort {$a <=> $b} @positions_in_graph;
				
				my $first_position_graph_inPGF = $positions_in_graph[0];
				my $last_position_graph_inPGF = $positions_in_graph[$#positions_in_graph];
				
				die unless($first_position_graph_inPGF < $last_position_graph_inPGF);
							
				my $pgf_sequence = read_PGF();
				my $pgf_sequence_coveredByGraph = substr($pgf_sequence, $first_position_graph_inPGF, $last_position_graph_inPGF - $first_position_graph_inPGF + 1);
				my $expected_sequence_in_genome = substr($chromo_6_content, $pgf_start + $first_position_graph_inPGF - 1, $last_position_graph_inPGF - $first_position_graph_inPGF + 1);
				
				# die "PGF does not match up with reference, this is not good!" unless ($pgf_sequence_coveredByGraph eq $expected_sequence_in_genome);
				
				my $lengthN = 'N' x length($pgf_sequence_coveredByGraph);
				die unless(length($lengthN) == length($expected_sequence_in_genome));

				my $genomic_graph_start = $pgf_start + $first_position_graph_inPGF;
				my $genomic_graph_lastPosition = $pgf_start + $first_position_graph_inPGF - 1 + length($lengthN) - 1;
				my $graph_before_flank = substr($chromo_6_content, $genomic_graph_start - 1 - $remapping_flank_length, $remapping_flank_length);
				my $graph_after_flank = substr($chromo_6_content, $genomic_graph_lastPosition + 1, $remapping_flank_length);		
				my $combinedBeforeAfter = $graph_before_flank . $expected_sequence_in_genome . $graph_after_flank;
				my $genomeSurrounding = substr($chromo_6_content, $pgf_start + $first_position_graph_inPGF - 1 - 1000, $last_position_graph_inPGF - $first_position_graph_inPGF + 1 + 2000);
				
				die if(index($genomeSurrounding, $combinedBeforeAfter) == -1);

				substr($chromo_6_content, $pgf_start + $first_position_graph_inPGF - 1, $last_position_graph_inPGF - $first_position_graph_inPGF + 1) = $lengthN;
				
				close($chromo6);
				
				open($chromo6, '>', $chromosome_6_file) or die "Cannot open $chromosome_6_file";
				print { $chromo6 } $chromo_6_header.' xMHC_excised'."\n";
				
				while($chromo_6_content ne "")
				{
					my $consumeLength = 60;
					if($consumeLength > length($chromo_6_content))
					{
						$consumeLength = length($chromo_6_content);
					}
					print { $chromo6 } substr($chromo_6_content, 0, $consumeLength), "\n";
					substr($chromo_6_content, 0, $consumeLength) = '';
				}
				
				my $file_haplo1 = $remapping_dir_rawGenome.'/xMHC_haplo1.fa';
					
				open(CHR1, '>', $file_haplo1) or die "Cannot open $file_haplo1";
							
				print CHR1 ">xMHCpseudoChromHaplotype chr6:${genomic_graph_start}:${genomic_graph_lastPosition}", "\n";
				
				my $written_length = 0;
				
				my $haplotype_to_print = $graph_before_flank.$haplotype.$graph_after_flank;
				
				while($haplotype_to_print ne "")
				{
					my $consumeLength = 60;
					if($consumeLength > length($haplotype_to_print))
					{
						$consumeLength = length($haplotype_to_print);
					}
					my $toWrite = substr($haplotype_to_print, 0, $consumeLength);
					$written_length += length($toWrite);
					print CHR1 $toWrite, "\n";
					substr($haplotype_to_print, 0, $consumeLength) = '';
				}
					
				close(CHR1);
				
			}
			close($chromo6);	
		}
		

		# pseudo genome references
		
		my $xMHC_IGV_pseudoGenome_1 = $remapping_dir.'/IGV_pseudoGenome_1.fa';
		my $xMHC_IGV_pseudoGenome_2 = $remapping_dir.'/IGV_pseudoGenome_2.fa';

		copy($remapping_dir_rawGenome_1.'/xMHC_haplo1.fa', $remapping_dir.'/IGV_pseudoGenome_1.fa');
		copy($remapping_dir_rawGenome_2.'/xMHC_haplo1.fa', $remapping_dir.'/IGV_pseudoGenome_2.fa');
		
		unless(-e $samtools_bin)
		{
			die "Samtools binary $samtools_bin not found";
		}
		
		my $cmd = qq(${samtools_bin} faidx ${xMHC_IGV_pseudoGenome_1});
		print $cmd, "\n", `$cmd`, "\n";	
		$cmd = qq(${samtools_bin} faidx ${xMHC_IGV_pseudoGenome_2});
		print $cmd, "\n", `$cmd`, "\n";	
		
		print "Chromosome 6 and pseudo-chromosomes prepared!\n";
		
		my $combined_genome_file_f1 = $remapping_dir.'/combinedGenome_1.fasta';
		my $combined_genome_file_f2 = $remapping_dir.'/combinedGenome_2.fasta';;

		my $igv1_genome;
		open(IGV1, '<', $remapping_dir.'/IGV_pseudoGenome_1.fa') or die;
		while(<IGV1>)
		{
			my $line = $_;
			chomp($line);
			next unless($line);
			next if(substr($line, 0, 1) eq '>');
			$igv1_genome .= $line;
		}
		close(IGV1);
		$igv1_genome = substr($igv1_genome, $remapping_flank_length);
		$igv1_genome = substr($igv1_genome, 0, length($igv1_genome) - $remapping_flank_length);

		my $igv2_genome;
		open(IGV2, '<', $remapping_dir.'/IGV_pseudoGenome_2.fa') or die;
		while(<IGV2>)
		{
			my $line = $_;
			chomp($line);
			next unless($line);
			next if(substr($line, 0, 1) eq '>');
			$igv2_genome .= $line;
		}
		close(IGV2);
		$igv2_genome = substr($igv2_genome, $remapping_flank_length);
		$igv2_genome = substr($igv2_genome, 0, length($igv2_genome) - $remapping_flank_length);
		
		my $expected_igv1_genome = join('', @haplotype_1_alignment_fields);
		$expected_igv1_genome =~ s/\*/N/g;
		$expected_igv1_genome =~ s/_//g;

		my $expected_igv2_genome = join('', @haplotype_2_alignment_fields);
		$expected_igv2_genome =~ s/\*/N/g;   
		$expected_igv2_genome =~ s/_//g;

		die unless($expected_igv1_genome eq $igv1_genome);
		die unless($expected_igv2_genome eq $igv2_genome);

		# print Dumper("Position 4634404", substr($expected_igv2_genome, 4634404, 1), substr($expected_igv2_genome, 4634404-15, 31)), "\n";

		if($mapping_display_only_instructions)
		{
			my $fa_idx_2 = $combined_genome_file_f2.'.fai';   
			unless(-e $fa_idx_2)
			{
				my @chromosomal_files_1 = (glob($remapping_dir_rawGenome_1.'/*.fa'));
				my @chromosomal_files_2 = (glob($remapping_dir_rawGenome_2.'/*.fa'));	
					
				open(COMBINED1, '>', $combined_genome_file_f1) or die "Cannot open $combined_genome_file_f1";
				open(COMBINED2, '>', $combined_genome_file_f2) or die "Cannot open $combined_genome_file_f2";	
				foreach my $f (@chromosomal_files_1)
				{
					# next unless($f =~ /xMHC/); # remove later
					
					open(F, '<', $f) or die "Cannot open $f";
					while(<F>)
					{
						print COMBINED1 $_;
					}
					close(F);
				} 
			
				foreach my $f (@chromosomal_files_2)
				{
					# next unless($f =~ /xMHC/); # remove later
				
					open(F, '<', $f) or die "Cannot open $f";
					while(<F>)
					{
						print COMBINED2 $_;
					}
					close(F);
				}	
				close(COMBINED1);
				close(COMBINED2);	
				
				my $cmd = qq(${samtools_bin} faidx ${combined_genome_file_f1});
				print $cmd, "\n", `$cmd`, "\n";	
				$cmd = qq(${samtools_bin} faidx ${combined_genome_file_f2});
				print $cmd, "\n", `$cmd`, "\n";	
			}
			die "No fasta index file $fa_idx_2 ??" unless(-e $fa_idx_2);
		}
		else
		{
			unless($mapping_display_only_instructions or ((-e $combined_genome_file_f1.'.sa') and (-e $combined_genome_file_f2.'.sa')))
			{	
			
				my @chromosomal_files_1 = (glob($remapping_dir_rawGenome_1.'/*.fa'));
				my @chromosomal_files_2 = (glob($remapping_dir_rawGenome_2.'/*.fa'));	
					
				open(COMBINED1, '>', $combined_genome_file_f1) or die "Cannot open $combined_genome_file_f1";
				open(COMBINED2, '>', $combined_genome_file_f2) or die "Cannot open $combined_genome_file_f2";	
				foreach my $f (@chromosomal_files_1)
				{
					# next unless($f =~ /xMHC/); # remove later
					
					open(F, '<', $f) or die "Cannot open $f";
					while(<F>)
					{
						print COMBINED1 $_;
					}
					close(F);
				} 
			
				foreach my $f (@chromosomal_files_2)
				{
					# next unless($f =~ /xMHC/); # remove later
				
					open(F, '<', $f) or die "Cannot open $f";
					while(<F>)
					{
						print COMBINED2 $_;
					}
					close(F);
				}	
				close(COMBINED1);
				close(COMBINED2);	
				
				my $cmd = qq(${samtools_bin} faidx ${combined_genome_file_f1});
				print $cmd, "\n", `$cmd`, "\n";	
				$cmd = qq(${samtools_bin} faidx ${combined_genome_file_f2});
				print $cmd, "\n", `$cmd`, "\n";	
				
				unless(-e $BWA_bin)
				{
					die "BWA binary $BWA_bin not found -- and you should check whetehr you don't want to call me with --mapping_display_only_instructions";
				}
				$cmd = qq(${BWA_bin} index -a bwtsw $combined_genome_file_f1);
				print $cmd, "\n\n";
				print `$cmd`, "\n\n";

				$cmd = qq(${BWA_bin} index -a bwtsw $combined_genome_file_f2);
				print $cmd, "\n\n";
				print `$cmd`, "\n\n";
				
				print "Indices for BWA built\n";
			}
		}
		
		my $temp_alignment_1 = $remapping_dir.'/temp_alignment_1';
		my $temp_alignment_2 = $remapping_dir.'/temp_alignment_2';
		unless(-e $temp_alignment_1)
		{
			mkdir($temp_alignment_1) or die;
		}
		unless(-e $temp_alignment_2)
		{
			mkdir($temp_alignment_2) or die;
		}
		
		my $merged_BAM_1 = $remapping_dir.'/merged_1.bam';
		my $merged_BAM_2 = $remapping_dir.'/merged_2.bam';
		
		if(not $mapping_display_only_instructions)
		{
			unless(-e $BWA_bin)
			{
				die "BWA binary $BWA_bin not found -- and you should check whetehr you don't want to call me with --mapping_display_only_instructions";
			}
		}
		if($mapping_display_only_instructions)
		{
			my $sorted_bam_1 = "${merged_BAM_1}.sorted.bam";
			my $sorted_bam_2 = "${merged_BAM_2}.sorted.bam";
			if((-e $sorted_bam_1) and (-e $sorted_bam_2))
			{
				print "Found $sorted_bam_1 and $sorted_bam_2 , continue!\n";
			}
			else
			{
				unless($localOxford)
				{ 
					die "You activated the switch \$mapping_display_only_instructions, but the corresponding code makes only sense in our local Oxford environment - track down this error message in nextGenInferenceVariGraph.pl, and modify the code following that line according to your environment.";
				}	
				print "\n\nYou now need to make sure $sorted_bam_1 and $sorted_bam_2 are there, by mapping the following reads separately to the references specified:\n\n";
				print "READS: $remapping_dir_reads \n\n";
				print "REFERENCES: $remapping_dir_rawGenome_1 and $remapping_dir_rawGenome_2\n\n";
				print "PAIRED-END: No\n\n";				
				print "Sorting and indexing: yes\n\n";							
				if($localOxford)
				{
					print "Command suggestion:\ncd /gpfs1/well/gsk_hla/readLengthSimulator/src; ./alignGenome.pl --action prepare --reference_genome_FASTA_path $remapping_dir_rawGenome_1 --paired_end 0 --input_directory $remapping_dir_reads --name_for_project REMAPPING_${sample}_1 --gz $needGZ; ./alignGenome.pl --action prepare --reference_genome_FASTA_path $remapping_dir_rawGenome_2 --paired_end 0 --input_directory $remapping_dir_reads --name_for_project REMAPPING_${sample}_2 --gz $needGZ\n\n\n";
				}
				print "\n\nFinally, copy BAMs (sorted and indexed) to\n${sorted_bam_1}\n${sorted_bam_2}\n... also copy index files!\n\n";
				exit;
			}
		} 
		else
		{
			unless((-e $merged_BAM_1) and (-e $merged_BAM_2))
			{
			
				die if($needGZ);
				
				my @fqfiles = glob($remapping_dir_reads.'/*.fastq');
					
				my %pe_pairs = map {$_ => 1} map {($_ =~ /.+\/(.+)_(1|2).+?$/) or die "Cannot parse filename $_"; $1} @fqfiles;
				my %pe_counts;
				my @pe_lists = ([], []);
				my @se_lists = ();

				foreach my $pe_pair (keys %pe_pairs)
				{
					foreach my $d (1, 2)
					{			
						my $f = $remapping_dir_reads.'/'.$pe_pair.'_'.$d.'.fastq';
						next unless (-e $f);
						push(@{$pe_counts{$pe_pair}}, $f);  
					}
				}
					
				my $found_pe = 0;
				foreach my $present_two_key (grep {scalar(@{$pe_counts{$_}}) == 2} keys %pe_counts)
				{
					$found_pe++;
					push(@{$pe_lists[0]}, $pe_counts{$present_two_key}[0]);
					push(@{$pe_lists[1]}, $pe_counts{$present_two_key}[1]);
				}
				
				foreach my $present_one_key (grep {scalar(@{$pe_counts{$_}}) == 1} keys %pe_counts)
				{
					warn "There is a file without mated reads: $present_one_key -- not good, but ignore and resume.\n";
					push(@se_lists, $pe_counts{$present_one_key}[0]);
				}
				
				my @BAMs_toMerge_1;
				my @BAMs_toMerge_2;
				
				die "No files to merge?" unless (scalar(@se_lists) or scalar(@{$pe_lists[0]}));
				
				for(my $i = 0; $i < scalar(@se_lists); $i++)
				{
					print "Align reads chunk $i (no pairs) using BWA [1]\n";
					
					my $reads_file = $se_lists[$i];
					my $sai_file_1 = $temp_alignment_1.'/'.fileparse($reads_file).'.sai';
					my $sai_file_2 = $temp_alignment_2.'/'.fileparse($reads_file).'.sai';
					my $bam_file_1 = $sai_file_1.'.bam';
					my $bam_file_2 = $sai_file_2.'.bam';
					
					unless(-e $BWA_bin)
					{
						die "BWA binary $BWA_bin not found -- and you should check whetehr you don't want to call me with --mapping_display_only_instructions";					
					}
					my $cmd = qq(${BWA_bin} aln -q10 -t42 ${combined_genome_file_f1} ${reads_file} > ${sai_file_1});
					print $cmd, "\n", `$cmd`, "\n";
					$cmd = qq(${BWA_bin} samse ${combined_genome_file_f1} ${sai_file_1} ${reads_file} | ${samtools_bin} view -Sb - > ${bam_file_1});
					print $cmd, "\n", `$cmd`, "\n";
					
					$cmd = qq(${BWA_bin} aln -q10 -t42 ${combined_genome_file_f2} ${reads_file} > ${sai_file_2});
					print $cmd, "\n", `$cmd`, "\n";
					$cmd = qq(${BWA_bin} samse ${combined_genome_file_f2} ${sai_file_2} ${reads_file} | ${samtools_bin} view -Sb - > ${bam_file_2});
					print $cmd, "\n", `$cmd`, "\n";
					
					# exit;
					
					push(@BAMs_toMerge_1, $bam_file_1);
					push(@BAMs_toMerge_2, $bam_file_2);
					
					# last; # remove later
				}
				
				for(my $i = 0; $i < scalar(@{$pe_lists[0]}); $i++)
				{
					# last; # remove later
					
					print "Align reads chunk $i (paired reads) using BWA [1]\n";
					
					foreach my $reads_file ($pe_lists[0][$i], $pe_lists[1][$i])
					{
						my $sai_file_1 = $temp_alignment_1.'/'.fileparse($reads_file);
						my $sai_file_2 = $temp_alignment_2.'/'.fileparse($reads_file);
						my $bam_file_1 = $sai_file_1.'.bam';
						my $bam_file_2 = $sai_file_2.'.bam';
						
						my $cmd = qq(${BWA_bin} aln -q10 -t42 ${combined_genome_file_f1} ${reads_file} > ${sai_file_1});
						print $cmd, "\n", `$cmd`, "\n";
						$cmd = qq(${BWA_bin} samse ${combined_genome_file_f1} ${sai_file_1} ${reads_file} | ${samtools_bin} view -Sb - > ${bam_file_1});
						print $cmd, "\n", `$cmd`, "\n";
						
						$cmd = qq(${BWA_bin} aln -q10 -t42 ${combined_genome_file_f2} ${reads_file} > ${sai_file_2});
						print $cmd, "\n", `$cmd`, "\n";
						$cmd = qq(${BWA_bin} samse ${combined_genome_file_f2} ${sai_file_2} ${reads_file} | ${samtools_bin} view -Sb - > ${bam_file_2});
						print $cmd, "\n", `$cmd`, "\n";
						
						push(@BAMs_toMerge_1, $bam_file_1);
						push(@BAMs_toMerge_2, $bam_file_2);			
						
						# exit;
						
					}  
				}

				print "Merging BAMs... [1]\n";
				if(scalar(@BAMs_toMerge_1) > 1)
				{
					my $cmd = $samtools_bin.' merge '.$merged_BAM_1.' '.join(' ', @BAMs_toMerge_1);
					print $cmd, "\n", `$cmd`, "\n";
					die unless(-e $merged_BAM_1);
				}
				else
				{
					copy($BAMs_toMerge_1[0], $merged_BAM_1);
				}
				
				print "Merging BAMs... [2]\n";	
				if(scalar(@BAMs_toMerge_2) > 1)
				{		
					my $cmd = $samtools_bin.' merge '.$merged_BAM_2.' '.join(' ', @BAMs_toMerge_2);
					print $cmd, "\n", `$cmd`, "\n";
					die unless(-e $merged_BAM_2);
				}
				else
				{
					copy($BAMs_toMerge_2[0], $merged_BAM_2);		
				}
				
				print "Sorting BAMs... \n";	
				my $cmd = $samtools_bin." sort ${merged_BAM_1} ${merged_BAM_1}.sorted";
				print $cmd, "\n", `$cmd`, "\n";
				$cmd = $samtools_bin." sort ${merged_BAM_2} ${merged_BAM_2}.sorted";
				print $cmd, "\n", `$cmd`, "\n";		

				print "Indexing BAMs... \n";	
				$cmd = $samtools_bin." index ${merged_BAM_1}.sorted.bam";
				print $cmd, "\n", `$cmd`, "\n";
				$cmd = $samtools_bin." index ${merged_BAM_2}.sorted.bam";
				print $cmd, "\n", `$cmd`, "\n";				
			}
		}
		
		my $xMHC_SAM_1 = $remapping_dir.'/mapped_xMHC_1.sam';
		my $xMHC_SAM_2 = $remapping_dir.'/mapped_xMHC_2.sam';
		my $xMHC_SAM_1_readDecision = $remapping_dir.'/mapped_xMHC_1.sam.exclusiveReads';
		my $xMHC_SAM_2_readDecision = $remapping_dir.'/mapped_xMHC_2.sam.exclusiveReads';
		my $xMHC_BAM_1_readDecision = $remapping_dir.'/mapped_xMHC_1.bam.exclusiveReads';
		my $xMHC_BAM_2_readDecision = $remapping_dir.'/mapped_xMHC_2.bam.exclusiveReads';
		
		unless((-e $xMHC_VCF_1_readDecision) and (-e $xMHC_VCF_2_readDecision))
		{
			print "Extracting re-mapped xMHC\n";
			
			my $cmd = $samtools_bin." view -h ${merged_BAM_1}.sorted.bam xMHCpseudoChromHaplotype > ${xMHC_SAM_1}";
			print $cmd, "\n", `$cmd`, "\n";		
			$cmd = $samtools_bin." view -h ${merged_BAM_2}.sorted.bam xMHCpseudoChromHaplotype > ${xMHC_SAM_2}";
			print $cmd, "\n", `$cmd`, "\n";				
			
			print "Deciding where reads come from...\n";
			makeReadOriginDecision($xMHC_SAM_1, $xMHC_SAM_2);
			
			die "Missing file $xMHC_SAM_1_readDecision" unless(-e $xMHC_SAM_1_readDecision);
			die "Missing file $xMHC_SAM_2_readDecision" unless(-e $xMHC_SAM_2_readDecision);
					
			print "Converting files back to BAM...\n";

			$cmd = $samtools_bin." view -S -b -o ${xMHC_BAM_1_readDecision} ${xMHC_SAM_1_readDecision}";
			print $cmd, "\n", `$cmd`, "\n";		

			$cmd = $samtools_bin." view -S -b -o ${xMHC_BAM_2_readDecision} ${xMHC_SAM_2_readDecision}";
			print $cmd, "\n", `$cmd`, "\n";
					
			die unless((-e $xMHC_BAM_1_readDecision) and (-e $xMHC_BAM_2_readDecision));
			
			$cmd = $samtools_bin." sort ${xMHC_BAM_1_readDecision} ${xMHC_BAM_1_readDecision}.sorted";
			print $cmd, "\n", `$cmd`, "\n";		
			$cmd = $samtools_bin." index ${xMHC_BAM_1_readDecision}.sorted.bam";
			print $cmd, "\n", `$cmd`, "\n";		
			
			$cmd = $samtools_bin." sort ${xMHC_BAM_2_readDecision} ${xMHC_BAM_2_readDecision}.sorted";
			print $cmd, "\n", `$cmd`, "\n";		
			$cmd = $samtools_bin." index ${xMHC_BAM_2_readDecision}.sorted.bam";
			print $cmd, "\n", `$cmd`, "\n";		
			
			print "Executing Platypus...\n";

			$cmd = qq(python ${platypus_executable} callVariants --bamFiles=${xMHC_BAM_1_readDecision}.sorted.bam --output=${xMHC_VCF_1_readDecision} --refFile=${combined_genome_file_f1} --logFileName=${xMHC_VCF_1_readDecision}.log --nCPU=1 --mergeClusteredVariants=1 --region=xMHCpseudoChromHaplotype);			
			print $cmd, "\n", `$cmd`, "\n";
			
			$cmd = qq(python ${platypus_executable} callVariants --bamFiles=${xMHC_BAM_2_readDecision}.sorted.bam --output=${xMHC_VCF_2_readDecision} --refFile=${combined_genome_file_f2} --logFileName=${xMHC_VCF_2_readDecision}.log --nCPU=1 --mergeClusteredVariants=1 --region=xMHCpseudoChromHaplotype);			
			print $cmd, "\n", `$cmd`, "\n";
		}
	}
	die unless((-e $xMHC_VCF_1_readDecision) && (-e $xMHC_VCF_2_readDecision));
	my $modifyHaplotypeAccordingToVCF = sub {
	
		# this function takes a VCF and tries to modify a haplotype array accordingly
		
		my $haplotype_fields_aref = shift;
		my $haplotype_index_aref = shift;
		my $haplotype_alleles_aref = shift;		
		my $alleles_details_per_positions = shift;
		my $VCF_filename = shift;
		my $positions_alleles_href = shift;
		my $haplotype = shift;
		my $alignment_index_toFill_aref = shift;
		my $haplotype_id = shift;
		
		for(my $i = 0; $i <= $#{$haplotype_fields_aref}; $i++)
		{
			$alignment_index_toFill_aref->[$i] = undef;
			$haplotype_alleles_aref->[$i] = {};
		}
		
		my %_touched_position;
		for(my $i = 0; $i <= $#{$haplotype_index_aref}; $i++)
		{
			die if($_touched_position{$haplotype_index_aref->[$i]});
			$alignment_index_toFill_aref->[$haplotype_index_aref->[$i]] = $i;
			# die Dumper($alignment_index_toFill_aref->[$haplotype_index_aref->[$i]], $haplotype_index_aref->[$i], $i);
			$_touched_position{$haplotype_index_aref->[$i]}++;
			#if(($i >= 2330646) and ($i <= 2330678))
			#{
			#	my $pgf_pos = $pgf_start + $haplotype_index_aref->[$i];
			#	print join(' ', "Add", "i: $i", "
			#}
		}
			
		# print Dumper("Position 4876672", $haplotype_fields_aref->[4876672], join('', @$haplotype_fields_aref[4876672-15 .. 4876672+15])), "\n";
		
		my $new_haplotype_fields_aref = [@$haplotype_fields_aref];
		
		my $indivID;

		open(VCF, '<', $VCF_filename) or die "Cannot open $VCF_filename";
		my @header_fields;
		while(<VCF>)
		{
			my $line = $_;
			chomp($line);
			next unless ($line);
			next if ($line =~ /^\#\#/);
			if($line =~ /^\#/)
			{
				substr($line, 0, 1) = '';
				@header_fields = split(/\t/, $line);
				$indivID = $header_fields[-1];
				next;
			}
			
			my $lineNum = $.;
			
			my @fields = split(/\t/, $line);
			my %line = (mesh @header_fields, @fields);
		
			die unless (exists $line{POS});
			
			next unless ($line{FILTER} eq 'PASS');
			my $ref_start_haplo = $line{POS} - $remapping_flank_length - 1;
			next unless($ref_start_haplo >= 0);
			next if($ref_start_haplo >= length($haplotype));
			
			my $reference = $line{REF};
			my $alternatives = $line{ALT};
			
			my $ref_extracted_ref = substr($haplotype, $ref_start_haplo, length($reference));
			unless($ref_extracted_ref eq $reference)
			{
				die Dumper("Reference matching problem!", "VCF $VCF_filename line ".$., $ref_start_haplo, "Reference according to VCF: $reference", "Reference from reference haplotype: $ref_extracted_ref", "Context: ".substr($haplotype, $ref_start_haplo-3, 6+length($reference)));
			}
			
			my @alternatives = split(/,/, $alternatives);
			unshift(@alternatives, $reference);
			
			die unless ($line{$indivID});
			
			my $genotype_data_firstPart = $line{FORMAT};
			my $genotype_data_secondPart =  $line{$indivID};
			my @genotype_specifier = split(/\:/, $genotype_data_firstPart);
			die Dumper($line{FORMAT}, \@genotype_specifier) unless($genotype_specifier[0] eq 'GT');
			my @genotype_actualValues = split(/\:/, $genotype_data_secondPart);
			my $variant_genotype = $genotype_actualValues[0];
			my @alleles = split(/\//, $variant_genotype);
			
			my @variantAlleles = grep {$_ ne $reference}  map {(defined $alternatives[$_]) ? $alternatives[$_] : die} @alleles;
			
			if($line{POS} == 3687165)
			{
				my $position_in_alignment = $haplotype_index_aref->[$ref_start_haplo];			
				print Dumper($line{POS}, $ref_start_haplo, $position_in_alignment, $reference, $alternatives, \@alleles, \@variantAlleles, '$positions_alleles_href->{$position_in_alignment}: '.$positions_alleles_href->{$position_in_alignment}, '$new_haplotype_fields_aref->[$position_in_alignment]: '.$new_haplotype_fields_aref->[$position_in_alignment]), "\n";
			}
			
			my $line_id = 'VCF'.$haplotype_id.'_'.$line{POS}.'_refAllele_'.$reference;
			
			foreach my $insertAllele (@variantAlleles)
			{
				# $position_in_VCF specifies the position in the underlying haplotype estimate - 0-based index (?)
				# ... the flanks thing does not work, therefore ignore
				# my $position_in_VCF = $ref_start_haplo;
				die unless(defined $haplotype_index_aref->[$ref_start_haplo]);
				my $position_in_alignment = $haplotype_index_aref->[$ref_start_haplo];
				die unless($position_in_alignment <= $#{$haplotype_fields_aref});
				# now $position_in_alignment tells us which position in the original alignment this position refers to
				
				$positions_alleles_href->{$position_in_alignment} = $insertAllele;
	
				# make sure that the reference we get from the VCF agrees with the haplotype we called variants from
				for(my $i = 0; $i < length($reference); $i++)
				{
					my $c1 = substr($reference, $i, 1);
					die unless(defined $haplotype_index_aref->[$ref_start_haplo + $i]);
					my $pos_of_this_character_in_alignment = $haplotype_index_aref->[$ref_start_haplo + $i];
					my $c2 = $haplotype_fields_aref->[$pos_of_this_character_in_alignment];
					
					die Dumper($position_in_alignment.' of '.$#{$haplotype_fields_aref}.' (i: '.$i.')', 'c1: '.$c1, 'c2: '.$c2, 'ref: '.$reference, 'var[0]: '.$variantAlleles[0]) unless($c1 eq $c2);
				}
				
				# stratify according to the length of the variant allele											
			
				if(length($insertAllele) == length($reference))
				{
					# length identical, assume SNP
					# die "Variant and reference have same length, but > 1 -- this implementation cannot deal with this. Split into multiple 'SNPs'." unless(length($insertAllele) == 1);
					for(my $j = 0; $j < length($insertAllele); $j++)
					{					
						my $this_position_in_alignment = $haplotype_index_aref->[$ref_start_haplo+$j];
						die unless(defined  $this_position_in_alignment);
						die unless (defined $new_haplotype_fields_aref->[$this_position_in_alignment]);									
						$new_haplotype_fields_aref->[$this_position_in_alignment] = substr($insertAllele, $j, 1);	
						$haplotype_alleles_aref->[$this_position_in_alignment]{substr($insertAllele, $j, 1)}++;
						$alleles_details_per_positions->[$this_position_in_alignment]{substr($insertAllele, $j, 1)}{'SNP_'.$line_id.'_ALIGNMENTPOS_'.$this_position_in_alignment}++;
						# push(@{$alignment_index_toFill_aref->[$position_in_alignment+$j]}, $ref_start_haplo+$j);
					}
				}
				elsif(length($insertAllele) < length($reference))
				{
					# deletion
					
					my $pos_of_deletion_start = $haplotype_index_aref->[$ref_start_haplo+length($insertAllele)]; # first character to be deleted
					my $deletion_length = length($reference) - length($insertAllele); # length of deletion
					
					# make sure that the first shared characters agree - this is not VCF spec, but we impose it anyway
					for(my $i = 0; $i < length($insertAllele); $i++)
					{
						my $c1 = substr($insertAllele, $i, 1);
						my $c2 = substr($reference, $i, 1);
						die unless($c1 eq $c2);
					}
					
					# print join(' ', 'DELetion', $line{POS}, $position_in_alignment, $pos_of_deletion_start, length($reference) - length($insertAllele) ), "\n";
					
					# delete deleted characters from haplotype
					for(my $i = 0; $i < $deletion_length; $i++)
					{					
						my $this_position_in_alignment = $haplotype_index_aref->[$ref_start_haplo+length($insertAllele)+$i];
						die unless(defined $this_position_in_alignment);
						die unless (defined $new_haplotype_fields_aref->[$this_position_in_alignment]);	
					
						my $beforeValue = $new_haplotype_fields_aref->[$this_position_in_alignment];
						$new_haplotype_fields_aref->[$this_position_in_alignment] = '_';
						$haplotype_alleles_aref->[$this_position_in_alignment]{'_'}++;		
						$alleles_details_per_positions->[$this_position_in_alignment]{'_'}{'DEL_'.$line_id.'_ALIGNMENTPOS_'.$this_position_in_alignment}++;						

						if($line{POS} == 4634505)
						{
							# print "\t ", $i, " alignmentPos ", $this_position_in_alignment,  " => ", $new_haplotype_fields_aref->[$pos_of_deletion_start + $i], "(replaces $beforeValue)\n";
						}
						# push(@{$alignment_index_toFill_aref->[$pos_of_deletion_start + $i]}, $ref_start_haplo);						
					}
				}
				elsif(length($insertAllele) > length($reference))
				{
					# insertion
					
					# make sure that the first characters between REF and ALT agree - again, not VCF spec, but 
					# we impose it
					for(my $i = 0; $i < length($reference); $i++)
					{
						my $c1 = substr($insertAllele, $i, 1);
						my $c2 = substr($reference, $i, 1);
						warn Dumper("INDEL - first chars of REF and ALT not agreeing", $VCF_filename, $lineNum, $c1, $c2) unless($c1 eq $c2);
						next;
					}
					
					# We want to find out what region of the alignment is covered
					# by this insertion. We will then extract the corresponding sequence and
					# re-align the insertion to the sequence. Gaps are subsituted with their reference
					# characters (from PGF) if possible.
					
					my $insertion_first_position_in_alignment = $position_in_alignment;
					my $insertion_ref_last_position_in_VCF = $ref_start_haplo + length($reference); # the is one after the last position!
					
					my $insertion_last_position_in_alignment;
			
					if(($insertion_ref_last_position_in_VCF - 1) == $#{$haplotype_index_aref})
					{
						# if the last position of the ref allele in the VCF is at the end of the
						# haplotype, the last position in the alignment for the allele is the total last
						# position
						$insertion_last_position_in_alignment = $#{$haplotype_fields_aref};
					}
					else
					{
						# the last position of the ref allele in the VCF is NOT at the end of the haplotype
						# i.e. we have a defined alignment position for the following ref allele, and this minus 1
						# has to be the last position for the reference allele, possibly expanded by subsequent gaps
						die unless(defined $haplotype_index_aref->[$insertion_ref_last_position_in_VCF]);
						$insertion_last_position_in_alignment = $haplotype_index_aref->[$insertion_ref_last_position_in_VCF] - 1;
					}
					die unless($insertion_last_position_in_alignment >= $insertion_first_position_in_alignment);
					
					
					# print "\nInsertion $line{POS}: starts at $insertion_first_position_in_alignment in alignment and goes to $insertion_last_position_in_alignment\n";					
					
					# this is a hack to keep alignment complexity down
					if(($insertion_last_position_in_alignment - $insertion_first_position_in_alignment) > 2*length($insertAllele))
					{
						$insertion_last_position_in_alignment = 2*length($insertAllele) + $insertion_first_position_in_alignment;
						# print "\t decrease complexity -- set insertion end to $insertion_last_position_in_alignment\n";
					}
				
					# we now know beginning and end in the alignment of the bit of the underlying haplotype which served
					# as a basis for discovering the variant we are dealing with. Some positions in this haplotype are
					# potentially gaps. We will these with PGF positions if possible.
					my %got_PGF_positions;
					my $augmented_reference_sequence;
					my @augmented_reference_sequence_alignmentPositions;
					for(my $i = $insertion_first_position_in_alignment; $i <= $insertion_last_position_in_alignment; $i++)
					{
						my $alignment_char = $haplotype_fields_aref->[$i];
						# print join(' ', "\t\tcheckPGFpos", $i), "\n";
						
						if($i == $insertion_first_position_in_alignment)
						{
							# we need to make sure we don't get the first character en double
							die if($alignment_char eq '_'); # we check this again, a bit below!
							die unless(defined $loci_in_graph[$i]);
							my @locusID_fields = split(/_/, $loci_in_graph[$i]);
							die unless($#locusID_fields == 2);
							my $pgf_position = $locusID_fields[2]; # this is not a genomic position but an index into the PGF string
							$got_PGF_positions{$pgf_position} = 1;
						}

						if($alignment_char eq '_')
						{
							die if ($i == $insertion_first_position_in_alignment); # first position may not be gap
							die unless(defined $loci_in_graph[$i]);
							my @locusID_fields = split(/_/, $loci_in_graph[$i]);
							die unless($#locusID_fields == 2);
							my $pgf_position = $locusID_fields[2]; # this is not a genomic position but an index into the PGF string
							
							if($line{POS} eq '4843752')
							{
								#print join(' ', "\t\tcheckPGF", $i, $pgf_position, $loci_in_graph[$i]), "\n";
							}
								
							unless($got_PGF_positions{$pgf_position})
							{
								# the alignment can have gaps etc, we fill the first occurrence of a PGF position
								# with the respective character
								$got_PGF_positions{$pgf_position} = 1;
								$augmented_reference_sequence .= substr($pgf_ref_seq, $pgf_position, 1);
								push(@augmented_reference_sequence_alignmentPositions, $i);
								
								if($line{POS} eq '4843752')
								{
									#print join(' ', "\t\tFromPGF", $pgf_position,  substr($pgf_ref_seq, $pgf_position, 1)), "\n";
								}
					
							}
						}
						else
						{
							$augmented_reference_sequence .= $alignment_char;
							push(@augmented_reference_sequence_alignmentPositions, $i);							
						}
						
						$new_haplotype_fields_aref->[$i] = undef; # set the target region in the haplotype to undef - will be re-filled in a bit
						# push(@{$alignment_index_toFill_aref->[$i]}, $ref_start_haplo);						
						
					}
					die unless(scalar(@augmented_reference_sequence_alignmentPositions) == length($augmented_reference_sequence));
					
					# transpose alignment into new haplotype
					if($line{POS} eq '4843752')
					{
						#print "$line{POS}\n\taugmented ref: $augmented_reference_sequence\n";
					}								
					
					# print "Needleman-Wunsch:\n\taugmented reference: $augmented_reference_sequence\n\tquery: $insertAllele\n";
					
					# align the variant to the specific reference
					my @aligned_sequences = needleman_wunsch($augmented_reference_sequence, $insertAllele, 2, -1, -1);
					
					# print "\taligned ref seq: $aligned_sequences[0]\n\taligned query: $aligned_sequences[1]\n";
					

					# transpose alignment into new haplotype
					if($line{POS} eq '4843752')
					{
						# print "$line{POS} write into new haplotype!\n";
					}					
					
					my $lastAlignmentPositionInString = -1;
					for(my $i = 0; $i < length($aligned_sequences[0]); $i++)
					{
						my $aligned_ref_char = substr($aligned_sequences[0], $i, 1);
						my $aligned_variant_char = substr($aligned_sequences[1], $i, 1);
						die unless((defined $aligned_ref_char) and (defined $aligned_variant_char));
						
						if($aligned_ref_char ne '_')
						{  
							$lastAlignmentPositionInString++;
						}
						die unless($lastAlignmentPositionInString >= 0);
						
						my $alignmentPos = $augmented_reference_sequence_alignmentPositions[$lastAlignmentPositionInString];
						$new_haplotype_fields_aref->[$alignmentPos] .= $aligned_variant_char;

						# print "\t", join(' ', $i, $alignmentPos, $aligned_variant_char), "\n" if($line{POS} eq '4852866');
						# push(@{$alignment_index_toFill_aref->[$alignmentPos]}, $ref_start_haplo);						
						
					}
					
					# if there are any undef fields left, they have to be gaps
					for(my $i = $insertion_first_position_in_alignment; $i <= $insertion_last_position_in_alignment; $i++)
					{
						if(not defined $new_haplotype_fields_aref->[$i])
						{
							$new_haplotype_fields_aref->[$i] = '_';
							$haplotype_alleles_aref->[$i]{'_'}++;						
							$alleles_details_per_positions->[$i]{'_'}{'IN_DEL_'.$line_id.'_ALIGNMENTPOS_'.$i}++;						

							# print "\t", join(' ', '!!!!', $i, 'GAP'), "\n" if($line{POS} eq '4852866');													
							# push(@{$alignment_index_toFill_aref->[$i]}, $ref_start_haplo);													
						}
						else
						{
							$haplotype_alleles_aref->[$i]{$new_haplotype_fields_aref->[$i]}++;						
							$alleles_details_per_positions->[$i]{$new_haplotype_fields_aref->[$i]}{'IN_'.$line_id.'_ALIGNMENTPOS_'.$i}++;												
						}
					}
					
					# print "Final reocnstructed haplotype:\n", join('', @{$new_haplotype_fields_aref}[$insertion_first_position_in_alignment .. $insertion_last_position_in_alignment]), "\n";
					# print "\t in separate fields: ", join(', ', @{$new_haplotype_fields_aref}[$insertion_first_position_in_alignment .. $insertion_last_position_in_alignment]), "\n";
				}
			}
		}
		close(VCF);	
		
		for(my $i = 0; $i <= $#{$haplotype_fields_aref}; $i++)
		{
			my @existing_alleles = keys %{$haplotype_alleles_aref->[$i]};
			my $allele_sum = 0;
			for(@existing_alleles){$allele_sum += $haplotype_alleles_aref->[$i]{$_};}
			warn "Weird allele sum: $allele_sum" unless(($allele_sum == 0) or ($allele_sum == 1) or ($allele_sum == 2));
			if($allele_sum == 0)
			{
				$haplotype_alleles_aref->[$i]{$haplotype_fields_aref->[$i]} += 2;
				$alleles_details_per_positions->[$i]{$haplotype_fields_aref->[$i]}{'REF_2_VCF'.$haplotype_id.'_ALIGNMENTPOS_'.$i}++;						
				
			}
			elsif($allele_sum == 1)
			{
				$haplotype_alleles_aref->[$i]{$haplotype_fields_aref->[$i]} += 1;
				$alleles_details_per_positions->[$i]{$haplotype_fields_aref->[$i]}{'REF_1_VCF'.$haplotype_id.'_ALIGNMENTPOS_'.$i}++;										
			}			
			# $alignment_index_toFill_aref->[$i] = undef;
		}
		
		return $new_haplotype_fields_aref;
	};
	

	#die unless($haplotype_2_alignment_fields[5137237] eq '_');
	#die unless($haplotype_2_alignment_fields[5137238] eq '_');
	
	# modify haplotypes according to the VCFs
	my $aligment_positions_to_haplotypes = [[], []];
	my $positions_alleles_href = {};
	my $alleles_per_positions_1 = [];
	my $alleles_per_positions_2 = [];	
	my $alleles_details_per_positions = [];


	my $ameneded_haplotype_alignment_fields_1 = $modifyHaplotypeAccordingToVCF->(\@haplotype_1_alignment_fields, \@haplotype_1_alignment_index, $alleles_per_positions_1, $alleles_details_per_positions, $xMHC_VCF_1_readDecision, $positions_alleles_href, $haplotype_1, $aligment_positions_to_haplotypes->[0], 1);
	

	for(my $i = 0; $i < $#{$ameneded_haplotype_alignment_fields_1}; $i++)
	{
		$positions_alleles_href->{$i} = $ameneded_haplotype_alignment_fields_1->[$i];
	}

	
	my $ameneded_haplotype_alignment_fields_2 = $modifyHaplotypeAccordingToVCF->(\@haplotype_2_alignment_fields, \@haplotype_2_alignment_index, $alleles_per_positions_2, $alleles_details_per_positions, $xMHC_VCF_2_readDecision, $positions_alleles_href, $haplotype_2, $aligment_positions_to_haplotypes->[1], 2);
		
	#die if($ameneded_haplotype_alignment_fields_2->[5137237] eq '_');
	#die if($ameneded_haplotype_alignment_fields_2->[5137238] eq '_');


	die unless($#{$alleles_per_positions_1} == $#{$alleles_per_positions_2});
	die unless($#{$alleles_per_positions_1} == $#{$ameneded_haplotype_alignment_fields_1});
	die unless($#{$alleles_per_positions_1} == $#{$ameneded_haplotype_alignment_fields_2});

	for(my $i = 0; $i <= $#{$alleles_per_positions_1}; $i++)
	{
		my %combined_alleles;
		foreach my $key (keys %{$alleles_per_positions_1->[$i]}){$combined_alleles{$key} += $alleles_per_positions_1->[$i]{$key}}
		foreach my $key (keys %{$alleles_per_positions_2->[$i]}){$combined_alleles{$key} += $alleles_per_positions_2->[$i]{$key}}
		
		my @sorted_alleles = sort {$combined_alleles{$b} <=> $combined_alleles{$a}} keys %combined_alleles;
		if(scalar(@sorted_alleles) > 1)
		{
			die unless($combined_alleles{$sorted_alleles[0]} >= $combined_alleles{$sorted_alleles[1]});
		}
		die unless($#sorted_alleles >= 0);
		if($#sorted_alleles > 1)
		{
			print Dumper("Warning - more than two alleles at position!", \@sorted_alleles, \%combined_alleles, $alleles_details_per_positions->[$i]), "\n";
		}
		
		if($#sorted_alleles > 0)
		{
			if((exists $alleles_per_positions_1->[$i]{$sorted_alleles[0]}) and ($alleles_per_positions_1->[$i]{$sorted_alleles[0]} > 0))
			{
				# $sorted_alleles[0] can come from haplo 1 - can it also come from haplo 2?
				if((exists $alleles_per_positions_2->[$i]{$sorted_alleles[0]}) and ($alleles_per_positions_2->[$i]{$sorted_alleles[0]} > 0))
				{	
					# $sorted_alleles[0] can come from both haplotypes - what about $sorted_alleles[1]?
					if((exists $alleles_per_positions_1->[$i]{$sorted_alleles[1]}) and ($alleles_per_positions_1->[$i]{$sorted_alleles[1]} > 0))
					{
						# $sorted_alleles[1] can come from haplo 1 as well. Can it come from haplo 2 as well?
						if((exists $alleles_per_positions_2->[$i]{$sorted_alleles[1]}) and ($alleles_per_positions_2->[$i]{$sorted_alleles[1]} > 0))
						{
							# both alleles can come from both haplotypes. selection arbitrary.
							if(($haplotype_1_alignment_fields[$i] eq $sorted_alleles[0]) or ($haplotype_2_alignment_fields[$i] eq $sorted_alleles[1]))
							{
								die unless($alleles_per_positions_1->[$i]{$sorted_alleles[0]} > 0);			
								die unless($alleles_per_positions_2->[$i]{$sorted_alleles[1]} > 0);												
								$ameneded_haplotype_alignment_fields_1->[$i] = $sorted_alleles[0];		
								$ameneded_haplotype_alignment_fields_2->[$i] = $sorted_alleles[1];										
							}
							else
							{
								die unless($alleles_per_positions_1->[$i]{$sorted_alleles[1]} > 0);			
								die unless($alleles_per_positions_2->[$i]{$sorted_alleles[0]} > 0);												
								$ameneded_haplotype_alignment_fields_1->[$i] = $sorted_alleles[1];		
								$ameneded_haplotype_alignment_fields_2->[$i] = $sorted_alleles[0];											
							}
						}
						else
						{
							# $sorted_alleles[1] can NOT come from haplo 2, thus it comes from haplo 1!
							die unless($alleles_per_positions_1->[$i]{$sorted_alleles[1]} > 0);			
							die unless($alleles_per_positions_2->[$i]{$sorted_alleles[0]} > 0);												
							$ameneded_haplotype_alignment_fields_1->[$i] = $sorted_alleles[1];		
							$ameneded_haplotype_alignment_fields_2->[$i] = $sorted_alleles[0];										
						}
					}
					else
					{
						# $sorted_alleles[1] can come from haplo 2 exclusively
						die unless($alleles_per_positions_2->[$i]{$sorted_alleles[1]} > 0);					
						$ameneded_haplotype_alignment_fields_1->[$i] = $sorted_alleles[0];		
						$ameneded_haplotype_alignment_fields_2->[$i] = $sorted_alleles[1];					
					}
				}
				else
				{
					# $sorted_alleles[0]  can come ONLY from haplo 1.
					warn "An allele we are going to insert into a haplotype is not supported by the VCF output.\nAlleles $sorted_alleles[0], $sorted_alleles[1].\nSupported: ".join(', ', keys %{$alleles_per_positions_1->[$i]}).' // '.join(', ', keys %{$alleles_per_positions_2->[$i]})."\n\n"	unless($alleles_per_positions_2->[$i]{$sorted_alleles[1]} > 0);					
					$ameneded_haplotype_alignment_fields_1->[$i] = $sorted_alleles[0];		
					$ameneded_haplotype_alignment_fields_2->[$i] = $sorted_alleles[1];					
				}
			}
			else
			{
				# $sorted_alleles[0] can NOT come from haplo 1 thus it goes to haplo 2. Haplo 2 must be able to support the other allele!
			
				die unless($alleles_per_positions_2->[$i]{$sorted_alleles[0]} > 0);
				$ameneded_haplotype_alignment_fields_1->[$i] = $sorted_alleles[1];		
				$ameneded_haplotype_alignment_fields_2->[$i] = $sorted_alleles[0];			
			}
		}
		else
		{
			$ameneded_haplotype_alignment_fields_1->[$i] = $sorted_alleles[0];				
			$ameneded_haplotype_alignment_fields_2->[$i] = $sorted_alleles[0];		
		}
	}
	

		
	# save haplotypes to file
	my $output_amended_haplotypes = $kMer_count_sample.'.amendedHaplotypes';
	open(AMENDEDHAPLOTYPES, '>', $output_amended_haplotypes) or die "Cannot open $output_amended_haplotypes";
	print AMENDEDHAPLOTYPES join(' ', $sample, 1, @$ameneded_haplotype_alignment_fields_1), "\n";
	print AMENDEDHAPLOTYPES join(' ', $sample, 2, @$ameneded_haplotype_alignment_fields_2), "\n";
	close(AMENDEDHAPLOTYPES);
	
	# save haplotype positions to file
	my $haplo1_to_pgf_aref = alignHaplotypeCharactersToPGF(\@haplotype_1_alignment_fields, \@loci_in_graph);
	die unless(scalar(@$haplo1_to_pgf_aref) == length($haplotype_1));
	my $haplo1_within_alignment_positions = haplotypeCharacterAlignmentPositions(\@haplotype_1_alignment_fields, \@loci_in_graph);
	die unless(scalar(@$haplo1_within_alignment_positions) == length($haplotype_1));	
	my $output_amended_haplotype_1_positions = $kMer_count_sample.'.rawAmendedHaplotype_1.positionsToReference';
	open(AMENDEDHAPLOTYPESPOS, '>', $output_amended_haplotype_1_positions) or die "Cannot open $output_amended_haplotype_1_positions";
	print AMENDEDHAPLOTYPESPOS join(' ', 'pos_in_h_1', 'pos_genomic', 'pos_pgf', 'pos_alignment'), "\n";
	for(my $i = 0; $i <= $#{$haplo1_to_pgf_aref}; $i++)
	{
		my $position_in_pgf = $haplo1_to_pgf_aref->[$i];
		die unless(defined $position_in_pgf);
		my $position_in_alignment = $haplo1_within_alignment_positions->[$i];
		die unless(defined $position_in_alignment);		
		print AMENDEDHAPLOTYPESPOS join(' ', $i, $position_in_pgf, $position_in_pgf,-$pgf_start, $position_in_alignment), "\n";	
	} 
	close(AMENDEDHAPLOTYPESPOS);	
	
	my $haplo2_to_pgf_aref = alignHaplotypeCharactersToPGF(\@haplotype_2_alignment_fields, \@loci_in_graph);
	die unless(scalar(@$haplo2_to_pgf_aref) == length($haplotype_2));
	my $haplo2_within_alignment_positions = haplotypeCharacterAlignmentPositions(\@haplotype_2_alignment_fields, \@loci_in_graph);
	die unless(scalar(@$haplo2_within_alignment_positions) == length($haplotype_2));	
	my $output_amended_haplotype_2_positions = $kMer_count_sample.'.rawAmendedHaplotype_2.positionsToReference';
	open(AMENDEDHAPLOTYPESPOS, '>', $output_amended_haplotype_2_positions) or die "Cannot open $output_amended_haplotype_2_positions";
	print AMENDEDHAPLOTYPESPOS join(' ', 'pos_in_h_1', 'pos_genomic', 'pos_pgf', 'pos_alignment'), "\n";
	for(my $i = 0; $i <= $#{$haplo2_to_pgf_aref}; $i++)
	{
		my $position_in_pgf = $haplo2_to_pgf_aref->[$i];
		die unless(defined $position_in_pgf);
		my $position_in_alignment = $haplo2_within_alignment_positions->[$i];
		die unless(defined $position_in_alignment);		
		print AMENDEDHAPLOTYPESPOS join(' ', $i, $position_in_pgf, $position_in_pgf,-$pgf_start, $position_in_alignment), "\n";	
	} 
	close(AMENDEDHAPLOTYPESPOS);	
	
	my $output_file = $kMer_count_sample.".amendedHaplotypes.VCF";			
	haplotypesToVCF2($ameneded_haplotype_alignment_fields_1, $ameneded_haplotype_alignment_fields_2, $sample, \@loci_in_graph, $pgf_ref_seq, $output_file);

	my $output_amendedVCF_pseudoVCF = $kMer_count_sample.'.amendedHaplotypes.IGV_info';
	haplotypesToVCF($ameneded_haplotype_alignment_fields_1, $ameneded_haplotype_alignment_fields_2, $sample, \@loci_in_graph, $pgf_ref_seq, $output_amendedVCF_pseudoVCF, $aligment_positions_to_haplotypes);

	exit;
}
elsif($collect eq '2')
{
	die "Legacy code. Use --collect 2viterbi instead.";
	
	my $expected_output_filename = $kMer_count_sample.'.viterbiHaplotypes';
	
	# read loci from graph specification
	
	my @loci_in_graph;
	my $graph_segments_file = $graph.'/segments.txt';
	die "File not there: $graph_segments_file" unless (-e $graph_segments_file);
	open(SEGMENTS, '<', $graph_segments_file) or die;
	while(<SEGMENTS>)
	{
		my $line = $_;
		chomp($line);
		$line =~ s/\n//g;
		$line =~ s/\r//g;	
		next unless($line);
		
		my $segment_file = $graph.'/'.$line;

		open(SEGMENT, '<', $segment_file) or die "Cannot open $segment_file";
		my $firstLine = <SEGMENT>;
		chomp($firstLine);
		$firstLine =~ s/\n//g;
		$firstLine =~ s/\r//g;			
		my @line_fields = split(/ /, $firstLine);
		shift(@line_fields); # kick out individual ID
		push(@loci_in_graph, @line_fields);
		close(SEGMENT);
	}
	close(SEGMENTS);
	
	my %locus_position;
	my $positions_file = $graph.'/positions.txt';
	die "File not there: $positions_file" unless (-e $positions_file);
	open(POSITIONS, '<', $positions_file) or die;	
	while(<POSITIONS>)
	{
		my $line = $_;
		chomp($line);
		$line =~ s/\n//g;
		$line =~ s/\r//g;		
		my @fields = split(/ /, $line);
		my $field_id = $fields[0];
		my $field_position = $fields[1];
		$locus_position{$field_id} = $field_position;
	}
	close(POSITIONS);
	
	for(my $lI = 1; $lI <= $#loci_in_graph; $lI++)
	{
		if(not defined $locus_position{$loci_in_graph[$lI]})
		{
			die Dumper("No position information for locus $loci_in_graph[$lI]", $lI);
		}
		if(not defined $locus_position{$loci_in_graph[$lI]})
		{
			die Dumper("No position information for locus $loci_in_graph[$lI-1]", $lI-1);
		}		
		die unless($locus_position{$loci_in_graph[$lI]} > $locus_position{$loci_in_graph[$lI-1]});
	}
	
	# read xMHC reference
	
	open(REF, "<", $xMHC_reference) or die "Cannot open $xMHC_reference";
	my $pgf_ref_seq;
	while(<REF>)
	{
		my $line = $_; 
		chomp($line);
		$line =~ s/[\x0A\x0D]+//g;	
		next if ($line =~ /^\>/);
		$pgf_ref_seq .= $line;
	}
	close(REF);
	my $pgf_stop = $pgf_start+length($pgf_ref_seq);
	
	my %identifier_to_position;
	my %identifier_to_refalleles;
	if(-e $vcfPos)
	{
		open(VCFSNPS, '<', $vcfPos) or die "Cannot open $vcfPos";
		my $header_line = <VCFSNPS>;
		chomp($header_line);
		$header_line =~ s/\n//g;
		$header_line =~ s/\r//g;	
		
		my @header_fields = split(/\t/, $header_line);
		while(<VCFSNPS>)
		{
			my $line = $_;
			chomp($line);
			$line =~ s/\n//g;
			$line =~ s/\r//g;	
		
			my @line_fields = split(/\t/, $line);
			my %line = (mesh @header_fields, @line_fields);
		
			die unless(defined $line{Position});
			
			die unless($line{RefAllele});
			
			my $reference_charPos_PGF = $line{Position} - $pgf_start;
			
			next if($reference_charPos_PGF < 0);
			next if($reference_charPos_PGF > (length($pgf_ref_seq) - 1));
			
			$identifier_to_position{$line{ID}} = $line{Position};
			$identifier_to_refalleles{$line{ID}} = $line{RefAllele};
						
			my $reference_char = substr($pgf_ref_seq, $reference_charPos_PGF, length($line{RefAllele}));
			
			die "$reference_char ne $line{RefAllele}, $reference_charPos_PGF, line $." unless($reference_char eq $line{RefAllele});
		}			
		close(VCFSNPS);
	}
	
	if((-e $expected_output_filename))
	{
		print "Analyze $expected_output_filename\n";
		
		my %inference;
		
		my %inference_PP;
		my %inference_seen_pairs;
		
		my %inference_firstAllele;
		my %inference_secondAllele; 
		my %firstAllele_called;
		my %secondAllele_called;		
		
		my %seen_PGF_positions;
		my %pgf_genotypes_labels;
		
		open(INFERENCE, '<', $expected_output_filename) or die "Cannot open $expected_output_filename";				
		while(<INFERENCE>)
		{
			my $line = $_;
			chomp($line);
			$line =~ s/\n//g;
			$line =~ s/\r//g;
			
			my @fields = split(/ /, $line);
			
			my $indiv_spec = shift(@fields);
			shift(@fields); # chromosome
			
			die "Error indivID not parseable: $indiv_spec" unless($indiv_spec =~ /^(.+?)-x-(.+?)$/);
			my $individualID = $1;
			my $iteration = $2;
			
			die Dumper($#fields, $#loci_in_graph) unless($#fields == ($#loci_in_graph+2));
			@fields = @fields[0 .. ($#fields-2)];
			
			my %pgf_genotypes;
			
			for(my $pI = 0; $pI <= $#fields; $pI++)
			{
				my $locusID = $loci_in_graph[$pI];
				my @locusID_fields = split(/_/, $locusID);
				die unless($#locusID_fields == 2);

				my $pgf_position = $pgf_start + $locusID_fields[2];
				
				if($locusID_fields[0] =~ /^rs/)
				{
					$pgf_genotypes_labels{$pgf_position}{$locusID_fields[0]}++;
				}
				
				my $char_from_inference = $fields[$pI];
				
				$pgf_genotypes{$pgf_position} .= $char_from_inference;
				
				$seen_PGF_positions{$pgf_position}++;
			}
			
			foreach my $pgfPos (keys %pgf_genotypes)
			{
				push(@{$inference{$individualID}{$pgfPos}[$iteration-1]}, $pgf_genotypes{$pgfPos});
			}
		}
		close(INFERENCE);
			
		foreach my $individualID (keys %inference)
		{
			foreach my $pgfPos (keys %seen_PGF_positions)
			{
				foreach my $inference_combination (@{$inference{$individualID}{$pgfPos}})
				{
					warn "Empty field in inference combinations!" unless ($inference_combination);
					my @alleles = sort @{$inference_combination};
					
					my $alleles = join('_', @alleles);
					$inference_seen_pairs{$pgfPos}{$alleles} = 1;

		
					$inference_PP{$individualID}{$pgfPos}{$alleles}++;
					$inference_PP{$individualID}{$pgfPos}{'ALL'}++;			
		
					
					my %tmp_alleles = map {$_ => 1} @alleles;
					

					foreach my $allele (keys %tmp_alleles)
					{
						$inference_firstAllele{$individualID}{$pgfPos}{$allele}++;
					}
					$inference_firstAllele{$individualID}{$pgfPos}{'ALL'}++;		

				}
			}
		}
		
		foreach my $individualID (keys %inference)
		{
			foreach my $pgfPos (keys %seen_PGF_positions)
			{					
				my $call = maxAllele($inference_firstAllele{$individualID}{$pgfPos});
				$firstAllele_called{$individualID}{$pgfPos} = $call;
				$firstAllele_called{$individualID}{$pgfPos}[2] = $firstAllele_called{$individualID}{$pgfPos}[1];
			}
		}
			
		foreach my $individualID (keys %inference)
		{
			foreach my $pgfPos (keys %seen_PGF_positions)
			{		
				foreach my $inference_combination (@{$inference{$individualID}{$pgfPos}})
				{
					warn "Empty field in inference combinations!" unless ($inference_combination);
					my @alleles = sort @{$inference_combination};
					my $firstAllele = $firstAllele_called{$individualID}{$pgfPos}[0];
					if($firstAllele ~~ @alleles)
					{
						my $gotFirst = 0;
						my $otherAllele;
						for(@alleles)
						{
							if(! $gotFirst)
							{
								if($_ eq $firstAllele)
								{
									$gotFirst = 1;
									next;
								}
							}
							$otherAllele = $_;
						}
						die unless ($gotFirst);
						
						$inference_secondAllele{$individualID}{$pgfPos}{$otherAllele}++;
						$inference_secondAllele{$individualID}{$pgfPos}{'ALL'}++;	
					}
				}
			}
		}
			
		foreach my $individualID (keys %inference)
		{
			foreach my $pgfPos (keys %seen_PGF_positions)
			{		
				my $call = maxAllele($inference_secondAllele{$individualID}{$pgfPos});
				$secondAllele_called{$individualID}{$pgfPos} = $call;
				$secondAllele_called{$individualID}{$pgfPos}[2] = $secondAllele_called{$individualID}{$pgfPos}[1] * $firstAllele_called{$individualID}{$pgfPos}[2];
			}	
		}		
		
		
		my %max_pair_value;
		my %max_pair_alleles;
		my $output_file = $expected_output_filename.".PP";
		open(OUTPUT, '>', $output_file) or die "Cannot open $output_file";
		my @allele_headers;
		my %pairs_in_order;
			foreach my $pgfPos (keys %seen_PGF_positions)
		{	
			my @pairs_in_order = sort keys %{$inference_seen_pairs{$pgfPos}};
			$pairs_in_order{$pgfPos} = \@pairs_in_order;
			push(@allele_headers, @pairs_in_order);
		}
		print OUTPUT join(' ', 'IndividualID', @allele_headers), "\n";
		foreach my $individualID (sort keys %inference_PP)
		{
			my @allele_fields;
			foreach my $pgfPos (keys %seen_PGF_positions)
			{
				push(@allele_fields,
					map {sprintf("%.5f", $inference_PP{$individualID}{$pgfPos}{$_}/$inference_PP{$individualID}{$pgfPos}{'ALL'} )} @{$pairs_in_order{$pgfPos}}
				);
			}
			print OUTPUT join(' ', $individualID, @allele_fields), "\n";
			print OUTPUT join(' ', $individualID, ), "\n";
		}
		close(OUTPUT);
		
		$output_file = $expected_output_filename.".bestguess";
		open(OUTPUT, '>', $output_file) or die "Cannot open $output_file";
		
		@allele_headers = ();
		foreach my $pgfPos (keys %seen_PGF_positions)
		{	
			my @headers = ($pgfPos, qw/Q Q2 P/);			
			push(@allele_headers, @headers);
		}
		
		my %bestguess_exist_alleles;
		
		print OUTPUT join(' ', 'IndividualID', 'Chromosome', @allele_headers), "\n";
		foreach my $individualID (keys %inference_firstAllele)
		{	
			my $likelihood = 'NA';
			my @f_1;
			my @f_2;
			foreach my $pgfPos (keys %seen_PGF_positions)
			{	
				$bestguess_exist_alleles{$pgfPos}{$firstAllele_called{$individualID}{$pgfPos}[0]}++;
				$bestguess_exist_alleles{$pgfPos}{$secondAllele_called{$individualID}{$pgfPos}[0]}++;
				
				push(@f_1,
					$firstAllele_called{$individualID}{$pgfPos}[0], $firstAllele_called{$individualID}{$pgfPos}[1], $firstAllele_called{$individualID}{$pgfPos}[2], $likelihood
				
				);
				push(@f_2,
					$secondAllele_called{$individualID}{$pgfPos}[0], $secondAllele_called{$individualID}{$pgfPos}[1], $secondAllele_called{$individualID}{$pgfPos}[2], $likelihood
				);				
			}
			print OUTPUT join(' ', $individualID, 1, @f_1), "\n";
			print OUTPUT join(' ', $individualID, 2, @f_2), "\n";
		}
		close(OUTPUT);
		
		if($vcfPos)
		{
			my $output_file = $expected_output_filename.".VCF";
			open(VCF, '>', $output_file) or die "Cannot open $output_file";
my $VCF_header = qq(##fileformat=VCFv4.0
##fileDate=27/04/12
##phasing=none, though some calls involve phasing clustered variants
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=COV,Number=2,Type=Integer,Description="Number of reads on ref and alt alleles">
##FORMAT=<ID=GT_CONF,Number=1,Type=Float,Description="Genotype confidence. Difference in log likelihood of most likely and next most likely genotype">
##FORMAT=<ID=SITE_CONF,Number=1,Type=Float,Description="Probabilitic site classification confidence. Difference in log likelihood of most likely and next most likely model (models are variant, repeat and error)">
##INFO=<ID=SVLEN,Number=1,Type=Integer,Description="Difference in length between REF and ALT alleles">
##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of variant">
##ALT=<ID=SNP,Description="SNP">
##ALT=<ID=SNP_FROM_COMPLEX,Description="SNP called from a cluster of phased SNPs or complex SNP/indel , split out for easier comparison with other SNP call sets">
##ALT=<ID=DEL,Description="Deletion">
##ALT=<ID=INS,Description="Insertion of novel sequence">
##ALT=<ID=INDEL,Description="Insertion-deletion">
##ALT=<ID=INV,Description="Inversion">
##ALT=<ID=INV_INDEL,Description="Inversion+indel - this script overcalls these, so worth checking">
##ALT=<ID=DEL_INV,Description="Deletion + Inversion">
##ALT=<ID=INS_INV,Description="Insertion + Inversion">
##ALT=<ID=PH_SNPS,Description="Phased SNPs">
##ALT=<ID=COMPLEX,Description="Complex variant, collection of SNPs and indels">
##FILTER=<ID=MAPQ,Description="5prime flank maps to reference with mapping quality below 40">);
			my @header_fields = qw/CHROM POS ID REF ALT QUAL FILTER INFO FORMAT/;
			my @individualIDs = sort (keys %inference_firstAllele);
			print VCF $VCF_header, "\n";
			print VCF '#'.join("\t", @header_fields, @individualIDs), "\n";
			
			my @output_variants = (keys %seen_PGF_positions);
			foreach my $pgfPos (sort {$a <=> $b} @output_variants)
			{
				my $variantID = '.';
				my $rsID;
				
				if(scalar( keys %{$pgf_genotypes_labels{$pgfPos}}) == 1)
				{
					$rsID = ((keys %{$pgf_genotypes_labels{$pgfPos}})[0]);
					die Dumper('$pgfPos != $identifier_to_position{$rsID}', $rsID, $pgfPos, $identifier_to_position{$rsID}) unless($pgfPos == $identifier_to_position{$rsID});
					$variantID = $rsID;
				}
				
				my @output_fields;
				push(@output_fields, 6);
				push(@output_fields, $pgfPos);
				push(@output_fields, $variantID);
				
				my $reference_charPos_PGF = $pgfPos - $pgf_start;
				die "???" if($reference_charPos_PGF < 0);
				die "???" if($reference_charPos_PGF > (length($pgf_ref_seq) - 1));
				my $reference_char = substr($pgf_ref_seq, $reference_charPos_PGF, 1);

				if($rsID)
				{
					die Dumper($., $pgfPos, $rsID, $identifier_to_refalleles{$rsID}, $reference_char) unless($identifier_to_refalleles{$rsID} eq $reference_char);
				}
				
				my $refAllele = $reference_char;
				push(@output_fields, $refAllele);
				
				die unless($bestguess_exist_alleles{$pgfPos});
				delete $bestguess_exist_alleles{$pgfPos}{$refAllele};
				my @nonref_alleles = sort keys %{$bestguess_exist_alleles{$pgfPos}};
				
				my %allele_2_num = ($refAllele => 0);
				my $allele_counter = 1;
				for(@nonref_alleles)
				{
					$allele_2_num{$nonref_alleles[$allele_counter-1]} = $allele_counter;
					$allele_counter++
				}
				if(scalar(@nonref_alleles) == 0)
				{
					push(@output_fields, '.');
				}
				else
				{
					push(@output_fields, join(',', @nonref_alleles));
				}
				push(@output_fields, '.');
				# this is a hack!
				die if($#individualIDs > 0);
				if(not ($secondAllele_called{$individualIDs[0]}{$pgfPos}[2] > 0.7))
				{
					push(@output_fields, 'FAIL');
				}
				else
				{
					push(@output_fields, 'PASS');
				}
				push(@output_fields, 'SVTYPE=SNP;SVLEN=0');
				push(@output_fields, 'GT');
				
				foreach my $indivID (@individualIDs)
				{
					(exists $allele_2_num{$firstAllele_called{$indivID}{$pgfPos}[0]}) or die;
					(exists $allele_2_num{$secondAllele_called{$indivID}{$pgfPos}[0]}) or die;					
					my $gt = join('/', $allele_2_num{$firstAllele_called{$indivID}{$pgfPos}[0]}, $allele_2_num{$secondAllele_called{$indivID}{$pgfPos}[0]});
					push(@output_fields, $gt);
				}
				
				print VCF join("\t", @output_fields), "\n";
			}
			
			close(VCF);
			
			print "\n\nVCF file $output_file produced.\n";
		}
		
		print "\n\nOutput collected in\n$output_file\n\n";		
		
		print "\n\n";
		
		# print `cat $output_file`;
	}
	else
	{
		print "Cannot find inference file $expected_output_filename\n";
	}
}
elsif($collect eq '2viterbi')
{
	my $expected_output_filename = $kMer_count_sample.'.viterbiHaplotypes';
	
	# read loci from graph specification
	
	my @loci_in_graph;
	my $graph_segments_file = $graph.'/segments.txt';
	die "File not there: $graph_segments_file" unless (-e $graph_segments_file);
	open(SEGMENTS, '<', $graph_segments_file) or die;
	while(<SEGMENTS>)
	{
		my $line = $_;
		chomp($line);
		$line =~ s/\n//g;
		$line =~ s/\r//g;	
		next unless($line);
		
		my $segment_file = $graph.'/'.$line;

		open(SEGMENT, '<', $segment_file) or die "Cannot open $segment_file";
		my $firstLine = <SEGMENT>;
		chomp($firstLine);
		$firstLine =~ s/\n//g;
		$firstLine =~ s/\r//g;			
		my @line_fields = split(/ /, $firstLine);
		shift(@line_fields); # kick out individual ID
		push(@loci_in_graph, @line_fields);
		close(SEGMENT);
	}
	close(SEGMENTS);
	
	my %locus_position;
	my $positions_file = $graph.'/positions.txt';
	die "File not there: $positions_file" unless (-e $positions_file);
	open(POSITIONS, '<', $positions_file) or die;	
	while(<POSITIONS>)
	{
		my $line = $_;
		chomp($line);
		$line =~ s/\n//g;
		$line =~ s/\r//g;		
		my @fields = split(/ /, $line);
		my $field_id = $fields[0];
		my $field_position = $fields[1];
		$locus_position{$field_id} = $field_position;
	}
	close(POSITIONS);
	
	for(my $lI = 1; $lI <= $#loci_in_graph; $lI++)
	{
		if(not defined $locus_position{$loci_in_graph[$lI]})
		{
			die Dumper("No position information for locus $loci_in_graph[$lI]", $lI);
		}
		if(not defined $locus_position{$loci_in_graph[$lI]})
		{
			die Dumper("No position information for locus $loci_in_graph[$lI-1]", $lI-1);
		}		
		die unless($locus_position{$loci_in_graph[$lI]} > $locus_position{$loci_in_graph[$lI-1]});
	}
	
	# read xMHC reference
	
	open(REF, "<", $xMHC_reference) or die "Cannot open $xMHC_reference";
	my $pgf_ref_seq;
	while(<REF>)
	{
		my $line = $_; 
		chomp($line);
		$line =~ s/[\x0A\x0D]+//g;	
		next if ($line =~ /^\>/);
		$pgf_ref_seq .= $line;
	}
	close(REF);
	my $pgf_stop = $pgf_start+length($pgf_ref_seq);
	

	if((-e $expected_output_filename))
	{
		print "Analyze $expected_output_filename\n";
		
		my %inference;
		
		my %inference_PP;
		my %inference_seen_pairs;
		
		my %inference_firstAllele;
		my %inference_secondAllele; 
		my %firstAllele_called;
		my %secondAllele_called;		
		
		my %seen_PGF_positions;
		my %pgf_genotypes_labels;
		
		open(HAPLOTYPES, '<', $expected_output_filename) or die "Cannot open $expected_output_filename";
		my $haplotype_1, my $haplotype_2;
		my @haplotype_1_alignment_fields;
		my @haplotype_2_alignment_fields;
		while(<HAPLOTYPES>)
		{
			my $line = $_;
			chomp($line);
			last if ($. > 2);
			my @fields = split(/ /, $line);
			my $haplotype = join('', @fields[2 .. ($#fields-2)]);
			if($. == 1)
			{
				$haplotype_1 = $haplotype;
				@haplotype_1_alignment_fields = @fields[2 .. ($#fields-2)];
			}
			elsif($. == 2)
			{
				$haplotype_2 = $haplotype;	
				@haplotype_2_alignment_fields = @fields[2 .. ($#fields-2)];
			}
		}
		close(HAPLOTYPES);    
		
		my $output_file = $expected_output_filename.".viterbiVCF";		
		haplotypesToVCF2(\@haplotype_1_alignment_fields, \@haplotype_2_alignment_fields, $sample, \@loci_in_graph, $pgf_ref_seq, $output_file);
			
		my $output_file_pseudoVCF = $expected_output_filename.".viterbi_IGV_info";		
		haplotypesToVCF(\@haplotype_1_alignment_fields, \@haplotype_2_alignment_fields, $sample, \@loci_in_graph, $pgf_ref_seq, $output_file_pseudoVCF);
	}
	else
	{
		print "Cannot find inference file $expected_output_filename\n";
	}
}
elsif($collect eq '1')
{
	my $expected_output_filename = $kMer_count_sample.'.inference';
	
	open(REF, "<", $xMHC_reference) or die "Cannot open $xMHC_reference";
	my $pgf_ref_seq;
	while(<REF>)
	{
		my $line = $_; 
		chomp($line);
		$line =~ s/[\x0A\x0D]+//g;	
		next if ($line =~ /^\>/);
		$pgf_ref_seq .= $line;
	}
	close(REF);
	my $pgf_stop = $pgf_start+length($pgf_ref_seq);
	
	my %identifier_to_position;
	my %identifier_to_refalleles;
	if(-e $vcfPos)
	{
		open(VCFSNPS, '<', $vcfPos) or die "Cannot open $vcfPos";
		my $header_line = <VCFSNPS>;
		chomp($header_line);
		my @header_fields = split(/\t/, $header_line);
		while(<VCFSNPS>)
		{
			my $line = $_;
			chomp($line);
			my @line_fields = split(/\t/, $line);
			my %line = (mesh @header_fields, @line_fields);
		
			die unless(defined $line{Position});
			
			die unless($line{RefAllele});
			
			my $reference_charPos_PGF = $line{Position} - $pgf_start;
			
			next if($reference_charPos_PGF < 0);
			next if($reference_charPos_PGF > (length($pgf_ref_seq) - 1));
			$identifier_to_position{$line{ID}} = $line{Position};
			$identifier_to_refalleles{$line{ID}} = $line{RefAllele};
						
			my $reference_char = substr($pgf_ref_seq, $reference_charPos_PGF, length($line{RefAllele}));
			
			die "$reference_char ne $line{RefAllele}, $reference_charPos_PGF, line $." unless($reference_char eq $line{RefAllele});
		}			
		close(VCFSNPS);
	}
	
	if((-e $expected_output_filename) && (! $redo))
	{
		print "Analyze $expected_output_filename\n";
		
		my %inference;
		
		my %inference_PP;
		my %inference_seen_pairs;
		
		my %inference_firstAllele;
		my %inference_secondAllele; 
		my %firstAllele_called;
		my %secondAllele_called;		
		
		open(INFERENCE, '<', $expected_output_filename) or die "Cannot open $expected_output_filename";
		my $firstLine_inference = <INFERENCE>;
		chomp($firstLine_inference);
		$firstLine_inference =~ s/\n//g;
		$firstLine_inference =~ s/\r//g;		
		my @header_fields = split(/ /, $firstLine_inference);
		die unless($header_fields[0] eq 'IndividualID');
		my @inference_fields =  @header_fields[2 .. $#header_fields];
				
		while(<INFERENCE>)
		{
			my $line = $_;
			chomp($line);
			$line =~ s/\n//g;
			$line =~ s/\r//g;
			
			my @fields = split(/ /, $line);
			my %linehash = (mesh @header_fields, @fields);
			
			my $individualID_mix = $linehash{'IndividualID'};
						
			die "Error indivID not parseable: $individualID_mix" unless($individualID_mix =~ /^(.+?)-x-(.+?)$/);
			my $individualID = $1;
			my $iteration = $2;
			
			foreach my $fN (@inference_fields)
			{
				die "Found question marks in output for the desired fieldname $fN -- not good!\nValue: $linehash{$fN}" if ($linehash{$fN} =~ /\?/);
				
				if($fN =~ /HLA/)
				{
					if(length($linehash{$fN}) > 5)
					{
						$linehash{$fN} = substr($linehash{$fN}, 0, 5);
					}	
				}
				
				push(@{$inference{$individualID}{$fN}[$iteration-1]}, $linehash{$fN});
			}			
		}
		close(INFERENCE);
			
		foreach my $individualID (keys %inference)
		{
			foreach my $fN (@inference_fields)
			{
				foreach my $inference_combination (@{$inference{$individualID}{$fN}})
				{
					warn "Empty field in inference combinations!" unless ($inference_combination);
					my @alleles = sort @{$inference_combination};
					
					my $alleles = join('_', @alleles);
					$inference_seen_pairs{$fN}{$alleles} = 1;

		
					$inference_PP{$individualID}{$fN}{$alleles}++;
					$inference_PP{$individualID}{$fN}{'ALL'}++;			
		
					
					my %tmp_alleles = map {$_ => 1} @alleles;
					

					foreach my $allele (keys %tmp_alleles)
					{
						$inference_firstAllele{$individualID}{$fN}{$allele}++;
					}
					$inference_firstAllele{$individualID}{$fN}{'ALL'}++;		

				}
			}
		}
		
		foreach my $individualID (keys %inference)
		{
			foreach my $fN (@inference_fields)
			{					
				my $call = maxAllele($inference_firstAllele{$individualID}{$fN});
				$firstAllele_called{$individualID}{$fN} = $call;
				$firstAllele_called{$individualID}{$fN}[2] = $firstAllele_called{$individualID}{$fN}[1];
			}
		}
			
		foreach my $individualID (keys %inference)
		{
			foreach my $fN (@inference_fields)
			{		
				foreach my $inference_combination (@{$inference{$individualID}{$fN}})
				{
					warn "Empty field in inference combinations!" unless ($inference_combination);
					my @alleles = sort @{$inference_combination};
					my $firstAllele = $firstAllele_called{$individualID}{$fN}[0];
					if($firstAllele ~~ @alleles)
					{
						my $gotFirst = 0;
						my $otherAllele;
						for(@alleles)
						{
							if(! $gotFirst)
							{
								if($_ eq $firstAllele)
								{
									$gotFirst = 1;
									next;
								}
							}
							$otherAllele = $_;
						}
						die unless ($gotFirst);
						
						$inference_secondAllele{$individualID}{$fN}{$otherAllele}++;
						$inference_secondAllele{$individualID}{$fN}{'ALL'}++;	

					}
				}
			}
		}
			
		foreach my $individualID (keys %inference)
		{
			foreach my $fN (@inference_fields)
			{		
				my $call = maxAllele($inference_secondAllele{$individualID}{$fN});
				$secondAllele_called{$individualID}{$fN} = $call;
				$secondAllele_called{$individualID}{$fN}[2] = $secondAllele_called{$individualID}{$fN}[1] * $firstAllele_called{$individualID}{$fN}[2];
			}	
		}		
		
		
		my %max_pair_value;
		my %max_pair_alleles;
		my $output_file = $expected_output_filename.".PP";
		open(OUTPUT, '>', $output_file) or die "Cannot open $output_file";
		my @allele_headers;
		my %pairs_in_order;
		foreach my $fN (@inference_fields)
		{	
			my @pairs_in_order = sort keys %{$inference_seen_pairs{$fN}};
			$pairs_in_order{$fN} = \@pairs_in_order;
			push(@allele_headers, @pairs_in_order);
		}
		print OUTPUT join(' ', 'IndividualID', @allele_headers), "\n";
		foreach my $individualID (sort keys %inference_PP)
		{
			my @allele_fields;
			foreach my $fN (@inference_fields)
			{
				push(@allele_fields,
					map {sprintf("%.5f", $inference_PP{$individualID}{$fN}{$_}/$inference_PP{$individualID}{$fN}{'ALL'} )} @{$pairs_in_order{$fN}}
				);
			}
			print OUTPUT join(' ', $individualID, @allele_fields), "\n";
			print OUTPUT join(' ', $individualID, ), "\n";
		}
		close(OUTPUT);
		
		$output_file = $expected_output_filename.".bestguess";
		open(OUTPUT, '>', $output_file) or die "Cannot open $output_file";
		
		@allele_headers = ();
		foreach my $fN (@inference_fields)
		{	
			my @headers = ($fN, qw/Q Q2 P/);			
			push(@allele_headers, @headers);
		}
		
		my %bestguess_exist_alleles;
		
		print OUTPUT join(' ', 'IndividualID', 'Chromosome', @allele_headers), "\n";
		foreach my $individualID (keys %inference_firstAllele)
		{	
			my $likelihood = 'NA';
			my @f_1;
			my @f_2;
			foreach my $fN (@inference_fields)
			{	
				$bestguess_exist_alleles{$fN}{$firstAllele_called{$individualID}{$fN}[0]}++;
				$bestguess_exist_alleles{$fN}{$secondAllele_called{$individualID}{$fN}[0]}++;
				
				push(@f_1,
					$firstAllele_called{$individualID}{$fN}[0], $firstAllele_called{$individualID}{$fN}[1], $firstAllele_called{$individualID}{$fN}[2], $likelihood
				
				);
				push(@f_2,
					$secondAllele_called{$individualID}{$fN}[0], $secondAllele_called{$individualID}{$fN}[1], $secondAllele_called{$individualID}{$fN}[2], $likelihood
				);				
			}
			print OUTPUT join(' ', $individualID, 1, @f_1), "\n";
			print OUTPUT join(' ', $individualID, 2, @f_2), "\n";
		}
		close(OUTPUT);
		
		if($vcfPos)
		{
			my $output_file = $expected_output_filename.".VCF";
			open(VCF, '>', $output_file) or die "Cannot open $output_file";
my $VCF_header = qq(##fileformat=VCFv4.0
##fileDate=27/04/12
##phasing=none, though some calls involve phasing clustered variants
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=COV,Number=2,Type=Integer,Description="Number of reads on ref and alt alleles">
##FORMAT=<ID=GT_CONF,Number=1,Type=Float,Description="Genotype confidence. Difference in log likelihood of most likely and next most likely genotype">
##FORMAT=<ID=SITE_CONF,Number=1,Type=Float,Description="Probabilitic site classification confidence. Difference in log likelihood of most likely and next most likely model (models are variant, repeat and error)">
##INFO=<ID=SVLEN,Number=1,Type=Integer,Description="Difference in length between REF and ALT alleles">
##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of variant">
##ALT=<ID=SNP,Description="SNP">
##ALT=<ID=SNP_FROM_COMPLEX,Description="SNP called from a cluster of phased SNPs or complex SNP/indel , split out for easier comparison with other SNP call sets">
##ALT=<ID=DEL,Description="Deletion">
##ALT=<ID=INS,Description="Insertion of novel sequence">
##ALT=<ID=INDEL,Description="Insertion-deletion">
##ALT=<ID=INV,Description="Inversion">
##ALT=<ID=INV_INDEL,Description="Inversion+indel - this script overcalls these, so worth checking">
##ALT=<ID=DEL_INV,Description="Deletion + Inversion">
##ALT=<ID=INS_INV,Description="Insertion + Inversion">
##ALT=<ID=PH_SNPS,Description="Phased SNPs">
##ALT=<ID=COMPLEX,Description="Complex variant, collection of SNPs and indels">
##FILTER=<ID=MAPQ,Description="5prime flank maps to reference with mapping quality below 40">);
			my @header_fields = qw/CHROM POS ID REF ALT QUAL FILTER INFO FORMAT/;
			my @individualIDs = sort (keys %inference_firstAllele);
			print VCF $VCF_header, "\n";
			print VCF '#'.join("\t", @header_fields, @individualIDs), "\n";
			
			my @output_variants = grep {exists $identifier_to_position{$_}} @inference_fields;
			foreach my $variantID (@output_variants)
			{
				my @output_fields;
				push(@output_fields, 6);
				push(@output_fields, $identifier_to_position{$variantID});
				push(@output_fields, $variantID);
				
				my $refAllele = $identifier_to_refalleles{$variantID};
				push(@output_fields, $refAllele);
				
				die unless($bestguess_exist_alleles{$variantID});
				delete $bestguess_exist_alleles{$variantID}{$refAllele};
				my @nonref_alleles = sort keys %{$bestguess_exist_alleles{$variantID}};
				
				my %allele_2_num = ($refAllele => 0);
				my $allele_counter = 1;
				for(@nonref_alleles)
				{
					$allele_2_num{$nonref_alleles[$allele_counter-1]} = $allele_counter;
					$allele_counter++
				}
				if(scalar(@nonref_alleles) == 0)
				{
					push(@output_fields, '.');
				}
				else
				{
					push(@output_fields, join(',', @nonref_alleles));
				}
				push(@output_fields, '.');
				push(@output_fields, 'PASS');
				push(@output_fields, 'SVTYPE=SNP;SVLEN=0');
				push(@output_fields, 'GT');
				
				foreach my $indivID (@individualIDs)
				{
					(exists $allele_2_num{$firstAllele_called{$indivID}{$variantID}[0]}) or die;
					(exists $allele_2_num{$secondAllele_called{$indivID}{$variantID}[0]}) or die;					
					my $gt = join('/', $allele_2_num{$firstAllele_called{$indivID}{$variantID}[0]}, $allele_2_num{$secondAllele_called{$indivID}{$variantID}[0]});
					push(@output_fields, $gt);
				}
				
				print VCF join("\t", @output_fields), "\n";
			}
			
			close(VCF);
			
			print "\n\nVCF file $output_file produced.\n";
		}
		
		print "\n\nOutput collected in\n$output_file\n\n";		
		
		print "\n\n";
		
		# print `cat $output_file`;
	}
	else
	{
		print "Cannot find inference file $expected_output_filename\n";
	}
}


sub read_PGF
{
	my %alignment;
	my $file = $path_to_PGF_haplotype;
	open(ALIGNMENT, "<", $file) or die "Cannot open $file";
	my $current_name;
	while(<ALIGNMENT>)
	{
		my $line = $_;
		chomp($line);
		$line =~ s/\r//g;
		$line =~ s/\n//g;
		if($line =~ /^>/)
		{
			die if ($current_name);
			$current_name = 'pgf';
		}
		else
		{
			die unless($current_name);
			$alignment{$current_name} .= $line;
		}
	}
	close(ALIGNMENT);
	$alignment{'pgf'} or die;
	return $alignment{'pgf'};
}

sub maxAllele
{
	my $indiv_href = shift;
	my $max_allele;
	my $max_allele_Q = -1; 
	foreach my $allele (keys %$indiv_href)
	{
		next if ($allele eq 'ALL');
		if($indiv_href->{$allele} > $max_allele_Q)
		{
			$max_allele = $allele;
			$max_allele_Q = $indiv_href->{$allele};
		}
	}
	
	die "Field $indiv_href->{'ALL'} missing\n\n".Dumper($indiv_href) unless($indiv_href->{'ALL'});
	
	return [$max_allele, $max_allele_Q/$indiv_href->{'ALL'}];
}


sub find_individual_readsPath
{
	# my $individualID = shift;
	# my @possibilities = map {$base_path_reads.'/drive_'.$_.'/'.$individualID} qw/f l v w x/;
	# my @found = grep {(-e $_) && (-d $_)} @possibilities;
	# unless ($#found == 0)
	# {
		# die "Problem finding $individualID. Details:\n".Dumper(\@found);
	# }
	# return $found[0];
}

sub find_individual_BAM
{
	my $individualID = shift;
	my $BAM = $base_path_BAMs.'/'.$individualID.'.bam';
	unless(-e $BAM)
	{
		# die "Cannot find BAM for individual $individualID. Expected path:\n$BAM";
	}
	return $BAM;
}


sub makeReadOriginDecision
{
	my $input_1 = shift;
	my $input_2 = shift;
	
	my $output_1 = $input_1.'.exclusiveReads';
	my $output_2 = $input_2.'.exclusiveReads';
	
	my $mapped_flag = 2**2;
	die unless($mapped_flag == 0x4);
	
	my @header_lines_1;
	my @header_lines_2;
	
	my %reads;
	
	open(INPUT1, '<', $input_1) or die "Cannot open $input_1";
	open(INPUT2, '<', $input_2) or die "Cannot open $input_2";
	open(OUTPUT1, '>', $output_1) or die "Cannot open $output_1";
	open(OUTPUT2, '>', $output_2) or die "Cannot open $output_2";
	while(<INPUT1>)
	{
		my $line = $_;
		chomp($line);
		if(substr($line, 0, 1) eq '@')
		{
			push(@header_lines_1, $line);
		}
		else
		{
			my @fields = split(/\t/, $line);
			
			my $name = $fields[0];
			my $flags = $fields[1];
			my $quality = $fields[4];
			my $sequence = $fields[9];
			
			next if($flags & $mapped_flag);
			next if($quality == 255);
			
			my $edit_distance;
			unless($line =~ /NM:i:(\d+)/)
			{
				die "Weird line $. in $input_1 :\n$line";
			}
			$edit_distance = $1;
								
			$reads{$sequence}{$name}[0]{q} = $quality;
			$reads{$sequence}{$name}[0]{edit_distance} = $edit_distance;						
			$reads{$sequence}{$name}[0]{line} = $line;			
		}
	}
	
 
	while(<INPUT2>)
	{
		my $line = $_;
		chomp($line);
		if(substr($line, 0, 1) eq '@')
		{
			push(@header_lines_2, $line);
		}
		else
		{
			my @fields = split(/\t/, $line);
			
			my $name = $fields[0];
			my $flags = $fields[1];
			my $quality = $fields[4];
			my $sequence = $fields[9];
			
			next if($flags & $mapped_flag);
			next if($quality == 255);
			
			my $edit_distance;
			unless($line =~ /NM:i:(\d+)/)
			{
				die "Weird line $. in $input_2 :\n$line";
			}
			$edit_distance = $1;  
							
			$reads{$sequence}{$name}[1]{q} = $quality;
			$reads{$sequence}{$name}[1]{edit_distance} = $edit_distance;			
			$reads{$sequence}{$name}[1]{line} = $line;			
		}
	}	
	
	print OUTPUT1 join("\n", @header_lines_1), "\n";
	print OUTPUT2 join("\n", @header_lines_2), "\n";
	
	foreach my $sequence (keys %reads)
	{
		my @names = keys %{$reads{$sequence}};
		if($#names > 0)
		{
			print "Sequence $sequence, have ", scalar(@names), " names\n";
		}
		foreach my $name (@names)
		{
			my $q1 = $reads{$sequence}{$name}[0]{q};
			my $q2 = $reads{$sequence}{$name}[1]{q};
			my $e1 = $reads{$sequence}{$name}[0]{edit_distance};
			my $e2 = $reads{$sequence}{$name}[1]{edit_distance};
			
			my $num_of_qualities = (( (defined $q1) ? 1 : 0) + ( (defined $q2) ? 1 : 0 ) );
			my $num_of_editDistances = (( (defined $e1) ? 1 : 0) + ( (defined $e2) ? 1 : 0 ) );
			
			# print "\tName $name with $num_of_qualities qualities.\n" if ($#names > 0);
			
			$e1 = 1000 unless(defined $e1);
			$e2 = 1000 unless(defined $e2);
			die if(($e1 == -1) and ($e2 == -1));
			
			$q1 = -1 unless(defined $q1);
			$q2 = -1 unless(defined $q2);
			die if(($q1 == -1) and ($q2 == -1));
			
			# print $q1, " vs ", $q2, "\n";
			
			# if($e1 < $e2)
			# {
					# print OUTPUT1 $reads{$sequence}{$name}[0]{line}, "\n";			
			# }
			# elsif($e1 > $e2)
			# {
					# print OUTPUT2 $reads{$sequence}{$name}[1]{line}, "\n";			
			# }
			# else
			# {
				# die Dumper('$e1 != $e2', $e1, $e2) unless($e1 == $e2);
				
				if($q1 > $q2)
				{
					print OUTPUT1 $reads{$sequence}{$name}[0]{line}, "\n";
				}
				elsif($q1 == $q2)
				{
					if(rand() < 0.5)
					{
						die Dumper($q1, $q2)  unless(defined $reads{$sequence}{$name}[0]{line});
						print OUTPUT1 $reads{$sequence}{$name}[0]{line}, "\n";				
					}
					else
					{
						die Dumper($q1, $q2) unless(defined $reads{$sequence}{$name}[1]{line});
						print OUTPUT2 $reads{$sequence}{$name}[1]{line}, "\n";				
					}
				}
				else
				{
					print OUTPUT2 $reads{$sequence}{$name}[1]{line}, "\n";
				}
			# }
		}
	}
	
	
	close(INPUT1);
	close(INPUT2);
	close(OUTPUT1);
	close(OUTPUT2);
}
 

sub needleman_wunsch
{
	my $seq1 = shift;
	my $seq2 = shift;
	my $score_match = shift;
	my $score_mismatch = shift;
	my $score_gap = shift;
	
	my $length_x = length($seq1);
	my $length_y = length($seq2);
	
	my $score_firstGap = $score_gap - 1000;
	
	my $t = [];
	for(my $x = 0; $x <= $length_x; $x++)
	{
		$t->[$x][0] = $x * $score_firstGap;
	}
	
	for(my $y = 0; $y <= $length_y; $y++)
	{
		$t->[0]->[$y] = $y * $score_firstGap;
	}
	
	for(my $y = 1; $y <= $length_y; $y++)
	{
		for(my $x = 1; $x <= $length_x; $x++)
		{
			die unless(defined substr($seq1, $x-1, 1));
			die unless(defined substr($seq2, $y-1, 1));
			my @alt = ($t->[$x-1][$y] + $score_gap, $t->[$x][$y-1] + $score_gap, $t->[$x-1][$y-1] + (((substr($seq1, $x-1, 1) eq substr($seq2, $y-1, 1)) or (substr($seq1, $x-1, 1) eq '*') or (substr($seq1, $x-1, 1) eq 'N')) ? $score_match : $score_mismatch));
				
			$t->[$x][$y] = more_max(@alt);
		}
	}
	
	my $backtrack_x = $length_x, my $backtrack_y = $length_y;
	
	my $seq1_aligned, my $seq2_aligned;
	while(($backtrack_x != 0) or ($backtrack_y != 0))
	{
		die unless($backtrack_x >= 0);
		die unless($backtrack_y >= 0);
		
		
		die if( (($backtrack_x == 0) and ($backtrack_y == 1)) or (($backtrack_x == 1) and ($backtrack_y == 0)));
		
		my $v = $t->[$backtrack_x][$backtrack_y];
		my @alts = ($t->[$backtrack_x-1][$backtrack_y] + $score_gap, $t->[$backtrack_x][$backtrack_y-1] + $score_gap, $t->[$backtrack_x-1][$backtrack_y-1] + (((substr($seq1, $backtrack_x-1, 1) eq substr($seq2, $backtrack_y-1, 1)) or (substr($seq1, $backtrack_x-1, 1) eq '*') or (substr($seq1, $backtrack_x-1, 1) eq 'N')) ? $score_match : $score_mismatch));
		my $whereFrom = more_max_from(@alts);
		#print Dumper(@alts, $whereFrom), "\n\n";
		
		if($backtrack_x == 0)
		{
			$whereFrom = 1;
		}
		if($backtrack_y == 0)
		{
			$whereFrom = 0;
		}			
		
		if($whereFrom == 0)
		{
			$seq2_aligned .= '_';
			$seq1_aligned .= substr($seq1, $backtrack_x-1, 1);
			$backtrack_x--;
		}
		elsif($whereFrom == 1)
		{
			$seq1_aligned .= '_';
			$seq2_aligned .= substr($seq2, $backtrack_y-1, 1);
			$backtrack_y--;
		}
		elsif($whereFrom == 2)
		{
			$seq1_aligned .= substr($seq1, $backtrack_x-1, 1);
			$seq2_aligned .= substr($seq2, $backtrack_y-1, 1);
			$backtrack_x--;
			$backtrack_y--;
		}
		else
		{
			die;
		}
	}
	
	$seq1_aligned = reverse($seq1_aligned);
	$seq2_aligned = reverse($seq2_aligned);
	
	return ($seq1_aligned, $seq2_aligned);
}

sub more_max
{
	return (reverse sort {$a <=> $b}(@_))[0];
}

sub more_max_from
{
	my @alts = @_;
	my $m = more_max(@alts);
	my %from;
	for(my $i = 0; $i <= $#alts; $i++)
	{
		if($alts[$i] eq $m)
		{
			return $i;
		}
	}
	die;
}	

sub alignHaplotypeCharactersToPGF
{
	my $haplotype_aref = shift;
	my $loci_in_graph_aref = shift;
	
	die unless(scalar(@$haplotype_aref) == scalar(@$loci_in_graph_aref));
	
	my @forReturn;
	for(my $pI = 0; $pI <= $#{$loci_in_graph_aref}; $pI++)
	{
		my $locusID = $loci_in_graph_aref->[$pI];
		my @locusID_fields = split(/_/, $locusID);
		die unless($#locusID_fields == 2);
		my $haplo_char = $haplotype_aref->[$pI];
		if($haplo_char ne '_')
		{
			my $pgf_position = $pgf_start + $locusID_fields[2];
			push(@forReturn, $pgf_position);
		}	
	}
	
	return \@forReturn;
}

sub haplotypeCharacterAlignmentPositions
{
	my $haplotype_aref = shift;
	my $loci_in_graph_aref = shift;
	
	die unless(scalar(@$haplotype_aref) == scalar(@$loci_in_graph_aref));
	
	my @forReturn;
	for(my $pI = 0; $pI <= $#{$loci_in_graph_aref}; $pI++)
	{
		my $locusID = $loci_in_graph_aref->[$pI];
		my @locusID_fields = split(/_/, $locusID);
		die unless($#locusID_fields == 2);
		my $haplo_char = $haplotype_aref->[$pI];
		if($haplo_char ne '_')
		{
			push(@forReturn, $pI);
		}	
	}
	
	return \@forReturn;
}


sub haplotypesToVCF
{
	my $haplo1_fields_aref = shift;
	my $haplo2_fields_aref = shift;
	my $individualID = shift;
	my $loci_in_graph_aref = shift;
	my $pgf_ref_seq = shift;
	my $output_file = shift;
	my $individual_haplotypes_positions = shift;
	
	my @loci_in_graph = @$loci_in_graph_aref;
	 
	my %identifier_to_position;   
	my %identifier_to_refalleles;
	
	die "Please suppy parameter --vcfPos" unless ((defined $vcfPos));
	die "Specified --vcfPos ($vcfPos) not existing" unless(-e $vcfPos);

	open(VCFSNPS, '<', $vcfPos) or die "Cannot open $vcfPos";
	my $header_line = <VCFSNPS>;
	chomp($header_line);
	$header_line =~ s/\n//g;
	$header_line =~ s/\r//g;	
	
	my @header_fields = split(/\t/, $header_line);
	while(<VCFSNPS>)
	{
		my $line = $_;
		chomp($line);
		$line =~ s/\n//g;
		$line =~ s/\r//g;	
	
		my @line_fields = split(/\t/, $line);
		my %line = (mesh @header_fields, @line_fields);
	
		die unless(defined $line{Position});
		
		die unless($line{RefAllele});
		
		my $reference_charPos_PGF = $line{Position} - $pgf_start;
		
		next if($reference_charPos_PGF < 0);
		next if($reference_charPos_PGF > (length($pgf_ref_seq) - 1));
		
		$identifier_to_position{$line{ID}} = $line{Position};
		$identifier_to_refalleles{$line{ID}} = $line{RefAllele};
					
		my $reference_char = substr($pgf_ref_seq, $reference_charPos_PGF, length($line{RefAllele}));
		
		die "$reference_char ne $line{RefAllele}, $reference_charPos_PGF, line $." unless($reference_char eq $line{RefAllele});
	}			
	close(VCFSNPS);
	
	my %inference;
	
	my %inference_PP;
	my %inference_seen_pairs;
	
	my %inference_firstAllele;
	my %inference_secondAllele; 
	my %firstAllele_called;
	my %secondAllele_called;		
	my %bestguess_exist_alleles;
	
	my %seen_PGF_positions;
	my %pgf_genotypes_labels;
	
	my %taken_alignment_positions;
	my %taken_haplotype_positions;
	
	my $iteration = 1; # this is just one iteration
	foreach my $fRef ($haplo1_fields_aref, $haplo2_fields_aref)
	{
		my @fields = @$fRef;
		
		die Dumper($#fields, $#loci_in_graph) unless($#fields == $#loci_in_graph);
		
		my %pgf_genotypes;
		
		my $nextHaplo = 1;
		my %prepared_this_PGF_index;
		for(my $pI = 0; $pI <= $#fields; $pI++)
		{
			my $locusID = $loci_in_graph[$pI];
			my @locusID_fields = split(/_/, $locusID);
			die unless($#locusID_fields == 2);

			my $pgf_position = $pgf_start + $locusID_fields[2];
			unless($prepared_this_PGF_index{$pgf_position})
			{
				unless($taken_alignment_positions{$pgf_position})
				{	
					$taken_alignment_positions{$pgf_position} = [];
				}
				push(@{$taken_alignment_positions{$pgf_position}}, []);
				
				unless($taken_haplotype_positions{$pgf_position})
				{	
					$taken_haplotype_positions{$pgf_position} = [];
				}
				push(@{$taken_haplotype_positions{$pgf_position}}, []);		
				
				$prepared_this_PGF_index{$pgf_position} = 1;
			}
		}
		
		for(my $pI = 0; $pI <= $#fields; $pI++)
		{
			my $locusID = $loci_in_graph[$pI];
			my @locusID_fields = split(/_/, $locusID);
			die unless($#locusID_fields == 2);

			my $pgf_position = $pgf_start + $locusID_fields[2];
			
			if($locusID_fields[0] =~ /^rs/)
			{
				$pgf_genotypes_labels{$pgf_position}{$locusID_fields[0]}++;
			}
			
			my $char_from_inference = $fields[$pI];
			
			$pgf_genotypes{$pgf_position} .= $char_from_inference;
			
			$seen_PGF_positions{$pgf_position}++;
			
			die unless($#{$taken_alignment_positions{$pgf_position}} > -1);
			push(@{$taken_alignment_positions{$pgf_position}[$#{$taken_alignment_positions{$pgf_position}}]}, $pI);
			
			if($individual_haplotypes_positions)
			{
				my $haplotype_position;
				if($fRef == $haplo1_fields_aref)
				{
					$haplotype_position = $individual_haplotypes_positions->[0][$pI];
				}
				else
				{
					$haplotype_position = $individual_haplotypes_positions->[1][$pI];				
				}

				push(@{$taken_haplotype_positions{$pgf_position}[$#{$taken_haplotype_positions{$pgf_position}}]}, $haplotype_position) if(defined $haplotype_position);
			}
		}
		
		foreach my $pgfPos (keys %pgf_genotypes)
		{
			push(@{$inference{$individualID}{$pgfPos}[$iteration-1]}, $pgf_genotypes{$pgfPos});
		}
	}
	close(INFERENCE);
	
	# print the alignment positions that this VCF refers to
	
	my $output_alignment_positions = $output_file.'.positionsInRefToAlignment';
	
	open(POSITIONS, '>', $output_alignment_positions) or die "Cannot open $output_alignment_positions";
	print POSITIONS join(' ', 'reference_position', 'alignment_positions_h1', 'alignment_positions_h2'), "\n";
	my @k = sort {$a <=> $b} keys %taken_alignment_positions;
	foreach my $k (@k)
	{
		print POSITIONS join(' ', $k, join(';', @{$taken_alignment_positions{$k}[0]}), join(';', @{$taken_alignment_positions{$k}[1]}), ), "\n";
	}
	close(POSITIONS);
	
	if($individual_haplotypes_positions)
	{
		$output_alignment_positions = $output_file.'.positionsInRefToHaplotypes';
		
		open(POSITIONS, '>', $output_alignment_positions) or die "Cannot open $output_alignment_positions";
		print POSITIONS join(' ', 'reference_position', 'haplotype_positions_h1', 'haplotype_positions_h2'), "\n";
		my @k = sort {$a <=> $b} keys %taken_haplotype_positions;
		foreach my $k (@k)
		{
			print POSITIONS join(' ', $k, join(';', @{$taken_haplotype_positions{$k}[0]}), join(';', @{$taken_haplotype_positions{$k}[1]}), ), "\n";
		}
		close(POSITIONS);	
	}

		
	foreach my $individualID (keys %inference)
	{
		foreach my $pgfPos (keys %seen_PGF_positions)
		{
			foreach my $inference_combination (@{$inference{$individualID}{$pgfPos}})
			{
				warn "Empty field in inference combinations!" unless ($inference_combination);
				my @alleles = sort @{$inference_combination};
				
				my $alleles = join('_', @alleles);
				$inference_seen_pairs{$pgfPos}{$alleles} = 1;

				$inference_PP{$individualID}{$pgfPos}{$alleles}++;
				$inference_PP{$individualID}{$pgfPos}{'ALL'}++;			
				
				my %tmp_alleles = map {$_ => 1} @alleles;
			
				foreach my $allele (keys %tmp_alleles)
				{
					$inference_firstAllele{$individualID}{$pgfPos}{$allele}++;
				}
				$inference_firstAllele{$individualID}{$pgfPos}{'ALL'}++;		

			}
		}
	}
		
	foreach my $individualID (keys %inference)
	{
		foreach my $pgfPos (keys %seen_PGF_positions)
		{					
			my $call = maxAllele($inference_firstAllele{$individualID}{$pgfPos});
			$firstAllele_called{$individualID}{$pgfPos} = $call;
			$firstAllele_called{$individualID}{$pgfPos}[2] = $firstAllele_called{$individualID}{$pgfPos}[1];
		}
	}
			
	foreach my $individualID (keys %inference)
	{
		foreach my $pgfPos (keys %seen_PGF_positions)
		{		
			foreach my $inference_combination (@{$inference{$individualID}{$pgfPos}})
			{
				warn "Empty field in inference combinations!" unless ($inference_combination);
				my @alleles = sort @{$inference_combination};
				my $firstAllele = $firstAllele_called{$individualID}{$pgfPos}[0];
				if($firstAllele ~~ @alleles)
				{
					my $gotFirst = 0;
					my $otherAllele;
					for(@alleles)
					{
						if(! $gotFirst)
						{
							if($_ eq $firstAllele)
							{
								$gotFirst = 1;
								next;
							}
						}
						$otherAllele = $_;
					}
					die unless ($gotFirst);
					
					$inference_secondAllele{$individualID}{$pgfPos}{$otherAllele}++;
					$inference_secondAllele{$individualID}{$pgfPos}{'ALL'}++;	
				}
			}
		}
	}
			
	foreach my $individualID (keys %inference)
	{
		foreach my $pgfPos (keys %seen_PGF_positions)
		{		
			my $call = maxAllele($inference_secondAllele{$individualID}{$pgfPos});
			$secondAllele_called{$individualID}{$pgfPos} = $call;
			$secondAllele_called{$individualID}{$pgfPos}[2] = $secondAllele_called{$individualID}{$pgfPos}[1] * $firstAllele_called{$individualID}{$pgfPos}[2];
		}	
	}		
		
	foreach my $individualID (keys %inference_firstAllele)
	{	
		foreach my $pgfPos (keys %seen_PGF_positions)
		{	
			$bestguess_exist_alleles{$pgfPos}{$firstAllele_called{$individualID}{$pgfPos}[0]}++;
			$bestguess_exist_alleles{$pgfPos}{$secondAllele_called{$individualID}{$pgfPos}[0]}++;
		}
	}
		
		
	open(VCF, '>', $output_file) or die "Cannot open $output_file";
	my $VCF_header = qq(##fileformat=VCFv4.0
##fileDate=27/04/12
##phasing=none, though some calls involve phasing clustered variants
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
);
	my @header_fields_VCF = qw/CHROM POS ID REF ALT QUAL FILTER INFO FORMAT/;
	
	my @individualIDs = sort (keys %inference_firstAllele);
	print VCF $VCF_header, "\n";
	print VCF '#'.join("\t", @header_fields_VCF, @individualIDs), "\n";
	
	my $last_pgfPos_with_IGV_H1;
	my $last_pgfPos_with_IGV_H2;
	
	my @output_variants = (keys %seen_PGF_positions);
	foreach my $pgfPos (sort {$a <=> $b} @output_variants)
	{
		my $variantID = '.';
		my $rsID;
		
		if(scalar( keys %{$pgf_genotypes_labels{$pgfPos}}) == 1)
		{
			$rsID = ((keys %{$pgf_genotypes_labels{$pgfPos}})[0]);
			die Dumper('$pgfPos != $identifier_to_position{$rsID}', $rsID, $pgfPos, $identifier_to_position{$rsID}) unless($pgfPos == $identifier_to_position{$rsID});
			$variantID = $rsID;
		}
		
		my @output_fields;
		push(@output_fields, 6);
		push(@output_fields, $pgfPos);
		push(@output_fields, $variantID);
		
		my $reference_charPos_PGF = $pgfPos - $pgf_start;
		die "???" if($reference_charPos_PGF < 0);
		die "???" if($reference_charPos_PGF > (length($pgf_ref_seq) - 1));
		my $reference_char = substr($pgf_ref_seq, $reference_charPos_PGF, 1);

		if($rsID)
		{
			die Dumper($., $pgfPos, $rsID, $identifier_to_refalleles{$rsID}, $reference_char) unless($identifier_to_refalleles{$rsID} eq $reference_char);
		}
		
		my $refAllele = $reference_char;
		push(@output_fields, $refAllele);
		
		die unless($bestguess_exist_alleles{$pgfPos});
		delete $bestguess_exist_alleles{$pgfPos}{$refAllele};
		my @nonref_alleles = sort keys %{$bestguess_exist_alleles{$pgfPos}};
		
		my %allele_2_num = ($refAllele => 0);
		my $allele_counter = 1;
		for(@nonref_alleles)
		{
			$allele_2_num{$nonref_alleles[$allele_counter-1]} = $allele_counter;
			$allele_counter++
		}
		if(scalar(@nonref_alleles) == 0)
		{
			push(@output_fields, '.');
		}
		else
		{
			push(@output_fields, join(',', @nonref_alleles));
		}
		push(@output_fields, '.');
		
		# this is a hack!
		die if($#individualIDs > 0);
		if(not ($secondAllele_called{$individualIDs[0]}{$pgfPos}[2] > 0.7))
		{
			push(@output_fields, 'FAIL');
		}
		else
		{
			push(@output_fields, 'PASS');
		}
		
		my $real_pgfPos = $pgfPos - $pgf_start;
		
		my @positions_in_string_haplotypes;
		my @positions_in_IGV_haplotypes;
		my @preDeletion_IGV_positions;
		
		if($individual_haplotypes_positions)
		{
			if($pgfPos == 31009300)
			{
				# die Dumper($taken_haplotype_positions{$pgfPos}[0]);
			}
			@positions_in_string_haplotypes = (join('-', @{$taken_haplotype_positions{$pgfPos}[0]}), join('-', @{$taken_haplotype_positions{$pgfPos}[1]}));
			@positions_in_IGV_haplotypes = (join('-', map {$_ + $remapping_flank_length + 1} @{$taken_haplotype_positions{$pgfPos}[0]}), join('-', map {$_ + $remapping_flank_length + 1} @{$taken_haplotype_positions{$pgfPos}[1]}));
			
			if(scalar(@{$taken_haplotype_positions{$pgfPos}[0]}) == 0)
			{
				die unless($last_pgfPos_with_IGV_H1);
				$preDeletion_IGV_positions[0] = join('-', map {$_ + $remapping_flank_length + 1} @{$taken_haplotype_positions{$last_pgfPos_with_IGV_H1}[0]});				
			}
			else
			{
				$last_pgfPos_with_IGV_H1 = $pgfPos;
				$preDeletion_IGV_positions[0] = '';
			}
			
			if(scalar(@{$taken_haplotype_positions{$pgfPos}[1]}) == 0)
			{
				die unless($last_pgfPos_with_IGV_H2);
				$preDeletion_IGV_positions[1] = join('-', map {$_ + $remapping_flank_length + 1} @{$taken_haplotype_positions{$last_pgfPos_with_IGV_H2}[1]});				
			}
			else
			{
				$last_pgfPos_with_IGV_H2 = $pgfPos;
				$preDeletion_IGV_positions[1] = '';
			}			
		}
		else
		{		
			@positions_in_string_haplotypes = ('', '');
			@positions_in_IGV_haplotypes = ('', '');
			@preDeletion_IGV_positions = ('', '');
		}
		
		my @positions_in_alignments = (join('-', @{$taken_alignment_positions{$pgfPos}[0]}), join('-', @{$taken_alignment_positions{$pgfPos}[1]}));
		
		# warnings seem to be normal if reading Viterbi haplotypes
		push(@output_fields, ''.qq(POS_IN_PGF=${real_pgfPos};H1_ALIGNMENT=${positions_in_alignments[0]};H2_ALIGNMENT=${positions_in_alignments[1]};H1_STRING=${positions_in_string_haplotypes[0]};H2_STRING=${positions_in_string_haplotypes[1]};H1_IGV=${positions_in_IGV_haplotypes[0]};H2_IGV=${positions_in_IGV_haplotypes[1]};H1_PREDELETION_IGV=${preDeletion_IGV_positions[0]};H2_PREDELETION_IGV=${preDeletion_IGV_positions[1]};));
		push(@output_fields, 'GT');
		
		foreach my $indivID (@individualIDs)
		{
			(exists $allele_2_num{$firstAllele_called{$indivID}{$pgfPos}[0]}) or die;
			(exists $allele_2_num{$secondAllele_called{$indivID}{$pgfPos}[0]}) or die;					
			my $gt = join('/', $allele_2_num{$firstAllele_called{$indivID}{$pgfPos}[0]}, $allele_2_num{$secondAllele_called{$indivID}{$pgfPos}[0]});
			push(@output_fields, $gt);
		}
		
		print VCF join("\t", @output_fields), "\n";
	}
	
	close(VCF);
}



sub haplotypesToVCF2
{
	my $haplo1_fields_aref = shift;
	my $haplo2_fields_aref = shift;
	my $individualID = shift;
	my $loci_in_graph_aref = shift;
	my $pgf_ref_seq = shift;
	my $output_file = shift;
	
	my @loci_in_graph = @$loci_in_graph_aref;
	 
	my %identifier_to_position;   
	my %identifier_to_refalleles;
	
	die "Please suppy parameter --vcfPos" unless ((defined $vcfPos));
	die "Specified --vcfPos ($vcfPos) not existing" unless(-e $vcfPos);

	open(VCFSNPS, '<', $vcfPos) or die "Cannot open $vcfPos";
	my $header_line = <VCFSNPS>;
	chomp($header_line);
	$header_line =~ s/\n//g;
	$header_line =~ s/\r//g;	
	
	my @header_fields = split(/\t/, $header_line);
	while(<VCFSNPS>)
	{
		my $line = $_;
		chomp($line);
		$line =~ s/\n//g;
		$line =~ s/\r//g;	
	
		my @line_fields = split(/\t/, $line);
		my %line = (mesh @header_fields, @line_fields);
	
		die unless(defined $line{Position});
		
		die unless($line{RefAllele});
		
		my $reference_charPos_PGF = $line{Position} - $pgf_start;
		
		next if($reference_charPos_PGF < 0);
		next if($reference_charPos_PGF > (length($pgf_ref_seq) - 1));
		
		$identifier_to_position{$line{ID}} = $line{Position};
		$identifier_to_refalleles{$line{ID}} = $line{RefAllele};
					
		my $reference_char = substr($pgf_ref_seq, $reference_charPos_PGF, length($line{RefAllele}));
		
		die "$reference_char ne $line{RefAllele}, $reference_charPos_PGF, line $." unless($reference_char eq $line{RefAllele});
	}			
	close(VCFSNPS);
	
	my %inference;
	
	my %inference_PP;
	my %inference_seen_pairs;
	
	my %inference_firstAllele;
	my %inference_secondAllele; 
	my %firstAllele_called;
	my %secondAllele_called;		
	my %bestguess_exist_alleles;
	
	my %seen_PGF_positions;
	my %pgf_genotypes_labels;
	
	my %taken_alignment_positions;
	my %taken_haplotype_positions;
	
	my $haplotype_maxIndex = $#{$haplo1_fields_aref};
	die unless($#{$haplo2_fields_aref} == $haplotype_maxIndex);
	
	open(VCF, '>', $output_file) or die "Cannot open $output_file";
	my $VCF_header = qq(##fileformat=VCFv4.1
##phasing=none - DO NOT TRUST PHASE BETWEEN ALLELES MORE THAN 31 CHARACTERS AWAY FROM EACH OTHER
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
);
	print VCF $VCF_header;
	
	my @header_fields_VCF = qw/CHROM POS ID REF ALT QUAL FILTER INFO FORMAT/;
	push(@header_fields_VCF, $individualID);
	print VCF '#'.join("\t", @header_fields_VCF), "\n";
	
	my @open_haplotypes;
	my $open_haplotypes_startPGF;
	
	my $print_open_haplotypes = sub {
		my $lastPGFPosPlusOne = shift;
		die unless(defined $lastPGFPosPlusOne);
		die unless(defined $open_haplotypes_startPGF);
		
		my $referenceStretch_firstPos = $open_haplotypes_startPGF - $pgf_start;
		my $referenceStretch_lastPos = $lastPGFPosPlusOne - $pgf_start - 1;
		die unless($referenceStretch_firstPos >= 0);
		die unless($referenceStretch_lastPos >= $referenceStretch_firstPos);
		
		# print join("\t", 'reference_stretch', $referenceStretch_firstPos, $referenceStretch_lastPos), "\n";
		
		my $reference_stretch = substr($pgf_ref_seq, $referenceStretch_firstPos, $referenceStretch_lastPos - $referenceStretch_firstPos + 1);
		die unless($reference_stretch);
		
		my @output_fields;
		push(@output_fields, '6');
		push(@output_fields, $open_haplotypes_startPGF);
		push(@output_fields, '.');
		push(@output_fields, $reference_stretch);
		
		my $allele_1 = join('', @{$open_haplotypes[0]});
		my $allele_2 = join('', @{$open_haplotypes[1]});
		die unless(length($allele_1) == length($allele_2));
		
		$allele_1 =~ s/_//g;
		$allele_2 =~ s/_//g;
		$allele_1 =~ s/\*/N/g;
		$allele_2 =~ s/\*/N/g;
		
		my @alleles = ($reference_stretch);
		my %allele_indices = ($reference_stretch => 0);
		if(not exists $allele_indices{$allele_1})
		{
			my $new_index = scalar(keys %allele_indices);
			$allele_indices{$allele_1} = $new_index;
			push(@alleles, $allele_1);
			die unless($#alleles == $new_index);
		}	
		if(not exists $allele_indices{$allele_2})
		{
			my $new_index = scalar(keys %allele_indices);
			$allele_indices{$allele_2} = $new_index;
			push(@alleles, $allele_2);
			die unless($#alleles == $new_index);			
		}	
		
		my @alternative_alleles;
		if($#alleles >= 1)
		{
			@alternative_alleles = @alleles[1 .. $#alleles];
		}
		
		push(@output_fields, join(',', @alternative_alleles));
		push(@output_fields, '.');
		push(@output_fields, 'PASS');
		push(@output_fields, '.');
		push(@output_fields, 'GT');
		
		my $allele_1_code = $allele_indices{$allele_1};
		my $allele_2_code = $allele_indices{$allele_2};
		die unless(defined $allele_1_code);
		die unless(defined $allele_2_code);
		
		push(@output_fields, join('/', $allele_1_code, $allele_2_code));
		
		if(1 or (scalar(@alternative_alleles) > 0))
		{
			print VCF join("\t", @output_fields), "\n";
		}
		
		# ~/vcftools/vcftools_0.1.11/perl$ ./vcf-validator /Net/birch/data/dilthey/MHC-PRG/tmp/kMerCount__GS_nextGen_varigraph3_AA02O9Q_Z2_31_required.binaryCount.viterbiHaplotypes.viterbiVCF

		@open_haplotypes = ([], []);
		$open_haplotypes_startPGF = $lastPGFPosPlusOne;			
	};
	
	my $last_seen_PGFposition;
	
	for(my $pI = 0; $pI <= $haplotype_maxIndex; $pI++)
	{
		my $locusID = $loci_in_graph[$pI];
		my @locusID_fields = split(/_/, $locusID);
		die unless($#locusID_fields == 2);
		my $pgf_position = $pgf_start + $locusID_fields[2];		
		$last_seen_PGFposition = $pgf_position;
		
		if($pI == 0)
		{
			die unless(defined $pgf_position);
			@open_haplotypes = ([], []);		
			$open_haplotypes_startPGF = $pgf_position;
		}
		my $allele_1 = $haplo1_fields_aref->[$pI];
		my $allele_2 = $haplo2_fields_aref->[$pI];
		my $noGap = (($allele_1 !~ /_/) and ($allele_2 !~ /_/));
		
		if((defined $open_haplotypes_startPGF) and ($pgf_position != $open_haplotypes_startPGF) and $noGap and ($allele_1 eq $allele_2))
		{	
			$print_open_haplotypes->($pgf_position);
		}
		
		if(length($allele_1) < length($allele_2))
		{
			my $missing_characters = length($allele_2) - length($allele_1);
			die unless($missing_characters > 0);
			my $gapString = ('_' x $missing_characters);
			$allele_1 .= $gapString;
		}

		if(length($allele_2) < length($allele_1))
		{
			my $missing_characters = length($allele_1) - length($allele_2);
			die unless($missing_characters > 0);
			my $gapString = ('_' x $missing_characters);
			$allele_2 .= $gapString;
		}
		
		die unless(length($allele_1) == length($allele_2));
		push(@{$open_haplotypes[0]}, $allele_1);
		push(@{$open_haplotypes[1]}, $allele_2);
	}
	
	if((scalar(@{$open_haplotypes[0]})) or (scalar(@{$open_haplotypes[1]})))
	{
		die unless(defined $last_seen_PGFposition);
		$print_open_haplotypes->($last_seen_PGFposition+1);
	}
	
	close(VCF);
}

back to top