https://github.com/AlexanderDilthey/MHC-PRG
Raw File
Tip revision: e59943adb8855532573a6c276651efad1e18a6b1 authored by Alexander Dilthey on 18 December 2018, 10:20:48 UTC
Update HLA-PRG.md
Tip revision: e59943a
analyseTypesAsValidated.pl
#!/usr/bin/perl

use strict;
use Data::Dumper;
use List::MoreUtils qw/mesh/;


my %samples_to_populations;
open(POPULATIONS, '<', 'forPaper/_1000G_populations.txt') or die;
while(<POPULATIONS>)
{
	my $line = $_;
	chomp($line);
	my @fields = split(/\t/, $line);
	$samples_to_populations{$fields[0]} = $fields[1];
}

my %populations;
my %types_2digit;

my $inputFile = '_Platinum_types_as_validated.txt';
open(F, '<', $inputFile) or die "Cannot open $inputFile";
my $headerLine = <F>;
chomp($headerLine);
my @headerFields = split(/\t/, $headerLine);
my @loci = grep {$_ =~ /HLA/} @headerFields;
while(<F>)
{
	my $line = $_;
	chomp($line);
	next unless($line);
	my @line_fields = split(/\t/, $line);
	die Dumper("Field mismatch", $#line_fields, $#headerFields, $.) unless($#line_fields == $#headerFields);
	
	my %line = (mesh @headerFields, @line_fields);
	
	my $indivID = $line{'IndividualID'};
	die unless(defined $indivID);
	
	die unless(exists $samples_to_populations{$indivID});
	
	my $population = $samples_to_populations{$indivID};
	
	$populations{$population}++;
	
	foreach my $locus (@loci)
	{
		my $alleles = $line{$locus};
		die unless($alleles);
		my @alleles = split(/\//, $alleles);
		die unless($#alleles == 1);
		
		foreach my $allele (@alleles)
		{
			if($allele =~ /\?/)
			{
				$allele = 'Missing';
			}
			else
			{
				my @components = split(/:/, $allele);
				die unless(scalar(@components) >= 2);
				$allele = $components[0];
			}
			$types_2digit{$locus}{$allele}++;
		}
	}
}
close(F);

my $fn_output_populations = $inputFile . '.populations';
open(OUT, '>', $fn_output_populations) or die "Cannot open $fn_output_populations";
print OUT join("\t", "Population", "ValidationSamples"), "\n";
foreach my $population (sort keys %populations)
{
	print OUT join("\t", $population, $populations{$population}), "\n";
}
close(OUT);

my $fn_output_twodigit = $inputFile . '.groups_2digit';
open(OUT, '>', $fn_output_twodigit) or die "Cannot open $fn_output_twodigit";
print OUT join("\t", "Locus", "Group2Digit", "N"), "\n";
foreach my $locus (sort keys %types_2digit)
{
	foreach my $twoDigGroup (sort keys %{$types_2digit{$locus}})
	{
		print OUT join("\t", $locus, $twoDigGroup, $types_2digit{$locus}{$twoDigGroup}), "\n";
	}
}
close(OUT);
back to top