https://github.com/AlexanderDilthey/MHC-PRG
Raw File
Tip revision: e59943adb8855532573a6c276651efad1e18a6b1 authored by Alexander Dilthey on 18 December 2018, 10:20:48 UTC
Update HLA-PRG.md
Tip revision: e59943a
inspect_pgf_alignment.pl
#!/usr/bin/perl
use strict;
use Data::Dumper;

my %contigs;

my $alignment = qq(pgfContigs_for_testing.txt.aligned);
open(ALIGNMENT, '<', $alignment) or die "Cannot open $alignment";
my $currentContig;
while(<ALIGNMENT>)
{
	my $line = $_;
	chomp($line);
	if(substr($line, 0, 1) eq '>')
	{
		$currentContig = $line;
	}
	else
	{
		die unless($currentContig);
		$contigs{$currentContig} .= $line;
	}
}
close(ALIGNMENT);

my @contig_IDs = sort keys %contigs;

print "Have ", scalar(@contig_IDs), " contigs, \n";#, join("\n", map {' - -'.$_.'-'} @contig_IDs), "\n";

my @graph_contigs = grep {$_ =~ /\- graph \-/} @contig_IDs;

my $contigI = 0;
foreach my $graph_contig (@graph_contigs)
{
	print $graph_contig, "\n";
	$contigI++;
	
	my $sequence_contig = $graph_contig;
	$sequence_contig =~ s/ graph \-.+$/ sequence/;  
	#die Dumper($graph_contig, $sequence_contig);
	
	die "No sequence contig for -${graph_contig}- , would expect -${sequence_contig}- \n" unless(exists $contigs{$sequence_contig});
	
	my $sequence = $contigs{$sequence_contig};
	my $graph = $contigs{$graph_contig};
	
	die unless($sequence and $graph);
	
	my $sequence_startGap;
	if($sequence =~ /^(_+)/)
	{
		$sequence_startGap = $1;
	}
	
	my $sequence_stopGap;
	if($sequence =~ /(_+)$/)
	{
		$sequence_stopGap = $1;
	}
	
	my $sequence2 = $sequence;
	$sequence2 =~ s/_//g;
	die "Problem with sequence for contig $graph_contig!" unless(length($sequence2) > 0);
	
	print "Contig $sequence_contig, start gaps ", length($sequence_startGap), ", stop gaps ", length($sequence_stopGap), "\n";
	
	substr($sequence, 0, length($sequence_startGap)) = '';
	substr($graph, 0, length($sequence_startGap)) = '';

	substr($sequence, length($sequence) - length($sequence_stopGap), length($sequence_stopGap)) = '';
	substr($graph, length($graph) - length($sequence_stopGap), length($sequence_stopGap)) = '';
	
	if(length($sequence) == 0)
	{
		die "Problem with contig $graph_contig\n";
	}	
	
	die unless(length($sequence) == length($graph));
	
	my $disagreementNumber = 0;
	my $continuousOK = 0;
	my $inDisagreement = 0;
	my $disagreementStart = 0;
	my $disagreementStop = 0;
	my $totalOK = 0;
	
	my $nucleotideOK = 0;
	my $nucleotideTotal = 0;	
	my $nucleotideInIsolation = 0;
	
	for(my $i = 0; $i < length($sequence); $i++)
	{
		if(substr($sequence, $i, 1) ne '_')
		{
			$nucleotideTotal++;
		}
		
		if(($i > 0) and ($i < (length($sequence) - 1)))
		{
			if(substr($sequence, $i, 1) ne '_')
			{
				if((substr($sequence, $i-1, 1) eq '_') and (substr($sequence, $i+1, 1) eq '_'))
				{
					$nucleotideInIsolation++;
				}
			}
		}
		
		if(substr($sequence, $i, 1) eq substr($graph, $i, 1))
		{
			if(substr($sequence, $i, 1) ne '_')
			{
				$nucleotideOK++;
			}
			
			$continuousOK++;
			$totalOK++;
			if($inDisagreement and ($continuousOK > 20))
			{
				$disagreementStop = $i - $continuousOK;
				$inDisagreement = 0;
				
				
				my $flank_left_graph = substr($graph, $disagreementStart - 15, 15);
				my $flank_left_sequence = substr($sequence, $disagreementStart - 15, 15);

				my $flank_right_graph = substr($graph, $disagreementStop + 1, 15);
				my $flank_right_sequence = substr($sequence, $disagreementStop + 1, 15);				
				
				my $disagreement_graph = substr($graph, $disagreementStart, $disagreementStop - $disagreementStart + 1);
				my $disagreement_sequence = substr($sequence, $disagreementStart, $disagreementStop - $disagreementStart + 1);
				
				if(length($disagreement_graph) > 5)
				{
					print "\tDisagreement $disagreementNumber $disagreementStart - $disagreementStop \n";
				
					print "\t\tG: [", $flank_left_graph, "] ", $disagreement_graph, " [", $flank_right_graph, "]", "\n";
					print "\t\tS: [", $flank_left_sequence, "] ", $disagreement_sequence, " [", $flank_right_sequence, "]", "\n";
					print "\n";
				}
			}
		}
		else
		{
			if(not $inDisagreement)
			{
				$inDisagreement = 1;
				$disagreementStart = $i;
				$disagreementNumber++;
			}
			$continuousOK = 0;
		}
	}
	
	print "\tContig $contigI $totalOK / ", length($sequence), " positions identical -- $nucleotideOK / $nucleotideTotal of sequence characters, $nucleotideInIsolation in isolation!\n";
		
	# my @agreement_scores;
	
	# my $iPlus = 10;
	# for(my $i = 0; $i < length($sequence); $i += $iPlus)
	# {
		# my $ok = 0;
		# for(my $j = $i; $j < $iPlus; $j++)
		# {
			# if(substr($sequence, $j, 1) eq substr($graph, $sequence, 1))
			# {
				# $ok++;
			# }
		# }
		# push(@agreement_scores, $ok/$iPlus);
	# }
}

back to top