https://github.com/lstevison/great-ape-recombination
Raw File
Tip revision: f7c1f5bfdc4cb2f4f95cc4982e65407ac8067927 authored by Laurie Stevison on 30 May 2016, 22:18:53 UTC
Add files via upload
Tip revision: f7c1f5b
match_cold_spots_final.pl
#! /usr/bin/perl

#Program to sort through 1kb regions within 100kb of hotspots and identify best matched cold spots
#Created: December 19, 2013
#Last modified: October 9, 2014

$hotspots_fasta=$ARGV[0];
$hotspots_withrates=$ARGV[1];
$input=$ARGV[2];
$output=$ARGV[3];
$genome_avg=$ARGV[4];
$peak_cutoff=$ARGV[5];
$width=$ARGV[6];

unless ($#ARGV==6) {
    print STDERR "Please provide on command line filename of fasta formatted hotspots, filename for hotspots with rates, a list of 1kb regions for the whole genome, an output filename, the genome average rate, the peak cutoff used for hotspots and the width to search for matching coldspots.\n\n";
    die;
} #end unless

open(HOTSPOTS, $hotspots_fasta);
open(RATES, $hotspots_withrates);
open(INPUT, $input);
$header=(<INPUT>);
open(OUTPUT, ">$output");

@hotspot_names = ();
@hotspot_sequences = ();
@hotspot_peaks=();
@hotspot_avg=();

print STDERR "Reading in hotspots...";

while (<HOTSPOTS>) {
    chomp;

    if (/>/) { 
        $seq_name = $'; #'
            push(@hotspot_names, $seq_name);
        if ($newseq==1) {push(@hotspot_sequences, $sequence);} 
        $sequence = "";
        $newseq=0;
        next; 
    } else {
        $sequence .= $_;
        $newseq=1; 
    } #end else    
} #end while
push(@hotspot_sequences, $sequence);
$h=0;
$header=(<RATES>);
while (<RATES>) {
    chomp;

    @rates=split(/\s+/,$_);
    $new_hotspot="$rates[0].$rates[1]:$rates[2]";

    if ($hotspot_names[$h]=~$new_hotspot) {
	push(@hotspot_peaks,$rates[3]);
	push(@hotspot_avg,$rates[4]);
	$h++;
    } #end if

} #end while

print STDERR "done reading in $#hotspot_names hotspots and $#hotspot_peaks rates.\nNow reading in coldspots for matching...";

@coldspot_matches_array=();
$l=0;

while (<INPUT>) {
    chomp;
    push(@coldspot_matches_array,$_);
} #end while

print STDERR "done reading in $#coldspot_matches_array coldspots for matching.\nNow matching hotspots to coldspots...";
print OUTPUT "Hotspot_chr\tHotspot_start\tHotspot_end\tHotspot_GC\tHotspot_N\tHotspot_avgRate\tHotspot_PeakRate\tColdspot_chr\tColdspot_start\tColdspot_end\tColdspot_GC\tColdspot_N\tColdspot_avgRate\tColdspot_PeakRate\n";

for ($i=0; $i<=$#hotspot_names; $i++) {

    $hotspot_rate=0;
    @input_array = split(/\.|:/, $hotspot_names[$i]);
#    print STDERR "$input_array[0]\t$input_array[1]\t$input_array[2]\n";
    if ($i==0) {
	$last_chr=$input_array[0];
    } #end if
    $hot_start=$input_array[1]+1;
    $hotspot_center = $input_array[1] + (($input_array[2]-$input_array[1])/2);
    $hotspot_length=sprintf("%.0f",($input_array[2]-$input_array[1])/1000);
    $new_width=(int($width/$hotspot_length))*$hotspot_length;

#    if ($i<=10) {print STDERR "Hotspot length: $hotspot_length, Width: $width; New width: $new_width\n";}
    if ($input_array[1] < $new_width) {    
	$max = $hotspot_center + $new_width + 1;
	$min = 0;
    } else {
	$max = $hotspot_center + $new_width + 1;
	$min = $hotspot_center - $new_width - 1;
    } #end else

    $a=($hotspot_sequences[$i]=~tr/Aa//);
    $t=($hotspot_sequences[$i]=~tr/Tt//);
    $g=($hotspot_sequences[$i]=~tr/Gg//);
    $c=($hotspot_sequences[$i]=~tr/Cc//);
    $total=$a+$t+$g+$c;
    $interval_size = $input_array[2]-$input_array[1]+1;
    $percent_N=sprintf("%.2f",(($interval_size-$total)/$interval_size)*100);
    if ($total != 0) {
	$gc= sprintf("%.2f", (($g+$c)/$total)*100 );
    } else {
	$gc=0;
    } #end else

    print OUTPUT "$input_array[0]\t$input_array[1]\t$input_array[2]\t$gc\t$percent_N\t$hotspot_avg[$i]\t$hotspot_peaks[$i]\t";
    @matched_coldspots = ();
    $previous_difference = 2;
    $first_match=0;

    for ($k=$l; $k<=$#coldspot_matches_array; $k+=$hotspot_length) {
	$highest_k=sprintf("%.0f", $k+$hotspot_length-1);
	$rate_count=0;
	$count=0;
	$coldpeak=0;
	$coldavg=0;
	$coldgc=0;
	$coldN=0;
	$start="";
	$end="";
	$cold_chr="";

	for ($b=$k; $b<=$highest_k; $b++) {
	    @region_array = split(/\s+/, $coldspot_matches_array[$b]);

	    if ($region_array[0]=~/^$input_array[0]$/) { 
		$cold_chr=$region_array[0];
		if ($b==$k) { $start=$region_array[1]; } #end if 
		if ($b==$highest_k) { $end=$region_array[2]; } #end if 

		if ($region_array[3]!~/n\/a/) {
		    $coldpeak+=$region_array[3];
		    $coldavg+=$region_array[4];
		    $rate_count++;
		} #end if
		$coldgc+=$region_array[7];
		$coldN+=$region_array[8];
		$count++;
	    } #end if
	} #end for

	unless (defined($end)) {
	    next;
	} #end unless

	unless (defined($cold_chr)) {
	    next;
	} #end unless

	if ($rate_count>0) { 
	    $coldspot_peak=$coldpeak/$rate_count;
	    $coldspot_avg=$coldavg/$rate_count;
	} else {
	    $coldspot_peak="n/a";
	    $coldspot_avg="n/a";
	} #end else

	if ($count>0) {
	    $coldspot_gc=$coldgc/$count;
	    $coldspot_N=$coldN/$count;
	} #end if

	$coldspot_center = $start + (($end-$start)/2);
#	print STDERR "$cold_chr\t$start\t$end\t$coldspot_peak\t$coldspot_avg\t$coldspot_gc\t$coldspot_N\n";
	if ($first_match==1) {
	    $new_l=$k-1;
	    $first_match=2;
	} #end 

	if ($input_array[0]!~/^$last_chr$/ && $cold_chr=~/^$last_chr$/) { #coldspot on last chr, but hotspot on new chr
#	    print STDERR "Hotspot chr: $input_array[0]; Last chr: $last_chr; Coldspot chr: $cold_chr\n";
	    next;
	} elsif ($coldspot_center >=$min && $coldspot_center <= $max && $cold_chr=~/^$input_array[0]$/) { #cold spot center within 100kb of hotspot

	    $coldspot_length=sprintf("%.0f", ($end-$start)/1000);
	    
	    unless ($coldspot_length==$hotspot_length) {
#		print STDERR "distance disparity, hotspot: $hotspot_length; coldspot: $coldspot_length\t$cold_chr:$start-$end\t$input_array[0]:$input_array[1]-$input_array[2]\n";
		next;
	    } #end if

	    if ($first_match==0) {
		$first_match=1;
	    } #end if

	    if ($coldspot_peak!~/n\/a/ && $coldspot_avg <= $genome_avg && $coldspot_peak <= $peak_cutoff && $$start!=$input_array[1] && $coldspot_N<=25) { #cold spot avg rate <=0.827 rho/kb (genome average), peak rate <=4.113 rho/kb (hotspot cutoff), not at hotspot, and percent N <=XX%
		$difference = abs($gc - $coldspot_gc);
#		print STDERR "Difference: $difference\n";
		if ($previous_difference > $difference) {
		    @matched_coldspots = ();
		    push(@matched_coldspots, "$cold_chr\t$start\t$end\t$coldspot_peak\t$coldspot_avg\t$coldspot_gc\t$coldspot_N");
		    $previous_difference = $difference;
		    next;
		} elsif ($previous_difference==$difference) {
		    push(@matched_coldspots, "$cold_chr\t$start\t$end\t$coldspot_peak\t$coldspot_avg\t$coldspot_gc\t$coldspot_N");
		} else {
		    next;
		} #end else
		
#	    } elsif ($start==$hot_start) { #print percentN, avg and peak rate at hotspot
#		print OUTPUT "$coldspot_N\t$coldspot_avg\t$coldspot_peak\t";
#		$hotspot_rate = 1;
	    } else {
		next;
	    } #end else
	} elsif ($end > $max) {
#	    if ($i<100) {print STDERR "L: $l; K:$k; Hotspot center: $hotspot_center; Min: $min; Max: $max; Coldspot center: $coldspot_center; Coldspot end: $region_array[2]; Matches: $#matched_coldspots\n";}
	    $l=$new_l;
	    last;
	} #end else
    } #end k-for

#    if ($hotspot_rate==0) {
#	print OUTPUT "n/a\tn/a\tn/a\t"; #if no rate for hotspot available, print 3 extra tabs so columns line up
#   } #end if

    if ($#matched_coldspots >= 1) {
	$previous_distance = $new_width;
	for ($z=0; $z<=$#matched_coldspots; $z++) {
	    @coldspot_array = split(/\s+/, $matched_coldspots[$z]);
	    $coldspot_center = $coldspot_array[1] + (($coldspot_array[2]-$coldspot_array[1])/2);
	    $distance = abs($hotspot_center - $coldspot_center);

	    if ($previous_distance > $distance) {
		@new_matched_coldspot = ();
		push(@new_matched_coldspot, $matched_coldspots[$z]);
		$previous_distance = $distance;
#		print STDERR "Distance to hotspot: $distance\n";
	    } #end if

	} #end for
	@coldspot_array = split(/\s+/, $new_matched_coldspot[0]);
	print OUTPUT "$coldspot_array[0]\t$coldspot_array[1]\t$coldspot_array[2]\t$coldspot_array[5]\t$coldspot_array[6]\t$coldspot_array[4]\t$coldspot_array[3]\tMultipleMatchesFound:$#matched_coldspots\n";
#	print STDERR "Multiple Matches found!\n";
    } elsif (defined($matched_coldspots[0])) {
	@coldspot_array = split(/\s+/, $matched_coldspots[0]);
	print OUTPUT "$coldspot_array[0]\t$coldspot_array[1]\t$coldspot_array[2]\t$coldspot_array[5]\t$coldspot_array[6]\t$coldspot_array[4]\t$coldspot_array[3]\n";
#	print STDERR "Match found!\n";
    } else {
#	print STDERR "no matches for @input_array\n";
	print OUTPUT "\t\t\t\t\t\t\tNoMatchesFound, $l\n";
    } #end if
    $last_chr=$input_array[0];
} #end for 

print STDERR "done.\n";
back to top