https://github.com/andrewalverson/diatom_mt
Tip revision: 47ac69b3993ade459e0335b8e3f2fbe21d7d0aee authored by andrewalverson on 08 May 2018, 16:36:16 UTC
removed old unused R files
removed old unused R files
Tip revision: 47ac69b
map_introns.pl
#!/usr/bin/perl
use strict;
use warnings;
use Getopt::Long;
# page coords = 612 x 792
my $SCALE = 1;
my $HEADER = 1;
my $SCALEBAR_WIDTH = 100;
my $PS2PDF = 1;
parseArgs();
my $SCALEBAR_LABEL = $SCALEBAR_WIDTH;
my $INFILE = shift @ARGV;
$SCALEBAR_WIDTH *= $SCALE;
my $y = 750;
my $left_margin = 40;
my $space_between_sequences = 20;
my $with_data_color = "black";
my $no_data_color = "gray";
my @file; #structure --> ( [$id, [@introns], [@sequence]], [$id, [@introns], [@sequence]], [$id, [@introns], [@sequence]] )
my %positions; # stores a non-redundant set of intron insertion sites
my $top = $y; # position of first gene line, for drawing vertial intron-position lines
my $bottom; # position of last gene line, for drawing vertial intron-position lines
my $outfile = $INFILE;
$outfile =~ s/\..*$//;
$outfile .= ".ps";
my $pdf = $outfile;
$pdf =~ s/\..*$/\.pdf/;
my $header = $INFILE;
$header =~ s/\..*$//;
parseFasta( \@file );
# includes left margin and space to accommodate the longest name; all features are drawn relative to this left-most point
my $feature_offset = $left_margin + 160;
# print postscript header
open( OUT, ">$outfile" ) || die "Can't open $outfile: $!\n";
print OUT "%!PS\n";
close OUT;
# print filename header
if( $HEADER ){
open( OUT, ">>$outfile" ) || die "Can't open $outfile: $!\n";
print OUT "% HEADER TEXT\n";
print OUT "/Arial findfont\n";
print OUT "12 scalefont\n";
print OUT "setfont\n";
print OUT "0 setgray\n";
print OUT $left_margin, " ", $y, " moveto\n";
print OUT "($header) show\n\n";
$y -= 40;
close OUT;
}
# print comment stating that sequence/intron drawing starts here
open( OUT, ">>$outfile" ) || die "Can't open $outfile: $!\n";
print OUT "\n% SEQUENCE/INTRON PRINTING STARTS HERE\n\n";
close OUT;
# iterate over all sequences in the file -- the real business starts here
for( my $i = 0; $i < @file; $i++ ){
# print $file[$i][1][0] and exit;
my( $counter, $j, $seq, $id, @missing_coords, @data_coords, @ranges, @introns, @edits );
# NOTE: CHANGE '[n]' to '[n-]' TO COLOR MISSING DATA AND GAPS THE SAME WAY; COULD ADD AN ELSIF
# TO GIVE GAPS A DIFFERENT COLOR
for( $j = 0; $j < @{$file[$i][2]}; $j++ ){
if( $file[$i][2][$j] =~ /[n]/i ){ #find and store coordinates for regions with missing data
push @missing_coords, $j+1;
}else{ #find and store coordinates for regions with data
push @data_coords, $j+1;
}
}
# get ranges of parts of sequence *with* data
if( @data_coords ){
@data_coords = sort {$a <=> $b} @data_coords;
getRanges( \@data_coords, \@ranges, $with_data_color );
}
# get ranges for gaps and missing data
if( @missing_coords ){
@missing_coords = sort {$a <=> $b} @missing_coords;
getRanges( \@missing_coords, \@ranges, $no_data_color );
}
#write taxon name
writeName( $left_margin, $y, $file[$i][0] );
#draw line segments (gray if missing data, black if there is data)
@ranges = sort {$a->[0] <=> $b->[0]} @ranges;
draw_gene( $feature_offset, $y, \@ranges, $outfile );
# print "$left_margin\n$y\n$ranges[0][0]\n"
#map introns
mapIntrons( $feature_offset, $y, $file[$i][1], $file[$i][2], $outfile, \%positions );
$y -= $space_between_sequences;
}
# print comment stating that sequence/intron drawing ends here, vertical line drawing begins here
open( OUT, ">>$outfile" ) || die "Can't open $outfile: $!\n";
print OUT "\n% SEQUENCE/INTRON PRINTING ENDS HERE\n";
print OUT "% BEGIN PRINTING VERTICAL LINES\n\n";
close OUT;
# draw vertical lines
vertial_lines( $feature_offset, $top, $y += $space_between_sequences, \%positions );
# print comment stating that sequence/intron drawing ends here, vertical line drawing begins here
open( OUT, ">>$outfile" ) || die "Can't open $outfile: $!\n";
print OUT "\n% VERTICAL LINE PRINTING ENDS HERE\n";
print OUT "% BEGIN PRINTING SCALE BAR\n\n";
close OUT;
# draw scale bar
scaleBar( $y, $outfile );
# covert to pdf
$PS2PDF and system( "ps2pdf $outfile $pdf" );
exit;
######################################################SUBROUTINES######################################################
sub parseFasta{
my $file = shift;
my $tmp;
#read file into string
open( FASTA, $INFILE ) || die "Couldn't open $INFILE: $!\n";
while( <FASTA> ){
$tmp .= $_;
}
close FASTA;
# parse string into array, each element is an array => ($header,[@introns],[@sequence])
foreach my $sequence ( split />/, $tmp ){
$sequence =~ /\S+/ or next;
my( $in_sequence, $seq, @introns, $id );
foreach( split /\n/, $sequence ){
if( $in_sequence ){
$seq .= $_;
}else{
@introns = split( /\s+/, $_ );
$id = shift @introns;
$in_sequence = 1;
}
}
$seq =~ s/[0-9\s]//g;
my @chars = split( //, $seq );
my $R = [$id, [@introns], [@chars]];
push @$file, $R;
}
$tmp = undef;
}
########################################################################################################################
sub getRanges{
my( $coords, $coord_ranges, $color ) = @_;
my $range_cnt = 0;
my $maxgap = 1;
my ( @ranges, $i );
$ranges[0][0] = $coords->[0];
for ($i = 1; $i < @$coords; $i++) {
if ($coords->[$i] - $coords->[$i - 1] > $maxgap) {
$ranges[$range_cnt][1] = $coords->[$i - 1];
$range_cnt++;
$ranges[$range_cnt][0] = $coords->[$i];
}
}
$ranges[$range_cnt][1] = $coords->[$#$coords];
for ($i = 0; $i < @ranges; $i++) {
my $R = [$ranges[$i][0], $ranges[$i][1], $color];
push @$coord_ranges, $R;
# print $color . "\n";
# printf "%d\t%d\t%d\t%d\n", $i + 1, $ranges[$i][1] - $ranges[$i][0] + 1, $ranges[$i][0], $ranges[$i][1];
}
}
########################################################################################################################
sub draw_gene {
my( $left_margin, $y, $coord_ranges, $outfile ) = @_;
my( $k, $line_color );
open( OUT, ">>$outfile" ) || die "Can't open $outfile: $!\n";
print OUT "1 setlinewidth\n";
foreach( $k = 0; $k < @$coord_ranges; $k++ ){
$$coord_ranges[$k][2] eq "black" and $line_color = "0 setgray\n";
# $$coord_ranges[$k][2] eq "gray" and $line_color = "0.6 setgray\n";
$$coord_ranges[$k][2] eq "gray" and $line_color = "1 0 0 setrgbcolor\n";
if( $k == 0 ){
print OUT "newpath\n";
print OUT "$left_margin $y moveto\n", $left_margin + (${$coord_ranges}[$k][1] * $SCALE), " $y lineto\n";
print OUT $line_color;
print OUT "stroke\n";
}else{
print OUT "newpath\n";
print OUT $left_margin + (${$coord_ranges}[$k-1][1] * $SCALE), " $y moveto\n", $left_margin + (${$coord_ranges}[$k][1] * $SCALE), " $y lineto\n";
print OUT $line_color;
print OUT "stroke\n";
}
#print "${$coord_ranges}[$k][2] ${$coord_ranges}[$k][0] ${$coord_ranges}[$k][1]\n";
}
close OUT;
}
########################################################################################################################
sub mapIntrons{
my( $left_margin, $y, $introns, $sequence, $outfile, $positions ) = @_;
my $intron_offset = $SCALE * 0.5; #put intron marker midway between exon coordinates
my $missing;
open( OUT, ">>$outfile" ) || die "Can't open $outfile: $!\n";
print OUT "0 setgray\n";
print OUT "0.5 setlinewidth\n";
for( my $i = 0; $i < @$introns; $i++ ){
if( $introns->[$i] =~ /m/ ){
$missing = 1;
$introns->[$i] =~ s/m//;
}
# print $introns->[$i] . " ";
#adjust intron positions for alignment gaps
for( my $j = 0; $j < $introns->[$i]; $j++ ){
$sequence->[$j] eq "-" and $introns->[$i]+=1;
}
# store unique intron positions in a hash, which we'll use to draw the vertical lines marking the insertion site of each intron
$positions->{$introns->[$i]} = $introns->[$i];
print OUT "newpath\n";
print OUT $left_margin + ($SCALE * ($introns->[$i] + 1) + $intron_offset), " ", $y+1, " moveto\n";
print OUT $left_margin + ($SCALE * ($introns->[$i] + 1) + $intron_offset) - 7, " ", $y+14, " lineto\n";
print OUT $left_margin + ($SCALE * ($introns->[$i] + 1) + $intron_offset) + 7, " ", $y+14, " lineto\n";
print OUT $left_margin + ($SCALE * ($introns->[$i] + 1) + $intron_offset), " ", $y+1, " lineto\n";
print OUT "closepath\n";
if( $missing ){
print OUT "stroke\n";
}else{
print OUT "gsave\n";
print OUT "fill\n";
print OUT "grestore\n";
print OUT "stroke\n";
}
$missing = undef;
}
close OUT;
}
########################################################################################################################
sub vertial_lines{
my( $left_margin, $top, $bottom, $positions ) = @_;
my $intron_offset = $SCALE * 0.5; #put intron marker midway between exon coordinates
open( OUT, ">>$outfile" ) || die "Can't open $outfile: $!\n";
print OUT "1 setlinewidth\n";
print OUT "0.6 setgray\n";
print OUT "0.25 setlinewidth\n";
# loop over intron positions
foreach my $intron ( keys %$positions ){
print $positions->{$intron}, "\n";
print OUT "newpath\n";
print OUT $left_margin + ($SCALE * ($positions->{$intron} + 1) + $intron_offset), " ", $top, " moveto\n";
print OUT $left_margin + ($SCALE * ($positions->{$intron} + 1) + $intron_offset), " ", $bottom, " lineto\n";
print OUT "stroke\n";
# print OUT "closepath\n";
}
close OUT;
}
########################################################################################################################
sub scaleBar{
my( $y, $outfile ) = @_;
$y -= 10;
open( OUT, ">>$outfile" ) || die "Can't open $outfile: $!\n";
print OUT "0 setgray\n";
print OUT "1 setlinewidth\n";
print OUT "newpath\n";
print OUT "$left_margin $y moveto\n", $left_margin + $SCALEBAR_WIDTH, " $y lineto\n";
print OUT "stroke\n";
print OUT "/Arial findfont\n";
print OUT "10 scalefont\n";
print OUT "setfont\n";
print OUT "newpath\n";
print OUT $left_margin + $SCALEBAR_WIDTH + 6, " ", $y-3, " moveto\n";
print OUT "($SCALEBAR_LABEL nt) show\n";
close OUT;
}
########################################################################################################################
sub writeName{
my( $left_margin, $y, $name ) = @_;
$name =~ s/_/ /g;
open( OUT, ">>$outfile" ) || die "Can't open $outfile: $!\n";
print OUT "0 setgray\n";
print OUT "/Arial findfont\n";
print OUT "10 scalefont\n";
print OUT "setfont\n";
print OUT $left_margin, " ", $y-3, " moveto\n";
print OUT "($name) show\n";
close OUT;
}
########################################################################################################################
sub parseArgs{
my $usage = "\nUsage: $0 [options] dna_alignment.fasta
options
--scale - proportion to scale width of gene line (default: none)
--bar_width - width of scale bar (default: 100 nt)
--header - print header at top of page (filename minus file extension) (default: yes)
--ps2pdf - call ps2pdf to convert postscript to pdf (default: yes)
NOTE: FASTA header can have list of intron coordinates (e.g., \">Citrullus 500 600 700m\"); introns are mapped as filled triangles and missing introns (e.g. 700m) are mapped as open triangles; it is assumed that intron locations do not consider any gap characters that might exist in the alignment; that is, the script will adjust intron locations to account for gap characters (\"-\") that precede them in that sequence
NOTE: The script calls ps2pdf (https://www.ghostscript.com/) to convert the postscript file into a pdf
\n\n";
my $result = GetOptions
(
'file=s' => \$INFILE,
'scale=s' => \$SCALE,
'bar_width=s' => \$SCALEBAR_WIDTH,
'header!' => \$HEADER,
'ps2pdf!' => \$PS2PDF,
);
$ARGV[0] or die $usage;
}
#######################################################################################################################