https://github.com/lhdo/GEDDplot
Tip revision: 4696f766d8ea2f7b722c792660e3fdf9cdaecf62 authored by Long H. Do on 01 July 2015, 15:12:07 UTC
Changes to be committed: added script to make GEDD figures and some sample data files from CuffDiff
Changes to be committed: added script to make GEDD figures and some sample data files from CuffDiff
Tip revision: 4696f76
makeSpearmanFigures.pl
#!/usr/bin/perl
#
# Author: Long H. Do (long.h.do@gmail.com)
#
# This script takes as input .diff files produced by CuffDiff to produce plots that search for GEDDs
# Requires R and the Bioconductor preprocessCore library
#
# usage: ./thisscript cuffdiff1 cuffdiff2 cuffdiff3
#
###########################################################################################
use strict;
use POSIX;
#takes cuffdiff .diff files and plots lowess smoothen lines across the chromosomes
my @y_title;# = ('Ts65Dn Fibroblast**','Ts65Dn Fibroblast*');
my @rho_order;# = ('1_3','2_3','1_2');
my @selected_chr = (); #shows all chromosomes
#my @selected_chr =(10,16); #plots only chromosomes 10 and 16
#########change this to your organism of choice to lable the chromosomes
my $box_prefix = 'Chr';
#my $box_prefix = 'HSA';
my $correlation_method = 'spearman';
my $pdf_filename = 'spearman.pdf';
my $pdf_width = 10;
my $pdf_height = 10;
my $pdf_column = 4;
my $pdf_row = 6;
my $plot_pointsize = .3;
my $plot_pointtype = 19;
###COLOR OPTIONS######
my %points_color = (1=>'lightpink',2=>'lightblue',3=>'lightgray');
my %lines_color = (1=>'red',2=>'blue',3=>'black');
my $box_title_size=.7;
my $box_title_pos = 0;
my $axis_label_font_size = .7;
my $axis_tick_size = .01;
my $axis_color = '#989898';
my $ylim = '-2,2';
my $rho_text_pos = '0,-1.7';
my $rho_font_size = 1.5;
my $plot_f = .1;
my $lowess_f = .3; #smoother span
my $lowess_iter = 1000; #iterations
my $tmpdir = '.tmp';
if(!-d $tmpdir){
mkdir($tmpdir) || die "Error creating directory: $tmpdir\n";
}
open (RSCRIPT,"|/usr/bin/Rscript -") || die "$!\n";
#open (RSCRIPT,">rscript.txt");
print RSCRIPT '#! /usr/bin/Rscript --vanilla --default-packages=utils' . "\n";
_print_plots(_get_spearman(\@ARGV));
sub _print_plots{
my ($spearman,$data) = @_;
print RSCRIPT 'pdf("'.$pdf_filename.'",width='.$pdf_width.',height='.$pdf_height.');par(mfrow=c('.$pdf_column.','.$pdf_row.'),oma = c(4,7,0,0) + .1, mar = c(1,0,1,1) + .1)' . "\n";
my @sorted;
if($selected_chr[0]){
@sorted = @selected_chr;
}
else{
my @chr = keys(%{$spearman});
@sorted = sort{$a <=> $b}@chr;
if($sorted[0] eq "X"){
shift(@sorted);
push @sorted,"X";
}
}
my (%uniq_num,%uniq_pair);
for my $chr (@sorted){
my @rho;
my $n=1;
for my $rho (@{$spearman->{$chr}}){
my ($i,$j) = split '_', $rho;
$uniq_pair{$rho}++;
$uniq_num{$i}++;
$uniq_num{$j}++;
push @rho,'rho["'.$i.','.$j.'"] == .(round(rho_'."$chr\_$rho".',2))';
$n++;
}
for my $i (@{$data->{$chr}}){
if($i == 1){
print RSCRIPT 'write("Plotting Chr' .$chr. '...",stderr()); log2data = log2(data.'."$chr".'[,'.($i*2).'] / data.'."$chr".'[,'.($i*2 - 1).']); plot(log2data,cex='.$plot_pointsize.',pch='.$plot_pointtype.',col="'.$points_color{$i}.'",ylim=c('.$ylim.'), xlab="",ylab="",axes=FALSE); box(lwd=.5,col="'.$axis_color.'");axis(side=1,lwd=.5,cex.axis='.$axis_label_font_size.', col="'.$axis_color.'",tck='.$axis_tick_size.',las=2,labels=NA);axis(side = 1, cex.axis='.$axis_label_font_size.', lwd = 0, line = -1.1, las = 1); axis(side=2,at=seq('.$ylim.',by=1),lwd=.5,cex.axis='.$axis_label_font_size.',col="'.$axis_color.'",tck='.$axis_tick_size.',las=3,labels=NA); axis(side = 2, at=seq('.$ylim.',by=1),cex.axis='.$axis_label_font_size.', lwd = 0, line = -.9, las = 2);lines(lowess(log2data,f='.$plot_f.',iter='.$lowess_iter.'),lwd=.4,col="'.$lines_color{$i}.'");' . "\n";
}
else{
print RSCRIPT 'log2data = log2(data.'."$chr".'[,'.($i*2).'] / data.'."$chr".'[,'.($i*2 - 1).']) ; points(log2data,cex='.$plot_pointsize.',pch='.$plot_pointtype.',col="'.$points_color{$i}.'");lines(lowess(log2data,f='.$plot_f.',iter='.$lowess_iter.'),lwd=.4,col="'.$lines_color{$i}.'");' . "\n";
}
}
print RSCRIPT 'mtext(side = 3, "'. $box_prefix . $chr.'", cex='.$box_title_size.',line ='.$box_title_pos.');text('.$rho_text_pos.',adj=0,cex='.$rho_font_size.',labels=bquote(' . join('*","~',@rho) . '));' . "\n";
}
print RSCRIPT 'mtext("Genes (chromosomal order)",side=1,cex=1,outer=TRUE,padj=1);';
my $i = 1;
my $padj = -.85;
for my $y_title(@y_title){
print RSCRIPT 'mtext(expression(paste(log[2],"[FC] ' . $y_title . '")),side=2,las=0,cex=1,adj=0,padj='. $padj .',outer=TRUE,at=0.43,col="'.$lines_color{$i}.'");';
$padj -= 1.65;
$i++;
}
unless( $selected_chr[0]){
for my $num (keys %uniq_num){
my @lowess_all;
for my $chr (@sorted){
push @lowess_all, 'lowess.'.$chr.'_'.$num . '$y';
}
print RSCRIPT "lowess.all_$num = c(" . join(",",@lowess_all) . ');';
}
for my $pair(sort keys %uniq_pair){
my ($i,$j) = split '_',$pair;
print RSCRIPT 'rho.all_' . "$i\_$j". '= cor(lowess.' . "all\_$i" . ',lowess.' . "all\_$j" . ',method="'.$correlation_method.'");' . "\n";
print RSCRIPT "print('$i\_$j');print(rho.all_" ."$i\_$j);\n";
}
}
close RSCRIPT;
if(-d $tmpdir){
system("rm -rf $tmpdir");
}
}
sub _makeFC_chr{
my ($files) = @_;
my %selected;
if($selected_chr[0]){
for my $chr(@selected_chr){
$selected{$chr}++;
}
}
my %fpkm_chr;
my @header;
my $f=1;
foreach my $file (@{$files}){
open (FILE,"$file") || die "$!\n";
my $n=1;
my $flag=1;
while (<FILE>){
chomp;
my @col = split;
next unless ($col[7] >= .1); #only keep if fpkm > .1
my($chr,$position) = split ':',$col[3];
next unless ($chr && $position);
$chr =~ s/chr//;
next unless $chr =~ /^\d+$|^X$/;
if($selected_chr[0]){
next unless($selected{$chr});
}
my ($start,$stop) = split '-',$position;
my $name = $col[4] . '__'. $col[5];
if(!defined $fpkm_chr{$chr}{$start}{$f}){
if($stop - $start > 0){
push @{$fpkm_chr{$chr}{$start}{$f}},\@col;
}
else{
push @{$fpkm_chr{$chr}{$stop}{$f}},\@col;
}
}
if($n == 1){
push @header,$name;
push @y_title,$col[5] . " \($f\)";
}
$n++;
}
close FILE;
$f++;
}
##modify row/columns based on total chromosomes
my $chr_count = scalar(keys(%fpkm_chr));
if($chr_count <=6){
$pdf_row = 2;
$plot_pointsize = .2;
}
else{
$pdf_row = 4;
if($f>3){
$rho_font_size = .9;
}
}
$pdf_column = ceil($chr_count/$pdf_row);
return(\%fpkm_chr,\@header);
}
sub _get_spearman{
my ($files) = @_;
my (%spearman,%data);
my $pairs = scalar(@{$files});
print RSCRIPT 'library(preprocessCore)' . "\n";
my ($fpkm_chr,$header) = _makeFC_chr($files);
my $header_string;
for my $exp(@{$header}){
my @e = split '__',$exp;
$header_string .= "\t" . join("\t",@e);
}
for my $chr(keys %{$fpkm_chr}){
next unless $chr =~ /^\d+$|^X$/;
my $i = 1;
open (CHRFPKM,">$tmpdir/$chr\_fpkm.txt") || die "$!: Unable to create chr fpkm file\n";
print CHRFPKM "gene\tlocus" . $header_string . "\n";
my @position = keys(%{$fpkm_chr->{$chr}});
my @sorted = sort{$a <=> $b}@position;
for my $pos (@sorted){
if( scalar(keys %{$fpkm_chr->{$chr}{$pos}}) == $pairs){
my $n=1;
for my $name (@{$header}){
my @col = @{$fpkm_chr->{$chr}{$pos}{$n}};
if($n==1){
print CHRFPKM $col[0]->[2] . "\t" . $col[0]->[3];
}
for my $col(@col){
print CHRFPKM "\t" . $col->[7] . "\t" . $col->[8];
}
$n++;
}
print CHRFPKM "\n";
}
}
close CHRFPKM;
print RSCRIPT 'write("Calculating lowess for Chr' .$chr. '...",stderr());' . 'chr_'.$chr.'_fpkm <- read.table("'. "$tmpdir\/$chr\_fpkm.txt" .'",header=T);' . "\n";
#print 'write("Calculating lowess for Chr' .$chr. '...",stderr());' . 'chr_'.$chr.'_fpkm <- read.table("'. "$tmpdir\/$chr\_fpkm.txt" .'",header=T);' . "\n";
print RSCRIPT 'data.'."$chr".' = normalize.quantiles(as.matrix('. "chr_$chr\_fpkm" .'[,3:'. (scalar(@{$files}) * 2 + 2) .']));' . "\n";
#print 'data.'."$chr".' = normalize.quantiles(as.matrix('. "chr_$chr\_fpkm" .'[,3:'. (scalar(@{$files}) * 2 + 2) .']));' . "\n";
for (my $i=1;$i<=$pairs;$i++){
print RSCRIPT 'lowess.'."$chr\_$i".' = lowess(log2(data.'."$chr".'[,'. ($i*2) .']/data.'."$chr".'[,'. ($i*2 - 1).']),f='.$lowess_f.',iter='.$lowess_iter.');' . "\n";
#print 'lowess.'."$chr\_$i".' = lowess(log2(data.'."$chr".'[,'. ($i*2) .']/data.'."$chr".'[,'. ($i*2 - 1).']),f='.$lowess_f.',iter='.$lowess_iter.');' . "\n";
push @{$data{$chr}},$i;
}
my %combo;
for (my $i=1;$i<=$pairs;$i++){
for (my $j=1;$j<=$pairs;$j++){
if($i<$j){
$combo{"$i\_$j"}++;
}
}
}
my @order = sort keys %combo;
my $n =1;
for my $order(@order){
my ($i,$j) = split '_',$order;
print RSCRIPT 'rho_' ."$chr\_$i\_$j". '= cor(lowess.' . "$chr\_$i" . '$y,lowess.' . "$chr\_$j" . '$y,method="'.$correlation_method.'");' . "\n";
#print 'rho_' ."$chr\_$i\_$j". '= cor(lowess.' . "$chr\_$i" . '$y,lowess.' . "$chr\_$j" . '$y,method="'.$correlation_method.'");' . "\n";
push @{$spearman{$chr}},"$i\_$j";
$n++;
}
}
return (\%spearman,\%data);
}