https://github.com/carden24/Bioinformatics_scripts
Raw File
Tip revision: 020af3000eeca9c2e270be798fe7264ece87f22b authored by Erick Cardenas on 30 September 2018, 20:38:24 UTC
Delete create_multiple_f.sh
Tip revision: 020af30
vcfutils.pl
#!/usr/bin/perl -w

# Author: lh3

use strict;
use warnings;
use Getopt::Std;

&main;
exit;

sub main {
  &usage if (@ARGV < 1);
  my $command = shift(@ARGV);
  my %func = (subsam=>\&subsam, listsam=>\&listsam, fillac=>\&fillac, qstats=>\&qstats, varFilter=>\&varFilter,
			  hapmap2vcf=>\&hapmap2vcf, ucscsnp2vcf=>\&ucscsnp2vcf, filter4vcf=>\&varFilter, ldstats=>\&ldstats,
			  gapstats=>\&gapstats, splitchr=>\&splitchr, vcf2fq=>\&vcf2fq);
  die("Unknown command \"$command\".\n") if (!defined($func{$command}));
  &{$func{$command}};
}

sub splitchr {
  my %opts = (l=>5000000);
  getopts('l:', \%opts);
  my $l = $opts{l};
  die(qq/Usage: vcfutils.pl splitchr [-l $opts{l}] <in.fa.fai>\n/) if (@ARGV == 0 && -t STDIN);
  while (<>) {
	my @t = split;
	my $last = 0;
	for (my $i = 0; $i < $t[1];) {
	  my $e = ($t[1] - $i) / $l < 1.1? $t[1] : $i + $l;
	  print "$t[0]:".($i+1)."-$e\n";
	  $i = $e;
	}
  }
}

sub subsam {
  die(qq/Usage: vcfutils.pl subsam <in.vcf> [samples]\n/) if (@ARGV == 0);
  my ($fh, %h);
  my $fn = shift(@ARGV);
  my @col;
  open($fh, ($fn =~ /\.gz$/)? "gzip -dc $fn |" : $fn) || die;
  $h{$_} = 1 for (@ARGV);
  while (<$fh>) {
	if (/^##/) {
	  print;
	} elsif (/^#/) {
	  my @t = split;
	  my @s = @t[0..8]; # all fixed fields + FORMAT
	  for (9 .. $#t) {
		if ($h{$t[$_]}) {
		  push(@s, $t[$_]);
		  push(@col, $_);
		}
	  }
	  pop(@s) if (@s == 9); # no sample selected; remove the FORMAT field
	  print join("\t", @s), "\n";
	} else {
	  my @t = split;
	  if (@col == 0) {
		print join("\t", @t[0..7]), "\n";
	  } else {
		print join("\t", @t[0..8], map {$t[$_]} @col), "\n";
	  }
	}
  }
  close($fh);
}

sub listsam {
  die(qq/Usage: vcfutils.pl listsam <in.vcf>\n/) if (@ARGV == 0 && -t STDIN);
  while (<>) {
	if (/^#/ && !/^##/) {
	  my @t = split;
	  print join("\n", @t[9..$#t]), "\n";
	  exit;
	}
  }
}

sub fillac {
  die(qq/Usage: vcfutils.pl fillac <in.vcf>\n\nNote: The GT field MUST BE present and always appear as the first field.\n/) if (@ARGV == 0 && -t STDIN);
  while (<>) {
	if (/^#/) {
	  print;
	} else {
	  my @t = split;
	  my @c = (0, 0);
	  my $n = 0;
	  my $s = -1;
	  @_ = split(":", $t[8]);
	  for (0 .. $#_) {
		if ($_[$_] eq 'GT') { $s = $_; last; }
	  }
	  if ($s < 0) {
		print join("\t", @t), "\n";
		next;
	  }
	  for (9 .. $#t) {
		if ($t[$_] =~ /^0,0,0/) {
		} elsif ($t[$_] =~ /^([^\s:]+:){$s}(\d+).(\d+)/) {
		  ++$c[$2]; ++$c[$3];
		  $n += 2;
		}
	  }
	  my $AC = "AC=" . join("\t", @c[1..$#c]) . ";AN=$n";
	  my $info = $t[7];
	  $info =~ s/(;?)AC=(\d+)//;
	  $info =~ s/(;?)AN=(\d+)//;
	  if ($info eq '.') {
		$info = $AC;
	  } else {
		$info .= ";$AC";
	  }
	  $t[7] = $info;
	  print join("\t", @t), "\n";
	}
  }
}

sub ldstats {
  my %opts = (t=>0.9);
  getopts('t:', \%opts);
  die("Usage: vcfutils.pl ldstats [-t $opts{t}] <in.vcf>\n") if (@ARGV == 0 && -t STDIN);
  my $cutoff = $opts{t};
  my ($last, $lastchr) = (0x7fffffff, '');
  my ($x, $y, $n) = (0, 0, 0);
  while (<>) {
	if (/^([^#\s]+)\s(\d+)/) {
	  my ($chr, $pos) = ($1, $2);
	  if (/NEIR=([\d\.]+)/) {
		++$n;
		++$y, $x += $pos - $last if ($lastchr eq $chr && $pos > $last && $1 > $cutoff);
	  }
	  $last = $pos; $lastchr = $chr;
	}
  }
  print "Number of SNP intervals in strong LD (r > $opts{t}): $y\n";
  print "Fraction: ", $y/$n, "\n";
  print "Length: $x\n";
}

sub qstats {
  my %opts = (r=>'', s=>0.02, v=>undef);
  getopts('r:s:v', \%opts);
  die("Usage: vcfutils.pl qstats [-r ref.vcf] <in.vcf>\n
Note: This command discards indels. Output: QUAL #non-indel #SNPs #transitions #joint ts/tv #joint/#ref #joint/#non-indel \n") if (@ARGV == 0 && -t STDIN);
  my %ts = (AG=>1, GA=>1, CT=>1, TC=>1);
  my %h = ();
  my $is_vcf = defined($opts{v})? 1 : 0;
  if ($opts{r}) { # read the reference positions
	my $fh;
	open($fh, $opts{r}) || die;
	while (<$fh>) {
	  next if (/^#/);
	  if ($is_vcf) {
		my @t = split;
		$h{$t[0],$t[1]} = $t[4];
	  } else {
		$h{$1,$2} = 1 if (/^(\S+)\s+(\d+)/);
	  }
	}
	close($fh);
  }
  my $hsize = scalar(keys %h);
  my @a;
  while (<>) {
	next if (/^#/);
	my @t = split;
	next if (length($t[3]) != 1 || uc($t[3]) eq 'N');
	$t[3] = uc($t[3]); $t[4] = uc($t[4]);
	my @s = split(',', $t[4]);
	$t[5] = 3 if ($t[5] eq '.' || $t[5] < 0);
	next if (length($s[0]) != 1);
	my $hit;
	if ($is_vcf) {
	  $hit = 0;
	  my $aa = $h{$t[0],$t[1]};
	  if (defined($aa)) {
		my @aaa = split(",", $aa);
		for (@aaa) {
		  $hit = 1 if ($_ eq $s[0]);
		}
	  }
	} else {
	  $hit = defined($h{$t[0],$t[1]})? 1 : 0;
	}
	push(@a, [$t[5], ($t[4] eq '.' || $t[4] eq $t[3])? 0 : 1, $ts{$t[3].$s[0]}? 1 : 0, $hit]);
  }
  push(@a, [-1, 0, 0, 0]); # end marker
  die("[qstats] No SNP data!\n") if (@a == 0);
  @a = sort {$b->[0]<=>$a->[0]} @a;
  my $next = $opts{s};
  my $last = $a[0];
  my @c = (0, 0, 0, 0);
  my @lc;
  $lc[1] = $lc[2] = 0;
  for my $p (@a) {
	if ($p->[0] == -1 || ($p->[0] != $last && $c[0]/@a > $next)) {
	  my @x;
	  $x[0] = sprintf("%.4f", $c[1]-$c[2]? $c[2] / ($c[1] - $c[2]) : 100);
	  $x[1] = sprintf("%.4f", $hsize? $c[3] / $hsize : 0);
	  $x[2] = sprintf("%.4f", $c[3] / $c[1]);
	  my $a = $c[1] - $lc[1];
	  my $b = $c[2] - $lc[2];
	  $x[3] = sprintf("%.4f", $a-$b? $b / ($a-$b) : 100);
	  print join("\t", $last, @c, @x), "\n";
	  $next = $c[0]/@a + $opts{s};
	  $lc[1] = $c[1]; $lc[2] = $c[2];
	}
	++$c[0]; $c[1] += $p->[1]; $c[2] += $p->[2]; $c[3] += $p->[3];
	$last = $p->[0];
  }
}

sub varFilter {
  my %opts = (d=>2, D=>10000000, a=>2, W=>10, Q=>10, w=>3, p=>undef, 1=>1e-4, 2=>1e-100, 3=>0, 4=>1e-4, G=>0, S=>1000, e=>1e-4);
  getopts('pd:D:W:Q:w:a:1:2:3:4:G:S:e:', \%opts);
  die(qq/
Usage:   vcfutils.pl varFilter [options] <in.vcf>

Options: -Q INT    minimum RMS mapping quality for SNPs [$opts{Q}]
         -d INT    minimum read depth [$opts{d}]
         -D INT    maximum read depth [$opts{D}]
         -a INT    minimum number of alternate bases [$opts{a}]
         -w INT    SNP within INT bp around a gap to be filtered [$opts{w}]
         -W INT    window size for filtering adjacent gaps [$opts{W}]
         -1 FLOAT  min P-value for strand bias (given PV4) [$opts{1}]
         -2 FLOAT  min P-value for baseQ bias [$opts{2}]
         -3 FLOAT  min P-value for mapQ bias [$opts{3}]
         -4 FLOAT  min P-value for end distance bias [$opts{4}]
		 -e FLOAT  min P-value for HWE (plus F<0) [$opts{e}]
         -p        print filtered variants

Note: Some of the filters rely on annotations generated by SAMtools\/BCFtools.
\n/) if (@ARGV == 0 && -t STDIN);

  # calculate the window size
  my ($ol, $ow) = ($opts{W}, $opts{w});
  my $max_dist = $ol > $ow? $ol : $ow;
  # the core loop
  my @staging; # (indel_filtering_score, flt_tag, indel_span; chr, pos, ...)
  while (<>) {
	my @t = split;
    if (/^#/) {
	  print; next;
	}
	next if ($t[4] eq '.'); # skip non-var sites
    next if ($t[3] eq 'N'); # skip sites with unknown ref ('N')
	# check if the site is a SNP
	my $type = 1; # SNP
	if (length($t[3]) > 1) {
	  $type = 2; # MNP
	  my @s = split(',', $t[4]);
	  for (@s) {
		$type = 3 if (length != length($t[3]));
	  }
	} else {
	  my @s = split(',', $t[4]);
	  for (@s) {
		$type = 3 if (length > 1);
	  }
	}
	# clear the out-of-range elements
	while (@staging) {
      # Still on the same chromosome and the first element's window still affects this position?
	  last if ($staging[0][3] eq $t[0] && $staging[0][4] + $staging[0][2] + $max_dist >= $t[1]);
	  varFilter_aux(shift(@staging), $opts{p}); # calling a function is a bit slower, not much
	}
	my $flt = 0;
	# parse annotations
	my ($dp, $mq, $dp_alt) = (-1, -1, -1);
	if ($t[7] =~ /DP4=(\d+),(\d+),(\d+),(\d+)/i) {
	  $dp = $1 + $2 + $3 + $4;
	  $dp_alt = $3 + $4;
	}
	if ($t[7] =~ /DP=(\d+)/i) {
	  $dp = $1;
	}
	$mq = $1 if ($t[7] =~ /MQ=(\d+)/i);
	# the depth and mapQ filter
	if ($dp >= 0) {
	  if ($dp < $opts{d}) {
		$flt = 2;
	  } elsif ($dp > $opts{D}) {
		$flt = 3;
	  }
	}
	$flt = 4 if ($dp_alt >= 0 && $dp_alt < $opts{a});
	$flt = 1 if ($flt == 0 && $mq >= 0 && $mq < $opts{Q});
	$flt = 7 if ($flt == 0 && /PV4=([^,]+),([^,]+),([^,]+),([^,;\t]+)/
				 && ($1<$opts{1} || $2<$opts{2} || $3<$opts{3} || $4<$opts{4}));
	$flt = 8 if ($flt == 0 && ((/MXGQ=(\d+)/ && $1 < $opts{G}) || (/MXSP=(\d+)/ && $1 >= $opts{S})));
	# HWE filter
	if ($t[7] =~ /G3=([^;,]+),([^;,]+),([^;,]+).*HWE=([^;,]+)/ && $4 < $opts{e}) {
		my $p = 2*$1 + $2;
		my $f = ($p > 0 && $p < 1)? 1 - $2 / ($p * (1-$p)) : 0;
		$flt = 9 if ($f < 0);
	}

	my $score = $t[5] * 100 + $dp_alt;
	my $rlen = length($t[3]) - 1; # $indel_score<0 for SNPs
	if ($flt == 0) {
	  if ($type == 3) { # an indel
		# filtering SNPs and MNPs
		for my $x (@staging) {
		  next if (($x->[0]&3) == 3 || $x->[1] || $x->[4] + $x->[2] + $ow < $t[1]);
		  $x->[1] = 5;
		}
		# check the staging list for indel filtering
		for my $x (@staging) {
		  next if (($x->[0]&3) != 3 || $x->[1] || $x->[4] + $x->[2] + $ol < $t[1]);
		  if ($x->[0]>>2 < $score) {
			$x->[1] = 6;
		  } else {
			$flt = 6; last;
		  }
		}
	  } else { # SNP or MNP
		for my $x (@staging) {
		  next if (($x->[0]&3) != 3 || $x->[4] + $x->[2] + $ow < $t[1]);
		  if ($x->[4] + length($x->[7]) - 1 == $t[1] && substr($x->[7], -1, 1) eq substr($t[4], 0, 1)
			  && length($x->[7]) - length($x->[6]) == 1) {
			$x->[1] = 5;
		  } else { $flt = 5; }
		  last;
		}
		# check MNP
		for my $x (@staging) {
		  next if (($x->[0]&3) == 3 || $x->[4] + $x->[2] < $t[1]);
		  if ($x->[0]>>2 < $score) {
			$x->[1] = 8;
		  } else {
			$flt = 8; last;
		  }
		}
	  }
	}
	push(@staging, [$score<<2|$type, $flt, $rlen, @t]);
  }
  # output the last few elements in the staging list
  while (@staging) {
	varFilter_aux(shift @staging, $opts{p});
  }
}

sub varFilter_aux {
  my ($first, $is_print) = @_;
  if ($first->[1] == 0) {
	print join("\t", @$first[3 .. @$first-1]), "\n";
  } elsif ($is_print) {
	print STDERR join("\t", substr("UQdDaGgPMS", $first->[1], 1), @$first[3 .. @$first-1]), "\n";
  }
}

sub gapstats {
  my (@c0, @c1);
  $c0[$_] = $c1[$_] = 0 for (0 .. 10000);
  while (<>) {
	next if (/^#/);
	my @t = split;
	next if (length($t[3]) == 1 && $t[4] =~ /^[A-Za-z](,[A-Za-z])*$/); # not an indel
	my @s = split(',', $t[4]);
	for my $x (@s) {
	  my $l = length($x) - length($t[3]) + 5000;
	  if ($x =~ /^-/) {
		$l = -(length($x) - 1) + 5000;
	  } elsif ($x =~ /^\+/) {
		$l = length($x) - 1 + 5000;
	  }
	  $c0[$l] += 1 / @s;
	}
  }
  for (my $i = 0; $i < 10000; ++$i) {
	next if ($c0[$i] == 0);
	$c1[0] += $c0[$i];
	$c1[1] += $c0[$i] if (($i-5000)%3 == 0);
	printf("C\t%d\t%.2f\n", ($i-5000), $c0[$i]);
  }
  printf("3\t%d\t%d\t%.3f\n", $c1[0], $c1[1], $c1[1]/$c1[0]);
}

sub ucscsnp2vcf {
  die("Usage: vcfutils.pl <in.ucsc.snp>\n") if (@ARGV == 0 && -t STDIN);
  print "##fileformat=VCFv4.0\n";
  print join("\t", "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"), "\n";
  while (<>) {
	my @t = split("\t");
	my $indel = ($t[9] =~ /^[ACGT](\/[ACGT])+$/)? 0 : 1;
	my $pos = $t[2] + 1;
	my @alt;
	push(@alt, $t[7]);
	if ($t[6] eq '-') {
	  $t[9] = reverse($t[9]);
	  $t[9] =~ tr/ACGTRYMKWSNacgtrymkwsn/TGCAYRKMWSNtgcayrkmwsn/;
	}
	my @a = split("/", $t[9]);
	for (@a) {
	  push(@alt, $_) if ($_ ne $alt[0]);
	}
	if ($indel) {
	  --$pos;
	  for (0 .. $#alt) {
		$alt[$_] =~ tr/-//d;
		$alt[$_] = "N$alt[$_]";
	  }
	}
	my $ref = shift(@alt);
	my $af = $t[13] > 0? ";AF=$t[13]" : '';
	my $valid = ($t[12] eq 'unknown')? '' : ";valid=$t[12]";
	my $info = "molType=$t[10];class=$t[11]$valid$af";
	print join("\t", $t[1], $pos, $t[4], $ref, join(",", @alt), 0, '.', $info), "\n";
  }
}

sub hapmap2vcf {
  die("Usage: vcfutils.pl <in.ucsc.snp> <in.hapmap>\n") if (@ARGV == 0);
  my $fn = shift(@ARGV);
  # parse UCSC SNP
  warn("Parsing UCSC SNPs...\n");
  my ($fh, %map);
  open($fh, ($fn =~ /\.gz$/)? "gzip -dc $fn |" : $fn) || die;
  while (<$fh>) {
	my @t = split;
	next if ($t[3] - $t[2] != 1); # not SNP
	@{$map{$t[4]}} = @t[1,3,7];
  }
  close($fh);
  # write VCF
  warn("Writing VCF...\n");
  print "##fileformat=VCFv4.0\n";
  while (<>) {
	my @t = split;
	if ($t[0] eq 'rs#') { # the first line
	  print join("\t", "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT", @t[11..$#t]), "\n";
	} else {
	  next unless ($map{$t[0]});
	  next if (length($t[1]) != 3); # skip non-SNPs
	  my $a = \@{$map{$t[0]}};
	  my $ref = $a->[2];
	  my @u = split('/', $t[1]);
	  if ($u[1] eq $ref) {
		$u[1] = $u[0]; $u[0] = $ref;
	  } elsif ($u[0] ne $ref) { next; }
	  my $alt = $u[1];
	  my %w;
	  $w{$u[0]} = 0; $w{$u[1]} = 1;
	  my @s = (@$a[0,1], $t[0], $ref, $alt, 0, '.', '.', 'GT');
	  my $is_tri = 0;
	  for (@t[11..$#t]) {
		if ($_ eq 'NN') {
		  push(@s, './.');
		} else {
		  my @a = ($w{substr($_,0,1)}, $w{substr($_,1,1)});
		  if (!defined($a[0]) || !defined($a[1])) {
			$is_tri = 1;
			last;
		  }
		  push(@s, "$a[0]/$a[1]");
		}
	  }
	  next if ($is_tri);
	  print join("\t", @s), "\n";
	}
  }
}

sub vcf2fq {
  my %opts = (d=>3, D=>100000, Q=>10, l=>5);
  getopts('d:D:Q:l:', \%opts);
  die(qq/
Usage:   vcfutils.pl vcf2fq [options] <all-site.vcf>

Options: -d INT    minimum depth          [$opts{d}]
         -D INT    maximum depth          [$opts{D}]
         -Q INT    min RMS mapQ           [$opts{Q}]
         -l INT    INDEL filtering window [$opts{l}]
\n/) if (@ARGV == 0 && -t STDIN);

  my ($last_chr, $seq, $qual, $last_pos, @gaps);
  my $_Q = $opts{Q};
  my $_d = $opts{d};
  my $_D = $opts{D};

  my %het = (AC=>'M', AG=>'R', AT=>'W', CA=>'M', CG=>'S', CT=>'Y',
			 GA=>'R', GC=>'S', GT=>'K', TA=>'W', TC=>'Y', TG=>'K');

  $last_chr = '';
  while (<>) {
	next if (/^#/);
	my @t = split;
	if ($last_chr ne $t[0]) {
	  &v2q_post_process($last_chr, \$seq, \$qual, \@gaps, $opts{l}) if ($last_chr);
	  ($last_chr, $last_pos) = ($t[0], 0);
	  $seq = $qual = '';
	  @gaps = ();
	}
	die("[vcf2fq] unsorted input\n") if ($t[1] - $last_pos < 0);
	if ($t[1] - $last_pos > 1) {
	  $seq .= 'n' x ($t[1] - $last_pos - 1);
	  $qual .= '!' x ($t[1] - $last_pos - 1);
	}
	if (length($t[3]) == 1 && $t[7] !~ /INDEL/ && $t[4] =~ /^([A-Za-z.])(,[A-Za-z])*$/) { # a SNP or reference
	  my ($ref, $alt) = ($t[3], $1);
	  my ($b, $q);
	  $q = $1 if ($t[7] =~ /FQ=(-?[\d\.]+)/);
	  if ($q < 0) {
		$_ = ($t[7] =~ /AF1=([\d\.]+)/)? $1 : 0;
		$b = ($_ < .5 || $alt eq '.')? $ref : $alt;
		$q = -$q;
	  } else {
		$b = $het{"$ref$alt"};
		$b ||= 'N';
	  }
	  $b = lc($b);
	  $b = uc($b) if (($t[7] =~ /MQ=(\d+)/ && $1 >= $_Q) && ($t[7] =~ /DP=(\d+)/ && $1 >= $_d && $1 <= $_D));
	  $q = int($q + 33 + .499);
	  $q = chr($q <= 126? $q : 126);
	  $seq .= $b;
	  $qual .= $q;
	} elsif ($t[4] ne '.') { # an INDEL
	  push(@gaps, [$t[1], length($t[3])]);
	}
	$last_pos = $t[1];
  }
  &v2q_post_process($last_chr, \$seq, \$qual, \@gaps, $opts{l});
}

sub v2q_post_process {
  my ($chr, $seq, $qual, $gaps, $l) = @_;
  for my $g (@$gaps) {
	my $beg = $g->[0] > $l? $g->[0] - $l : 0;
	my $end = $g->[0] + $g->[1] + $l;
	$end = length($$seq) if ($end > length($$seq));
	substr($$seq, $beg, $end - $beg) = lc(substr($$seq, $beg, $end - $beg));
  }
  print "\@$chr\n"; &v2q_print_str($seq);
  print "+\n"; &v2q_print_str($qual);
}

sub v2q_print_str {
  my ($s) = @_;
  my $l = length($$s);
  for (my $i = 0; $i < $l; $i += 60) {
	print substr($$s, $i, 60), "\n";
  }
}

sub usage {
  die(qq/
Usage:   vcfutils.pl <command> [<arguments>]\n
Command: subsam       get a subset of samples
         listsam      list the samples
         fillac       fill the allele count field
         qstats       SNP stats stratified by QUAL

         hapmap2vcf   convert the hapmap format to VCF
         ucscsnp2vcf  convert UCSC SNP SQL dump to VCF

         varFilter    filtering short variants (*)
         vcf2fq       VCF->fastq (**)

Notes: Commands with description endting with (*) may need bcftools
       specific annotations.
\n/);
}
back to top