https://github.com/carden24/Bioinformatics_scripts
Tip revision: 020af3000eeca9c2e270be798fe7264ece87f22b authored by Erick Cardenas on 30 September 2018, 20:38:24 UTC
Delete create_multiple_f.sh
Delete create_multiple_f.sh
Tip revision: 020af30
vcfutils.pl
#!/usr/bin/perl -w
# Author: lh3
use strict;
use warnings;
use Getopt::Std;
&main;
exit;
sub main {
&usage if (@ARGV < 1);
my $command = shift(@ARGV);
my %func = (subsam=>\&subsam, listsam=>\&listsam, fillac=>\&fillac, qstats=>\&qstats, varFilter=>\&varFilter,
hapmap2vcf=>\&hapmap2vcf, ucscsnp2vcf=>\&ucscsnp2vcf, filter4vcf=>\&varFilter, ldstats=>\&ldstats,
gapstats=>\&gapstats, splitchr=>\&splitchr, vcf2fq=>\&vcf2fq);
die("Unknown command \"$command\".\n") if (!defined($func{$command}));
&{$func{$command}};
}
sub splitchr {
my %opts = (l=>5000000);
getopts('l:', \%opts);
my $l = $opts{l};
die(qq/Usage: vcfutils.pl splitchr [-l $opts{l}] <in.fa.fai>\n/) if (@ARGV == 0 && -t STDIN);
while (<>) {
my @t = split;
my $last = 0;
for (my $i = 0; $i < $t[1];) {
my $e = ($t[1] - $i) / $l < 1.1? $t[1] : $i + $l;
print "$t[0]:".($i+1)."-$e\n";
$i = $e;
}
}
}
sub subsam {
die(qq/Usage: vcfutils.pl subsam <in.vcf> [samples]\n/) if (@ARGV == 0);
my ($fh, %h);
my $fn = shift(@ARGV);
my @col;
open($fh, ($fn =~ /\.gz$/)? "gzip -dc $fn |" : $fn) || die;
$h{$_} = 1 for (@ARGV);
while (<$fh>) {
if (/^##/) {
print;
} elsif (/^#/) {
my @t = split;
my @s = @t[0..8]; # all fixed fields + FORMAT
for (9 .. $#t) {
if ($h{$t[$_]}) {
push(@s, $t[$_]);
push(@col, $_);
}
}
pop(@s) if (@s == 9); # no sample selected; remove the FORMAT field
print join("\t", @s), "\n";
} else {
my @t = split;
if (@col == 0) {
print join("\t", @t[0..7]), "\n";
} else {
print join("\t", @t[0..8], map {$t[$_]} @col), "\n";
}
}
}
close($fh);
}
sub listsam {
die(qq/Usage: vcfutils.pl listsam <in.vcf>\n/) if (@ARGV == 0 && -t STDIN);
while (<>) {
if (/^#/ && !/^##/) {
my @t = split;
print join("\n", @t[9..$#t]), "\n";
exit;
}
}
}
sub fillac {
die(qq/Usage: vcfutils.pl fillac <in.vcf>\n\nNote: The GT field MUST BE present and always appear as the first field.\n/) if (@ARGV == 0 && -t STDIN);
while (<>) {
if (/^#/) {
print;
} else {
my @t = split;
my @c = (0, 0);
my $n = 0;
my $s = -1;
@_ = split(":", $t[8]);
for (0 .. $#_) {
if ($_[$_] eq 'GT') { $s = $_; last; }
}
if ($s < 0) {
print join("\t", @t), "\n";
next;
}
for (9 .. $#t) {
if ($t[$_] =~ /^0,0,0/) {
} elsif ($t[$_] =~ /^([^\s:]+:){$s}(\d+).(\d+)/) {
++$c[$2]; ++$c[$3];
$n += 2;
}
}
my $AC = "AC=" . join("\t", @c[1..$#c]) . ";AN=$n";
my $info = $t[7];
$info =~ s/(;?)AC=(\d+)//;
$info =~ s/(;?)AN=(\d+)//;
if ($info eq '.') {
$info = $AC;
} else {
$info .= ";$AC";
}
$t[7] = $info;
print join("\t", @t), "\n";
}
}
}
sub ldstats {
my %opts = (t=>0.9);
getopts('t:', \%opts);
die("Usage: vcfutils.pl ldstats [-t $opts{t}] <in.vcf>\n") if (@ARGV == 0 && -t STDIN);
my $cutoff = $opts{t};
my ($last, $lastchr) = (0x7fffffff, '');
my ($x, $y, $n) = (0, 0, 0);
while (<>) {
if (/^([^#\s]+)\s(\d+)/) {
my ($chr, $pos) = ($1, $2);
if (/NEIR=([\d\.]+)/) {
++$n;
++$y, $x += $pos - $last if ($lastchr eq $chr && $pos > $last && $1 > $cutoff);
}
$last = $pos; $lastchr = $chr;
}
}
print "Number of SNP intervals in strong LD (r > $opts{t}): $y\n";
print "Fraction: ", $y/$n, "\n";
print "Length: $x\n";
}
sub qstats {
my %opts = (r=>'', s=>0.02, v=>undef);
getopts('r:s:v', \%opts);
die("Usage: vcfutils.pl qstats [-r ref.vcf] <in.vcf>\n
Note: This command discards indels. Output: QUAL #non-indel #SNPs #transitions #joint ts/tv #joint/#ref #joint/#non-indel \n") if (@ARGV == 0 && -t STDIN);
my %ts = (AG=>1, GA=>1, CT=>1, TC=>1);
my %h = ();
my $is_vcf = defined($opts{v})? 1 : 0;
if ($opts{r}) { # read the reference positions
my $fh;
open($fh, $opts{r}) || die;
while (<$fh>) {
next if (/^#/);
if ($is_vcf) {
my @t = split;
$h{$t[0],$t[1]} = $t[4];
} else {
$h{$1,$2} = 1 if (/^(\S+)\s+(\d+)/);
}
}
close($fh);
}
my $hsize = scalar(keys %h);
my @a;
while (<>) {
next if (/^#/);
my @t = split;
next if (length($t[3]) != 1 || uc($t[3]) eq 'N');
$t[3] = uc($t[3]); $t[4] = uc($t[4]);
my @s = split(',', $t[4]);
$t[5] = 3 if ($t[5] eq '.' || $t[5] < 0);
next if (length($s[0]) != 1);
my $hit;
if ($is_vcf) {
$hit = 0;
my $aa = $h{$t[0],$t[1]};
if (defined($aa)) {
my @aaa = split(",", $aa);
for (@aaa) {
$hit = 1 if ($_ eq $s[0]);
}
}
} else {
$hit = defined($h{$t[0],$t[1]})? 1 : 0;
}
push(@a, [$t[5], ($t[4] eq '.' || $t[4] eq $t[3])? 0 : 1, $ts{$t[3].$s[0]}? 1 : 0, $hit]);
}
push(@a, [-1, 0, 0, 0]); # end marker
die("[qstats] No SNP data!\n") if (@a == 0);
@a = sort {$b->[0]<=>$a->[0]} @a;
my $next = $opts{s};
my $last = $a[0];
my @c = (0, 0, 0, 0);
my @lc;
$lc[1] = $lc[2] = 0;
for my $p (@a) {
if ($p->[0] == -1 || ($p->[0] != $last && $c[0]/@a > $next)) {
my @x;
$x[0] = sprintf("%.4f", $c[1]-$c[2]? $c[2] / ($c[1] - $c[2]) : 100);
$x[1] = sprintf("%.4f", $hsize? $c[3] / $hsize : 0);
$x[2] = sprintf("%.4f", $c[3] / $c[1]);
my $a = $c[1] - $lc[1];
my $b = $c[2] - $lc[2];
$x[3] = sprintf("%.4f", $a-$b? $b / ($a-$b) : 100);
print join("\t", $last, @c, @x), "\n";
$next = $c[0]/@a + $opts{s};
$lc[1] = $c[1]; $lc[2] = $c[2];
}
++$c[0]; $c[1] += $p->[1]; $c[2] += $p->[2]; $c[3] += $p->[3];
$last = $p->[0];
}
}
sub varFilter {
my %opts = (d=>2, D=>10000000, a=>2, W=>10, Q=>10, w=>3, p=>undef, 1=>1e-4, 2=>1e-100, 3=>0, 4=>1e-4, G=>0, S=>1000, e=>1e-4);
getopts('pd:D:W:Q:w:a:1:2:3:4:G:S:e:', \%opts);
die(qq/
Usage: vcfutils.pl varFilter [options] <in.vcf>
Options: -Q INT minimum RMS mapping quality for SNPs [$opts{Q}]
-d INT minimum read depth [$opts{d}]
-D INT maximum read depth [$opts{D}]
-a INT minimum number of alternate bases [$opts{a}]
-w INT SNP within INT bp around a gap to be filtered [$opts{w}]
-W INT window size for filtering adjacent gaps [$opts{W}]
-1 FLOAT min P-value for strand bias (given PV4) [$opts{1}]
-2 FLOAT min P-value for baseQ bias [$opts{2}]
-3 FLOAT min P-value for mapQ bias [$opts{3}]
-4 FLOAT min P-value for end distance bias [$opts{4}]
-e FLOAT min P-value for HWE (plus F<0) [$opts{e}]
-p print filtered variants
Note: Some of the filters rely on annotations generated by SAMtools\/BCFtools.
\n/) if (@ARGV == 0 && -t STDIN);
# calculate the window size
my ($ol, $ow) = ($opts{W}, $opts{w});
my $max_dist = $ol > $ow? $ol : $ow;
# the core loop
my @staging; # (indel_filtering_score, flt_tag, indel_span; chr, pos, ...)
while (<>) {
my @t = split;
if (/^#/) {
print; next;
}
next if ($t[4] eq '.'); # skip non-var sites
next if ($t[3] eq 'N'); # skip sites with unknown ref ('N')
# check if the site is a SNP
my $type = 1; # SNP
if (length($t[3]) > 1) {
$type = 2; # MNP
my @s = split(',', $t[4]);
for (@s) {
$type = 3 if (length != length($t[3]));
}
} else {
my @s = split(',', $t[4]);
for (@s) {
$type = 3 if (length > 1);
}
}
# clear the out-of-range elements
while (@staging) {
# Still on the same chromosome and the first element's window still affects this position?
last if ($staging[0][3] eq $t[0] && $staging[0][4] + $staging[0][2] + $max_dist >= $t[1]);
varFilter_aux(shift(@staging), $opts{p}); # calling a function is a bit slower, not much
}
my $flt = 0;
# parse annotations
my ($dp, $mq, $dp_alt) = (-1, -1, -1);
if ($t[7] =~ /DP4=(\d+),(\d+),(\d+),(\d+)/i) {
$dp = $1 + $2 + $3 + $4;
$dp_alt = $3 + $4;
}
if ($t[7] =~ /DP=(\d+)/i) {
$dp = $1;
}
$mq = $1 if ($t[7] =~ /MQ=(\d+)/i);
# the depth and mapQ filter
if ($dp >= 0) {
if ($dp < $opts{d}) {
$flt = 2;
} elsif ($dp > $opts{D}) {
$flt = 3;
}
}
$flt = 4 if ($dp_alt >= 0 && $dp_alt < $opts{a});
$flt = 1 if ($flt == 0 && $mq >= 0 && $mq < $opts{Q});
$flt = 7 if ($flt == 0 && /PV4=([^,]+),([^,]+),([^,]+),([^,;\t]+)/
&& ($1<$opts{1} || $2<$opts{2} || $3<$opts{3} || $4<$opts{4}));
$flt = 8 if ($flt == 0 && ((/MXGQ=(\d+)/ && $1 < $opts{G}) || (/MXSP=(\d+)/ && $1 >= $opts{S})));
# HWE filter
if ($t[7] =~ /G3=([^;,]+),([^;,]+),([^;,]+).*HWE=([^;,]+)/ && $4 < $opts{e}) {
my $p = 2*$1 + $2;
my $f = ($p > 0 && $p < 1)? 1 - $2 / ($p * (1-$p)) : 0;
$flt = 9 if ($f < 0);
}
my $score = $t[5] * 100 + $dp_alt;
my $rlen = length($t[3]) - 1; # $indel_score<0 for SNPs
if ($flt == 0) {
if ($type == 3) { # an indel
# filtering SNPs and MNPs
for my $x (@staging) {
next if (($x->[0]&3) == 3 || $x->[1] || $x->[4] + $x->[2] + $ow < $t[1]);
$x->[1] = 5;
}
# check the staging list for indel filtering
for my $x (@staging) {
next if (($x->[0]&3) != 3 || $x->[1] || $x->[4] + $x->[2] + $ol < $t[1]);
if ($x->[0]>>2 < $score) {
$x->[1] = 6;
} else {
$flt = 6; last;
}
}
} else { # SNP or MNP
for my $x (@staging) {
next if (($x->[0]&3) != 3 || $x->[4] + $x->[2] + $ow < $t[1]);
if ($x->[4] + length($x->[7]) - 1 == $t[1] && substr($x->[7], -1, 1) eq substr($t[4], 0, 1)
&& length($x->[7]) - length($x->[6]) == 1) {
$x->[1] = 5;
} else { $flt = 5; }
last;
}
# check MNP
for my $x (@staging) {
next if (($x->[0]&3) == 3 || $x->[4] + $x->[2] < $t[1]);
if ($x->[0]>>2 < $score) {
$x->[1] = 8;
} else {
$flt = 8; last;
}
}
}
}
push(@staging, [$score<<2|$type, $flt, $rlen, @t]);
}
# output the last few elements in the staging list
while (@staging) {
varFilter_aux(shift @staging, $opts{p});
}
}
sub varFilter_aux {
my ($first, $is_print) = @_;
if ($first->[1] == 0) {
print join("\t", @$first[3 .. @$first-1]), "\n";
} elsif ($is_print) {
print STDERR join("\t", substr("UQdDaGgPMS", $first->[1], 1), @$first[3 .. @$first-1]), "\n";
}
}
sub gapstats {
my (@c0, @c1);
$c0[$_] = $c1[$_] = 0 for (0 .. 10000);
while (<>) {
next if (/^#/);
my @t = split;
next if (length($t[3]) == 1 && $t[4] =~ /^[A-Za-z](,[A-Za-z])*$/); # not an indel
my @s = split(',', $t[4]);
for my $x (@s) {
my $l = length($x) - length($t[3]) + 5000;
if ($x =~ /^-/) {
$l = -(length($x) - 1) + 5000;
} elsif ($x =~ /^\+/) {
$l = length($x) - 1 + 5000;
}
$c0[$l] += 1 / @s;
}
}
for (my $i = 0; $i < 10000; ++$i) {
next if ($c0[$i] == 0);
$c1[0] += $c0[$i];
$c1[1] += $c0[$i] if (($i-5000)%3 == 0);
printf("C\t%d\t%.2f\n", ($i-5000), $c0[$i]);
}
printf("3\t%d\t%d\t%.3f\n", $c1[0], $c1[1], $c1[1]/$c1[0]);
}
sub ucscsnp2vcf {
die("Usage: vcfutils.pl <in.ucsc.snp>\n") if (@ARGV == 0 && -t STDIN);
print "##fileformat=VCFv4.0\n";
print join("\t", "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"), "\n";
while (<>) {
my @t = split("\t");
my $indel = ($t[9] =~ /^[ACGT](\/[ACGT])+$/)? 0 : 1;
my $pos = $t[2] + 1;
my @alt;
push(@alt, $t[7]);
if ($t[6] eq '-') {
$t[9] = reverse($t[9]);
$t[9] =~ tr/ACGTRYMKWSNacgtrymkwsn/TGCAYRKMWSNtgcayrkmwsn/;
}
my @a = split("/", $t[9]);
for (@a) {
push(@alt, $_) if ($_ ne $alt[0]);
}
if ($indel) {
--$pos;
for (0 .. $#alt) {
$alt[$_] =~ tr/-//d;
$alt[$_] = "N$alt[$_]";
}
}
my $ref = shift(@alt);
my $af = $t[13] > 0? ";AF=$t[13]" : '';
my $valid = ($t[12] eq 'unknown')? '' : ";valid=$t[12]";
my $info = "molType=$t[10];class=$t[11]$valid$af";
print join("\t", $t[1], $pos, $t[4], $ref, join(",", @alt), 0, '.', $info), "\n";
}
}
sub hapmap2vcf {
die("Usage: vcfutils.pl <in.ucsc.snp> <in.hapmap>\n") if (@ARGV == 0);
my $fn = shift(@ARGV);
# parse UCSC SNP
warn("Parsing UCSC SNPs...\n");
my ($fh, %map);
open($fh, ($fn =~ /\.gz$/)? "gzip -dc $fn |" : $fn) || die;
while (<$fh>) {
my @t = split;
next if ($t[3] - $t[2] != 1); # not SNP
@{$map{$t[4]}} = @t[1,3,7];
}
close($fh);
# write VCF
warn("Writing VCF...\n");
print "##fileformat=VCFv4.0\n";
while (<>) {
my @t = split;
if ($t[0] eq 'rs#') { # the first line
print join("\t", "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT", @t[11..$#t]), "\n";
} else {
next unless ($map{$t[0]});
next if (length($t[1]) != 3); # skip non-SNPs
my $a = \@{$map{$t[0]}};
my $ref = $a->[2];
my @u = split('/', $t[1]);
if ($u[1] eq $ref) {
$u[1] = $u[0]; $u[0] = $ref;
} elsif ($u[0] ne $ref) { next; }
my $alt = $u[1];
my %w;
$w{$u[0]} = 0; $w{$u[1]} = 1;
my @s = (@$a[0,1], $t[0], $ref, $alt, 0, '.', '.', 'GT');
my $is_tri = 0;
for (@t[11..$#t]) {
if ($_ eq 'NN') {
push(@s, './.');
} else {
my @a = ($w{substr($_,0,1)}, $w{substr($_,1,1)});
if (!defined($a[0]) || !defined($a[1])) {
$is_tri = 1;
last;
}
push(@s, "$a[0]/$a[1]");
}
}
next if ($is_tri);
print join("\t", @s), "\n";
}
}
}
sub vcf2fq {
my %opts = (d=>3, D=>100000, Q=>10, l=>5);
getopts('d:D:Q:l:', \%opts);
die(qq/
Usage: vcfutils.pl vcf2fq [options] <all-site.vcf>
Options: -d INT minimum depth [$opts{d}]
-D INT maximum depth [$opts{D}]
-Q INT min RMS mapQ [$opts{Q}]
-l INT INDEL filtering window [$opts{l}]
\n/) if (@ARGV == 0 && -t STDIN);
my ($last_chr, $seq, $qual, $last_pos, @gaps);
my $_Q = $opts{Q};
my $_d = $opts{d};
my $_D = $opts{D};
my %het = (AC=>'M', AG=>'R', AT=>'W', CA=>'M', CG=>'S', CT=>'Y',
GA=>'R', GC=>'S', GT=>'K', TA=>'W', TC=>'Y', TG=>'K');
$last_chr = '';
while (<>) {
next if (/^#/);
my @t = split;
if ($last_chr ne $t[0]) {
&v2q_post_process($last_chr, \$seq, \$qual, \@gaps, $opts{l}) if ($last_chr);
($last_chr, $last_pos) = ($t[0], 0);
$seq = $qual = '';
@gaps = ();
}
die("[vcf2fq] unsorted input\n") if ($t[1] - $last_pos < 0);
if ($t[1] - $last_pos > 1) {
$seq .= 'n' x ($t[1] - $last_pos - 1);
$qual .= '!' x ($t[1] - $last_pos - 1);
}
if (length($t[3]) == 1 && $t[7] !~ /INDEL/ && $t[4] =~ /^([A-Za-z.])(,[A-Za-z])*$/) { # a SNP or reference
my ($ref, $alt) = ($t[3], $1);
my ($b, $q);
$q = $1 if ($t[7] =~ /FQ=(-?[\d\.]+)/);
if ($q < 0) {
$_ = ($t[7] =~ /AF1=([\d\.]+)/)? $1 : 0;
$b = ($_ < .5 || $alt eq '.')? $ref : $alt;
$q = -$q;
} else {
$b = $het{"$ref$alt"};
$b ||= 'N';
}
$b = lc($b);
$b = uc($b) if (($t[7] =~ /MQ=(\d+)/ && $1 >= $_Q) && ($t[7] =~ /DP=(\d+)/ && $1 >= $_d && $1 <= $_D));
$q = int($q + 33 + .499);
$q = chr($q <= 126? $q : 126);
$seq .= $b;
$qual .= $q;
} elsif ($t[4] ne '.') { # an INDEL
push(@gaps, [$t[1], length($t[3])]);
}
$last_pos = $t[1];
}
&v2q_post_process($last_chr, \$seq, \$qual, \@gaps, $opts{l});
}
sub v2q_post_process {
my ($chr, $seq, $qual, $gaps, $l) = @_;
for my $g (@$gaps) {
my $beg = $g->[0] > $l? $g->[0] - $l : 0;
my $end = $g->[0] + $g->[1] + $l;
$end = length($$seq) if ($end > length($$seq));
substr($$seq, $beg, $end - $beg) = lc(substr($$seq, $beg, $end - $beg));
}
print "\@$chr\n"; &v2q_print_str($seq);
print "+\n"; &v2q_print_str($qual);
}
sub v2q_print_str {
my ($s) = @_;
my $l = length($$s);
for (my $i = 0; $i < $l; $i += 60) {
print substr($$s, $i, 60), "\n";
}
}
sub usage {
die(qq/
Usage: vcfutils.pl <command> [<arguments>]\n
Command: subsam get a subset of samples
listsam list the samples
fillac fill the allele count field
qstats SNP stats stratified by QUAL
hapmap2vcf convert the hapmap format to VCF
ucscsnp2vcf convert UCSC SNP SQL dump to VCF
varFilter filtering short variants (*)
vcf2fq VCF->fastq (**)
Notes: Commands with description endting with (*) may need bcftools
specific annotations.
\n/);
}