https://github.com/thamala/lyrataRnaMet
Tip revision: 79c3fb242e5acf52d5a1711c739db416a065ef42 authored by Tuomas Hämälä on 21 October 2022, 08:10:44 UTC
Update README.md
Update README.md
Tip revision: 79c3fb2
probs2fst.c
/*
Copyright (C) 2022 Tuomas Hamala
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
For any other inquiries send an email to tuomas.hamala@gmail.com
––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––
Program for estimating Weir & Cockerham's Fst across arbitrary number of populations using genotype probabilities.
Compiling: gcc probs2fst.c -o probs2fst -lm
Usage:
-beagle [file] Posterior genotype probabilities in Beagle format (generated e.g. with Angsd or PCAngsd).
-pop [file] File listing individuals from a single population. Can be used >= 2 times.
-genes [file] Tab delimited file listing genes (format chr, start, end, strand [+ or -], id). Optional.
-bp [int] Distance around genes to calculate Fst for up- and downstream areas. Optional.
-min [int] Minimum number of individuals per population required to consider a site. Default 1.
-maf [double] Minimum minor allele frequency required to consider a site. Default 0.
Example:
./probs2fst -beagle postprobs.beagle -pop list1.txt -pop list2.txt -pop list3.txt -genes genes.txt -bp 1000 -min 6 -maf 0.05 > test.txt
*/
#include <ctype.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <unistd.h>
#define merror "\nERROR: System out of memory\n\n"
#define NA 0.333333
typedef struct {
int n;
double hw, hb;
} Var_s;
typedef struct {
int start, end;
char str, chr[101], id[101];
Var_s up, cds, down;
} Gene_s;
void openFiles(int argc, char *argv[]);
char **readPop(FILE *pop_file, int *n);
Gene_s *readGenes(FILE *gene_file, int *n);
void readBeagle(FILE *beagle_file, char ***pops, Gene_s *genes, int bp, int pop_n, int gene_n, int ind_n, int min, double maf);
Var_s estVars(double **dosage, int **plist, int pop_n, int ind_n, int plist_n, int min, double maf);
double estFst(Var_s vars);
int isNumeric(const char *s);
void lineTerminator(char *line);
int main(int argc, char *argv[]) {
int second = 0, minute = 0, hour = 0;
time_t timer = 0;
timer = time(NULL);
openFiles(argc, argv);
second = time(NULL) - timer;
minute = second / 60;
hour = second / 3600;
fprintf(stderr, "\nDone!");
if(hour > 0)
fprintf(stderr, "\nElapsed time: %i h, %i min & %i sec\n\n", hour, minute - hour * 60, second - minute * 60);
else if(minute > 0)
fprintf(stderr, "\nElapset time: %i min & %i sec\n\n", minute, second - minute * 60);
else if(second > 5)
fprintf(stderr, "\nElapsed time: %i sec\n\n", second);
else
fprintf(stderr, "\n\n");
return 0;
}
void openFiles(int argc, char *argv[]) {
int i, j, gene_n = 0, min = 1, pop_n = 0, ind_n = 0, bp = 0;
double maf = 0;
char ***pops = NULL;
Gene_s *genes = NULL;
FILE *beagle_file = NULL, *pop_file = NULL, *gene_file = NULL;
fprintf(stderr, "\nParameters:\n");
if((pops = malloc(argc * sizeof(char **))) == NULL) {
fprintf(stderr, merror);
exit(EXIT_FAILURE);
}
for(i = 1; i < argc; i++) {
if(strcmp(argv[i], "-beagle") == 0) {
if((beagle_file = fopen(argv[++i], "r")) == NULL) {
fprintf(stderr, "\nERROR: Cannot open file %s\n\n", argv[i]);
exit(EXIT_FAILURE);
}
fprintf(stderr, "\t-beagle %s\n", argv[i]);
}
else if(strcmp(argv[i], "-pop") == 0) {
if((pop_file = fopen(argv[++i], "r")) == NULL) {
fprintf(stderr, "\nERROR: Cannot open file %s\n\n", argv[i]);
exit(EXIT_FAILURE);
}
fprintf(stderr, "\t-pop %s\n", argv[i]);
pops[pop_n] = readPop(pop_file, &ind_n);
pop_n++;
}
else if(strcmp(argv[i], "-genes") == 0) {
if((gene_file = fopen(argv[++i], "r")) == NULL) {
fprintf(stderr, "\nERROR: Cannot open file %s\n\n", argv[i]);
exit(EXIT_FAILURE);
}
fprintf(stderr, "\t-genes %s\n", argv[i]);
}
else if(strcmp(argv[i], "-bp") == 0) {
if(isNumeric(argv[++i]))
bp = atoi(argv[i]);
fprintf(stderr, "\t-bp %s\n", argv[i]);
}
else if(strcmp(argv[i], "-min") == 0) {
if(isNumeric(argv[++i]))
min = atoi(argv[i]);
fprintf(stderr, "\t-min %s\n", argv[i]);
}
else if(strcmp(argv[i], "-maf") == 0) {
if(isNumeric(argv[++i]))
maf = atof(argv[i]);
fprintf(stderr, "\t-maf %s\n", argv[i]);
}
else {
fprintf(stderr, "\nERROR: Unknown argument '%s'\n\n", argv[i]);
exit(EXIT_FAILURE);
}
}
fprintf(stderr, "\n");
if(beagle_file == NULL) {
fprintf(stderr, "\nERROR: -beagle [file] is required!\n");
exit(EXIT_FAILURE);
}
if(pop_n < 2) {
fprintf(stderr, "\nERROR: at least two population files (-pop [file) are required!\n");
exit(EXIT_FAILURE);
}
if(gene_file != NULL)
genes = readGenes(gene_file, &gene_n);
readBeagle(beagle_file, pops, genes, bp, pop_n, gene_n, ind_n, min, maf);
}
char **readPop(FILE *pop_file, int *n) {
int i = 0, char_i = 0, maxchar = 0, line_i = 0;
char c, *line, **list;
while((c = fgetc(pop_file)) != EOF) {
char_i++;
if(c == '\n') {
line_i++;
if(char_i > maxchar)
maxchar = char_i;
char_i = 0;
}
}
rewind(pop_file);
if((line = malloc((maxchar + 1) * sizeof(char))) == NULL) {
fprintf(stderr, merror);
exit(EXIT_FAILURE);
}
if((list = malloc((line_i + 1) * sizeof(char *))) == NULL) {
fprintf(stderr, merror);
exit(EXIT_FAILURE);
}
for(i = 0; i < line_i + 1; i++) {
if((list[i] = malloc((maxchar + 1) * sizeof(char))) == NULL) {
fprintf(stderr, merror);
exit(EXIT_FAILURE);
}
}
i = 0;
while(fgets(line, maxchar + 1, pop_file) != NULL) {
if(line[0] == '\n')
continue;
lineTerminator(line);
strcpy(list[i], line);
i++;
*n = *n + 1;
}
list[i][0] = '\0';
free(line);
fclose(pop_file);
return list;
}
Gene_s *readGenes(FILE *gene_file, int *n) {
int i, char_i = 0, line_i = 0, maxchar = 0;
char c, *line = NULL;
Gene_s *list = NULL;
while((c = fgetc(gene_file)) != EOF) {
char_i++;
if(c == '\n') {
line_i++;
if(char_i > maxchar)
maxchar = char_i;
char_i = 0;
}
}
rewind(gene_file);
if((line = malloc((maxchar + 1) * sizeof(char))) == NULL) {
fprintf(stderr, merror);
exit(EXIT_FAILURE);
}
if((list = malloc(line_i * sizeof(Gene_s))) == NULL) {
fprintf(stderr, merror);
exit(EXIT_FAILURE);
}
memset(list, 0, sizeof(Gene_s) * line_i);
while(fgets(line, maxchar + 1, gene_file) != NULL) {
if(line[0] == '\n')
continue;
lineTerminator(line);
strncpy(list[*n].chr, strtok(line, "\t"), 100);
list[*n].start = atoi(strtok(NULL, "\t"));
list[*n].end = atoi(strtok(NULL, "\t"));
list[*n].str = strtok(NULL, "\t")[0];
strncpy(list[*n].id, strtok(NULL, "\t"), 100);
*n = *n + 1;
}
free(line);
fclose(gene_file);
return list;
}
void readBeagle(FILE *beagle_file, char ***pops, Gene_s *genes, int bp, int pop_n, int gene_n, int ind_n, int min, double maf) {
int i, j = 0, k = 0, l = 0, m = 0, n = 0, pos = 0, ok = 0, p_i = 0, gene_i = 0, kept_i = 0, site_i = 0, **plist = NULL;
double prob[3] = {0}, **dosage;
char chr[101], *line = NULL, *temp = NULL, *end = NULL;
Var_s vars = {0};
size_t len = 0;
ssize_t read;
if((plist = malloc(ind_n * sizeof(int *))) == NULL) {
fprintf(stderr, merror);
exit(EXIT_FAILURE);
}
for(i = 0; i < ind_n; i++) {
if((plist[i] = malloc(2 * sizeof(int))) == NULL) {
fprintf(stderr, merror);
exit(EXIT_FAILURE);
}
}
while((read = getline(&line, &len, beagle_file)) != -1) {
if(line[0] == '\n')
continue;
lineTerminator(line);
temp = strtok_r(line, "\t", &end);
i = 1;
j = 0;
k = 0;
if(strcmp(temp, "marker") == 0) {
while(temp != NULL) {
if(i > 3) {
if(j == 0) {
for(k = 0; k < pop_n; k++) {
for(l = 0; pops[k][l][0] != '\0'; l++) {
if(strcmp(temp, pops[k][l]) == 0) {
plist[p_i][0] = n;
plist[p_i][1] = k;
p_i++;
}
}
}
n++;
}
j++;
if(j == 3)
j = 0;
}
temp = strtok_r(NULL, "\t", &end);
i++;
}
if(p_i == 0) {
fprintf(stderr, "ERROR: Individuals in pop files were not found in the Beagle file!\n\n");
exit(EXIT_FAILURE);
}
if(p_i < ind_n)
fprintf(stderr, "Warning: Pop files contain individuals that are not in the Beagle file\n");
fprintf(stderr, "Kept %i individuals from %i populations\n", p_i, pop_n);
if((dosage = malloc(n * sizeof(double *))) == NULL) {
fprintf(stderr, merror);
exit(EXIT_FAILURE);
}
for(j = 0; j < n; j++) {
if((dosage[j] = malloc(2 * sizeof(double))) == NULL) {
fprintf(stderr, merror);
exit(EXIT_FAILURE);
}
}
if(gene_n == 0) {
if(isatty(1))
fprintf(stderr, "\n");
printf("chr\tbp\tfst\n");
}
continue;
}
site_i++;
strncpy(chr, strtok(temp, "_"), 100);
pos = atoi(strtok(NULL, "_"));
if(gene_n > 0) {
ok = 0;
while(gene_i < gene_n) {
if(strcmp(chr, genes[gene_i].chr) == 0) {
if(pos <= genes[gene_i].end + bp && pos >= genes[gene_i].start - bp) {
ok = 1;
break;
} else if(pos < genes[gene_i].start - bp)
break;
} else if(strcmp(chr, genes[gene_i].chr) < 0)
break;
gene_i++;
}
if(ok == 0)
continue;
}
while(temp != NULL) {
if(i > 3) {
prob[j] = atof(temp);
j++;
if(j == 3) {
if(prob[0] != NA | prob[1] != NA | prob[2] != NA) {
dosage[k][0] = prob[1] + 2 * prob[2];
dosage[k][1] = prob[1];
} else
dosage[k][0] = 9;
j = 0;
k++;
}
}
temp = strtok_r(NULL, "\t", &end);
i++;
}
if(temp == NULL) {
vars = estVars(dosage, plist, pop_n, n, p_i, min, maf);
if(isnan(vars.hw) == 1)
continue;
kept_i++;
if(gene_n == 0)
printf("%s\t%i\t%f\n", chr, pos, estFst(vars));
else {
for(i = m; i < gene_n; i++) {
if(strcmp(chr, genes[i].chr) == 0) {
if(pos <= genes[i].end + bp && pos >= genes[i].start - bp) {
if(pos < genes[i].start && genes[i].str == '+') {
genes[i].up.hw += vars.hw;
genes[i].up.hb += vars.hb;
genes[i].up.n++;
} else if(pos < genes[i].start && genes[i].str == '-') {
genes[i].down.hw += vars.hw;
genes[i].down.hb += vars.hb;
genes[i].down.n++;
} else if(pos > genes[i].end && genes[i].str == '+') {
genes[i].down.hw += vars.hw;
genes[i].down.hb += vars.hb;
genes[i].down.n++;
} else if(pos > genes[i].end && genes[i].str == '-') {
genes[i].up.hw += vars.hw;
genes[i].up.hb += vars.hb;
genes[i].up.n++;
} else {
genes[i].cds.hw += vars.hw;
genes[i].cds.hb += vars.hb;
genes[i].cds.n++;
}
} else if(pos < genes[i].start - bp) {
m = i;
for(j = 1; j <= i; j++) {
if(pos <= genes[i - j].end + bp && pos >= genes[i - j].start - bp)
m = i - j;
else if(pos > genes[i - j].end + bp) {
if(i - j - 1 >= 0) {
if(pos > genes[i - j - 1].end + bp)
break;
} else
break;
}
}
break;
}
} else if(strcmp(chr, genes[i].chr) < 0) {
m = i;
for(j = 1; j <= i; j++) {
if(strcmp(chr, genes[i - j].chr) == 0) {
if(pos <= genes[i - j].end + bp && pos >= genes[i - j].start - bp)
m = i - j;
else if(pos > genes[i - j].end + bp) {
if(i - j - 1 >= 0) {
if(pos > genes[i - j - 1].end + bp)
break;
} else
break;
}
} else if(strcmp(chr, genes[i - j].chr) > 0)
break;
}
break;
}
}
}
}
}
if(gene_n > 0) {
if(isatty(1))
fprintf(stderr, "\n");
if(bp == 0) {
printf("id\tcoding_fst\tcoding_n\n");
for(i = 0; i < gene_n; i++)
printf("%s\t%f\t%i\n", genes[i].id, estFst(genes[i].cds), genes[i].cds.n);
} else {
printf("id\tup_fst\tup_n\tcoding_fst\tcoding_n\tdown_fst\tdown_n\n");
for(i = 0; i < gene_n; i++)
printf("%s\t%f\t%i\t%f\t%i\t%f\t%i\n", genes[i].id, estFst(genes[i].up), genes[i].up.n, estFst(genes[i].cds), genes[i].cds.n, estFst(genes[i].down), genes[i].down.n);
}
}
if(isatty(1))
fprintf(stderr, "\n");
fprintf(stderr, "Kept %i out of %i sites\n", kept_i, site_i);
free(line);
free(plist);
free(pops);
if(gene_n > 0)
free(genes);
fclose(beagle_file);
}
Var_s estVars(double **dosage, int **plist, int pop_n, int ind_n, int plist_n, int min, double maf) {
int i, j, ok = 1;
double a = 0, b = 0, c = 0, pbar = 0, nbar = 0, hbar = 0, n_sum = 0, n_sum2 = 0, nc = 0, r = 0, s2 = 0, *p = NULL, *n = NULL;
Var_s vars;
if((p = malloc(pop_n * sizeof(double))) == NULL) {
fprintf(stderr, merror);
exit(EXIT_FAILURE);
}
if((n = malloc(pop_n * sizeof(double))) == NULL) {
fprintf(stderr, merror);
exit(EXIT_FAILURE);
}
memset(p, 0, sizeof(double) * pop_n);
memset(n, 0, sizeof(double) * pop_n);
for(i = 0; i < ind_n; i++) {
if(dosage[i][0] == 9)
continue;
for(j = 0; j < plist_n; j++) {
if(plist[j][0] == i) {
p[plist[j][1]] += dosage[i][0];
n[plist[j][1]]++;
pbar += dosage[i][0];
hbar += dosage[i][1];
n_sum++;
}
}
}
for(i = 0; i < pop_n; i++) {
p[i] /= n[i] * 2;
n_sum2 += (n[i] * n[i]);
if(n[i] < min)
ok = 0;
}
r = (double)pop_n;
nbar = n_sum / r;
pbar /= n_sum * 2;
hbar /= n_sum;
if(ok == 0 || pbar < maf || 1 - pbar < maf) {
free(p);
free(n);
vars.hw = 0.0 / 0.0;
return vars;
}
for(i = 0; i < pop_n; i++)
s2 += n[i] * (p[i] - pbar) * (p[i] - pbar);
s2 /= (r - 1.0) * nbar;
nc = n_sum - (n_sum2 / n_sum) / (r - 1.0);
a = (s2 - (pbar * (1.0 - pbar) - (((r - 1.0) * s2) / r) - (hbar / 4.0)) / (nbar - 1.0)) * nbar / nc;
b = (pbar * (1.0 - pbar) - (s2 * (r - 1.0) / r) - hbar * (((2.0 * nbar) - 1.0) / (4.0 * nbar))) * nbar / (nbar - 1.0);
c = hbar / 2.0;
if(isnan(a) || isnan(b) || isnan(c))
vars.hw = 0.0 / 0.0;
else {
vars.hw = a;
vars.hb = a + b + c;
}
free(p);
free(n);
return vars;
}
double estFst(Var_s vars) {
return vars.hw / vars.hb;
}
int isNumeric(const char *s) {
char *p;
if(s == NULL || *s == '\0' || isspace(*s))
return 0;
strtod(s, &p);
return *p == '\0';
}
void lineTerminator(char *line) {
int i;
for(i = 0; line[i] != 0; i++) {
if(line[i] == '\n' || line[i] == '\r')
line[i] = '\0';
}
}