https://github.com/MoorjaniLab/DATES_v4010
Raw File
Tip revision: e034dc0d6fe8d41a828796f07791d50011b6bb04 authored by MoorjaniLab on 09 May 2022, 23:55:41 UTC
Add files via upload
Tip revision: e034dc0
admutils.c
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
// #include <openssl/md2.h>
#include "admutils.h" 

#include  "packit.h"   
static int fastdupnum = 10 ;
static double fastdupthresh = 0.75 ;
static double fastdupkill = 0.75 ;
static int snptab = NO ;

#define MAXSTR 1024

int hashit (char *str)  ;

int countcol(char *fname) {
 FILE *fp ;
 int t ;

 openit(fname, &fp, "r") ;
 t = countcolumns(fp) ;
 fclose(fp) ;
 return t ;
}


int countcolumns(FILE *fp)
{  /* count number of text columns separated by whitespace */
  int i=0,c;
  fpos_t ptr;

  if (fgetpos(fp,&ptr)) {
    printf("error counting columns\n");
    return 0;
  }

  while ( (c = getc(fp)) != '\n' ) {
    if (isgraph(c)) {
      i++;
      while (isgraph(c = getc(fp))) {}
      ungetc(c,fp);
    }
  }
  fsetpos(fp,&ptr);
  return i;
}

void sett1(double *tt, double theta, int numstates) 
{
    if (numstates==2)  { 
     tt[0] = 1.0-theta ;
     tt[1] = theta ;
     tt[2] = 0.0 ;
    }
    if (numstates==3)  { 
     tt[0] = (1.0-theta)*(1.0-theta) ;
     tt[1] = 2.0*theta*(1.0-theta) ;
     tt[2] = theta*theta ;
    }
}

void sett1r(double *tt, double theta, int numstates, double risk)
{
  double y ;
  sett1(tt, theta, numstates) ;
  tt[1] *= risk ;
  tt[2] *= risk*risk ;  
  y = asum(tt, numstates) ;
  vst(tt, tt, 1.0/y, numstates) ;
}

void gettln(SNP *cupt, Indiv *indx, 
  double *ptheta, double *plambda, int *pnumstates, int *pignore) 
/* set theta, lambda numstates */
{
   double theta, lambda ; 
   int numstates, chrom, ignore ;

   theta  = indx->theta_mode ;
   lambda = indx->lambda_mode ;
   ignore = indx->ignore ;
   numstates = 3 ;

   chrom = cupt -> chrom ;

   if (chrom == 23) { 
    theta  = indx->Xtheta_mode ;
    lambda = indx->Xlambda_mode ;
    if (indx -> gender == 'M') numstates = 2;
    if (indx -> gender == 'U') ignore = YES ;
   }
   if (ptheta !=NULL) *ptheta = theta ;
   if (plambda != NULL) *plambda = lambda ;
   if (pnumstates != NULL) *pnumstates = numstates ;
   if (pignore != NULL) *pignore = ignore ;
}


void puttln(SNP *cupt, Indiv *indx, 
  double ptheta, double plambda) 
/* put theta, lambda */
{

   int chrom ;


   chrom = cupt -> chrom ;

   if (chrom == 23) { 
    indx->Xtheta_mode = ptheta;
    indx->Xlambda_mode  = plambda ;
    return ;
   }
   indx->theta_mode = ptheta;
   indx->lambda_mode  = plambda ;
   return ;
}


double hwcheck (SNP *cupt, double *cc) 
{

      int i, n, g ;


      vzero(cc, 3) ;

      n = cupt -> ngtypes ;
      for (i=0; i<n; i++)  {
        g = getgtypes(cupt, i) ;
        if (g<0) continue ;
        ++cc[g] ;
     }
     return (hwstat(cc)) ;

}

double hwcheckx (SNP *cupt, Indiv **indm, double *cc) 
// deals with X  
{

      int i, n, g, ignore, numstates ;
      Indiv *indx ;
      double t, l ; 


      vzero(cc, 3) ;

      n = cupt -> ngtypes ;
      for (i=0; i<n; i++)  {
        indx = indm[i] ;
        gettln(cupt, indx, &t, &l, &numstates, &ignore) ;
        if (ignore) continue ; 
        if (numstates != 3) continue ;
        g = getgtypes(cupt, i) ;
        if (g<0) continue ;
        ++cc[g] ;
     }
     return (hwstat(cc)) ;

}
void hap2dip(SNP *cupt) 
// duplicate chromosomes           
{
   int i, n, g ;

   n = cupt -> ngtypes ;
   for (i=0; i<n; i++) {  
    g = getgtypes(cupt, i) ; 
    if (g<0) continue ;
    if (g==2) g = -1 ; 
    if (g==1) g = 2 ; 
    putgtypes(cupt, i, g) ;
   }
}

void flipalleles(SNP *cupt) 
// flip reference, variant counts
{
   int i, n, g ;
   n = cupt -> ngtypes ;
   for (i=0; i<n; i++) {  
    g = getgtypes(cupt, i) ; 
    if (g<0) continue ;
    putgtypes(cupt, i, 2-g) ;
   }
}

void flipalleles_phased(SNP *cupt) 
// flip reference, variant counts
{
   int i, n, g ;
   n = cupt -> ngtypes ;
   for (i=0; i<n; i++) {  
    g = getgtypes(cupt, i) ; 
    if (g<0) continue ;
    if (g>1) fatalx("genotype > 1 in phased file\n") ;
    putgtypes(cupt, i, 1-g) ;
   }
}


void cntit(double *xc, SNP *c1, SNP *c2) 
{
 int n, i, e, f ;

 n = MIN(c1->ngtypes, c2->ngtypes) ;
 vzero(xc, 9) ;
 for (i=0; i<n; i++) {  
  e = getgtypes(c1, i) ;
  f = getgtypes(c2, i) ;
  if (e<0) continue ;
  if (f<0) continue ;
  ++xc[3*e+f] ;
 }
}

/******** UTILITY FUNCTIONS **********/
void fataly(const char *name)
{
  printf("%s",name);
  exit(1);
}

int compare_doubles (const void *a, const void *b)
{
  const double *da = (const double *) a;
  const double *db = (const double *) b;

  return (*da > *db) - (*da < *db);
}

void pcheck (char *name, char x) 
{
 
  if (name != NULL) return ;
  if (x != CNULL) 
   fatalx ("parameter %c compulsory\n",x) ;
  else fatalx("missing argument\n") ;
}

void printm(double **M, int numstates)  
{
     int i,j ;
     printf("M:\n") ;
     for (i=0; i<numstates; i++)  {  
      for (j=0; j<numstates; j++)  {  
       printf("%9.3f ", M[j][i]) ;
      }
      printf("\n") ;
     }
}


int numvalidgtypes(SNP *cupt) 
{
  int nvalid, n, i, k ;
  if (cupt -> isfake) return 0 ;
  n = cupt -> ngtypes ;
  nvalid = 0 ;
  for (i=0; i<n; i++)  {
   k = getgtypes(cupt, i) ;
   if (k >= 0) ++nvalid ;  
  }
  return nvalid ;
}  

int numvalids(Indiv *indx, SNP **snpmarkers, int fc, int lc)  
{ 
  SNP *cupt ; 
  int idnum, numstates, ignore ;  
  int k, nval= 0 ; 
 
  if (fc>lc) return 0 ; 
  if (lc<0) return 0 ; 
  idnum = indx -> idnum ; 
  for (k=fc; k<=lc; ++k) {   
    cupt = snpmarkers[k] ; 
    if (cupt -> isfake) continue ; 
    gettln(cupt, indx, NULL, NULL, &numstates, &ignore) ; 
    if (ignore) continue ; 
    if (cupt -> ngtypes == 0) continue ; 
    if (getgtypes(cupt, idnum) >= 0) ++nval ; 
  } 
  return nval ; 
} 

double malefreq(Indiv **indmarkers, int numindivs) 

/* pop freq of males in sample */
{
   int i ;
   Indiv *indx ; 
   double cmale, cfemale ;

   cmale = 0 ;
   for (i=0; i<numindivs; ++i) { 
    indx = indmarkers[i] ;
    if (indx -> gender == 'M') ++cmale ;
   }

   cmale /= (double) numindivs ;

   return cmale ;
}

int isimatch(int a, int b)
{
   if (a < 0)  return YES ;
   if (b < 0)  return YES ;
   if (a==b)   return YES ;
   return NO;
}

void gethpos(int *fc, int *lc, SNP **snpm, int numsnps,  
 int xchrom, int lo, int hi)  
{ 
   int k, xfc, xlc, pos ; 
   SNP *cupt ; 
 
   xfc = 9999999 ; 
   xlc = -9999999 ; 
   for (k=0; k<numsnps; k++) {  
    cupt = snpm[k] ; 
    if (cupt -> chrom != xchrom) continue ; 
    pos = cupt -> physpos ; 
    if (pos < lo) continue ; 
    if (pos > hi) continue ; 
    xfc = MIN(xfc, k) ; 
    xlc = MAX(xlc, k) ; 
   } 
   *fc = xfc ;  
   *lc = xlc ; 
} 

void makedir (char * dirname) 
// AT wrote original 
// sets up directory.  Will fail hard if directory does not 
// exist and can't be made 
{
 int fdir ; 
   fdir  = open(dirname,O_RDONLY,0);
   if (fdir >= 0) {  
    close (fdir) ;
    return ;
   }
   fdir = mkdir(dirname,0775);
   if (fdir < 0) {  
    perror("makedir") ;
    fatalx("(makedir) directory %s not created\n") ;
   }
   printf("Created a new directory %s\n",dirname);
}


int
indxindex(char **namelist, int len, char *strid)  
// look for string in list
{
     int k ; 
     for (k=0; k< len; k++) {  
      if (strcmp(namelist[k], strid) == 0) return k ;
     }
     return -1 ;
}

int indindex(Indiv **indivmarkers, int numindivs, char *indid) 
/* hash table would be good here */
{
     int k ; 
     for (k=0; k< numindivs; k++) {  
      if (strcmp(indivmarkers[k] -> ID, indid) == 0) return k ;
     }
     return -1 ;
}

void inddupcheck (Indiv ** indivmarkers, int numindivs)
{
  // fail hard if duplicate
  int t, k, n;
  char **ss;

  freesnpindex ();
  n = numindivs;
  ZALLOC (ss, n, char *);
  for (k = 0; k < n; k++) {
    ss[k] = strdup (indivmarkers[k]->ID);
  }
  t = finddup (ss, n);
  if (t >= 0)
    fatalx ("duplicate sample: %s\n", ss[t]);
  freeup (ss, n);
  free (ss);
}


int snpindex(SNP **snpmarkers, int numsnps, char *snpid) 
{
     int k ; 
     char **ss ;  

     if (snptab==NO) {  
// set up hash table
      snptab = YES ;
      ZALLOC(ss, numsnps, char *) ;
      for (k=0; k< numsnps; k++) {  
       ss[k] = strdup(snpmarkers[k] -> ID) ;
      }
      xloadsearch(ss, numsnps) ;
      freeup(ss, numsnps) ;
      free(ss) ;
     }
     
     k = xfindit(snpid) ;
     return k ;
}
void freesnpindex() 
{
 if (snptab == NO) return ;
 snptab = NO ; 
 xdestroy() ;
}

int ignoresnp(SNP *cupt)  
{
 if (cupt -> ignore) return YES ;
 if (cupt -> isfake) return YES ;
 if (cupt -> ngtypes == 0) return YES ;
 if (cupt -> isrfake) return NO ; 
 return NO ;
}

double entrop(double *a, int n) 
{
  int i ;
  double ysum, t, ans ;

  ans = 0.0 ; 
  ysum = asum(a,n) ;
  for (i=0; i<n ; i++) {
   t = a[i]/ysum ;
   ans += xxlog2(t) ;
  }
  return -ans ;
}
double xxlog2(double t) 
{
  if (t<=0.0) return t ;
  return t * log(t) / log(2.0) ;
}

void 
testnan(double *a, int n) 
{
  int i ;

  for (i=0; i<n; i++) {
   if (!finite(a[i])) fatalx("(testnan) fails:  index %d\n",i) ;
  }
}
void getgall(SNP *cupt, int *x, int n) 
{
  int k, t, a ;
  unsigned char b, w ;

  if (cupt -> gtypes == NULL) {          
   ivclear(x, -1, n) ;
   return ;
  }

  if (!packmode) {  
   copyiarr(cupt-> gtypes, x, n) ;
   return ;
  }

  k = 0 ;
  for (a=0; 4*a<n; ++a) {  
   w = cupt -> pbuff[a] ;
   for (t = 0; t < 4 ; t++)  { 
    b = w >> 2*(3-t) ;
    x[k] = b & 3 ;
    ++k ; 
    if (k>=n) break ;
   }
  }
}

int getgtypes(SNP *cupt, int k) 
{
  char *buff ; 
  int  g ;

  if (cupt -> gtypes == NULL) return -1 ;

  if (packmode)  {  
   buff = cupt -> pbuff ;          
   g = rbuff((unsigned char *)buff, k) ;  
   if (g==3) g = -1 ; 
   return g ;
  }

  return cupt -> gtypes[k] ;
}

void putgtypes(SNP *cupt, int k, int val) 
{
  char *buff ; 
  int   g ;

  if (k>= cupt -> ngtypes) fatalx("(putgtypes)\n") ;
  if (packmode)  {  
   buff = cupt -> pbuff ;          
   g = val ; 
   if (g <0) g=3 ;
   wbuff((unsigned char *)buff, k, g) ;  
   return ;
  }
  cupt ->gtypes[k] = val ;
}

int getep(SNP *cupt, int k) 
{
  char *buff ; 
  int   g ;

  if (cupt -> gtypes == NULL) return -1 ;
  if (k>= cupt -> ngtypes) return -1 ; 
   buff = cupt -> ebuff ;          
   g = rbuff((unsigned char *)buff, k) ;  
   if (g==3) g = -1 ; 
   return g ;

}

void putep(SNP *cupt, int k, int val) 
{
  char *buff ; 
  int   g ;

  buff = cupt -> ebuff ;          
  if (buff == NULL) fatalx("(putep) no buffer\n") ;
  if (k>= cupt -> ngtypes) fatalx("(putep)\n") ;
   g = val ; 
   if (g <0) g=3 ;
   wbuff((unsigned char *)buff, k, g) ;  
   return ;

}

int hasharr(char **xarr, int nxarr)  
// in application ordering  matters so we hash order dependent 
{
  
   int hash, thash, i, n ;  

   hash = 0 ;  

   for (i=0; i< nxarr; i++)  { 
    thash = hashit(xarr[i]) ;
    hash *= 17 ;
    hash ^= thash ;
   }
   return hash ;
}
int hashit (char *str) 
{
/* simple and unimpressive hash function NJP */
   int j, len, hash ; 

   hash = 0 ;
   len = strlen(str) ;

   for (j=0; j<len ; j++) {  
     hash *= 23 ;
     hash += (int) str[j] ;
   }
   return hash ;
}
void
wbuff(unsigned char *buff, int num, int g) 
// low level routine writes 2 bits to buffer 
// g should be 0 1 2 or 3   (3 = missing) 
{

   int wnum, wplace ;  
   unsigned char mm, msk, ones ;
   static int ncall = 0 ;
 
   if ((g<0) || (g>3)) fatalx("(wbuff) invalid g value\n", g) ;

   ++ncall ;

   msk = 3 << 6 ;  
   mm = g << 6 ;
   ones = 0XFF  ;

   wnum = num/4 ;  
   wplace = num%4 ;

   mm = mm >> (wplace * 2) ;  
   msk = (msk >> (wplace *2)) ^ ones ;  
   buff[wnum] &= msk ;
   buff[wnum] |= mm ;

/**
     printf("zz %d  %d %d %d %02x\n", num, wnum, wplace, g, buff[wnum]) ;
     printf("yyy %d %d\n", g, rbuff(buff,num)) ;
*/
}
int
rbuff(unsigned char *buff, int num) 
{
   int wnum, wplace, rshft ;  
   unsigned char  b ;
   static int ncall = 0 ;
 
// ++ncall ;


   wnum = num >> 2 ;
   wplace = num & 3 ; 

   rshft = (3-wplace) << 1 ;
   b = buff[wnum] >> rshft ;

   b = b & 3 ;
   return b ;
}
// fast dup code
void setfastdupnum(int  num) 
{
 fastdupnum = num ;
}
void setfastdupthresh(double thresh, double kill) 
{
 fastdupthresh = thresh ;
 fastdupkill = kill ;
}

void
killxhets(SNP **snpmarkers, Indiv **indivmarkers, int numsnps, int numindivs)
{
 SNP *cupt ; 
 Indiv *indx ;
 int i, k, g ;

 for (i=0; i<numsnps; i++) {  
  cupt = snpmarkers[i]  ;
  if (cupt -> ignore) continue ;  
  if (cupt -> isfake) continue ; 
  if (cupt -> chrom != 23) continue ;
  for (k=0; k<numindivs; k++) { 
    indx = indivmarkers[k]  ;
    if (indx -> gender != 'M') continue ;
    g = getgtypes(cupt, k) ; 
    if (g != 1) continue ;
    putgtypes(cupt, k, -1) ;
  }
 }
}
  

void
fastdupcheck(SNP **snpmarkers, Indiv **indivmarkers, int numsnps, int numindivs)
{
 SNP *cupt ; 
 Indiv *indx ;
 int *gtypes ;
 int i, j, k, n ;
 int *snphets, *indsnp, tab[15], ww[15], **codeit, *cc, g, *cbuff ; 
 int *buff, val, vv, lbuff, itry, ilo, ihi ;

 ZALLOC(gtypes, numindivs, int) ;
 ZALLOC(cbuff, 2*numindivs, int) ;
 ZALLOC(codeit, numindivs, int *) ;
 ZALLOC(snphets, numsnps, int) ;
 ZALLOC(indsnp, numsnps, int) ;
 for (i=0; i<numsnps; i++) {  
  cupt = snpmarkers[i]  ;
  if (cupt -> ignore) continue ;  
  if (cupt -> isfake) continue ; 
  if (cupt -> chrom > 22) continue ;
  grabgtypes(gtypes, cupt, numindivs) ;
  for (k=0; k<numindivs; k++) { 
    if (gtypes[k] == 1) ++snphets[i] ;
  }
 }
 ivst(snphets, snphets, -1, numsnps) ;
 isortit(snphets, indsnp, numsnps) ;
// make fastdupnum shots at exact match on 15 snps */
 for (itry = 1; itry < fastdupnum; itry++) {
  ilo = 15*itry ;  
  if ((ilo+15)>=numsnps) break ;
  for (i=0; i<15; i++) { 
   j = indsnp[i+ilo] ; 
   tab[i] = j ;
  }
  n = 0 ;
  for (k=0; k<numindivs; ++k) {  
   indx = indivmarkers[k] ; 
   if (indx -> ignore) continue ;
   for (i=0; i<15; i++) { 
    j = tab[i] ;
    g  = getgtypes(snpmarkers[j], k) ;
    if (g<0) break ;
    ww[i] = g ; 
   }
   if (g < 0 ) continue ;
   cc = codeit[n] = cbuff+2*n ;
   cc[0] = kcode(ww, 15, 4) ;
   cc[1] = k ; 
   ++n  ;
  }


  if (n==0) continue ;
  ipsortit(codeit, NULL, n, 2) ;  

  buff = gtypes ; lbuff = 0; val = -1 ;  

  for (i=0; i<n; i++)  {  
   cc=codeit[i] ;  
   vv = cc[0] ;
   if (vv != val) {  
    cdup(snpmarkers, indivmarkers, numsnps, buff, lbuff) ;
    lbuff = 0 ; 
    val = vv ; 
   }
   buff[lbuff] = cc[1] ;
   ++lbuff ;
  }
  cdup(snpmarkers, indivmarkers, numsnps, buff, lbuff) ;
 }  // itry

 free(snphets) ;
 free(indsnp) ;
 free(gtypes) ;
 free(codeit) ;
 free(cbuff) ;
}

void cdup(SNP **snpm, Indiv **indm, int nsnp, int *buff, int lbuff) 
{ 
 static int ncall = 0 ;
 SNP * cupt ;
 Indiv *inda, *indb ;
 double ytot, yhit ;
 int g1, g2, k1, k2, match, nomatch ;
 int i1, i2, j ;
#define MINMARK 200 


 if (lbuff <= 1) return ;
 ++ncall ;

 if (ncall<=1) {
//printf ("cdup: %d\n", ncall) ;
  printf("fastdupthresh, kill: %9.3f %9.3f\n", fastdupthresh, fastdupkill) ;
//printimat(buff, lbuff, 1) ;
 }
// if (ncall==1) printf(" cdup %9.3f %9.3f\n", fastdupthresh, fastdupkill) ;
 for (i1=0; i1<lbuff; ++i1) {  
  for (i2=i1+1; i2<lbuff; ++i2) {
   k1 = buff[i1] ; 
   k2 = buff[i2] ;  
   match = nomatch = 0 ;
   for (j=0; j<nsnp; ++j) {  
    cupt = snpm[j] ;
    if (cupt -> ignore) continue ;
    if (cupt -> isfake) continue ;
    g1 = getgtypes(cupt, k1) ;
    g2 = getgtypes(cupt, k2) ;
    if ( (g1<0) || (g2<0) ) continue ;
    if (g1==g2) ++match ;
    if (g1!=g2) ++nomatch ;
   }

   inda = indm[k1] ;
   indb = indm[k2] ;
   ytot = (double) (match + nomatch) ;
   if (ytot< MINMARK) continue ;
   yhit = ((double) match) / ytot  ; 

   if (yhit>fastdupthresh) { 
    printdup(snpm, nsnp, inda, indb, match, nomatch)  ;
    if (yhit>fastdupkill) killdup(inda, indb, snpm, nsnp) ;
   }
  }
 }
}

void killdup(Indiv *inda, Indiv *indb, SNP **snpm, int nsnp) 
{
       int t1, t2 ;
       Indiv *indx ;

       t1 = numvalids(inda, snpm, 0, nsnp-1) ;
       t2 = numvalids(indb, snpm, 0, nsnp-1) ;
       indx = inda ; 
       if (t1>t2) indx = indb ;
       indx -> ignore = YES ; 
       printf("dup.  %s ignored\n", indx -> ID) ;
}

void printdup(SNP **snpm, int numsnp, Indiv *inda, Indiv *indb, int nmatch, int nnomatch) 
{
       int t1, t2 ;
       double y ;

       if (nmatch<=0) return ;
       if (inda -> ignore) return ;
       if (indb -> ignore) return ;

       t1 = numvalids(inda, snpm, 0, numsnp-1) ;
       t2 = numvalids(indb, snpm, 0, numsnp-1) ;
       printf("dup? %s %s  match: %d mismatch: %d   %d %d ", 
        inda->ID, indb -> ID, nmatch, nnomatch, t1, t2) ;
       printf("%20s ", inda->egroup) ;
       printf("%20s", indb->egroup) ;
       y = nnomatch / (double) (nnomatch+nmatch) ;
       printf(" %9.3f", y) ;
       printnl() ; 
}

int kcode(int *w, int len, int base) 
{
 int i, t ;
 t = 0; 
 for (i=0; i<len; i++) {  
  t *= base ;  
  t += w[i] ;
 }
 return t ;
}

int 
 grabgtypes(int *gtypes, SNP *cupt, int numindivs) 
{

 int k ;

 for (k=0; k<numindivs; k++) { 
  gtypes[k] = getgtypes(cupt, k) ;
 }

}

double kurtosis(double *a, int n) 
{

  double  y1, y2, y4 ;  
  double *w  ;

  ZALLOC(w, n, double) ;

  y1 = asum(a,n) / (double) n ;  
  vsp(w, a, -y1, n) ; 
  
  y2 = asum2(w, n) / (double) n ;
  vst(w, w, 1.0/sqrt(y2), n) ;

  vvt(w, w, w, n) ;  

  y4 = asum2(w, n) / (double) n ;


  free(w) ;
  return y4 ; 

}

int getlist(char *name, char **list) 
{
#define MAXFF 5
  FILE *fff ;
  char line[MAXSTR] ;
  char *spt[MAXFF] ;
  char *sx ;
  int nsplit, num=0 ;

  num = 0;
  if (name == NULL) fatalx("(numlines)  no name")  ;
  openit(name, &fff, "r") ;
  while (fgets(line, MAXSTR, fff) != NULL)  {
   nsplit = splitup(line, spt, MAXFF) ;
   if (nsplit==0) continue ;
   sx = spt[0] ;
   if (sx[0] == '#') {
    freeup(spt, nsplit) ;
    continue ;
   }
   list[num] = strdup(sx) ;
   ++num ;
   freeup(spt, nsplit) ;
  }
  fclose(fff) ;
  return num ;
}
void printvers(char *progname, char *vers) 
{
 printf("## %s  version: %s", progname, vers) ; 
 printnl() ;
}
int numvalidind(Indiv **indivmarkers, int  numind)  
{ 
  int i ;              
  int nvalids = 0 ;
  Indiv *indx ;

  for (i=0; i< numind;  i++) {  
   indx = indivmarkers[i] ; 
   if (indx -> ignore) continue ;
   ++nvalids ;
  }
  return nvalids ; 
} 

void numvalidgtallind(int *x, SNP **snpm, int numsnps, int numind) { 
// count valids for all 
  int n = 0 ;
  int k, t, j ; 
  SNP *cupt ;
  int *z ;  
  
  ZALLOC(z, numind, int) ;
  ivzero(x, numind) ;
  for (k=0; k< numsnps; ++k) {  
   cupt = snpm[k]  ;
   if (cupt -> ignore) continue ;
   getgall(cupt, z, numind) ;
   for (j=0; j<numind; ++j) { 
    if (z[j] >=0) ++x[j] ;
   }
  }
  free(z) ;
}


int numvalidgtind(SNP **snpm, int numsnps, int ind) { 
  int n = 0 ;
  int k, t ; 
  SNP *cupt ;
  
  for (k=0; k< numsnps; ++k) {  
   cupt = snpm[k]  ;
   if (cupt -> ignore) continue ;
   t = getgtypes(cupt, ind) ;
   if (t >= 0)   ++n ;
  }
  return n ;
}

int numvalidgt(Indiv **indivmarkers, SNP *cupt)  
     // like numvalidgtypes but tests ignore   
{ 
  int n, k, nvalids ; 
  if (cupt -> gtypes == NULL) return 0 ; 
  nvalids = 0 ; 
  n = cupt -> ngtypes ; 
  for (k=0; k<n; k++) {   
    if (indivmarkers[k] -> ignore)  continue ; 
    if (getgtypes(cupt, k) >=0) ++nvalids ; 
  } 
  return nvalids ; 
} 
int numvalidgtx(Indiv **indivmarkers, SNP *cupt, int affst)  
// like numvalidgtypes but tests ignore   counts only when status=affst
{ 
  int n, k, nvalids ; 
  if (cupt -> gtypes == NULL) return 0 ; 
  nvalids = 0 ; 
  n = cupt -> ngtypes ; 
  for (k=0; k<n; k++) {   
    if (indivmarkers[k] -> ignore)  continue ; 
    if (indivmarkers[k] -> affstatus != affst) continue ;
    if (getgtypes(cupt, k) >=0) ++nvalids ; 
  } 
  return nvalids ; 
} 

int isxmale(SNP *cupt, Indiv *indx) 
{
  if (cupt -> chrom != 23) return NO ; 
  if (indx -> gender != 'M') return NO ;
  return YES ;
}

void
printmatz(double *ww, char **eglist, int n) 
{
  int i, j , x ;
  printf(" %4s", "   ") ;
  for (i=0; i<n; i++)  {  
   printf(" %4s", get3(eglist[i])) ; 
  }
  printnl() ;
  for (i=0; i<n; i++)  {  
    printf("%4s", get3(eglist[i])) ;
    for (j=0; j<n; j++) { 
     x = nnint(1000*ww[i*n+j]) ; 
     printf(" %4d", x)  ;              
    }
    printnl() ;
  }
}
void
printmatz5(double *ww, char **eglist, int n) 
{
  int i, j , x ;
  printf(" %5s", "   ") ;
  for (i=0; i<n; i++)  {  
   printf(" %5s", get3(eglist[i])) ; 
  }
  printnl() ;
  for (i=0; i<n; i++)  {  
    printf("%5s", get3(eglist[i])) ;
    for (j=0; j<n; j++) { 
     x = nnint(1000*ww[i*n+j]) ; 
     printf(" %5d", x)  ;              
    }
    printnl() ;
  }
}
void
printmatz10(double *ww, char **eglist, int n) 
{
  int i, j , x ;
  printf(" %5s", "   ") ;
  for (i=0; i<n; i++)  {  
   printf(" %10s", get3(eglist[i])) ; 
  }
  printnl() ;
  for (i=0; i<n; i++)  {  
    printf("%5s", get3(eglist[i])) ;
    for (j=0; j<n; j++) { 
     x = nnint(1000000*ww[i*n+j]) ; 
     printf(" %10d", x)  ;              
    }
    printnl() ;
  }
}



char *get3(char *ss) 
{
  return getshort(ss, 3) ;
}
char *getshort(char *ss, int n) 
{
  static char xxx[MAXSTR] ;
  strcpy(xxx, ss)  ;
  xxx[n-2] = ' ' ;  
  xxx[n-1] = CNULL ;
  return xxx ;
}




int setid2pops(char *idpopstring, Indiv **indmarkers, int numindivs) 
// replace pop by ID for certain samples
{
#define MAXS 1000 
  char *spt[MAXS] ;
  char *sx ;
  int nsplit, k, t ;
  Indiv *indx ;
 
  nsplit = splitupx(idpopstring, spt, MAXS, ':') ;
  for (k=0; k<nsplit; ++k) { 
   sx = spt[k] ;
   t = indindex(indmarkers, numindivs, sx) ; 
   if (t<0) { 
    printf("(setid2pops): %s not found\n") ;
    continue ;
   }
   indx = indmarkers[t] ;
   freestring(&indx -> egroup) ;
   indx -> egroup = strdup(indx -> ID) ;
  }
  freeup(spt, nsplit) ;
  return nsplit ;
}



int gvalm(char cc, char cm, char c1, char c2, int minfilterval) 
{
   int x=0, t ;
   char cb[2] ; 

   if (fvalid(cm, minfilterval) == NO) return 9 ;      
   if (isiub2(cc) == NO) return 9  ;  
   t = iubcbases(cb, cc) ; 

   if (cb[0] == c1) ++x ;
   if (cb[1] == c1) ++x ;


   return x ;

}

int fvalid(char cm, int minfilterval) 
// is cm indicating valid? 
{
  int t ; 

  if (minfilterval < 0) return YES ;
  t = (int) cm - (int) '0' ;

  if (t<0) return NO;
  if (t>=10) return NO ;
  if (t<minfilterval) return NO ;

  return YES ; 
}


int setfalist(char **poplist, int npops, char *dbfile, char **iublist, char *table_path) 
{
	int t;
	for (t = 0; t < npops; ++t) {
		iublist[t] = strdup(table_path);
		iublist[t] = (char*) realloc(iublist[t], 64);
		iublist[t] = strcat(iublist[t], poplist[t]);
		if ((!strcmp (poplist[t], "Chimp") || !strcmp (poplist[t], "Href")) && strcmp (dbfile, ".fa")) {
			free (iublist[t]);
			iublist[t] = "NULL";
		} else 
			iublist[t] = strcat(iublist[t], dbfile);
	}
	return npops;
}
  

int mkmaplist(char *fname, char **kword, char **val) 
// return list of (kword, values) sorted in descending kword lenth
// return # of keywords
{
   FILE *fff ;  
  char line[MAXSTR + 1];
  char ww[MAXSTR];
  char rest[MAXSTR];
  int len, plen, nmap, n ;
  char **tkword, **tval ; 
  int *slenm, *indx, k, x, l ;

   
   
   openit(fname, &fff, "r") ;

   n = numlines(fname) ; 
   ZALLOC(tkword, n, char *) ;
   ZALLOC(tval, n, char *) ;
 
   nmap = 0 ; 

  line[MAXSTR] = '\0';          // defensive programmming
  while (fgets (line, MAXSTR, fff) != NULL) {

    len = strlen (line);

    if (isspace (line[len - 1]))
      line[len - 1] = CNULL ;

    if (first_word (line, ww, rest) > 0) {

      if (ww[0] == '#')
        continue;
      plen = strlen (ww);
      if (ww[plen - 1] != ':') continue ; 
      ww[plen-1] = CNULL ;
      if (upstring(ww) == NO) continue ; 
  
      tkword[nmap] = strdup(ww) ; 
      tval[nmap] = strdup(rest) ; 
      ++nmap ; 

    }

  }

  fclose(fff) ;

  if (nmap == 0) { 
   freeup(tkword, n) ;
   freeup(val, n) ;
   return 0 ;
  }
 
// sort in descending order of length
   ZALLOC(slenm, nmap, int) ; 
   ZALLOC(indx, nmap, int) ; 
   for (k=0; k<nmap; ++k) { 
    x = 10000 * strlen(tkword[k]) + (nmap - k) ; 
    slenm[k] = -x ;
   }
   isortit(slenm, indx, nmap) ; 
   for (k=0; k<nmap; ++k)  { 
    x = indx[k] ;
    kword[k] = strdup(tkword[x]) ;
    val[k] = strdup(tval[x]) ;
   }


  freeup(tkword, n) ;
  freeup(tval, n) ;
  free(slenm) ;
  free(indx) ;

  return nmap ;

}
int getfalist(char **poplist, int npops, char *dbfile, char **iublist)  

{
 char line[MAXSTR+1] ;
 char *spt[MAXFF], *sx ;
 char c ;
 int nsplit ;
 int  tt, t, k, s, nx = 0  ;
 FILE *fff ;
 char *scolon ; 
 char **kword, **val ;
 int nn, nu ; 
  
  if (dbfile == NULL) return 0 ;
  openit(dbfile, &fff, "r") ;

  nn = numlines(dbfile) ;

  ZALLOC(kword, nn, char *) ;
  ZALLOC(val, nn, char *) ;

  nu =  mkmaplist(dbfile, kword, val)  ; 
/**
  printf("##zz nu: %d\n", nu) ;
  printstrings(kword, nu) ;
  printstrings(val, nu) ;
  printnl() ;
*/
  

  while (fgets(line, MAXSTR, fff) != NULL)  {
   nsplit = splitup(line, spt, MAXFF) ; 
   if (nsplit<1) continue ;
   sx = spt[0] ;
   if (sx[0] == '#') { 
    freeup(spt, nsplit) ;
    continue ;
   }
   t = indxstring(poplist, npops, spt[0]) ; 
   if (t<0) { 
    freeup(spt, nsplit) ; 
    continue ;
   }
    sx = spt[2] ;
    tt = strcmp(sx, "NULL") ; 
    if (tt == 0) {  
     iublist[t] = NULL ;
    }
    else {
     mapstrings(&sx, kword, val, nu) ;  
     iublist[t] = strdup(sx) ; 
    }
    freeup(spt, nsplit) ;
    ++nx ;
  }

   fclose(fff) ;
   return nx ;

}


int getdbname(char *dbase, char *name, char **pfqname) 
{
 int n, k, t, i ; 
 char **tpoplist, **tfalist ; 
 int numfalist ; 

// fprintf(stderr, "call numlines in [getdbname]\n");

 ZALLOC(tpoplist, 1, char *) ;
 ZALLOC(tfalist, 1, char *) ;
 tpoplist[0] = strdup(name) ; 
 numfalist = getfalist(tpoplist, 1, dbase, tfalist) ;	
 if (numfalist == 0) fatalx("name: %s not found in %s\n", name, dbase) ;

 *pfqname = strdup(tfalist[0]) ;
 
 freeup(tpoplist, 1) ;
 freeup(tfalist, 1) ;
 
 return 1 ; 
}


int cmap(SNP **snpmarkers, int numsnps) 
{
   int t, k ; 
   double y1, y2 ; 
   SNP *cupt ;  
   for (k=1 ; k<=10; ++k) { 
    t = ranmod(numsnps) ;
    cupt = snpmarkers[t] ; 
    y1 = cupt -> genpos ; 
    y2 = cupt -> physpos / 1.0e8  ; 
    if (fabs(y1-y2) > .001) return YES ;
   }
   return NO ;
}

void setinfiles(char **pind, char **psnp, char **pgeno, char *stem) 
{
  char ss[MAXSTR] ; 

  snprintf(ss, MAXSTR, "%s.ind", stem) ; 
  fcheckr(ss) ; 
  *pind = strdup(ss) ; 

  snprintf(ss, MAXSTR, "%s.snp", stem) ; 
  fcheckr(ss) ; 
  *psnp = strdup(ss) ; 


  snprintf(ss, MAXSTR, "%s.geno", stem) ; 
  fcheckr(ss) ; 
  *pgeno = strdup(ss) ; 

  printf("input files set from %s\n", stem) ; 



}














back to top