Content - d7ba61d16c9351b245b1f1971c5527dddf6288ad - e491604/SNPReader.c

visit type:
https://github.com/sfu-compbio/mrsfast

05 April 2024, 19:47:20 UTC
Tip revision: cf8e678d30f68b6c5af722e5bfd36f1d1076092e authored by Faraz Hach on 10 June 2020, 20:07:02 UTC
Update Makefile
Tip revision: cf8e678
SNPReader.c
/*
 * Copyright (c) <2008 - 2020>, University of Washington, Simon Fraser University
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, 
 * are permitted provided that the following conditions are met:
 *   
 * Redistributions of source code must retain the above copyright notice, this list
 * of conditions and the following disclaimer.
 * - Redistributions in binary form must reproduce the above copyright notice, this
 *   list of conditions and the following disclaimer in the documentation and/or other
 *   materials provided with the distribution.
 * - Neither the name of the <ORGANIZATION> nor the names of its contributors may be
 *   used to endorse or promote products derived from this software without specific
 *   prior written permission.
 * 
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Author: 
 *        Faraz Hach (fhach AT cs DOT sfu DOT ca)
 *        Iman Sarrafi (isarrafi AT cs DOT sfu DOT ca)
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include "Common.h"
#include "HashTable.h"
#include "SNPReader.h"

CompressedSeq	*_snp_SNPMap		= NULL;
int				_snp_SNPMapLength	= 0;
ChrSNPs			*_snp_chrSNPs		= NULL;
int				_snp_chrCnt			= 0;
char 			**_snp_chrNames		= NULL;
int				_snp_currentChr		= 0;
int				_snp_currentLoc		= 0;

/**********************************************/
int findChrIndex(char *chrName)
{
	int i;
	char cname[CONTIG_NAME_SIZE];	// chr name in FASTA file

	for (i = 0; i < _snp_chrCnt; i++)
	{
		strcpy(cname, _snp_chrNames[i]);
		if (strlen(cname) > 3 && cname[0] == 'c' && cname[1] == 'h' && cname[2] == 'r')
			strcpy(cname, _snp_chrNames[i] + 3);		// get rid of the potential "chr" at the beginning
		if (! strcmp(cname, "MT"))						// change "MT" to "M" for consistency with dbSNP naming
			cname[1] = '\0';

		if (! strcmp(chrName, cname))
			return i;
	}
	return -1;
}
/**********************************************/
void initLoadingSNPs(char *fileName)
{
	int i, loc, t, locCnt, chrIndex, nameLen;
	char cname[CONTIG_NAME_SIZE];	// chromosome name from dbSNP
	int ccnt;						// number of chromosomes in dbSNP
	//int chrNameOffset = 0;			// used to trim the "chr" at the beginning of chromosome names
	SNPLoc *dummy = getMem(MAX_SNP_PER_CHR * sizeof(SNPLoc));

	_snp_chrCnt = getChrCnt();
	_snp_chrNames = getChrNames();
	_snp_chrSNPs = getMem(_snp_chrCnt * sizeof(ChrSNPs));
	
	for (i = 0; i < _snp_chrCnt; i++)		// FASTA chromosomes
	{
		//chrNameOffset = (strlen(_snp_chrNames[i]) > 3 && chrNames[i][0] == 'c' && chrNames[i][1] == 'h' && chrNames[i][2] == 'r') ?3 :0;
		_snp_chrSNPs[i].chrName = _snp_chrNames[i];// + chrNameOffset;

		_snp_chrSNPs[i].locCnt = 0;
		_snp_chrSNPs[i].snpLocs = NULL;	//getMem(MAX_SNP_PER_CHR * sizeof(SNPLoc));
	}

	_snp_SNPMapLength = (calculateCompressedLen(CONTIG_MAX_SIZE)+1) * sizeof(CompressedSeq);
	_snp_SNPMap = getMem(_snp_SNPMapLength);

	FILE *fp = fopen(fileName, "rt");
	t = fread(&ccnt, sizeof(int), 1, fp);		// ccnt = number of chromosomes in dbSNP

	// look for each dbSNP chromosome in the reference
	for (i = 0; i < ccnt; i++)
	{
		t = fread(&nameLen, sizeof(int), 1, fp);
		t = fread(cname, sizeof(char), nameLen, fp);
		t = fread(&locCnt, sizeof(int), 1, fp);

		cname[nameLen] = '\0';
		chrIndex = findChrIndex(cname);

		if (chrIndex != -1)		// found in FASTA chromosomes
		{
			_snp_chrSNPs[chrIndex].locCnt = locCnt;
			_snp_chrSNPs[chrIndex].snpLocs = getMem(locCnt * sizeof(SNPLoc));
			t = fread(_snp_chrSNPs[chrIndex].snpLocs, sizeof(SNPLoc), locCnt, fp);
		}
		else					// not found
		{
			t = fread(dummy, sizeof(SNPLoc), locCnt, fp);	// read dummy
			fprintf(stdout, "Warning: chromosome %s is present in the SNP database but not found in the reference genome\n", cname);
		}
	}

	fclose(fp);
	freeMem(dummy, MAX_SNP_PER_CHR * sizeof(SNPLoc));
}
/**********************************************/
void finalizeSNPs()
{
	int i;
	for (i = 0; i < _snp_chrCnt; i++)
		freeMem(_snp_chrSNPs[i].snpLocs, _snp_chrSNPs[i].locCnt * sizeof(SNPLoc));
	freeMem(_snp_chrSNPs, _snp_chrCnt * sizeof(ChrSNPs));
	freeMem(_snp_SNPMap, _snp_SNPMapLength);
}
/**********************************************/
CompressedSeq *loadSNPMap(char *chrName, int contigStartIndex, int contigLength, char *alt)
{
	//memset(_snp_SNPMap, -1, calculateCompressedLen(contigLength) * sizeof(CompressedSeq));
	memset(_snp_SNPMap, -1, _snp_SNPMapLength);
	int contigEnd = contigStartIndex + contigLength;
	int loc, offset;
	CompressedSeq *snp, mask;

	if ( strcmp(chrName, _snp_chrSNPs[_snp_currentChr].chrName) )		// new chr
	{
		_snp_currentChr++;
		_snp_currentLoc = 0;
	}

	if (_snp_chrSNPs[_snp_currentChr].locCnt)
	{
		int i = _snp_currentChr;		// just to make the code more readable
		int pos = _snp_currentLoc;

		while ( pos < _snp_chrSNPs[i].locCnt && _snp_chrSNPs[i].snpLocs[pos].loc < contigStartIndex )	// this should never happen!
			pos ++;

		while ( pos < _snp_chrSNPs[i].locCnt && _snp_chrSNPs[i].snpLocs[pos].loc < contigEnd )
		{
			loc = _snp_chrSNPs[i].snpLocs[pos].loc - contigStartIndex - 1;
			alt[loc] = _snp_chrSNPs[i].snpLocs[pos].alt;
			offset = loc % 21;
			mask = 0x7000000000000000;
			mask = ~(mask >> offset*3);
			
			snp = _snp_SNPMap + (loc/21);
			*snp &= mask;

			pos ++;
		}

		_snp_currentLoc = pos;
	}
	return _snp_SNPMap;
}
/**********************************************/
void rewindSNPIndex()
{
	_snp_currentChr = 0;
	_snp_currentLoc = 0;
}