#include #include #include #include #include #include #include #include #include #include #include using namespace std; int M, N, L, W, G; // # of sequences, # of sites, length, width, gap size struct SparseTable { static const int B = 30; int N, blocks; vector v, mask; vector> table; SparseTable(vector& _v) { this->v = _v; this->N = (int)v.size(); blocks = N / B; mask = vector(N); table = vector>(blocks, vector(msb(blocks) + 1)); int cur = 0; // sliding mask for (int i = 0; i < N; ++i) { cur = (cur << 1) & ((1 << B) - 1); while (cur > 0 && max(v[i], v[i - msb(lsb(cur))]) == v[i]) cur ^= lsb(cur); cur |= 1; mask[i] = cur; } for (int i = 0; i < blocks; ++i) table[i][0] = mini_query(B * i + B - 1); for (int j = 1; (1 << j) <= blocks; ++j) { for (int i = 0; i + (1 << j) - 1 < blocks; ++i) { table[i][j] = max(table[i][j - 1], table[i + (1 << (j - 1))][j - 1]); } } } // least significant set bit int lsb(int num) {return num & -num;} // index of most significant set bit int msb(int num) {return __builtin_clz(1) - __builtin_clz(num);} int mini_query(int r, int len = B) { return v[r - msb(mask[r] & ((1 << len) - 1))]; } int query(int l, int r) { if (r - l + 1 <= B) return mini_query(r, r - l + 1); int ret = max(mini_query(l + B - 1), mini_query(r)); int blockL = l / B + 1, blockR = r / B - 1; if (blockL <= blockR) { int j = msb(blockR - blockL + 1); ret = max({ret, table[blockL][j], table[blockR - (1 << j) + 1][j]}); } return ret; } }; struct VCFReader { ifstream vcf; int G, M, p1 = 0; vector> gap; // stores haplotype data in the gap vector ID; VCFReader(string file, int _G, int _M) { vcf = ifstream(file); G = max(_G, 1), M = _M; // gap size of 0 is equivalent to a gap size of 1 when handling the VCF file and gap gap = vector>(G, vector(M)); ID = vector(M); preprocess(); initGap(); } // gets haplotype IDs and moves input stream pointer to start of raw data void preprocess() { // skip meta-info lines and get header line string header; while (getline(vcf, header)) { if ((int)header.size() < 2 || header[0] != '#' || header[1] != '#') break; } // input sample IDs stringstream ss(header); for (int i = 0; i < 9; ++i) getline(ss, ID[0], '\t'); // skip fixed columns, assumes 9 columns (FORMAT column) for (int i = 0; i < M / 2; ++i) { getline(ss, ID[2 * i], '\t'); ID[2 * i + 1] = ID[2 * i] + "-1"; ID[2 * i] += "-0"; } } // initializes sliding window for the gap void initGap() { for (int i = 0; i < G; ++i) nextSite(); } // reads the next site in the VCF file void nextSite() { char s[2 * M + 5000]; // assumes fixed fields take up less than 5000 characters vcf.getline(s, 2 * M + 5000); // skip fixed fields int offset = 0, tabs = 0; // offset = position in "s" of first sequence - points to the first character after 9 tabs while (tabs < 9) { if (s[offset] == '\t') ++tabs; ++offset; } for (int i = 0; i < M; ++i) { assert(s[offset + (i / 2) * 4 + 1] == '|'); // sanity check gap[p1][i] = (s[offset + 2 * i] == '0' ? 0 : 1); } p1 = (p1 + 1) % G; } int getGap(int g, int idx) { return gap[(p1 + g) % G][idx]; } void close() {vcf.close();} }; void countingSort(vector>& v, int idx) { vector>> table(M + 1); for (int i = 0; i < M; ++i) { table[v[i][idx]].push_back(v[i]); } int p = 0; for (int i = 0; i <= M; ++i) { for (int j = 0; j < (int)table[i].size(); ++j) { v[p++] = table[i][j]; } } } void processBlock(vector>& link, int start, int end, vector& idx, vector& rIdx, SparseTable& forwardSparse, SparseTable& backwardSparse, int site, int rsite, vector& positions, vector& ID, ofstream& blocks, double& MI, vector& blockSize, vector& rBlockSize, VCFReader& vcf) { // [start, end) // compute MI double pxy = (double)(end - start) / M; double px = (double)blockSize[link[start][1]] / M; double py = (double)rBlockSize[link[start][2]] / M; MI += pxy * log2(pxy / px / py); if (end - start < W) return; // width too small int f_mini = M - 1, f_maxi = 0, r_mini = M - 1, r_maxi = 0; vector zero(G), one(G); for (int j = start; j < end; ++j) { int id = link[j][0]; f_mini = min(f_mini, idx[id]); f_maxi = max(f_maxi, idx[id]); r_mini = min(r_mini, rIdx[id]); r_maxi = max(r_maxi, rIdx[id]); for (int k = 0; k < G; ++k) { if (vcf.getGap(k, id) == 0) ++zero[k]; else ++one[k]; } } // check if every site in the gap has a mismatch bool flag = false; for (int k = 0; k < G; ++k) { if (min(zero[k], one[k]) == 0) flag = true; // no mismatch at this site in the gap } if (flag) return; int fL = (site - 1) - forwardSparse.query(f_mini + 1, f_maxi) + 1; // length of forward block int rL = rsite - backwardSparse.query(r_mini + 1, r_maxi) + 1; // length of reverse block blocks << site << ' ' << positions[site] << ' ' << fL << ' ' << rL << ' ' << positions[site - fL] << ' ' << positions[site + G + rL - 1] << ' ' << (end - start); for (int j = start; j < end; ++j) blocks << ' ' << ID[link[j][0]]; blocks << '\n'; } int main(int argc, char* argv[]) { ios_base::sync_with_stdio(0); cin.tie(0); string writeTo = string(argv[2]); ifstream backward(writeTo + ".rpbwt"), sites(writeTo + ".sites"), meta(writeTo + ".meta"); ofstream blocks(writeTo + ".blocks"), resultMI(writeTo + ".MI"); int checkpoint = atoi(argv[3]); L = atoi(argv[4]), W = atoi(argv[5]), G = atoi(argv[6]); // retrieve M and N from meta file meta >> M >> N; VCFReader vcf(string(argv[1]), G, M); // input chromosome site positions vector positions(N); for (int i = 0; i < N; ++i) sites >> positions[i]; vector pre(M), div(M), backwardPre(M), backwardDiv(M); // prefix and divergence arrays for forward and backward PBWT iota(pre.begin(), pre.end(), 0); vector a(M), b(M), d(M), e(M); vector idx(M), rIdx(M); // idx[i] = index of sample i in the positional prefix array; r = reverse vector block(M), blockSize(M + 1), rBlock(M), rBlockSize(M + 1); // block[i] = block ID of sample i in the reverse PBWT; block IDs go from [1, M] for (int site = 0; site + G < N; ++site) { if (site != 0) { int rsite = (N - 1) - site - G; // index of the corresponding reverse site backward.seekg((long long)rsite * M * 8); // initialize rIdx, backwardPre, rBlock, and rBlockSize int start = -1, id = 0; for (int i = 0; i < M; ++i) { backward.read((char*)&backwardPre[i], sizeof backwardPre[i]); rIdx[backwardPre[i]] = i; int rDiv; backward.read((char*)&rDiv, sizeof rDiv); backwardDiv[i] = rDiv; rDiv = (N - 1) - rDiv; // get forward index for position comparision if ((string(argv[7]) == "0" && positions[rDiv] < positions[site + (G - 1)] + L) || (string(argv[7]) == "1" && rDiv < site + (G - 1) + L)) { rBlockSize[id] = i - start; ++id; start = i; } rBlock[backwardPre[i]] = id; } // special case where a matching block extends up to the final haplotype rBlockSize[id] = M - start; // initialize idx, block, and blockSize start = -1, id = 0; for (int i = 0; i < M; ++i) { idx[pre[i]] = i; if ((string(argv[7]) == "0" && positions[div[i]] > positions[site] - L) || (string(argv[7]) == "1" && div[i] > site - L)) { blockSize[id] = i - start; ++id; start = i; } block[pre[i]] = id; } // special case where a matching block extends up to the final haplotype blockSize[id] = M - start; SparseTable forwardSparse(div), backwardSparse(backwardDiv); // build sparse tables // Algorithm 2 - block matching vector> link(M, vector(3)); // [sample ID, forward block ID, reverse block ID] for (int i = 0; i < M; ++i) { link[i][0] = i, link[i][1] = block[i], link[i][2] = rBlock[i]; } // radix sort countingSort(link, 2); countingSort(link, 1); double MI = 0; // mutual information start = 0; for (int i = 1; i < M; ++i) { if (link[i][1] != link[i - 1][1] || link[i][2] != link[i - 1][2]) { processBlock(link, start, i, idx, rIdx, forwardSparse, backwardSparse, site, rsite, positions, vcf.ID, blocks, MI, blockSize, rBlockSize, vcf); start = i; } } processBlock(link, start, M, idx, rIdx, forwardSparse, backwardSparse, site, rsite, positions, vcf.ID, blocks, MI, blockSize, rBlockSize, vcf); resultMI << positions[site] << ' ' << MI << '\n'; } // pbwt algorithm int u = 0, v = 0, p = site + 1, q = site + 1; for (int i = 0; i < M; ++i) { int id = pre[i]; if (div[i] > p) p = div[i]; if (div[i] > q) q = div[i]; if (vcf.getGap(0, id) == 0) { a[u] = id; d[u] = p; ++u; p = 0; } else { b[v] = id; e[v] = q; ++v; q = 0; } } for (int i = 0; i < u; ++i) { pre[i] = a[i]; div[i] = d[i]; } for (int i = 0; i < v; ++i) { pre[u + i] = b[i]; div[u + i] = e[i]; } vcf.nextSite(); // move input stream pointer if (site % checkpoint == 0) cout << "Checkpoint " << site << endl; } vcf.close(); sites.close(); meta.close(); blocks.close(); resultMI.close(); return 0; }