#include <iostream>
#include <fstream>
#include <string>
#include <vector>
#include <numeric>
#include <cassert>
using namespace std;
int M = 0; // # of sequences
// skip meta-info lines and header line
void skipMeta(ifstream& in) {
string s;
while (getline(in, s)) {
if ((int)s.size() < 2 || s[0] != '#' || s[1] != '#') break;
}
}
// find M using vcf's info field - resets input stream pointer when finished
void getM(ifstream& in) {
int startPos = in.tellg();
string s; getline(in, s);
int tabs = 0;
for (int i = 0; i < (int)s.size(); ++i) {
if (s[i] == '\t') ++tabs;
if (tabs >= 9 && s[i] == '|') ++M;
}
in.seekg(startPos);
M *= 2;
}
int main(int argc, char* argv[]) {
ios_base::sync_with_stdio(0); cin.tie(0);
ifstream in(argv[1]);
ofstream out(string(argv[2]) + ".rpbwt", ios::binary), sites(string(argv[2]) + ".sites"), meta(string(argv[2]) + ".meta");
int checkpoint = atoi(argv[3]);
skipMeta(in);
getM(in);
int site = 0;
vector<int> positions;
vector<int> pre(M), div(M); // prefix and divergence array
iota(pre.begin(), pre.end(), 0);
vector<int> a(M), b(M), d(M), e(M);
char s[2 * M + 5000]; // assumes fixed fields take up less than 5000 characters
in.seekg(-1, ios::end);
if (in.peek() == '\n') in.seekg(-1, ios::cur); // the process won't work if it starts on a newline character
in.seekg(-2 * M, ios::cur);
while (in.tellg() >= 0) {
// process for reading file backwards
while (in.peek() != '\n') in.seekg(-1, ios::cur);
streampos pos = in.tellg();
in.seekg(1, ios::cur);
in.getline(s, 2 * M + 5000);
in.seekg(pos - (streampos)1);
in.seekg(-2 * M, ios::cur);
if (s[0] == '#' && s[1] == 'C') break; // header line that starts with "#CHROM POS ID etc."
// skip fixed fields and get chromosome position
string position;
int offset = 0, tabs = 0; // offset = position in "s" of first sequence - points to the first character after 9 tabs
while (tabs < 9) {
if (tabs == 1 && s[offset] != '\t') position += s[offset];
if (s[offset] == '\t') ++tabs;
++offset;
}
positions.push_back(stoi(position));
// pbwt algorithm
int u = 0, v = 0, p = site + 1, q = site + 1;
for (int i = 0; i < M; ++i) {
int id = pre[i];
if (div[i] > p) p = div[i];
if (div[i] > q) q = div[i];
assert(s[offset + (id / 2) * 4 + 1] == '|'); // sanity check
if (s[offset + (id / 2) * 4 + (id % 2) * 2] == '0') {
a[u] = id;
d[u] = p;
++u;
p = 0;
}
else {
b[v] = id;
e[v] = q;
++v;
q = 0;
}
}
for (int i = 0; i < u; ++i) {
pre[i] = a[i];
div[i] = d[i];
}
for (int i = 0; i < v; ++i) {
pre[u + i] = b[i];
div[u + i] = e[i];
}
// output
for (int i = 0; i < M; ++i) {
out.write((char*)&pre[i], sizeof pre[i]);
out.write((char*)&div[i], sizeof div[i]);
}
if (site % checkpoint == 0) cout << "Checkpoint " << site << endl;
++site;
}
// create file with site positions
for (int i = (int)positions.size() - 1; i >= 0; --i) {
sites << positions[i] << '\n';
}
// create meta file
meta << M << ' ' << site << '\n'; // M and N
in.close();
out.close();
sites.close();
meta.close();
return 0;
}