/* Characters
   
   Copyright 2009, 2010 Andreas Enge, Pierrick Gaudry, Fran\c{c}ois Morain, Emmanuel Thom\'e, Paul Zimmermann
   
   This file is part of CADO-NFS.
   
   CADO-NFS is free software; you can redistribute it and/or modify it
   under the terms of the GNU Lesser General Public License as published
   by the Free Software Foundation; either version 2.1 of the License, or
   (at your option) any later version.
   
   CADO-NFS is distributed in the hope that it will be useful, but
   WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more
   details.
   
   You should have received a copy of the GNU Lesser General Public
   License along with CADO-NFS; see the file COPYING.  If not, write to
   the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
   Boston, MA 02110-1301, USA.
*/

// Input:
//
// * [k] A bit matrix of (small_nrows) rows and (some multiple of 64)
//   cols, whose column span _contains_ the kernel of the matrix which
//   has been fed to the linear algebra.
// * A list of the (npurged) a,b pairs. This is obtained from the
//   purgedfile.
// * A matrix of (small_nrows) rows and (npurged) cols, which indicates
//   the contents of each relation-set. This is obtained from the
//   indexfile.
//
// Output:
//
// * A subspace of the column span of [k], whose vectors happen to also
//   cancel the characters used. By construction, it might contain zero
//   vectors, which are eventually discarded from the output.
//
// Algorithm.
//
// * [big character matrix, bcmat] First we build a matrix of size
//   npurged x nchars, where each row corresponds to the character values
//   at the corresponding pair (a,b)
// * [small character matrix, scmat] Then this matrix is multiplied by
//   the corresponding matrix taken from the indexfile.
//              scmat = index * bcmat
//   We thus obtain a matrix of size small_nrows x nchars, containing the
//   character values for each relation set.
// * [heavy block, h] According to the value of the ``skip'' parameter,
//   ``dense'' block which was taken out of the matrix during replay is
//   now read again. This is a matrix of size small_nrows x skip -- the
//   concatenation of scmat and h is thus a matrix of size (small_nrows)
//   x (nchars + skip), although this concatenation is not computed.
//
// * [t] At this point, if k were completely satisfactory, we would have:
//            transpose(k) * [scmat | h] == 0
//   this is a priori not the case. Therefore we compute the product:
//            t = transpose(k) * [scmat | h]
// * [kb] Now we compute the transpose of the left nullspace of t, namely
//   a matrix kb of size ncols(k) * ncols(k) (number of ``kernel
//   vectors'' out of bwc), and satisfying:
//            transpose(kb) * t == 0
//            transpose(k * kb) * [scmat | h] == 0
// * [nk] The ``new kernel'' k*kb is computed as nk.
// * [nkt] Its transpose is computed, so that each kernel vector can be
//   printed separately.
//
// Notes.
//
// Because [k] is only guaranteed to _contain_ the kernel in its column
// span, it is reasonable to first compute a basis of this column span.
// This is done on a heuristic basis, taking the first 4k coordinates
// only.

#include "cado.h"

#include <stdio.h>
#include <stdlib.h>
#include <gmp.h>
#include <string.h>
#include <ctype.h>
#include <limits.h>
#include <sys/types.h>
#include <sys/stat.h>

#include "mod_ul.c"

#include "cado-endian.h"
#include "portability.h"
#include "utils_with_io.h"
#include "blockmatrix.h"
#include "gauss.h"
#include "worker-threads.h"

// #if defined(__FreeBSD__) && (__FreeBSD__ <= 7)
/* pthread_cond_wait seems to be buggy on FreeBSD 7 (shard.starfyre.net) */
/* #define NTHREADS 1
#else
#define NTHREADS 16
#endif */

/* Calculates a 64-bit word with the values of the characters chi(a,b), where
 * chi ranges from chars to chars+64
 */
uint64_t eval_64chars(int64_t a, uint64_t b, alg_prime_t * chars, cado_poly_ptr pol)
{
    /* FIXME: do better. E.g. use 16-bit primes, and a look-up table. Could
     * beat this. */
    uint64_t v = 0;
    for(int i = 0 ; i < 64 ; i++) {
        alg_prime_t * ch = chars + i;
        int res;
        if (ch->p == 0) {
            // all special characters are identified by p==0
            if (ch->r == 0) {
                res = 0;        // trivial character
            } else if (ch->r == 1) {
                res = 1;        // parity character
            } else if (ch->r == 2) {
                /* Special: rational sign (sign of m1*a+m2*b) */
                mpz_t tmp1, tmp2;

		/* FIXME: the code below only works for a rational g(x),
		   extend it to non-linear g(x) */
		ASSERT_ALWAYS(pol->rat->deg == 1);

                /* first perform a quick check */
                res = (a > 0) ? mpz_sgn(pol->rat->coeff[1]) : -mpz_sgn(pol->rat->coeff[1]);
                if (mpz_sgn(pol->rat->coeff[0]) != res) {
                    mpz_init(tmp1);
                    mpz_mul_si(tmp1, pol->rat->coeff[1], a);
                    mpz_init(tmp2);
                    mpz_mul_ui(tmp2, pol->rat->coeff[0], b);
                    mpz_add(tmp1, tmp1, tmp2);
                    res = mpz_sgn(tmp1) < 0;
                    mpz_clear(tmp1);
                    mpz_clear(tmp2);
                } else {
                    res = res < 0;
                }
            } else if (ch->r == 3) {
                res = (b==0);        // parity of the number of free relations
            } else {
                abort();
            }
        } else {
            /* Compute b*r-a (mod p) */
            residueul_t ra, rb, rr;
            modulusul_t mp;
            modul_initmod_ul(mp, ch->p);
            modul_init(ra, mp);
            modul_init(rb, mp);
            modul_init(rr, mp);
            if (a < 0) {
                ASSERT((uint64_t)(-a) <= ULONG_MAX);
                modul_set_ul(ra, (unsigned long)(-a), mp);
                modul_neg(ra, ra, mp);
            } else {
                ASSERT((uint64_t)a <= ULONG_MAX);
                modul_set_ul(ra, a, mp);
            }
            ASSERT(b <= ULONG_MAX);
            modul_set_ul(rb, (unsigned long)b, mp);
            modul_set_ul_reduced(rr, ch->r, mp);
            modul_mul(rr, rb, rr, mp);
            modul_sub(rr, rr, ra, mp);
            res = modul_jacobi(rr, mp);
            modul_clear(ra, mp);
            modul_clear(rb, mp);
            modul_clear(rr, mp);
            modul_clearmod(mp);
            
            // If res is 0, it means that ch->p divides the norm, which
            // should not be, unless the special-q has been chosen to be
            // larger than lpb.
            // A fix, for that case that should not happen in real life
            // would be to artificially enlarge the large prime bound
            // if we had to sieve beyond it, so that subsequent program
            // (including characters) behaves correctly.
            if (res == 0) {
                fprintf (stderr, "Error, Jacobi symbol is 0 for a = %" PRId64 
                         ", b = %" PRIu64 ", p = %lu, r = %lu\n", 
                         a, b, ch->p, ch->r);
                ASSERT_ALWAYS(res != 0);
            }
            res = res < 0;   // -1->1, 1->0
        }
        v |= ((uint64_t) res) << i;
    }
    return v;
}

struct charbatch {
    uint64_t * W;
    int64_t * A;
    uint64_t *B;
    unsigned int n;
    alg_prime_t * chars;
    cado_poly_ptr pol;
};
void eval_64chars_batch_thread(struct worker_threads_group * g, int tnum, void * t)
{
    struct charbatch * ss = (struct charbatch *) t;

    for(unsigned int z = tnum * ss->n / g->n ; z < (tnum + 1) * ss->n / g->n ; z++) {
        int64_t a = ss->A[z];
        uint64_t b = ss->B[z];
        ss->W[z] = eval_64chars(a,b,ss->chars,ss->pol);
    }
    return;
}

static alg_prime_t * create_characters(int nchars, int nratchars,
        cado_poly pol, unsigned long *lpb)
{
    unsigned long p;
    int ret;
    mpz_t pp;
    unsigned long *roots;

    ASSERT_ALWAYS(nchars);

    int nchars2 = iceildiv(nchars, 64) * 64;

    mpz_init (pp);
    roots = malloc(pol->alg->deg * sizeof(unsigned long));

    alg_prime_t * chars = malloc(nchars2 * sizeof(alg_prime_t));

    /* force rational sign */
    chars[0] = (alg_prime_t) { .p = 0, .r = 2 };
    /* force parity */
    chars[1] = (alg_prime_t) { .p = 0, .r = 1 };
    /* force parity of the free relations. This is really only because
     * we're lazy -- it's been asserted that it eases stuff at some
     * point, but nobody remembers the why and how. */
    chars[2] = (alg_prime_t) { .p = 0, .r = 3 };

    /* we might want to force evenness of the number of relations as well. Easy
     * to put this in chars[1] if needed (and add the appropriate stuff above
     * of course).  */

    /* Rational characters. Normally we have none. But the -nratchars
     * option inserts some */
    /* we want some prime beyond the (rational) large prime bound */
    mpz_set_ui (pp, 1UL << lpb[RATIONAL_SIDE]);
    for(int i = 3 ; i < 3 + nratchars && i < nchars ; ) {
        mpz_nextprime(pp, pp);
        p = mpz_get_ui(pp);
        ret = poly_roots_ulong(roots, pol->rat->coeff, pol->rat->deg, p);
        for(int j = 0 ; j < ret && i < 3 + nratchars && i < nchars ; j++, i++) {
            chars[i].p = p;
            chars[i].r = roots[j];
        }
    }
    /* we want some prime beyond the (algebraic) large prime bound */
    mpz_set_ui (pp, 1UL << lpb[ALGEBRAIC_SIDE]);
    for(int i = 3 + nratchars ; i < nchars ; ) {
        mpz_nextprime(pp, pp);
        p = mpz_get_ui(pp);
        ret = poly_roots_ulong(roots, pol->alg->coeff, pol->alg->deg, p);
        for(int j = 0 ; j < ret && i < nchars ; j++, i++) {
            chars[i].p = p;
            chars[i].r = roots[j];
        }
    }
    /* pad with trivial characters */
    for(int i = nchars ; i < nchars2 ; i++) {
        chars[i] = (alg_prime_t) { .p = 0, .r = 0 };
    }
    if (nchars < nchars2) {
        fprintf(stderr, "Note: total %d characters, including %d trivial padding characters\n", nchars2, nchars2-nchars);
    }

    free(roots);
    mpz_clear(pp);

    return chars;
}

typedef struct
{
  int64_t *a;
  uint64_t *b;
} chars_data_t;

void *
thread_chars (void * context_data, earlyparsed_relation_ptr rel)
{
  chars_data_t *data = (chars_data_t *) context_data;
  data->a[rel->num] = rel->a;
  data->b[rel->num] = rel->b;
  return NULL;
}

// The big character matrix has (number of purged rels) rows, and (number of
// characters) cols

static blockmatrix big_character_matrix(alg_prime_t * chars, unsigned int nchars2, const char * purgedname, cado_poly_ptr pol, struct worker_threads_group * g)
{
    uint64_t nrows, ncols;
    purgedfile_read_firstline (purgedname, &nrows, &ncols);

    int64_t  *all_A = (int64_t *)  malloc (nrows * sizeof(int64_t));
    uint64_t *all_B = (uint64_t *) malloc (nrows * sizeof(uint64_t));
    ASSERT_ALWAYS(all_A != NULL && all_B != NULL);
    blockmatrix res = blockmatrix_alloc(nrows, nchars2);
    blockmatrix_set_zero(res);

    /* For each rel, read the a,b-pair and init the corresponding poly pairs[] */
    fprintf(stderr, "Reading %" PRIu64 " (a,b) pairs from %s\n", nrows,
                     purgedname);
    chars_data_t data = {.a = all_A, .b=all_B};
    char *fic[2] = {(char *) purgedname, NULL};
    filter_rels (fic, (filter_rels_callback_t) thread_chars, &data,
          EARLYPARSE_NEED_AB_HEXA, NULL, NULL);

    fprintf(stderr, "Computing %u characters for %" PRIu64 " (a,b) pairs\n",
            nchars2, nrows);

    for(uint64_t i = 0 ; i < nrows; ) {
        static const int batchsize = 16384;
        int64_t A[batchsize];
        uint64_t B[batchsize];
        uint64_t W[batchsize];
        int bs = 0;
        while (bs < batchsize && i+bs < nrows) {
            A[bs] = all_A[i+bs];
            B[bs] = all_B[i+bs];
            bs++;
        }
        struct charbatch ss = { .W=W,.A=A,.B=B,.pol=pol,.n=bs };
        for(unsigned int cg = 0 ; cg < nchars2 ; cg+=64) {
            ss.chars = chars + cg;
            worker_threads_do(g, eval_64chars_batch_thread, &ss);
            for(int z = 0 ; z < bs ; z++) {
                *blockmatrix_subrow_ptr(res, i+z, cg) = W[z];
            }
        }
        i += bs;
    }
    free(all_A);
    fprintf (stderr, "all_A freed\n");
    free(all_B);
    fprintf (stderr, "all_B freed\n");

    return res;
}

/* The small character matrix has only (number of relation-sets) rows -- its
 * number of cols is still (number of characters) */
static blockmatrix small_character_matrix(blockmatrix bcmat, const char * indexname)
{
    FILE * ix = fopen_maybe_compressed(indexname, "r");
    int small_nrows, small_ncols;
    int ret;

    /* small_ncols isn't used here: we don't care. */
    ret = fscanf(ix, "%d %d", &small_nrows, &small_ncols);
    ASSERT(ret == 2);

    unsigned int nchars2 = bcmat->ncols;

    blockmatrix res = blockmatrix_alloc(small_nrows, nchars2);

    for(int i = 0 ; i < small_nrows ; i++) {
        int nc;
        ret = fscanf(ix, "%d", &nc); ASSERT_ALWAYS(ret == 1);
        for(unsigned int cg = 0 ; cg < nchars2 ; cg+=64) {
            res->mb[(i/64) + (cg/64) * res->nrblocks][i%64] = 0;
        }
        for(int k = 0 ; k < nc ; k++) {
            unsigned int col;
            ret = fscanf(ix, "%x", &col); ASSERT_ALWAYS(ret == 1);
            ASSERT_ALWAYS(col < bcmat->nrows);
            for(unsigned int cg = 0 ; cg < nchars2 ; cg+=64) {
                res->mb[(i/64) + (cg/64) * res->nrblocks][i%64] ^=
                    bcmat->mb[(col/64) + (cg/64) * bcmat->nrblocks][col%64];
            }
        }
    }
    fclose_maybe_compressed(ix, indexname);
    return res;
}

/* We support both ascii and binary format, which is close to a bug. */

static blockmatrix
read_heavyblock_matrix_binary(const char * heavyblockname)
{
    FILE * f = fopen(heavyblockname, "rb");

    if (f == NULL) {
        fprintf(stderr, "Warning: %s not found, assuming empty\n", heavyblockname);
        return blockmatrix_alloc(0,0);
    }

    unsigned int nrows, ncols;

    /* If we're binary, we insist on having the companion files as well,
     * which provide a quick hint at the matrix dimensions */

    {
        char * rwname = derived_filename(heavyblockname, "rw", ".bin");
        struct stat sbuf[1];
        int rc = stat(rwname, sbuf);
        if (rc < 0) { perror(rwname); exit(1); }
        nrows = sbuf->st_size / sizeof(uint32_t);
        free(rwname);
    }
    {
        char * cwname = derived_filename(heavyblockname, "cw", ".bin");
        struct stat sbuf[1];
        int rc = stat(cwname, sbuf);
        if (rc < 0) { perror(cwname); exit(1); }
        ncols = sbuf->st_size / sizeof(uint32_t);
        free(cwname);
    }

    blockmatrix res = blockmatrix_alloc(nrows, ncols);

    /* Sometimes the heavy block width is not a multiple of 64. Thus we pad
     * with zeros */
    blockmatrix_set_zero(res);

    for(unsigned int i = 0 ; i < nrows ; i++) {
        uint32_t len;
        int r = fread32_little(&len, 1, f);
        ASSERT_ALWAYS(r == 1);
        for( ; len-- ; ) {
            uint32_t v;
            r = fread32_little(&v, 1, f); ASSERT_ALWAYS(r == 1);
            res->mb[(i/64) + (v/64) * res->nrblocks][i%64] ^= ((uint64_t)1) << (v%64);
        }
    }
    fclose (f);
    return res;
}

static blockmatrix
read_heavyblock_matrix_ascii(const char * heavyblockname)
{
    FILE * f = fopen(heavyblockname, "r");

    if (f == NULL) {
        fprintf(stderr, "Warning: %s not found, assuming empty\n", heavyblockname);
        return NULL;
    }

    unsigned int nrows, ncols;
    int rc = fscanf(f,"%u %u", &nrows, &ncols);
    ASSERT_ALWAYS(rc == 2);

    blockmatrix res = blockmatrix_alloc(nrows, ncols);

    /* Sometimes the heavy block width is not a multiple of 64. Thus we pad
     * with zeros */
    blockmatrix_set_zero(res);

    for(unsigned int i = 0 ; i < nrows ; i++) {
        uint32_t len;
        int r = fscanf(f, "%"SCNu32, &len); ASSERT_ALWAYS(r == 1);
        for( ; len-- ; ) {
            uint32_t v;
            r = fscanf(f, "%"SCNu32, &v); ASSERT_ALWAYS(r == 1);
            res->mb[(i/64) + (v/64) * res->nrblocks][i%64] ^= ((uint64_t)1) << (v%64);
        }
    }
    fclose (f);
    return res;
}

static blockmatrix
read_heavyblock_matrix (const char * heavyblockname)
{
  if (heavyblockname == NULL)
    return blockmatrix_alloc (0, 0);
  else if (has_suffix(heavyblockname, ".bin"))
    return read_heavyblock_matrix_binary(heavyblockname);
  else
    return read_heavyblock_matrix_ascii(heavyblockname);
}

int compute_transpose_of_blockmatrix_kernel(blockmatrix kb, blockmatrix t)
{
    /* gauss.c's kernel() function takes its input with a different ordering.
     * It's tiny data anyway. */

    fprintf(stderr, "Computing left nullspace of %u x %u matrix\n",
            t->nrows, t->ncols);
    unsigned int tiny_nrows = t->nrows;
    unsigned int tiny_ncols = t->ncols;
    unsigned int tiny_limbs_per_row = iceildiv(tiny_ncols, 64);
    unsigned int tiny_limbs_per_col = iceildiv(tiny_nrows, 64);
    unsigned int tiny_chars = FLAT_BYTES_WITH_READAHEAD(t->nrows, t->ncols);
    unsigned int tiny_64bit_words = tiny_chars / sizeof(uint64_t);

    /* we need some readahead zones because of the block matrix structure */
    uint64_t * tiny = malloc (tiny_chars);
    memset(tiny, 0, tiny_chars);
    
    blockmatrix_copy_to_flat(tiny, tiny_limbs_per_row, 0, 0, t);

    /* The kernel matrix is essentially a square matrix of tiny_nrows rows and
     * columns (tiny_nrows is the same as the number of kernel vectors)
     */
    /* we need some readahead zones because of the block matrix structure */
    uint64_t * kerdata = malloc(FLAT_BYTES_WITH_READAHEAD(t->nrows, t->nrows));
    memset(kerdata, 0, FLAT_BYTES_WITH_READAHEAD(t->nrows, t->nrows));
    unsigned int kerdata_64bit_words = FLAT_BYTES_WITH_READAHEAD(t->nrows, t->nrows) / sizeof(uint64_t);


    uint64_t ** myker = (uint64_t **) malloc(tiny_nrows * sizeof(uint64_t *));
    ASSERT(myker != NULL);
    for (unsigned int i = 0; i < tiny_nrows; ++i)
        myker[i] = kerdata + i * tiny_limbs_per_col;
    /* gauss.c knows about mp_limb_t's only */
    ASSERT_ALWAYS(sizeof(uint64_t) % sizeof(mp_limb_t) == 0);
    swap_words_if_needed (tiny, tiny_64bit_words);
    int dim = kernel((mp_limb_t *) tiny,
            (mp_limb_t **) myker,
            tiny_nrows, tiny_ncols,
            sizeof(uint64_t) / sizeof(mp_limb_t) * tiny_limbs_per_row,
            sizeof(uint64_t) / sizeof(mp_limb_t) * tiny_limbs_per_col);
    swap_words_if_needed (tiny, tiny_64bit_words); /* FIXME: this is maybe not
                                                      needed since tiny is
                                                      destroyed, but keep it
                                                      for debugging */
    swap_words_if_needed(kerdata, kerdata_64bit_words);
    free(tiny);
    /* Now take back our kernel to block format, and multiply. Exciting. */
    if (kb)
      blockmatrix_copy_transpose_from_flat(kb, kerdata, tiny_limbs_per_col,
                                           0, 0);
    free(myker);
    free(kerdata);
    return dim;
}

/* This only builds a basis, not an echelonized basis */
blockmatrix blockmatrix_column_reduce(blockmatrix m, unsigned int max_rows_to_consider)
{
    blockmatrix t = blockmatrix_submatrix(m, 0, 0, MIN(max_rows_to_consider, m->nrows), m->ncols);

    unsigned int tiny_nrows = t->ncols;
    unsigned int tiny_ncols = t->nrows;
    unsigned int tiny_limbs_per_row = iceildiv(tiny_ncols, 64);
    unsigned int tiny_limbs_per_col = iceildiv(tiny_nrows, 64);

    unsigned int tiny_nlimbs = tiny_nrows * tiny_limbs_per_row;
    uint64_t * tiny = malloc(tiny_nlimbs * sizeof(uint64_t));
    memset(tiny, 0, tiny_nlimbs * sizeof(uint64_t));
    blockmatrix_copy_transpose_to_flat(tiny, tiny_limbs_per_row, 0, 0, t);

    uint64_t * sdata = (uint64_t *) malloc(tiny_nrows * tiny_limbs_per_col * sizeof(uint64_t));
    memset(sdata, 0, tiny_nrows * tiny_limbs_per_col * sizeof(uint64_t *));
    unsigned int sdata_64bit_words = tiny_nrows * tiny_limbs_per_col;
    free(t);

    swap_words_if_needed (tiny, tiny_nlimbs);
    int rank = spanned_basis(
            (mp_limb_t *) sdata,
            (mp_limb_t *) tiny,
            tiny_nrows,
            tiny_ncols,
            sizeof(uint64_t) / sizeof(mp_limb_t) * tiny_limbs_per_row,
            sizeof(uint64_t) / sizeof(mp_limb_t) * tiny_limbs_per_col,
            NULL
            );
    swap_words_if_needed (tiny, tiny_nlimbs);
    swap_words_if_needed (sdata, sdata_64bit_words);
    free(tiny);

    blockmatrix s = blockmatrix_alloc(m->ncols, rank);
    blockmatrix_copy_transpose_from_flat(s, sdata, tiny_limbs_per_col, 0, 0);
    blockmatrix k2 = blockmatrix_alloc(m->nrows, rank);
    blockmatrix_mul_smallb(k2, m, s);
    blockmatrix_free(s);
    free(sdata);
    return k2;
}

static void
declare_usage (param_list pl)
{
  param_list_decl_usage (pl, "purged", "output-from-purge file");
  param_list_decl_usage (pl, "index",  "index file");
  param_list_decl_usage (pl, "out",    "output file");
  param_list_decl_usage (pl, "heavyblock", "heavyblock output file");
  param_list_decl_usage (pl, "poly",   "polynomial file");
  param_list_decl_usage (pl, "nchar",  "number of (algebraic) characters");
  param_list_decl_usage (pl, "lpbr",   "large prime bound on rational side");
  param_list_decl_usage (pl, "lpba",   "large prime bound on algebraic side");
  param_list_decl_usage (pl, "nratchars", "number of rational characters");
  param_list_decl_usage (pl, "t",      "number of threads");
  param_list_decl_usage (pl, "ker",      "input kernel file");
}

int main(int argc, char **argv)
{
    const char * heavyblockname = NULL;
    int nchars;
    int nratchars = 0;
    alg_prime_t *chars;
    cado_poly pol;
    const char *purgedname = NULL;
    const char *indexname = NULL;
    const char *outname = NULL;
    int nthreads = 1;
    unsigned long lpb[2] = {0,0};
    char *argv0 = argv[0];

    /* print the command line */
    fprintf (stderr, "%s.r%s", argv[0], CADO_REV);
    for (int i = 1; i < argc; i++)
      fprintf (stderr, " %s", argv[i]);
    fprintf (stderr, "\n");

    param_list pl;
    param_list_init(pl);
    declare_usage(pl);

    argc--,argv++;
    const char *bw_kernel_file = NULL;

    for( ; argc ; ) {
        if (param_list_update_cmdline(pl, &argc, &argv)) continue;

        fprintf(stderr, "Unhandled parameter %s\n", argv[0]);
        param_list_print_usage(pl, argv0, stderr);
        exit (EXIT_FAILURE);
    }
    purgedname = param_list_lookup_string(pl, "purged");
    indexname = param_list_lookup_string(pl, "index");
    outname = param_list_lookup_string(pl, "out");
    heavyblockname = param_list_lookup_string(pl, "heavyblock");
    bw_kernel_file = param_list_lookup_string(pl, "ker");

    cado_poly_init (pol);

    const char * tmp;

    if ((tmp = param_list_lookup_string(pl, "poly")) == NULL)
      {
        fprintf (stderr, "Error: parameter -poly is mandatory\n");
        param_list_print_usage (pl, argv0, stderr);
        exit (EXIT_FAILURE);
      }
    cado_poly_read(pol, tmp);

    if (param_list_parse_int(pl, "nchar", &nchars) == 0)
      {
        fprintf (stderr, "Error: parameter -nchar is mandatory\n");
        param_list_print_usage (pl, argv0, stderr);
        exit (EXIT_FAILURE);
      }
    if (param_list_parse_ulong(pl, "lpbr", &lpb[RATIONAL_SIDE]) == 0)
      {
        fprintf (stderr, "Error: parameter -lpbr is mandatory\n");
        param_list_print_usage (pl, argv0, stderr);
        exit (EXIT_FAILURE);
      }
    if (param_list_parse_ulong(pl, "lpba", &lpb[ALGEBRAIC_SIDE]) == 0)
      {
        fprintf (stderr, "Error: parameter -lpba is mandatory\n");
        param_list_print_usage (pl, argv0, stderr);
        exit (EXIT_FAILURE);
      }

    param_list_parse_int(pl, "nratchars", &nratchars);
    param_list_parse_int(pl, "t", &nthreads);

    if (purgedname == NULL)
      {
        fprintf (stderr, "Error: parameter -purged is mandatory\n");
        param_list_print_usage (pl, argv0, stderr);
        exit (EXIT_FAILURE);
      }
     if (indexname == NULL)
       {
         fprintf (stderr, "Error: parameter -index is mandatory\n");
         param_list_print_usage (pl, argv0, stderr);
         exit (EXIT_FAILURE);
       }

    struct worker_threads_group * g = worker_threads_init (nthreads);
    chars = create_characters (nchars, nratchars, pol, lpb);
    int nchars2 = iceildiv(nchars, 64) * 64;
    double tt=wct_seconds();
    blockmatrix bcmat = big_character_matrix(chars, nchars2, purgedname, pol, g);
    free(chars);
    worker_threads_clear(g);

    fprintf(stderr, "done building big character matrix at %.1f\n", wct_seconds()-tt);

    blockmatrix scmat = small_character_matrix(bcmat, indexname);
    fprintf(stderr, "done building small character matrix at %.1f\n", wct_seconds()-tt);

    blockmatrix_free(bcmat);
    
    unsigned int small_nrows = scmat->nrows;

    /* It's ok if heavyblockname == 0. After all sufficiently many
     * characters should be enough */
    blockmatrix h = read_heavyblock_matrix(heavyblockname);
    if (h->ncols == 0) { h->nrows = small_nrows; }
    if (h->ncols)
          fprintf(stderr, "done reading heavy block of size %u x %u at %.1f\n",
                  h->nrows, h->ncols, wct_seconds()-tt);
    ASSERT_ALWAYS(h->nrows == small_nrows);

    /* Now do dot products of these matrices by the kernel vectors
     * supplied on input */

    /* First compute how many kernel vectors we have */

    unsigned int total_kernel_cols = 0;
    unsigned int kncols;
    struct stat sbuf[1];
    int rc = stat(bw_kernel_file, sbuf);
    if (rc < 0) { perror(bw_kernel_file); exit(1); }
    ASSERT_ALWAYS(sbuf->st_size % small_nrows == 0);
    unsigned int ncols = 8 * (sbuf->st_size / small_nrows);
    fprintf(stderr, "%s: %u vectors\n", bw_kernel_file, ncols);
    ASSERT_ALWAYS(ncols % 64 == 0);
    total_kernel_cols += kncols = ncols;

    fprintf(stderr, "Total: %u kernel vectors\n", total_kernel_cols);

    /* kmat is the join of all kernel vectors */
    blockmatrix k = blockmatrix_alloc(small_nrows, total_kernel_cols);
    int j0 = 0;
    blockmatrix_read_from_flat_file(k, 0, j0, bw_kernel_file, small_nrows, kncols);

    fprintf(stderr, "done reading %u kernel vectors at %.1f\n",
            total_kernel_cols, wct_seconds() - tt);

    blockmatrix k2 = blockmatrix_column_reduce(k, 4096);
    blockmatrix_free(k);
    k = k2;
    total_kernel_cols = k->ncols;
    fprintf(stderr, "Info: input kernel vectors reduced to dimension %u\n",
            total_kernel_cols);

    /* tmat is the product of the character matrices times the kernel vector */
    blockmatrix t = blockmatrix_alloc(total_kernel_cols, scmat->ncols + h->ncols);
    blockmatrix tc = blockmatrix_submatrix(t, 0, 0, total_kernel_cols, scmat->ncols);
    blockmatrix th = blockmatrix_submatrix(t, 0, scmat->ncols, total_kernel_cols, h->ncols);

    blockmatrix_mul_Ta_b(tc, k, scmat);
    blockmatrix_mul_Ta_b(th, k, h);

    fprintf(stderr, "done multiplying matrices at %.1f\n", wct_seconds() - tt);

    free(tc);
    free(th);
    blockmatrix_free(scmat);
    blockmatrix_free(h);


    blockmatrix kb = blockmatrix_alloc(k->ncols, k->ncols);
    blockmatrix_set_zero(kb);
    int dim = compute_transpose_of_blockmatrix_kernel(kb, t);
    blockmatrix_free(t);
    fprintf(stderr, "dim of ker = %d\n", dim);

    blockmatrix kbsub = blockmatrix_submatrix(kb, 0, 0, kb->nrows, dim);

    blockmatrix nk = blockmatrix_alloc(small_nrows, kbsub->ncols);
    blockmatrix_mul_smallb(nk, k, kbsub);
    free(kbsub);
    blockmatrix_free(k);
    blockmatrix_free(kb);

    /* Sanity check: count the number of zero dependencies */
    unsigned int nonzero_deps = 0;
    for(unsigned int j = 0 ; j < nk->ncblocks ; j++) {
        uint64_t sanity = 0 ;
        for(unsigned int i = 0 ; i < nk->nrows ; i+=64) {
            for(unsigned int ii = 0 ; ii < 64 && i + ii < nk->nrows ; ii++)
                sanity |= nk->mb[j*nk->stride + ii/64][i];
        }
        // do popcount.
        for( ; sanity ; sanity>>=1) nonzero_deps += sanity&1UL;
    }
    if (!nonzero_deps) {
        fprintf(stderr, "Error, all dependencies are zero !\n");
        exit(1);
    }
    blockmatrix_write_to_flat_file(outname, nk, 0, 0, nk->nrows, nk->ncols);

    fprintf(stderr, "Wrote %d non-zero dependencies to %s\n",
            nonzero_deps, outname);
    if (nonzero_deps < (unsigned int) dim || nk->ncols % 64) {
        fprintf(stderr, "This includes %u discarded zero dependencies, as well as %u padding zeros\n",
                dim - nonzero_deps,
                nk->ncblocks * 64 - dim);
    }
    blockmatrix_free(nk);

    cado_poly_clear(pol);
    param_list_clear(pl);

    return 0;
}