https://gitlab.inria.fr/cado-nfs/cado-nfs
Raw File
Tip revision: 5b2f4e4ce413fc083713cbf6b164b7bb5ec452ce authored by Emmanuel Thomé on 03 April 2021, 13:59:23 UTC
estimate_matsize: allow uncompressed files
Tip revision: 5b2f4e4
shrink_rels.cpp
#include "cado.h" // IWYU pragma: keep
#include <sstream>
#include <istream>
#include <ostream>
#include <iostream>
#include <memory>
#include <type_traits>
#include "fmt/core.h"
#include "fmt/format.h"
#include "macros.h"
#include "misc.h"
#include "params.h"
#include "relation.hpp"
#include "relation-tools.h"
#include "indexed_relation.hpp"
#include "gzip.h"

/*
 * The goal of this binary is to read relations that come out of
 * dup1/dup2, and shrink them in a way similar to what fake_rels does
 * with the "shrink" option.
 *
 * Two distinct shrink parameters are provided:
 *      - an integer sigma that denotes the shrink factor with the same meaning
 *        as for fake_rels: it divides column indices by sigma.
 *      - optionally, a second parameter is passed (a floating point
 *        between 0 and 1) that gives the fraction of the input rows that
 *        are kept. If unspecified, this parameter defaults to 1/sigma.
 * Note that if dup1/dup2 split the input into several slices (say 2),
 * the caller has the following options:
 *      - apply a shrink factor sigma to columns, and leave the second
 *      parameter unspecified (i.e. keep one row every sigma), and do
 *      that to the two input files.
 *      - assuming sigma>1, read only the first file, apply a shrink
 *      factor sigma to columns, and 2/sigma to rows.
 * Assuming the split that is done by dup1 introduces no statistical
 * bias, the second option is faster since it reads only part of the
 * input.
 *
 *
 * Note that relations that are read are post-dup2, and hence these are
 * renumbered relations!
 *
 * A random seed can be provided (this has an impact on which rows
 * are kept).
 */


struct shrink_action {
    double row_fraction = 0;
    int shrink_factor = 0;
    int dl = 0;
    gmp_randstate_t rstate;

    shrink_action() {
        gmp_randinit_default(rstate);
    }
    shrink_action(shrink_action const &) = delete;
    shrink_action& operator=(shrink_action const &) = delete;
    ~shrink_action() {
        gmp_randclear(rstate);
    }

    void process(std::ostream & os, std::istream& is) 
    {
        for(std::string line ; std::getline(is, line) ; ) {
            if (line[0] == '#')
                continue;
            indexed_relation rel;
            if (!(std::istringstream(line) >> rel))
                throw std::runtime_error(fmt::format(FMT_STRING("parse error while reading {}"), line));

            double rnd = double(u64_random(rstate)) / double(UINT64_MAX);
            if (rnd >= row_fraction)
                continue;

            rel.shrink(shrink_factor);
            rel.sort();
            rel.compress(dl);

            os << rel << std::endl;
        }
    }
};


static void declare_usage(param_list pl)
{
    param_list_decl_usage(pl, "out", "output file (defaults to stdout)");
    param_list_decl_usage(pl, "in", "input file (defaults to stdin)");
    param_list_decl_usage(pl, "shrink-factor", "divide all column indices by n");
    param_list_decl_usage(pl, "row-fraction", "ratio of rows to keep");
    param_list_decl_usage(pl, "seed", "random seed");
    param_list_decl_usage(pl, "dl", "DL mode (do not reduce valuations mod 2)");
}

int
main (int argc, char *argv[])
{
    char * argv0 = argv[0];
    cxx_param_list pl;

    declare_usage(pl);

    argv++, argc--;

    shrink_action A;

    param_list_configure_switch(pl, "-dl", &A.dl);

    for( ; argc ; ) {
        if (param_list_update_cmdline(pl, &argc, &argv)) { continue; }

        /* Could also be a file */
        FILE *f;
        if ((f = fopen(argv[0], "r")) != NULL) {
            param_list_read_stream(pl, f, 0);
            fclose(f);
            argv++,argc--;
            continue;
        }

        fprintf(stderr, "Unhandled parameter %s\n", argv[0]);
        param_list_print_usage(pl, argv0, stderr);
        exit (EXIT_FAILURE);
    }
    // param_list_print_command_line(stdout, pl);
    //

    param_list_parse_int(pl, "shrink-factor", &A.shrink_factor);
    if (A.shrink_factor < 1) {
        fprintf(stderr, "Error: shrink factor must be an integer >= 1\n");
        param_list_print_usage(pl, argv0, stderr);
        exit(EXIT_FAILURE);
    }

    unsigned long seed;

    if (param_list_parse_ulong(pl, "seed", &seed)) {
        gmp_randseed_ui(A.rstate, seed);
    }

    if (param_list_parse_double(pl, "row-fraction", &A.row_fraction)) {
        if (A.row_fraction < 0 || A.row_fraction > 1) {
            fprintf(stderr, "Error: row-fraction must be an real number in [0,1]\n");
            param_list_print_usage(pl, argv0, stderr);
            exit(EXIT_FAILURE);
        }
    } else {
        A.row_fraction = 1 / (double) A.shrink_factor;
    }

    std::istream * ptr_in = &std::cin;
    const char * in = param_list_lookup_string(pl, "in");
    std::unique_ptr<std::istream> p_in;
    if (in) {
        p_in = std::unique_ptr<std::istream>(new ifstream_maybe_compressed(in));
        ptr_in = p_in.get();
    }

    std::ostream * ptr_out = &std::cout;
    const char * out = param_list_lookup_string(pl, "out");
    std::unique_ptr<std::ostream> p_out;
    if (out) {
        p_out = std::unique_ptr<std::ostream>(new ofstream_maybe_compressed(out));
        ptr_out = p_out.get();
    }

    A.process(*ptr_out, *ptr_in);

    return 0;
}
back to top