https://gitlab.inria.fr/cado-nfs/cado-nfs
Raw File
Tip revision: d5a5c566e3ab7037e0960b1441613062ade661c9 authored by Alexander Kruppa on 29 March 2021, 19:23:48 UTC
Merge branch 'torture_redc_timing' into 'master'
Tip revision: d5a5c56
dup1.c
/* dup1: 1st duplicate pass, split relation files into 'nslices'
         slices (adapted from check).

   Usage:
   dup1 [-bz] [-n nslices_log] -out <dir> file1 ... filen
   by default nslices_log = 1 (nslices = 2).

   Files file1 ... filen are split into 'nslices' slices in
   <dir>/0/filej ... <dir>/31/filej.

   If option -bz is given, then the output is compressed with bzip2
   instead of gzip.
   Input can be in gzipped or bzipped format.
*/

#include "cado.h" // IWYU pragma: keep

// IWYU pragma: no_include <bits/types/struct_rusage.h>

#define MAX_NSLICES_LOG 6

#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <string.h>
#include <inttypes.h>
#ifdef HAVE_MINGW
#include <fcntl.h>   /* for _O_BINARY */
#endif

#include "filter_config.h"
#include "filter_io.h"  // filter_rels
#include "gzip.h"       // fopen_maybe_compressed
#include "macros.h"
#include "portability.h" // strdup // IWYU pragma: keep
#include "misc.h"       // filelist_clear
#include "params.h"     // param_list_parse_*
#include "timing.h"     // timingstats_dict_t
#include "verbose.h"

#define DEFAULT_LOG_MAX_NRELS_PER_FILES 25

/* Only (a,b) are parsed on input. This flags control whether we copy the
 * rest of the relation data to the output file, or if we content
 * ourselves with smaller .ab files */
static int only_ab = 0;

static uint64_t nr_rels_tot[(1 << MAX_NSLICES_LOG)];
static unsigned int nslices_log = 1, do_slice[(1 << MAX_NSLICES_LOG)];


typedef struct {
  const char *prefix, *suffix;
  char *filename;
  FILE *file;
  const char *msg;
  unsigned int next_idx;
  size_t lines_per_file, lines_left;
} split_output_iter_t;

static split_output_iter_t *
split_iter_init(const char *prefix, const char *suffix,
                const size_t lines_per_file, const char *msg)
{
  split_output_iter_t *iter = malloc(sizeof(split_output_iter_t));
  ASSERT_ALWAYS(iter != NULL);
  iter->prefix = strdup(prefix);
  iter->suffix = strdup(suffix);
  iter->next_idx = 0;
  iter->filename = NULL;
  iter->file = NULL;
  if (msg)
    iter->msg = strdup(msg);
  else
    iter->msg = NULL;
  ASSERT_ALWAYS(lines_per_file > 0);
  iter->lines_per_file = lines_per_file;
  iter->lines_left = 0; /* Force opening of file on next write */
  return iter;
}

/* used for counting time in different processes */
timingstats_dict_t stats;


static void
split_iter_end(split_output_iter_t *iter)
{
  if (iter->file != NULL)
    fclose_maybe_compressed(iter->file, iter->filename);
  free(iter->filename);
  free((void *) iter->prefix);
  free((void *) iter->suffix);
  free((void *) iter->msg);
  free(iter);
}

/* Closes the currently open file, if any, and opens the next one */
void
split_iter_open_next_file(split_output_iter_t *iter)
{
  if (iter->file != NULL) {
    int rc;
#ifdef  HAVE_GETRUSAGE
    struct rusage r[1];
    rc = fclose_maybe_compressed2(iter->file, iter->filename, r);
    timingstats_dict_add(stats, iter->prefix, r);
#else
    rc = fclose_maybe_compressed(iter->file, iter->filename);
#endif
    ASSERT_ALWAYS (rc == 0);
  }

  free (iter->filename);
  int rc = asprintf(&(iter->filename), "%s%04x%s",
                    iter->prefix, iter->next_idx++, iter->suffix);
  ASSERT_ALWAYS (rc >= 0);
  if (iter->msg != NULL)
    fprintf (stderr, "%s%s\n", iter->msg, iter->filename);
  iter->file = fopen_maybe_compressed(iter->filename, "w");
  if (iter->file == NULL) {
    char *msg;
    rc = asprintf(&msg, "Could not open file %s for writing", iter->filename);
    if (rc >= 0) {
      perror(msg);
      free(msg);
    } else {
      perror("Could not open file for writing");
    }
    exit(EXIT_FAILURE);
  }
  iter->lines_left = iter->lines_per_file;
}

static void
split_iter_write_next(split_output_iter_t *iter, const char *line)
{
  if (iter->lines_left == 0)
    split_iter_open_next_file(iter);
  if (fputs (line, iter->file) == EOF) {
    perror("Error writing relation");
    abort();
  }
  iter->lines_left--;
}


/* Must be called only when nslices_log > 0 */
static inline unsigned int
compute_slice (int64_t a, uint64_t b)
{
  uint64_t h = CA_DUP1 * (uint64_t) a + CB_DUP1 * b;
  /* Using the low bit of h is not a good idea, since then
     odd values of i are twice more likely. The second low bit
     also gives a small bias with RSA768 (but not for random
     coprime a, b). We use here the nslices_log high bits.
  */
  h >>= (64 - nslices_log);
  return (unsigned int) h;
}

/* Callback function called by prempt_scan_relations */

static void *
thread_dup1 (void * context_data, earlyparsed_relation_ptr rel)
{
    unsigned int slice = compute_slice (rel->a, rel->b);
    split_output_iter_t **outiters = (split_output_iter_t**)context_data;

    if (do_slice[slice])
    {
      if (only_ab)
      {
        char *p = rel->line;
        while (*p != ':')
          p++;
        *p = '\n';
      }

      split_output_iter_t *iter = outiters[slice];
      split_iter_write_next(iter, rel->line);
      nr_rels_tot[slice]++;
    }
    return NULL;
}

/* Special callback function for when nslices = 1 */
static void *
thread_dup1_special (void * context_data, earlyparsed_relation_ptr rel)
{
  split_output_iter_t **outiters = (split_output_iter_t**)context_data;
  if (do_slice[0])
  {
    if (only_ab)
    {
      char *p = rel->line;
      while (*p != ':')
        p++;
      *p = '\n';
    }

    split_output_iter_t *iter = outiters[0];
    split_iter_write_next(iter, rel->line);
    nr_rels_tot[0]++;
  }
  return NULL;
}

static void declare_usage(param_list pl)
{
  param_list_decl_usage(pl, "filelist", "file containing a list of input files");
  param_list_decl_usage(pl, "basepath", "path added to all file in filelist");
  param_list_decl_usage(pl, "out", "output directory");
  param_list_decl_usage(pl, "prefix", "prefix for output files");
  param_list_decl_usage(pl, "lognrels", "log of number of rels per output file");
  param_list_decl_usage(pl, "n", "log of number of slices (default: 1)");
  param_list_decl_usage(pl, "only", "do only slice i (default: all)");
  param_list_decl_usage(pl, "outfmt",
                               "format of output file (default same as input)");
  param_list_decl_usage(pl, "ab", "only print a and b in the output");
  param_list_decl_usage(pl, "abhexa",
                                  "read a and b as hexa not decimal");
  param_list_decl_usage(pl, "force-posix-threads", "force the use of posix threads, do not rely on platform memory semantics");
  param_list_decl_usage(pl, "path_antebuffer", "path to antebuffer program");
  verbose_decl_usage(pl);
}

static void
usage (param_list pl, char *argv0)
{
    param_list_print_usage(pl, argv0, stderr);
    exit(EXIT_FAILURE);
}


int
main (int argc, char * argv[])
{
    char * argv0 = argv[0];
    unsigned int log_max_nrels_per_files = DEFAULT_LOG_MAX_NRELS_PER_FILES;
    int only_slice = -1;
    int abhexa = 0;

    param_list pl;
    param_list_init(pl);
    declare_usage(pl);
    argv++,argc--;

    param_list_configure_switch(pl, "ab", &only_ab);
    param_list_configure_switch(pl, "abhexa", &abhexa);
    param_list_configure_switch(pl, "force-posix-threads", &filter_rels_force_posix_threads);

#ifdef HAVE_MINGW
    _fmode = _O_BINARY;     /* Binary open for all files */
#endif

    if (argc == 0)
      usage (pl, argv0);

    for( ; argc ; ) {
        if (param_list_update_cmdline(pl, &argc, &argv)) { continue; }
        /* Since we accept file names freeform, we decide to never abort
         * on unrecognized options */
        break;
        // fprintf (stderr, "Unknown option: %s\n", argv[0]);
        // abort();
    }
    /* print command-line arguments */
    verbose_interpret_parameters(pl);
    param_list_print_command_line (stdout, pl);
    fflush(stdout);

    param_list_parse_uint(pl, "n", &nslices_log);
    const char *outdir = param_list_lookup_string(pl, "out");
    param_list_parse_int(pl, "only", &only_slice);
    param_list_parse_uint(pl, "lognrels", &log_max_nrels_per_files);
    const char *outfmt = param_list_lookup_string(pl, "outfmt");
    const char * filelist = param_list_lookup_string(pl, "filelist");
    const char * basepath = param_list_lookup_string(pl, "basepath");
    const char * path_antebuffer = param_list_lookup_string(pl, "path_antebuffer");
    const char *prefix_files = param_list_lookup_string(pl, "prefix");

    if (param_list_warn_unused(pl))
    {
      fprintf(stderr, "Error, unused parameters are given\n");
      usage(pl, argv0);
    }

    if (nslices_log > MAX_NSLICES_LOG)
    {
      fprintf(stderr, "Error, -n is too large\n");
      usage(pl, argv0);
    }
    if (basepath && !filelist)
    {
      fprintf(stderr, "Error, -basepath only valid with -filelist\n");
      usage(pl, argv0);
    }

    if (!prefix_files)
    {
      fprintf(stderr, "Error, missing -prefix command line argument\n");
      usage(pl, argv0);
    }

    if (!outdir)
    {
      fprintf(stderr, "Error, missing -out command line argument\n");
      usage(pl, argv0);
    }
    if (outfmt && !is_supported_compression_format(outfmt)) {
        fprintf(stderr, "Error, output compression format unsupported\n");
        usage(pl, argv0);
    }

    unsigned int nslices = 1 << nslices_log;
    if (only_slice < 0) /* split all slices */
    {
      for (unsigned int i = 0; i < nslices; i++)
        do_slice[i] = 1;
    }
    else /* split only slide i */
    {
      for (unsigned int i = 0; i < nslices; i++)
        do_slice[i] = (i == (unsigned int) only_slice);
    }

    if ((filelist != NULL) + (argc != 0) != 1) {
      fprintf(stderr, "Error, provide either -filelist or freeform file names\n");
      usage(pl, argv0);
    }

    set_antebuffer_path (argv0, path_antebuffer);
    char ** files = filelist ? filelist_from_file(basepath, filelist, 0) : argv;

    // If not output suffix is specified, use suffix of first input file
    if (!outfmt && files[0] != NULL)
      get_suffix_from_filename (files[0], &outfmt);

    memset (nr_rels_tot, 0, sizeof(uint64_t) * nslices);

    split_output_iter_t **outiters;
    outiters = malloc(sizeof(split_output_iter_t *) * nslices);
    ASSERT_ALWAYS(outiters != NULL);
    for(unsigned int i = 0 ; i < nslices ; i++)
    {
      char *prefix, *suffix, *msg;
      int rc = asprintf(&prefix, "%s/%d/%s.",
                        outdir, i, prefix_files);
      ASSERT_ALWAYS(rc >= 0);
      rc = asprintf(&suffix, only_ab ? ".ab%s" : "%s", outfmt);
      ASSERT_ALWAYS(rc >= 0);
      rc = asprintf (&msg, "# Opening output file for slice %d : ", i);
      ASSERT_ALWAYS(rc >= 0);
      outiters[i] = split_iter_init(prefix, suffix, 1UL<<log_max_nrels_per_files, msg);
      free(prefix);
      free(suffix);
      free(msg);
    }

    timingstats_dict_init(stats);
    if (nslices == 1)
      filter_rels(files, (filter_rels_callback_t) &thread_dup1_special,
            (void*)outiters, EARLYPARSE_NEED_LINE |
            (abhexa ? EARLYPARSE_NEED_AB_HEXA : EARLYPARSE_NEED_AB_DECIMAL),
            NULL, stats);
    else
      filter_rels(files, (filter_rels_callback_t) &thread_dup1, (void*)outiters,
            EARLYPARSE_NEED_LINE |
            (abhexa ? EARLYPARSE_NEED_AB_HEXA : EARLYPARSE_NEED_AB_DECIMAL),
            NULL, stats);

    for(unsigned int i = 0 ; i < nslices ; i++)
      split_iter_end(outiters[i]);

    for (unsigned int i = 0; i < nslices; i++)
        fprintf (stderr, "# slice %d received %" PRIu64 " relations\n", i,
                                                                nr_rels_tot[i]);

    if (filelist) filelist_clear(files);

    free(outiters);

    param_list_clear(pl);

    // double thread_times[2];
    // thread_seconds_user_sys(thread_times);
    timingstats_dict_add_mythread(stats, "main");
    // fprintf(stderr, "Main thread ends after having spent %.2fs+%.2fs on cpu \n", thread_times[0], thread_times[1]);
    timingstats_dict_disp(stats);
    timingstats_dict_clear(stats);

    return 0;
}
back to top