https://github.com/DBWangGroupUNSW/SRS
Raw File
Tip revision: 0bdf8aa436d237c23d278160dce85c589298daec authored by yifangs on 25 May 2015, 00:30:17 UTC
Update run_toy_data.sh
Tip revision: 0bdf8aa
gen_gt.cpp
/*
 *   This file is part of SRS project.
 *
 *   SRS is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   SRS is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with SRS. If not, see <http://www.gnu.org/licenses/>.
 *
 *   Created by: Yifang Sun, Jianbin Qin
 *   Last modified by: Yifang Sun, Jianbin Qin
 */

#include <iostream>
#include <fstream>
#include <stdio.h>
#include <getopt.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>
#include <queue>

#define no_argument 0
#define required_argument 1
#define optional_argument 2

//#define _LARGEFILE_SOURCE
//#define _LARGEFILE64_SOURCE
#define _FILE_OFFSET_BITS 64
#define O_LARGEFILE 0

#define MAXINT 1e9

typedef std::priority_queue<double> max_queue_t;

void usage();
bool file_exists(const char *);
double l2_dist(int *_p1, int *_p2, int _dim);
double l2_dist(float *_p1, float *_p2, int _dim);
void gen_gt(int, int, int, const char *, const char *, const char *,
            bool is_integer);

int main(int argc, char * argv[]) {
  const struct option longopts[] ={
    {"dimension",                   required_argument, 0, 'd'},
    {"dataset-file-path",           required_argument, 0, 's'},
    {"cardinality",                 required_argument, 0, 'n'},
    {"k",                           required_argument, 0, 'k'},
    {"ground-truth-file-path",      required_argument, 0, 'g'},
    {"query-file-path",             required_argument, 0, 'q'},
    {"data-type",                   required_argument, 0, 'y'},
    {0, 0, 0, 0},
  };

  int index;
  int iarg = 0;
  opterr = 1;    //getopt error message (off: 0)

  int d = -1;
  int n = -1;
  int k = -1;
  bool is_integer = true;

  char ground_truth_file_path[100] = "";
  char query_file_path[100] = "";
  char data_file_path[100] = "";

  bool is_valid_command = true;

  while (iarg != -1) {
    iarg = getopt_long(argc, argv, "d:g:k:n:q:s:y:h", longopts, &index);

    switch (iarg) {
      case 'd':
        if (optarg) {
          d = atoi(optarg);
        }
        break;
      case 'g':
        if (optarg) {
          strcpy(ground_truth_file_path, optarg);
        }
        break;
      case 'h':
        usage();
        return 0;
      case 'k':
        if (optarg) {
          k = atoi(optarg);
        }
        break;
      case 'n':
        if (optarg) {
          n = atoi(optarg);
        }
        break;
      case 'q':
        if (optarg) {
          strcpy(query_file_path, optarg);
        }
        break;
      case 's':
        if (optarg) {
          strcpy(data_file_path, optarg);
        }
        break;
      case 'y':
        if (optarg) {
          if (strcmp(optarg, "f") == 0) {
            is_integer = false;
          } else if (strcmp(optarg, "i") != 0) {
            is_valid_command = false;
          }
        }
        break;
    }
  }

  if (d < 0 || k < 0 || n < 0 || !file_exists(data_file_path)
      || !file_exists(query_file_path)) {
    is_valid_command = false;
  }
  if (!is_valid_command) {
    usage();
  } else {
    gen_gt(n, d, k, data_file_path, query_file_path, ground_truth_file_path,
           is_integer);
  }

  return 0;
}

void usage() {
  printf("SRS-Mem (v1.0)\n");
  printf("Options\n");
  printf("-d {value}\tdimensionality of data\n");
  printf("-g {string}\tground truth file\n");
  printf("-k {value}\tnumber of neighbors wanted\n");
  printf("-n {value}\tcardinality\n");
  printf("-q {string}\tquery file\n");
  printf("-s {string}\tdataset file\n");
  printf(
      "-y {string}\tdata type (i: integer; f: floating number), default value: integer\n");
  printf("\n");

  printf("Usage:\n");

  printf("Generate ground truth file\n");
  printf("-d -g -k -n -q -s [-y]\n");
}

bool file_exists(const char *filename) {
  std::ifstream ifile(filename);
  return ifile;
}

double l2_dist_int(int *_p1, int *_p2, int _dim) {
  double ret = 0;
  for (int i = 0; i < _dim; i++) {
    long long dif = (_p1[i] - _p2[i]);
    ret += dif * dif;
  }
  return ret;
}

double l2_dist_float(float *_p1, float *_p2, int _dim) {
  double ret = 0;
  for (int i = 0; i < _dim; i++) {
    double dif = (_p1[i] - _p2[i]);
    ret += dif * dif;
  }
  return ret;
}

void gen_gt(int n, int d, int k, const char * data_file,
            const char * query_file, const char * gt_file, bool is_integer) {
  FILE * dfp = fopen(data_file, "r");
  FILE * qfp = fopen(query_file, "r");
  FILE * rfp = fopen(gt_file, "w+");

  if (dfp == NULL || qfp == NULL || rfp == NULL) {
    printf("open file error\n");
  }

  int nq;
  fscanf(qfp, "%d %d", &nq, &d);
  fprintf(rfp, "%d %d\n", nq, k);

  int * qs_int = new int[d * nq];
  int * data_int = new int[d];
  float * qs_float = new float[d * nq];
  float * data_float = new float[d];

  int temp = -1;
  double dist = -1;

  for (int i = 0; i < nq; ++i) {
    fscanf(qfp, "%d", &temp);
    for (int j = 0; j < d; ++j) {
      if (is_integer) {
        fscanf(qfp, "%d", &qs_int[i * d + j]);
      } else {
        fscanf(qfp, "%f", &qs_float[i * d + j]);
      }
    }
  }

  max_queue_t *max_queue_list = new max_queue_t[nq];
  for (int i = 0; i < n; ++i) {
    if (i % 5000 == 0) {
      fprintf(stderr, "\r%d (%.3f\%)", i, 1.0 * i / n * 100);
    }
    for (int j = 0; j < d; ++j) {
      if (is_integer) {
        fscanf(dfp, "%d", &data_int[j]);
      } else {
        fscanf(dfp, "%f", &data_float[j]);
      }
    }

    for (int j = 0; j < nq; ++j) {
      if (is_integer) {
        dist = l2_dist_int(data_int, &qs_int[j * d], d);
      } else {
        dist = l2_dist_float(data_float, &qs_float[j * d], d);
      }
      if (max_queue_list[j].size() < k) {
        max_queue_list[j].push(dist);
      } else if (max_queue_list[j].top() > dist) {
        max_queue_list[j].pop();
        max_queue_list[j].push(dist);
      }
    }
  }
  fprintf(stderr, "\r%d (%.3f\%)\n", n, 100.0);

  double *klist = new double[k];
  for (int i = 0; i < nq; ++i) {
    fprintf(rfp, "%d", i + 1);
    while (max_queue_list[i].size() > 0) {
      klist[max_queue_list[i].size() - 1] = max_queue_list[i].top();
      max_queue_list[i].pop();
    }
    for (int j = 0; j < k; ++j) {
      fprintf(rfp, " %f", sqrt(klist[j]));
    }
    fprintf(rfp, "\n");
  }

  fclose(dfp);
  fclose(qfp);
  fclose(rfp);

  delete[] qs_int;
  delete[] data_int;
  delete[] qs_float;
  delete[] data_float;
  delete[] max_queue_list;
  delete[] klist;
}
back to top