Content - 9158a043497bea2e081b4bdcdd142e25b96f701f

Permalink
# =============================================================================
# copyright (c) Microsoft. All rights reserved.
# Licensed under the MIT license. See LICENSE.md file in the project root
# for full license information.
# ==============================================================================

import argparse
import os
import random

from converter import generate_vocab_from_source_file
from converter import init_location
from converter import save_to_vocab_location_file
from converter import save_to_vocab_file

parser = argparse.ArgumentParser(description="Language Model with LightRNN")

parser.add_argument('-datadir', '--datadir', default=None, required=True,
                    help='Data Directory where the dataset is located')
parser.add_argument('-outputdir', '--outputdir', default=None, required=True,
                    help='Vocab directory put word allocation and vocab file')
parser.add_argument('-vocab_file', '--vocab_file', default='vocab.txt',
                    help='The file name of vocab file')
parser.add_argument('-alloc_file', '--alloc_file', default='word-0.location',
                    help='The file name of word allocation table')
parser.add_argument('-vocabsize', '--vocabsize', default=None, type=int,
                    help='The vocab size')
parser.add_argument('-seed', '--seed', default=0, type=int,
                    help='The random seed')

opt = parser.parse_args()


def preprocess():
    # Generate the vocabulary and location
    random.seed(opt.seed)
    # make vocabuary from the files under the datadir
    vocab = generate_vocab_from_source_file(opt.datadir, opt.vocabsize)
    # save vocabulary 
    save_to_vocab_file(vocab, os.path.join(opt.outputdir, opt.vocab_file))
    # make a random word allocation
    location = init_location(vocab)
    # save word allocation
    save_to_vocab_location_file(location, os.path.join(opt.outputdir, opt.alloc_file))


def main():
    preprocess()

if __name__ == '__main__':
    main()