https://github.com/Microsoft/CNTK
Raw File
Tip revision: 10a8ffcf50d7b9225f3236ffcfdc422b2014fb92 authored by microsoft-github-policy-service[bot] on 23 September 2022, 14:06:50 UTC
Microsoft mandatory file (#3870)
Tip revision: 10a8ffc
preprocess.py
# =============================================================================
# copyright (c) Microsoft. All rights reserved.
# Licensed under the MIT license. See LICENSE.md file in the project root
# for full license information.
# ==============================================================================

import argparse
import os
import random

from converter import generate_vocab_from_source_file
from converter import init_location
from converter import save_to_vocab_location_file
from converter import save_to_vocab_file

parser = argparse.ArgumentParser(description="Language Model with LightRNN")

parser.add_argument('-datadir', '--datadir', default=None, required=True,
                    help='Data Directory where the dataset is located')
parser.add_argument('-outputdir', '--outputdir', default=None, required=True,
                    help='Vocab directory put word allocation and vocab file')
parser.add_argument('-vocab_file', '--vocab_file', default='vocab.txt',
                    help='The file name of vocab file')
parser.add_argument('-alloc_file', '--alloc_file', default='word-0.location',
                    help='The file name of word allocation table')
parser.add_argument('-vocabsize', '--vocabsize', default=None, type=int,
                    help='The vocab size')
parser.add_argument('-seed', '--seed', default=0, type=int,
                    help='The random seed')

opt = parser.parse_args()


def preprocess():
    # Generate the vocabulary and location
    random.seed(opt.seed)
    # make vocabuary from the files under the datadir
    vocab = generate_vocab_from_source_file(opt.datadir, opt.vocabsize)
    # save vocabulary 
    save_to_vocab_file(vocab, os.path.join(opt.outputdir, opt.vocab_file))
    # make a random word allocation
    location = init_location(vocab)
    # save word allocation
    save_to_vocab_location_file(location, os.path.join(opt.outputdir, opt.alloc_file))


def main():
    preprocess()

if __name__ == '__main__':
    main()
back to top