https://github.com/Microsoft/CNTK
Tip revision: c92d560d9bb2099c75ffc4c7a1b447e7b0885f1a authored by Peyman Manikashani on 07 September 2018, 22:41:43 UTC
fixes on Batchnorm and Pooling for v1 pretrained models after removal of sequence axis from input
fixes on Batchnorm and Pooling for v1 pretrained models after removal of sequence axis from input
Tip revision: c92d560
uci2ctf.py
import argparse
def convert(file_in, file_out, features_start, features_dim,
labels_start, labels_dim, num_labels, label_type='Category', mapping_file=None):
label_map = {}
if label_type == "Category":
if mapping_file is not None:
with open(mapping_file, 'r') as f:
for line in f.read().splitlines():
label_map[line] = len(label_map)
num_labels = max(num_labels, len(label_map))
else:
label_map = {str(x) : x for x in range(num_labels)}
input_file = open(file_in, 'r')
output_file = open(file_out, 'w')
for line in input_file.readlines():
values = line.split()
if label_type != 'None':
max_length = max(labels_start + labels_dim, features_start + features_dim)
if len(values) < (labels_dim + features_dim):
raise RuntimeError(("Too few input columns ({} out of expected {}) ")
.format(len(values), (labels_dim + features_dim)))
elif len(values) < max_length:
raise RuntimeError(
("Too few input columns ({} out of expected {}) ")
.format(len(values), max_length))
labels = values[labels_start:labels_start+labels_dim]
if label_type == 'Category':
one_hot = ['0'] * num_labels
# there's only one label
label = labels[0]
if label not in label_map:
raise RuntimeError(("Illegal label value: '{}'").format(label))
one_hot[label_map[label]] = '1'
labels = one_hot
output_file.write("|labels " + " ".join(labels))
output_file.write("\t")
elif len(values) < features_start+features_dim:
raise RuntimeError(
("Too few input columns ({} out of expected {}) ")
.format(len(values), features_start+features_dim))
output_file.write(
"|features " + " ".join(values[features_start:features_start+features_dim]))
output_file.write("\n")
input_file.close()
output_file.close()
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="UCI to CNTKText format converter",
epilog=("Quick example - converting MNIST data (see Examples/Image/MNIST):"
"\n\n\t"
"--input_file Examples/Image/MNIST/Data/Train-28x28.txt "
"--features_start 1 "
"--features_dim 784 "
"--labels_start 0 "
"--labels_dim 1 "
"--num_labels 10 "
"--output_file Examples/Image/MNIST/Data/Train-28x28_cntk_text.txt"
"\n\n"
"For more information please visit "
"https://docs.microsoft.com/en-us/cognitive-toolkit/BrainScript-CNTKTextFormat-Reader"),
formatter_class=argparse.RawTextHelpFormatter)
requiredNamed = parser.add_argument_group('required arguments')
requiredNamed.add_argument("-in", "--input_file",
help="input file path", required=True)
requiredNamed.add_argument("-fs", "--features_start", type=int,
help="start offset of feature columns", required=True)
requiredNamed.add_argument("-fd", "--features_dim", type=int,
help=("dimension of the feature vector "
"(number of feature columns in the input file)"),
required=True)
parser.add_argument("-lt", "--label_type", default="Category",
help=("Label type (indicates how the label columns should "
" be interpreted)"),
choices=["Category", "Regression", "None"])
parser.add_argument("-ls", "--labels_start", type=int,
help=("dimension of the label vector "
"(number of label columns in the input file)"))
parser.add_argument("-nl", "--num_labels", type=int,
help="number of possible label values "
"(required for categorical labels)")
parser.add_argument("-ld", "--labels_dim", type=int, default=1,
help=("dimension of the input label vector "
"(number of label columns in the input file, "
"default is 1)"))
parser.add_argument("--mapping_file",
help=("the path to a file used to map from the label value "
"to a numerical label identifier (if omitted, the "
"label value is interpreted as a numerical "
"identifier)"))
parser.add_argument("-out", "--output_file", help="output file path")
args = parser.parse_args()
# a number of sanity checks
if args.label_type != "None" and args.labels_start is None:
parser.error("-ls/--label_start is required when label type is not 'None'")
if args.label_type == "Category":
if args.num_labels is None:
parser.error("-nl/--num_labels is required when label type is 'Category'")
if args.labels_dim > 1:
parser.error("-ld/--labels_dim cannot be greater than 1 "
"when label type is 'Category'")
if args.label_type == "Regression":
if args.num_labels > args.labels_dim:
parser.error("-nl/--num_labels is optional and "
" cannot exceed -ld/--labels_dim "
" when label type is 'Regression'")
if args.label_type != 'None':
if (((args.labels_start <= args.features_start) and
(args.labels_start + args.labels_dim > args.features_start)) or
((args.labels_start > args.features_start) and
(args.features_start + args.features_dim > args.labels_start))):
parser.error("Label and feature column ranges must not overlap.")
file_in = args.input_file
file_out = args.output_file
if not file_out:
dot = file_in.rfind(".")
if dot == -1:
dot = len(file_in)
file_out = file_in[:dot] + "_cntk_text" + file_in[dot:]
print (" Converting from UCI format\n\t '{}'\n"
" to CNTK text format\n\t '{}'".format(file_in, file_out))
convert(file_in, file_out, args.features_start, args.features_dim,
args.labels_start, args.labels_dim, args.num_labels, args.label_type, args.mapping_file)