#!/usr/bin/env python # This script takes a CNTK text format file and a header file, and converts it # to a CNTK binary format file. # # The header file must list all of the streams in the input file in the # following format: # # # Where: # is the desired name for the input in CNTK. # is the alias for the stream in the input file. # is the matrix type, i.e., dense or sparse # is the dimension of each sample for the input # import sys import argparse import struct import os from collections import OrderedDict MAGIC_NUMBER = 0x636e746b5f62696e; CBF_VERSION = 1; class ElementType: FLOAT = 0 DOUBLE = 1 class MatrixEncodingType: DENSE = 0 SPARSE = 1 # TODO: use varint encoding for sparse indices, # use varint encoding for integer values, # use a single byte for boolean values (e.g., one-hot values). #COMPRESSED_DENSE = 2 #COMPRESSED_SPARSE = 3 # This will convert data in the CTF format into the binary format class Converter(object): def __init__(self, name, sample_dim, element_type): self.name = name self.sample_dim = sample_dim # contains length (in samples) for each sequence in the chunk self.sequences = [] self.element_type = element_type def write_header(self, output): # First is the matrix type. output.write(struct.pack('= self.sample_dim): raise ValueError("Invalid sample dimension for input {0}. Max {1}, given {2}" .format(self.name, self.sample_dim, index)) byte_size = len(list(pairs)) * (8 if self.is_float() else 12) + 4 if(len(self.sequences) == 0): self.sequences.append([]) byte_size += 8; self.sequences[-1].append(pairs) return byte_size def get_matrix_type(self): return MatrixEncodingType.SPARSE; def write_data(self, output): format = 'f' if self.is_float() else 'd' for sequence in self.sequences: # write out each sequence in sparse format values = [] indices = [] sizes = [] for sample in sequence: sizes.append(len(sample)) sample.sort(key=lambda x: x[0]) for (index, value) in sample: indices.append(index) values.append(value) output.write(struct.pack(' 0 and alias[0] != '#'): byte_size += converters[alias].add_sample(values.split()) sequence_length_samples = max([len(x.sequences[-1]) for x in converters.values()]) chunk.add_sequence(sequence_length_samples) return byte_size # Output a binary chunk def write_chunk(binfile, converters, chunk): binfile.flush() chunk.offset = binfile.tell() # write out the number of samples for each sequence in the chunk binfile.write(b''.join([struct.pack(' def build_converters(streams_header, element_type): converters = OrderedDict(); for line in streams_header: (name, alias, input_type, sample_dim) = line.strip().split() converters[alias] = get_converter(input_type, name, int(sample_dim), element_type) return converters class Chunk: def __init__(self): self.offset = 0 self.sequences = [] def num_sequences(self): return len(self.sequences) def num_samples(self): return sum(self.sequences) def add_sequence(self, num_samples): return self.sequences.append(num_samples) class Header: def __init__(self, converters): self.converters = converters self.chunks = [] def add_chunk(self, chunk): assert(isinstance(chunk, Chunk)) self.chunks.append(chunk) # Output the binary format header. def write(self, output_file): output_file.flush() header_offset = output_file.tell() # First, write the magic number (uint64, 8 bytes) output_file.write(struct.pack(' 0 and seq_id != prefix)): if(len(sequence) > 0): estimated_chunk_size += process_sequence(sequence, converters, chunk) sequence = [] if(estimated_chunk_size >= chunk_size): write_chunk(output, converters, chunk) header.add_chunk(chunk) chunk = Chunk() seq_id = prefix sequence.append(line) # we must parse the last line if(len(sequence) > 0): process_sequence(sequence, converters, chunk) write_chunk(output, converters, chunk) header.add_chunk(chunk) header.write(output) output.close() if __name__ == '__main__': parser = argparse.ArgumentParser(description="Transforms a CNTK Text Format file into CNTK binary format given a header.") parser.add_argument('--input', help="CNTK Text Format file to convert to binary.", required=True) parser.add_argument('--header', help="Header file describing each stream in the input.", required=True) parser.add_argument('--chunk_size', type=int, help='Chunk size in bytes.', required=True) parser.add_argument('--output', help='Name of the output file, stdout if not given', required=True) parser.add_argument('--precision', help='Floating point precision (double or float). Default is float', choices=["float", "double"], default="float", required=False) args = parser.parse_args() with open(args.header) as header: streams = header.readlines() element_type = ElementType.FLOAT if args.precision == 'float' else ElementType.DOUBLE process(args.input, args.output, streams, element_type, int(args.chunk_size))