Content - 84b9ab3bceb514fa917fb2628530b520d21f3de6 - b1cf431/Examples/Text/WordLMWithSampledSoftmax/word_rnn.py

visit type:
Tip revision: a05c3c642648373f4ede0956e4286257c3d59a61 authored by liqfu on 24 August 2018, 17:46:51 UTC
CNTK splice allows broadcast. This case is handled in the change. For noop (identity) ops, its inputs and outputs types shall be set according to upstream ops. ToBatch/ToSequence and Unpack Batch/Sequence ops added during model importing need tp be skipped. Model import need to handle ops with multiple outputs.
Tip revision: a05c3c6
word_rnn.py
# ==============================================================================
# Copyright (c) Microsoft. All rights reserved.
# Licensed under the MIT license. See LICENSE.md file in the project root
# for full license information.
# ==============================================================================

import numpy as np
import os
import cntk as C
import timeit
from cntk import Axis
from cntk.train import Trainer
from cntk.learners import momentum_sgd
from cntk.ops import sequence
from cntk.losses import cross_entropy_with_softmax
from cntk.metrics import classification_error
from cntk.ops.functions import load_model
from cntk.layers import Recurrence, Dense, LSTM, Stabilizer, For, Sequential
from cntk.logging import log_number_of_parameters, ProgressPrinter
from data_reader import DataReader
from math import log, exp
from cntk.device import try_set_default_device, cpu, gpu

# Setting global parameters
use_sampled_softmax = True
softmax_sample_size = 500 # Applies only when 'use_sampled_softmax = True'

use_sparse = True

hidden_dim = 200
num_layers = 2
num_epochs = 10
sequence_length = 40
sequences_per_batch = 10
alpha = 0.75
learning_rate = 0.002
momentum_per_sample = 0.9999000049998333
clipping_threshold_per_sample = 5.0
token_to_id_path        = './ptb/token2id.txt'
validation_file_path    = './ptb/valid.txt'
train_file_path         = './ptb/train.txt'
token_frequencies_file_path = './ptb/freq.txt'
segment_sepparator = '<eos>'
num_samples_between_progress_report = 100000


# reads a file with one number per line and returns the numbers as a list
def load_sampling_weights(sampling_weights_file_path):
    weights = []
    f = open(sampling_weights_file_path,'r')
    for line in f:
        if len(line) > 0:
            weights.append(float(line))
    return weights

# Creates model subgraph computing cross-entropy with softmax.
def cross_entropy_with_full_softmax(
    hidden_vector,  # Node providing the output of the recurrent layers
    target_vector,  # Node providing the expected labels (as sparse vectors)
    vocab_dim,      # Vocabulary size
    hidden_dim      # Dimension of the hidden vector
    ):
    bias = C.Parameter(shape = (vocab_dim, 1), init = 0)
    weights = C.Parameter(shape = (vocab_dim, hidden_dim), init = C.initializer.glorot_uniform())

    z = C.reshape(C.times_transpose(weights, hidden_vector) + bias, (1,vocab_dim))
    zT = C.times_transpose(z, target_vector)
    ce = C.reduce_log_sum_exp(z) - zT
    zMax = C.reduce_max(z)
    error_on_samples = C.less(zT, zMax)
    return (z, ce, error_on_samples)

# Creates model subgraph computing cross-entropy with sampled softmax.
def cross_entropy_with_sampled_softmax(
    hidden_vector,           # Node providing the output of the recurrent layers
    target_vector,           # Node providing the expected labels (as sparse vectors)
    vocab_dim,               # Vocabulary size
    hidden_dim,              # Dimension of the hidden vector
    num_samples,             # Number of samples to use for sampled softmax
    sampling_weights,        # Node providing weights to be used for the weighted sampling
    allow_duplicates = False # Boolean flag to control whether to use sampling with replacement (allow_duplicates == True) or without replacement.
    ):
    bias = C.layers.Parameter(shape = (vocab_dim, 1), init = 0)
    weights = C.layers.Parameter(shape = (vocab_dim, hidden_dim), init = C.initializer.glorot_uniform())

    sample_selector_sparse = C.random_sample(sampling_weights, num_samples, allow_duplicates) # sparse matrix [num_samples * vocab_size]
    if use_sparse:
        sample_selector = sample_selector_sparse
    else:
        # Note: Sampled softmax with dense data is only supported for debugging purposes.
        # It might easily run into memory issues as the matrix 'I' below might be quite large.
        # In case we wan't to a dense representation for all data we have to convert the sample selector
        I = C.Constant(np.eye(vocab_dim, dtype=np.float32))
        sample_selector = C.times(sample_selector_sparse, I)

    inclusion_probs = C.random_sample_inclusion_frequency(sampling_weights, num_samples, allow_duplicates) # dense row [1 * vocab_size]
    log_prior = C.log(inclusion_probs) # dense row [1 * vocab_dim]


    print("hidden_vector: "+str(hidden_vector.shape))
    wS = C.times(sample_selector, weights, name='wS') # [num_samples * hidden_dim]
    print("ws:"+str(wS.shape))
    zS = C.times_transpose(wS, hidden_vector, name='zS1') + C.times(sample_selector, bias, name='zS2') - C.times_transpose (sample_selector, log_prior, name='zS3')# [num_samples]

    # Getting the weight vector for the true label. Dimension hidden_dim
    wT = C.times(target_vector, weights, name='wT') # [1 * hidden_dim]
    zT = C.times_transpose(wT, hidden_vector, name='zT1') + C.times(target_vector, bias, name='zT2') - C.times_transpose(target_vector, log_prior, name='zT3') # [1]


    zSReduced = C.reduce_log_sum_exp(zS)

    # Compute the cross entropy that is used for training.
    # We don't check whether any of the classes in the random samples coincides with the true label, so it might happen that the true class is counted
    # twice in the normalizing denominator of sampled softmax.
    cross_entropy_on_samples = C.log_add_exp(zT, zSReduced) - zT

    # For applying the model we also output a node providing the input for the full softmax
    z = C.times_transpose(weights, hidden_vector) + bias
    z = C.reshape(z, shape = (vocab_dim))

    zSMax = C.reduce_max(zS)
    error_on_samples = C.less(zT, zSMax)
    return (z, cross_entropy_on_samples, error_on_samples)

def average_cross_entropy(full_cross_entropy_node, input_node, label_node, data):
    count = 0
    ce_sum = 0
    for features, labels, _ in data.minibatch_generator(validation_file_path, sequence_length, sequences_per_batch):
        arguments = ({input_node : features, label_node : labels})
        full_cross_entropy = full_cross_entropy_node.eval(arguments)
        for ce_list in full_cross_entropy:
            ce_sum += np.sum(ce_list)
            count += len(ce_list)

    return ce_sum / count

def create_model(input_sequence, label_sequence, vocab_dim, hidden_dim):
    # Create the rnn that computes the latent representation for the next token.
    rnn_with_latent_output = Sequential([
        C.layers.Embedding(hidden_dim),   
        For(range(num_layers), lambda: 
            Sequential([Stabilizer(), Recurrence(LSTM(hidden_dim), go_backwards=False)])),
        ])

    
    # Apply it to the input sequence. 
    latent_vector = rnn_with_latent_output(input_sequence)

    # Connect the latent output to (sampled/full) softmax.
    if use_sampled_softmax:
        weights = load_sampling_weights(token_frequencies_file_path)
        smoothed_weights = np.float32( np.power(weights, alpha))
        sampling_weights = C.reshape(C.Constant(smoothed_weights), shape = (1,vocab_dim))
        z, ce, errs = cross_entropy_with_sampled_softmax(latent_vector, label_sequence, vocab_dim, hidden_dim, softmax_sample_size, sampling_weights)
    else:
        z, ce, errs = cross_entropy_with_full_softmax(latent_vector, label_sequence, vocab_dim, hidden_dim)

    return z, ce, errs


# Creates model inputs
def create_inputs(vocab_dim):
    input_seq_axis = Axis('inputAxis')
    input_sequence = sequence.input_variable(shape=vocab_dim, sequence_axis=input_seq_axis, is_sparse = use_sparse)
    label_sequence = sequence.input_variable(shape=vocab_dim, sequence_axis=input_seq_axis, is_sparse = use_sparse)
    
    return input_sequence, label_sequence

def print_progress(samples_per_second, average_full_ce, total_samples, total_time):
    print("time=%.3f ce=%.3f perplexity=%.3f samples=%d samples/second=%.1f" % (total_time, average_full_ce, exp(average_full_ce), total_samples, samples_per_second))
    with open("log.txt", "a+") as myfile:
        myfile.write("%.3f\t%.3f\t%.3f\t%d\t%.1f\n" % (total_time, average_full_ce, exp(average_full_ce), total_samples, samples_per_second))


# Creates and trains an rnn language model.
def train_lm(testing=False):
    data = DataReader(token_to_id_path, segment_sepparator)

    # Create model nodes for the source and target inputs
    input_sequence, label_sequence = create_inputs(data.vocab_dim)

    # Create the model. It has three output nodes
    # z: the input to softmax that  provides the latent representation of the next token
    # cross_entropy: this is used training criterion
    # error: this a binary indicator if the model predicts the correct token
    z, cross_entropy, error = create_model(input_sequence, label_sequence, data.vocab_dim, hidden_dim)

    # For measurement we use the (build in) full softmax.
    full_ce = C.cross_entropy_with_softmax(z, label_sequence)

    # print out some useful training information
    log_number_of_parameters(z) ; print()
    
    # Run the training loop
    num_trained_samples = 0
    num_trained_samples_since_last_report = 0

    # Instantiate the trainer object to drive the model training
    lr_schedule = C.learning_parameter_schedule_per_sample(learning_rate)
    momentum_schedule = C.momentum_schedule_per_sample(momentum_per_sample)
    gradient_clipping_with_truncation = True
    learner = momentum_sgd(z.parameters, lr_schedule, momentum_schedule,
                            gradient_clipping_threshold_per_sample=clipping_threshold_per_sample,
                            gradient_clipping_with_truncation=gradient_clipping_with_truncation)
    trainer = Trainer(z, (cross_entropy, error), learner)

    last_avg_ce = 0
    for epoch_count in range(num_epochs):
        for features, labels, token_count in data.minibatch_generator(train_file_path, sequence_length, sequences_per_batch):
            arguments = ({input_sequence : features, label_sequence : labels})

            t_start = timeit.default_timer()
            trainer.train_minibatch(arguments)
            t_end =  timeit.default_timer()

            samples_per_second = token_count / (t_end - t_start)

            # Print progress report every num_samples_between_progress_report samples

            if num_trained_samples_since_last_report >= num_samples_between_progress_report or num_trained_samples == 0:
                av_ce = average_cross_entropy(full_ce, input_sequence, label_sequence, data)
                print_progress(samples_per_second, av_ce, num_trained_samples, t_start)
                num_trained_samples_since_last_report = 0
                last_avg_ce = av_ce

            num_trained_samples += token_count
            num_trained_samples_since_last_report += token_count

        if not testing:
            # after each epoch save the model
            model_filename = "models/lm_epoch%d.dnn" % epoch_count
            z.save(model_filename)
            print("Saved model to '%s'" % model_filename)

    return last_avg_ce


if __name__=='__main__':
    # train the LM
    train_lm()
Browse the archive

https://github.com/Microsoft/CNTK