https://github.com/Microsoft/CNTK
Raw File
Tip revision: fec5bd5fdcd76a76e11dc533dbb5bdf2d5b14a29 authored by Vadim Mazalov on 13 January 2017, 01:41:40 UTC
CTC: Further refactoring
Tip revision: fec5bd5
Config.h
//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//
#pragma once
#include "Basics.h"
#include <vector>
#include <map>
#include <stdexcept>
#include <stdint.h>

using namespace std;

#pragma warning(disable : 4996) // Caused by the TODO below (line ~1280)

// helper for numeric parameter arguments for multiple layers
// This parses arguments of the form a:b*n:c where b gets duplicated n times and c unlimited times.
namespace Microsoft { namespace MSR { namespace CNTK {

#define FUNCTIONOPEN "("
#define OPENBRACES    "[{(\"" // all opening braces
#define CLOSINGBRACES "]})\"" // and matching closing ones

static const std::string::size_type npos = (std::string::size_type) -1;

// These are the constants associated with the "ResolveVariables" method.
static const char* openBraceVar    = "$"; // beginning of a var
static const char* closingBraceVar = "$"; // end of a var
static const char* forbiddenCharactersInVarName = ",/<>?;':\"[]{}\\|!@#%^&*()+=~` \t\n";
static const char* forbiddenCharactersInVarNameEscapeWhitespace = ",/<>?;':\"[]{}\\|!@#%^&*()+=~` \\t\\n";
static const std::size_t openBraceVarSize = strlen(openBraceVar);
static const std::size_t closingBraceVarSize = strlen(closingBraceVar);

// Trim - trim white space off the start and end of the string
// str - string to trim
// NOTE: if the entire string is empty, then the string will be set to an empty string
void Trim(std::string& str);

// TrimQuotes - trim surrounding quotation marks
// str - string to trim
void TrimQuotes(std::string& str);

class ConfigValue;
typedef std::map<std::string, ConfigValue, nocase_compare> ConfigDictionary;

class ConfigParameters;
std::string::size_type ParseKeyValue(const std::string& token,
                                     std::string::size_type pos,
                                     ConfigParameters& dict);

// ConfigValue - value of one configuration parameter
// Parses from string to resultant value on assignment. Basically a ConfigValue
// is a std::string with type casts to convert it to numeric types, boolean, etc.,
// by simply type-casting it or assigning it to a value of the desired type.
// ConfigParameters::ConfigDictionary is a collection of names ConfigValues,
// which know which collection they belong to (their 'parent').
// Often, they get constructed on the fly and passed around by value, e.g. in modified
// form or when falling back to a default value, without being added to the collection.
class ConfigValue : public std::string
{
    // name of this configuration, e.g. for error messages, optional
    std::string m_configName;

    // we belong to this collection of ConfigValues
    const ConfigParameters* m_parent;

public:
    std::string Name() const
    {
        return m_configName;
    }

    const ConfigParameters* Parent() const
    {
        return m_parent;
    }

    void SetParent(const ConfigParameters* parent)
    {
        m_parent = parent;
    }

protected:
    // Constructor with a parent pointer. NOTE: this MUST be used with care.
    // Parent lifetime must be longer than ConfigValue lifetime
    ConfigValue(const std::string& val, const std::string& name, const ConfigParameters* parent)
        : std::string(val)
    {
        m_configName = name;
        m_parent = parent;
    }

    // only allow these classes to construct ConfigValues with parent pointers,
    // they are meant as intermediate values only
    friend class ConfigParameters;
    friend class ConfigArray;

public:
    ConfigValue(const std::string& val, const std::string& name)
        : std::string(val), m_configName(name), m_parent(NULL)
    {
    }

    ConfigValue(const std::string& val)
        : std::string(val), m_parent(NULL)
    {
    }

    // empty constructor so ConfigValue can be contained in a std::map (requires default constructor)
    ConfigValue()
        : m_parent(NULL)
    {
    }

    // it auto-casts to the common types
    // Note: This is meant to read out a parameter once to assign it, instead of over again.
#if 0
    operator std::string() const { return *this; } // TODO: does not seem to work
#endif

    operator const char*() const
    {
        return c_str();
    }

    operator std::wstring() const
    {
        return msra::strfun::utf16(*this);
    }
    operator double() const
    {
        // will be set to point to first character that failed parsing
        char* ep;

        double value = strtod(c_str(), &ep);
        if (empty() || *ep != 0)
        {
            // check for infinity since strtod() can't handle it
            if (*ep && _strnicmp("#inf", ep, 4) == 0)
            {
                return std::numeric_limits<double>::infinity();
            }
            RuntimeError("ConfigValue (double): invalid input string '%s'", c_str());
        }
        return value;
    }

    operator float() const
    {
        return (float) (double) *this;
    }

private:
    long tolong() const
    {
        // will be set to point to first character that failed parsing
        char* ep;

        long value = strtol(c_str(), &ep, 10);
        if (empty() || *ep != 0)
        {
            RuntimeError("ConfigValue (long): invalid input string '%s'", c_str());
        }

        return value;
    }
    unsigned long toulong() const
    {
        // will be set to point to first character that failed parsing
        char* ep;

        unsigned long value = strtoul(c_str(), &ep, 10);
        if (empty() || *ep != 0)
        {
            RuntimeError("ConfigValue (unsigned long): invalid input string '%s'", c_str());
        }
        return value;
    }

public:
    operator short() const
    {
        long val = tolong();
        short ival = (short) val;
        if (val != ival)
        {
            RuntimeError("ConfigValue (short): integer argument expected");
        }
        return ival;
    }

    operator unsigned short() const
    {
        unsigned long val = toulong();
        unsigned short ival = (unsigned short) val;
        if (val != ival)
        {
            RuntimeError("ConfigValue (unsigned short): integer argument expected");
        }
        return ival;
    }

    operator int() const
    {
        long val = tolong();
        int ival = (int) val;
        if (val != ival)
        {
            RuntimeError("ConfigValue (int): integer argument expected");
        }
        return ival;
    }
    operator unsigned int() const
    {
        unsigned long val = toulong();
        unsigned int ival = (unsigned int) val;
        if (val != ival)
        {
            RuntimeError("ConfigValue (unsigned int): integer argument expected");
        }

        return ival;
    }
//#if (SIZE_MAX != ULONG_MAX)     // on x64 GCC unsigned long == size_t, i.e. we'd get an ambigous declaration
#ifdef _MSC_VER // somehow the above check does not work on GCC/Cygwin, causing an ambiguous declaration
    operator unsigned long() const
    {
        return toulong();
    }

    operator long() const
    {
        return tolong();
    }

#endif
    operator int64_t() const
    {
        // will be set to point to first character that failed parsing
        char* ep;

        int64_t value = _strtoi64(c_str(), &ep, 10);
        if (empty() || *ep != 0)
        {
            RuntimeError("ConfigValue (int64_t): invalid input string '%s'", c_str());
        }
        return value;
    }

    operator uint64_t() const
    {
        // will be set to point to first character that failed parsing
        char* ep;

        uint64_t value = _strtoui64(c_str(), &ep, 10);
        if (empty() || *ep != 0)
        {
            RuntimeError("ConfigValue (uint64_t): invalid input string '%s'", c_str());
        }

        return value;
    }

    operator bool() const
    {
        const auto& us = *this;
        if (us == "t" || us == "true" || us == "T" || us == "True" || us == "TRUE" || us == "1")
        {
            return true;
        }

        if (us == "f" || us == "false" || us == "F" || us == "False" || us == "FALSE" || us == "0" || us == "")
        {
            return false;
        }

        RuntimeError("ConfigValue (bool): boolean argument expected");

        // TODO: do we want to allow accept non-empty strings and non-0 numerical values as 'true'?
    }

    // ReplaceAppend - replace an existing value with another value, or append if it appears to be a "set" type
    ConfigValue& ReplaceAppend(const std::string& configValue)
    {
        static const std::string openBraces = "[";

        // if we have openBraces, append (it's a group)
        if (length() > 0 && openBraces.find(configValue[0]) != npos)
        {
            // append another value to the current value, add a space separator
            append(" ");
            append(configValue);
        }
        else
        {
            // otherwise replace
            this->assign(configValue);
        }
        return *this;
    }
};

// parse config parameters on separators, and keep track of configuration names
class ConfigParser
{
protected:
    char m_separator;
    mutable std::string m_configName; // name of this configuration, e.g. for error messages, optional

    // parse at the file level, can be overridden for "section of file" behavior
    virtual void FileParse(const std::string& stringParse)
    {
        Parse(stringParse);
    }

public:
    ConfigParser(char separator, const std::string& configname)
        : m_separator(separator), m_configName(configname)
    {
    }

    ConfigParser(char separator, const std::wstring& configname)
        : m_separator(separator)
    {
        m_configName = string(configname.begin(), configname.end());
    }

    ConfigParser(char separator)
        : m_separator(separator), m_configName("unknown")
    {
    }

    ConfigParser(const ConfigParser& configParser)
    {
        m_separator = configParser.m_separator;
        m_configName = configParser.m_configName;
    }

    ConfigParser(const ConfigParser&& configParser)
    {
        m_separator = configParser.m_separator;
        m_configName = move(configParser.m_configName);
    }

    ConfigParser& operator=(const ConfigParser& configParser) = default;

public:
    // FindBraces - parser helper function to find matching braces in a string starting at the
    // current position (any leading whitespace must have been consumed).
    // Basically this tokenizes an entire bracketed section into a single token.
    // This function assumes that all the underlying languages involved have matching braces.
    // Braces matched by this function are:
    //  - [ ... ]
    //  - { ... }
    //  - ( ... )
    //  - " ... "  (yes)
    // str - string to search
    // tokenStart - start location in the string to search
    // returns: character position of matching closing brace, string::npos if no brace present at start position
    static size_t FindBraces(const std::string& str, const size_t tokenStart)
    {
        const auto len = str.length();
        // start is outside (or rather, at end of string): no brace here
        if (tokenStart >= len)
            return npos;

        // open braces and quote
        static const std::string openBraces    = OPENBRACES;    // currently "[{(\""
        // close braces and quote
        static const std::string closingBraces = CLOSINGBRACES;

        static const auto charsToLookFor = closingBraces + openBraces; // all chars we match for

        // get brace index for first character of input string
        const auto braceFound = openBraces.find(str[tokenStart]);
        // no brace present at tokenStart
        if (braceFound == npos)
            return npos;
        // string begins with a brace--find the closing brace, while correctly handling nested braces
        std::string braceStack;                          // nesting stack; .back() is closing symbol for inner-most brace
        braceStack.push_back(closingBraces[braceFound]); // closing symbol for current
        // search for end brace or other nested layers of this brace type
        for (auto current = tokenStart; current + 1 < len;)
        {
            // look for closing brace and also for another opening brace
            // Inside strings we only accept the closing quote, and ignore any braces inside.
            current = str.find_first_of(braceStack.back() == '"' ? "\"" : charsToLookFor, current + 1);
            if (current == string::npos) // none found: error
                break;
            char brace = str[current];
            // found the expected closing brace?
            if (brace == braceStack.back())
            {
                braceStack.pop_back();  // yes: pop up and continue (or stop if stack is empty)
                if (braceStack.empty()) // fully closed: done
                    return current;
            }
            // or any other closing brace? That's an error.
            else if (brace != '"' && closingBraces.find(brace) != string::npos)
                RuntimeError("unmatched bracket found in parameters");
            // found another opening brace, push it on the stack
            else
            {
                const auto braceFound2 = openBraces.find(brace);  // index of brace
                braceStack.push_back(closingBraces[braceFound2]); // closing symbol for current
            }
        }
        // hit end before everything was closed: error
        RuntimeError("no closing %c found in parameters", braceStack.back());
        //RuntimeError("no closing bracket found in parameters (opening bracket at offset %d)\n%s", (int)tokenStart, str.substr(tokenStart).c_str());
    }

    // ParseValue - virtual function to parse a "token" as tokenized by Parse() below.
    // Parse() calls into ParseValue() which is a virtual function that implements how an assignment token is to be processed.
    virtual std::string::size_type ParseValue(const std::string& stringParse,
                                              std::string::size_type tokenStart,
                                              std::string::size_type tokenEnd) = 0;

    // Parse - Break a string into "records" and pass each to a user-specified function, where
    //  - record separator is newline and an optional record separator character (such as semicolon)
    //  - leading and trailing white space is trimmed from records
    //  - nested blocks (braces, string literals) are honored: record separators inside braces or quotes are ignored
    // In the simplest case, "records" are lines of text, e.g. the lines of a configuration file.
    // Any further parsing of these lines, e.g. of the form a=b, is up to the user-specified ParseValue()).
    // The above is subject to some special behaviors:
    //  - records that are complete brace expressions themselves are flattened,
    //    e.g. a ; [ x ; [ u ; v ] ] ; b emits the tokens "a", "x", "[ u ; v ]", and "b"
    //    This is meant for the case where the entire string is a brace expression (TODO: is that true? [fseide]).
    //  - the separator character can be changed inside a brace expression by appending the different separator
    //    right after the brace, e.g. [- a - b] will separate using '-' instead of ';'. TODO: document what this is used for.
    // This function is used at lots of places for various purposes.
    //  - (ConfigParameters from file) config-file parsing passes in expressions of the type a1=b1 \n a2=b2 \n ...,
    //    creates a ConfigDictionary entry for each top-level a=b expression, where b can be a block in braces
    //  - (ConfigParameters) right-hand side that is an array of parameters [ a1=b1; a2=b2 ...], with surrounding braces
    //  - (ConfigValue) individual values are also parsed
    //  - (ConfigArray) same as ConfigValue--the array syntax (':') is not parsed here
    //    The above all allow ';' or newline as a separator
    //  - (NDLScript)
    //  - more to be added
    // stringParse - string to parse
    // pos - postion to start parsing at
    // m_separator - extra separator character between tokens, typically ';' (in addition to comma and newline)
    void Parse(const std::string& stringParse, std::string::size_type pos = 0)
    {
        // set of record separator characters
        std::string seps = ",\r\n"; // default separators

        // and one extra caller-specified one (typically ';'). Note that this gets temporarily changed
        // inside content level, see below.
        seps += m_separator;

        // set that includes both record separators and all open-brace characters
        // OPENBRACES includes anything that requires a closing, including "
        std::string sepsBraces = seps + OPENBRACES;

        // set up for token loop
        // current token's end
        auto tokenEnd = pos;
        const auto totalLength = stringParse.length();
        auto braceEnd = totalLength;

        // are we inside content? (=an outer level of braces)
        bool contentLevel = false;

        // loop over tokens
        do
        {
            // consume separators (newline, comma, semicolon)
            auto tokenStart = stringParse.find_first_not_of(seps, tokenEnd);
            if (tokenStart == npos)
            {
                // no more tokens
                break;
            }

            // consume any leading spaces
            tokenStart = stringParse.find_first_not_of(" \t", tokenStart);
            if (tokenStart == npos)
            {
                break;
            }

            // lex one token--this determines 'tokenEnd' (we already got 'tokenStart')

            // First check whether we are in a braced condition (including ").
            const auto braceEndFound = FindBraces(stringParse, tokenStart);
            bool quoteFound = false;

            // opening braces found
            if (braceEndFound != npos)
            {
                // consume one level of braces right here, enter "content level" mode
                if (!contentLevel && tokenStart + 1 < totalLength /*[fseide] why is this test necessary?*/)
                {
                    // consume the opening brace
                    tokenStart++;

                    // check for custom separator character
                    // If the opening brace is immediately followed by any of the customSeparators,
                    // change m_separator (inside seps) to that character.
                    // The parser lets you change the default separator to something else. For example the default separator for an array is usually the ':' (I think)
                    // (12:45:23:46)
                    // However if you are using strings, and one of those strings contains a ':', you might want to change the separator to something else:
                    // (;this;is;a;path:;c:\mydir\stuff)
                    //
                    // This will fail for
                    // (..\dirname,something else)
                    // Hence there is an ugly fix for it below. This will go away when we replace all configuration parsing by BrainScript.
                    const static std::string customSeperators = "`~!@$%^&*_-+|:;,?.";

                    if (customSeperators.find(stringParse[tokenStart]) != npos && stringParse.substr(tokenStart).find("..") != 0 && stringParse.substr(tokenStart).find(".\\") != 0 && stringParse.substr(tokenStart).find("./") != 0 && stringParse.substr(tokenStart).find("\\\\") != 0 // [fseide] otherwise this will nuke leading . or .. or \\ in a pathname... Aargh!
                        )
                    {
                        char separator = stringParse[tokenStart];
                        // this was m_separator; on content level, we change it to a custom separator (it gets changed back when we exit content level)
                        seps[seps.length() - 1] = separator;

                        sepsBraces = seps + OPENBRACES;

                        // consume the separator
                        tokenStart++;
                    }
                    braceEnd = braceEndFound;
                    tokenEnd = tokenStart;

                    // now at content level
                    contentLevel = true;

                    // this sort of "recursively" calls ourselves with contentLevel flag set. [fseide] does this make sense for strings??
                    continue;
                }

                // content level braces: just find the end of the braces, and that's our token
                if (stringParse[braceEndFound] == '"') // special case for strings
                {
                    // for quoted string we skip the quotes
                    tokenStart++;

                    // position of closing "
                    tokenEnd = braceEndFound;

                    // tells code below to consume the closing "
                    quoteFound = true;
                }
                else
                {
                    // a regular brace: the entire brace expression becomes the token, including the braces themselves
                    // tokenEnd is one past the character we want
                    tokenEnd = braceEndFound + 1;
                }
            }
            else
            {
                // not braces
                // find the end of the token
                tokenEnd = stringParse.find_first_of(sepsBraces, tokenStart);

                // now look for contained braces before the next break
                if (tokenEnd != npos)
                {
                    const auto braceEndFound2 = FindBraces(stringParse, tokenEnd);

                    // found an embedded brace, extend token to the end of the braces
                    if (braceEndFound2 != npos)
                    {
                        // token includes the closing brace
                        tokenEnd = braceEndFound2 + 1;
                    }
                }

                // no more seperators
                if (tokenEnd == npos || tokenEnd > braceEnd)
                {
                    // use the length of the string as the boundary
                    tokenEnd = braceEnd;

                    if (tokenStart >= totalLength)
                    {
                        // if nothing left, we are done
                        break;
                    }
                }
            }
            // token has been determined to range from tokenStart to tokenEnd

            // now parse the value in a caller-specific fashion (through a virtual call into our host class)
            if (tokenEnd > tokenStart)
            {
                tokenEnd = ParseValue(stringParse, tokenStart, tokenEnd);
            }

            // prepare to advance to next token
            // if we hit the end of a brace block, move past the ending brace and reset
            if (tokenEnd == braceEnd)
            {
                // consume closing brace
                tokenEnd++;

                braceEnd = totalLength;

                // restore default separator
                seps[seps.length() - 1] = m_separator;
                sepsBraces = seps + OPENBRACES;

                // pop out of content level
                contentLevel = false;
            }

            if (quoteFound)
            {
                // skip the closing quote
                tokenEnd++;
            }
            // While we have tokens to parse
        } while (tokenEnd != npos);
    }

    // StripComments - This method removes the section of a config line corresponding to a comment.
    // configLine - The line within a config file to pre-process.
    // returns:
    //      If the entire line is whitespace, or if the entire line is a comment, simply return an empty string.
    //      If there is no comment, simply return the original 'configString'
    //      If there is a comment, remove the part of 'configString' corresponding to the comment
    //      Note that midline comments need to be preceded by whitespace, otherwise they are not treated as comments.
    static std::string StripComments(const std::string& configLine)
    {
        std::string::size_type pos = configLine.find_first_not_of(" \t");

        // entire line is whitespace, or it is a full line comment.
        if (pos == std::string::npos || configLine[pos] == '#')
        {
            return "";
        }

        // search for a comment mid line
        std::string::size_type midLineCommentPos = configLine.find_first_of('#', pos);

        // if there is no comment, simply return original string
        if (midLineCommentPos == std::string::npos)
        {
            return configLine;
        }

        // if we have a mid-line comment, make sure it's preceded by a whitespace character
        // otherwise, don't treat this midline comment as a comment.
        // this should be safe because midLineCommentPos is guaranteed to be > 0
        char chPrev = configLine[midLineCommentPos - 1];
        return (chPrev == ' ' || chPrev == '\t') ? configLine.substr(pos, midLineCommentPos - pos) : configLine;
    }

    std::string ReadConfigFile(const std::string& filePath);
    std::string ReadConfigFile(const std::wstring& filePath);
    std::string ReadConfigFiles(const std::string& filePaths);
    std::string ReadConfigFiles(const std::wstring& filePaths);
    std::string ResolveIncludeStatements(const std::string& configString, std::vector<std::string>& resolvedConfigFiles);
    void LoadConfigFile(const std::wstring& filePath);
    void LoadConfigFileAndResolveVariables(const std::wstring& filePath, const ConfigParameters& config);
    void LoadConfigFiles(const std::wstring& filePaths, const std::string* configStringToAppend = nullptr);

    void SetName(const std::wstring& name)
    {
        m_configName = string(name.begin(), name.end());
    }

    void SetName(const std::string& name)
    {
        m_configName = name;
    }

    std::string Name() const
    {
        return m_configName;
    }
};

// dictionary of parameters
// care should be used when using this class it has parent links to stack variables
// which are assumed to exist and have lifetimes that are allocated and freed in a FIFO manner.
// If this is not the case for a particular variable (stored in a class or something), you must
// call ClearParent() to disconnect it from it's parents before they are freed.
// usage: This class is intended to be used as local variables where the "parent"
// parameters have lifetimes longer than the "child" parameters
// for example:
// int wmain(int argc, wchar_t* argv[]) {
//    ConfigParameters config = ConfigParameters::ParseCommandLine(argc, argv);
//    A(config);
// }
// void A(const ConfigParameters& config) {ConfigParameters subkey1 = config("a"); /* use the config params */ B(subkey);}
// void B(const ConfigParameters& config) {ConfigParameters subkey2 = config("b"); /* use the config params */}
class ConfigParameters : public ConfigParser, public ConfigDictionary
{
    // WARNING: the parent pointer use requires parent lifetimes be longer than or equal to children.
    const ConfigParameters* m_parent;

public:
    // empty constructor
    ConfigParameters()
        : ConfigParser(';'), m_parent(NULL)
    {
        // parameterless version for subConfig Dictionaries
    }

    // construct ConfigParameters from a ConfigValue, propagate the config Name, and parent pointer
    ConfigParameters(const ConfigValue& configValue)
        : ConfigParser(';', configValue.Name()), m_parent(configValue.Parent())
    {
        std::string configString = configValue;
        Parse(configString);
    }

    // private:
    // copy and move constructors
    ConfigParameters(const ConfigParameters& configValue)
        : ConfigParser(configValue)
    {
        *this = configValue;
    }

    ConfigParameters(const ConfigParameters&& configValue)
        : ConfigParser(move(configValue))
    {
        *this = move(configValue);
    }

    ConfigParameters& operator=(const ConfigParameters& configValue)
    {
        this->ConfigParser::operator=(configValue);
        this->ConfigDictionary::operator=(configValue);
        this->m_parent = configValue.m_parent;
        return *this;
    }

    ConfigParameters& operator=(const ConfigParameters&& configValue)
    {
        this->ConfigParser::operator=(configValue);
        this->ConfigDictionary::operator=(configValue);
        this->m_parent = configValue.m_parent;
        return *this;
    }

private:
    // hide new so only stack allocated   --TODO: Why do we care?
    void* operator new(size_t /*size*/);

public:
    // used as default argument to operator(id, default) to retrieve ConfigParameters
    static const ConfigParameters& Record()
    {
        static ConfigParameters emptyParameters;
        return emptyParameters;
    }
    // to retrieve an array, pass e.g. Array(floatargvector()) as the default value
    template <class V>
    static const V& Array(const V& vec)
    {
        return vec;
    }

    // get the names of all members in this record (but not including parent scopes)
    vector<wstring> GetMemberIds() const
    {
        vector<wstring> ids;
        for (auto iter = begin(); iter != end(); ++iter)
        {
            auto id = iter->first;
            ids.push_back(wstring(id.begin(), id.end()));
        }
        return ids;
    }

    bool CanBeConfigRecord(const wstring& /*id*/) const
    {
        return true;
    }
    bool CanBeString(const wstring& /*id*/) const
    {
        return true;
    }

public:
    // explicit copy function. Only to be used when a copy must be made.
    // this also clears out the parent pointer, so only local configs can be used
    ConfigParameters& CopyTo(ConfigParameters& copyTo) const
    {
        copyTo = *this;
        copyTo.ClearParent();
        return copyTo;
    }

    // clear the parent link, important when storing ConfigParameters in a
    // class where parent lifetime is not guaranteed
    void ClearParent()
    {
        m_parent = NULL;
    }

    const ConfigParameters* GetParent() const
    {
        return m_parent;
    }

    // parse a 'key=value' pair and insert in the ConfigDictionary
    std::string::size_type ParseValue(const std::string& stringParse,
                                      std::string::size_type tokenStart,
                                      std::string::size_type tokenEnd)
    {
        // skip leading spaces
        tokenStart = stringParse.find_first_not_of(" \t", tokenStart);
        auto keyEnd = stringParse.find_first_of("=" OPENBRACES, tokenStart);
        std::string value;

        // if no value is specified, it's a boolean variable and set to true
        if (keyEnd == npos || keyEnd >= tokenEnd)
        {
            auto key = stringParse.substr(tokenStart, tokenEnd - tokenStart);
            Trim(key);
            value = "true";
            if (!key.empty())
            {
                Insert(key, value);
            }
            return tokenEnd;
        }

        // get the key
        // As a special case, we will get an empty key when parsing a macro definition.
        auto key = stringParse.substr(tokenStart, keyEnd - tokenStart);
        Trim(key);
        tokenStart = keyEnd;
        if (stringParse[keyEnd] == '=')
        {
            ++tokenStart;
        }

        // skip any spaces before the second token
        tokenStart = stringParse.find_first_not_of(" \t", tokenStart);
        std::string::size_type substrSize = tokenEnd - tokenStart;

        auto bracesEnd = FindBraces(stringParse, tokenStart);

        // if braces found, we modify the token end according to braces
        if (bracesEnd != npos)
        {
            // include the trailing brace
            tokenEnd = bracesEnd + 1;
            substrSize = tokenEnd - tokenStart;

            // for quote delimited string remove quotes
            if (stringParse[tokenStart] == '"')
            {
                tokenStart++;
                substrSize -= 2; // take out the quotes
            }
            //else if (substrSize == 0)
            //{
            //    InvalidArgument("ParseValue: No value given for '%s'.", key.c_str());
            //    //return npos;
            //}
            // Not a quoted string: It's an empty section. Nothing wrong with that, is it?
        }
        else if (substrSize == 0)
        {
            InvalidArgument("ParseValue: No value given for '%s'.", key.c_str());
        }

        // get the value
        value = stringParse.substr(tokenStart, substrSize);
        Trim(value);

        // add the value to the dictionary
        if (!key.empty()) // in NDL macro definitions, we get called with cursor on the '='
            Insert(key, value);

        return tokenEnd;
    }

    // Insert - insert a new name and value into the dictionary
    void Insert(const std::wstring& name, const std::string& val)
    {
        Insert(msra::strfun::utf8(name), val);
    }

    // Insert - insert a new name and value into the dictionary
    void Insert(const std::string& name, const std::string& val)
    {
        auto iter = find(name);
        if (iter != end())
        {
            // replace or append the value
            iter->second.ReplaceAppend(val);
        }
        else
        {
            std::string fullName = m_configName + ":" + name;
            auto res = ConfigDictionary::insert(std::make_pair(name, ConfigValue(val, fullName, this)));

            if (!res.second)
            {
                // no insertion was made
                RuntimeError("configparameters: duplicate parameter definition for %s", fullName.c_str());
            }
        }
    }

    // Insert - insert an 'name=value' string into the dictionary
    void Insert(const std::string& str)
    {
        ParseValue(str, 0, str.length());
    }

    bool Exists(const std::wstring& name) const
    {
        return Exists(msra::strfun::utf8(name));
    }

    bool Exists(const std::string& name) const
    {
        if (find(name) != end())
        {
            return true;
        }

        // now check parent if we have one
        if (m_parent != NULL)
        {
            return m_parent->Exists(name);
        }
        return false;
    }

    // ExistsCurrent - check to see if a key exists in THIS config, don't check parent
    bool ExistsCurrent(const std::string& name) const
    {
        return (find(name) != end());
    }
    bool ExistsCurrent(const wchar_t* name) const
    {
        return ExistsCurrent(string(name, name + wcslen(name)));
    }

    // dict(name, default) for strings
    ConfigValue operator()(const std::wstring& name,
                           const wchar_t* defaultvalue) const
    {
        return operator()(string(name.begin(), name.end()), defaultvalue);
    }

    // dict(name, default) for strings
    ConfigValue operator()(const std::string& name,
                           const wchar_t* defaultvalue) const
    {
        return operator()(name, msra::strfun::utf8(defaultvalue).c_str());
    }

    // dict(name, default) for strings
    ConfigValue operator()(const std::wstring& name,
                           const char* defaultvalue) const
    {
        return operator()(string(name.begin(), name.end()), defaultvalue);
    }

    // dict(name, default) for strings
    ConfigValue operator()(const std::string& name,
                           const char* defaultvalue) const
    {
        ConfigValue value = Find(name, defaultvalue);
        return value;
    }

    // version for defaults with types
    template <typename Type>
    Type operator()(const wchar_t* name,
                    const Type& defaultValue) const
    {
        // find the value
        // TODO: unify with the Find() function below
        for (auto* dict = this; dict; dict = dict->m_parent)
        {
            auto iter = dict->find(string(name, name + wcslen(name)));
            if (iter != dict->end())
            {
                if (iter->second == "default")
                    break; // use the default value
                return (Type) iter->second;
            }
        }
        // we get here if no dict in the chain contains the entry, or if the entry's string value says "default"
        return defaultValue;
    }

    // Look up a variable through the nested hierarchy. If not found, return false, and 'result'is untouched.
    bool TryFind(const std::string& name, ConfigValue& result, const char* defaultvalue = NULL) const
    {
        auto iter = find(name);

        // if we aren't found, or they want the default value
        // TODO: What the hell is this?
        if (iter == end() || iter->second == "default")
        {
            // not found but the parent exists, check there
            if (iter == end() && m_parent != NULL)
            {
                result = m_parent->Find(name, defaultvalue);
                return true;
            }
            else if (defaultvalue != NULL)
            {
                // no parent, so use default value
                std::string fullName = m_configName + ":" + name;
                result = ConfigValue(defaultvalue, fullName, this);
                return true;
            }
        }
        else
        {
            std::string rhs = iter->second;
            rhs = this->ResolveVariables(rhs);
            std::string fullName = m_configName + ":" + name;
            result = ConfigValue(rhs, fullName, this);
            return true;
        }
        return false; // not found
    }

    // Look up a variable using TryFind() above. If not found, return empty string.
    ConfigValue Find(const std::string& name, const char* defaultvalue = NULL) const
    {
            ConfigValue result;
            TryFind(name, result, defaultvalue); // (if returns false, we return an empty ConfigValue)
            return result;
    }

    // ResolveVariablesInSingleLine - In this method we replace all substrings of 'configLine' of the form "$varName$"
    //     (where varName is a variable name), with the value of the "varName" variable in config.
    //     We search up the config tree for the value, and we throw an error if we don't find it.
    //     Note that this process is recursive.  Take the following example: A=1; B=$A$; C=$B$.
    //     In this example, calling ResolveVariables with $B$, would see B=$A$, then look up the value
    //     of A and see A=1, and it would then replace the string "$B$" with the string "1".
    //     Note that this method ignores comments in 'configString' (though they should probably already be
    //     removed from 'configString' before calling this method).
    //     Variables must begin with a letter; e.g. $1 does not trigger.
    //     If an opening "$" is found without a closing "$", an exception is thrown.
    // BUGBUG: This does not allow to deliver a string value to a config parameter that contains a $ sign followed by a letter.
    // configString - the string that you would like to resolve variables in.
    // returns: A copy of 'configString' with all the variables resolved.
    std::string ResolveVariablesInSingleLine(const std::string& configLine) const
    {
        // ensure that this method was called on a single line (eg, no newline characters exist in 'configLine').
        if (configLine.find_first_of("\n") != std::string::npos)
            LogicError("ResolveVariablesInSingleLine() should not be called with a string containing a newline character");

        std::string newConfigLine = StripComments(configLine);
        std::size_t start = newConfigLine.find_first_of(openBraceVar);
        std::size_t end = 0;
        while (start != std::string::npos )
        {
            // variable names must begin with a letter or '_'
            if (start + 1 < newConfigLine.size() && !iscalpha(newConfigLine[start + 1]) && newConfigLine[start + 1] != '_')
            {
                start = newConfigLine.find_first_of(openBraceVar, start + 2);
                continue;
            }

            // search for whitespace or closing brace.
            end = newConfigLine.find_first_of(std::string(closingBraceVar) + forbiddenCharactersInVarName,
                                              start + openBraceVarSize);

            // If no end found on the line or the variable name would be invalid, we ignore the $.
            if (end == std::string::npos || newConfigLine[end] != '$')
            {
                start = newConfigLine.find_first_of(openBraceVar, start + 1);
                continue;
            }

            // end + 1 - start = the length of the string, including opening and closing braces.
            std::size_t varLength = (end + 1 - start) - (openBraceVarSize + closingBraceVarSize);
            std::string varName = newConfigLine.substr(start + openBraceVarSize, varLength);

            // It would be great to recognize $$ as $, but since stuff gets parsed and reparsed over again
            // in nested dictionaries, this is not working.
            if (varName.empty())
            {
                RuntimeError("$$ is not allowed. Parsing of string failed: %s:%s",
                             m_configName.c_str(),
                             newConfigLine.c_str());
            }

            // Note that this call to "Find" can trigger further substitutions of the form $varName2$ -> varValue2,
            // thus making this search process recursive.
            ConfigValue varConfigValue;
            const bool foundValue = this->TryFind(varName, varConfigValue);
            if (!foundValue)
            {
                RuntimeError("No variable found with the name %s. Parsing of string failed: %s:%s",
                             varName.c_str(), m_configName.c_str(),
                             newConfigLine.c_str());
            }

            std::string varValue = varConfigValue;
            if (varValue.find_first_of("\n") != std::string::npos)
                LogicError("Newline characters are not allowed in the value of a variable which is resolved using $varName$ feature");

            // Replace $varName$ with 'varValue'.  Then continue the search for
            // other variables in 'newConfigLine' string, starting at the point
            // in the 'newConfigLine' string right after 'varValue' (all variables prior to this point
            // have already been resolved, due to recursion)
            newConfigLine.replace(start, varLength + openBraceVarSize + closingBraceVarSize, varValue);
            start = newConfigLine.find_first_of(openBraceVar, start + varValue.size());
        }

        return newConfigLine;
    }

    // ResolveVariables - In this method we replace all instances of substrings
    //     of 'configString' of the form "$varName$" (where varName is a variable name),
    //     with the value of the "varName" variable in config.  We do this by calling
    //     the 'ResolveVariablesInSingleLine' function on every line of 'configString'.
    //     See 'ResolveVariablesInSingleLine' method for more details.  Note that if
    //     there are no newlines in 'configString', then we don't append any newlines to it.
    //     This is important, because when this function is called recursively
    //     (eg, from inside the "Find" method, in order to
    //     to resolve something like "$A$" in a string like "$A$\$B$"),
    //     we shouldn't insert newlines where they didn't already exist.
    // configString - the string that you would like to resolve variables in.
    // returns: A copy of 'configString' with all the variables resolved.
    std::string ResolveVariables(const std::string& configString) const
    {
        std::string newConfigString;
        if (configString.find_first_of("\n") != std::string::npos)
        {
            // if 'configString' contains newlines, put them back after resolving each line.
            std::vector<std::string> configLines = msra::strfun::split(configString, "\n");
            for (auto configLine : configLines)
            {
                newConfigString += ResolveVariablesInSingleLine(configLine) + "\n";
            }
        }
        else
        {
            // if 'configString' doesn't contain any newlines, don't append a newline.
            newConfigString = ResolveVariablesInSingleLine(configString);
        }

        return newConfigString;
    }

    // dict(name): read out a mandatory parameter value
    ConfigValue operator()(const std::wstring& name) const
    {
        return operator()(string(name.begin(), name.end()));
    }

    // dict(name): read out a mandatory parameter value
    ConfigValue operator()(const std::string& name) const
    {
        ConfigValue value = Find(name);
        if (value.empty())
        {
            RuntimeError("configparameters: required parameter missing: %s:%s",
                         m_configName.c_str(), name.c_str());
        }

        // update parent pointer to this pointer
        value.SetParent(this);
        return value;
    }

    // Match - comparison function, case insensitive
    // key - key to get the value from
    // compareValue - string to compare against
    // returns - true if it matches
    bool Match(const std::string& key, const std::string& compareValue) const
    {
        std::string value = Find(key);
        return EqualCI(compareValue, value);
    }
    bool Match(const std::wstring& key, const std::wstring& compareValue) const
    {
        return Match(string(key.begin(), key.end()), msra::strfun::utf8(compareValue));
    }

    // return the entire path to this config element
    // NOTE: may get messed up if you use temporaries mid-stream
    const std::string& ConfigPath() const
    {
        return m_configName;
    }

    // return the name of this config element
    const std::string ConfigName() const
    {
        auto lastColon = m_configName.find_last_of(':');
        if (lastColon != npos && m_configName.size() > lastColon + 1)
        {
            return m_configName.substr(lastColon + 1);
        }
        return std::string(); // empty string
    }

    static std::string ParseCommandLine(int argc, wchar_t* argv[], ConfigParameters& config);

    // support for BrainScriptNetworkBuilder: It needs the config directories in order to know where to include files from.
    static vector<wstring>& GetBrainScriptNetworkBuilderIncludePaths()
    {
        static vector<wstring> allConfigDirs;
        return allConfigDirs;
    }

    // dump for debugging purposes
    void dump() const
    {
        for (auto iter = begin(); iter != end(); iter++)
        {
            fprintf(stderr, "configparameters: %s:%s=%s\n",
                    m_configName.c_str(), iter->first.c_str(), ((std::string) iter->second).c_str());
        }
    }

    void dumpWithResolvedVariables() const
    {
        for (auto iter = begin(); iter != end(); iter++)
        {
            fprintf(stderr, "configparameters: %s:%s=%s\n",
                    m_configName.c_str(), iter->first.c_str(), ResolveVariables(((std::string) iter->second)).c_str());
        }
    }

    // cast ConfigParameters back to a string so we can return it as a ConfigValue
    operator ConfigValue()
    {
        std::string unparse = "[";
        for (auto iter = this->begin(); iter != this->end(); ++iter)
        {
            // NOTE: the first time through this loop we will get a separator before the first value
            // this is by design, since a separator immediately following a brace "[," defines the separator for that block
            std::string value = iter->first + '=' + iter->second;
            unparse += m_separator + value;
        }
        unparse += "]";
        return ConfigValue(unparse, m_configName, m_parent);
    }
};

class ConfigArray : public ConfigParser, public std::vector<ConfigValue>
{
    bool m_repeatAsterisk;

public:
    // construct an array from a ConfigValue, propogate the configName
    ConfigArray(const ConfigValue& configValue, char separator = ':', bool repeatAsterisk = true)
        : ConfigParser(separator, configValue.Name())
    {
        m_repeatAsterisk = repeatAsterisk;
        std::string configString = configValue;
        Parse(configString);
    }

    // config aray from a string
    ConfigArray(const char* configValue, char separator = ':', bool repeatAsterisk = true)
        : ConfigParser(separator)
    {
        m_repeatAsterisk = repeatAsterisk;
        Parse(configValue);
    }

    // empty config array
    ConfigArray(char separator = ':', bool repeatAsterisk = true)
        : ConfigParser(separator)

    {
        m_repeatAsterisk = repeatAsterisk;
    }

    // copy and move constructors and assignment
    ConfigArray(const ConfigArray& configValue)
        : ConfigParser(configValue)
    {
        m_repeatAsterisk = true;
        *this = configValue;
    }

    ConfigArray(const ConfigArray&& configValue)
        : ConfigParser(move(configValue))
    {
        m_repeatAsterisk = true;
        *this = move(configValue);
    }

    ConfigArray& operator=(const ConfigArray& configValue) = default;

    // cast a configArray back to a string so we can return it as a ConfigValue
    operator ConfigValue()
    {
        std::string unparse = "{";
        for (auto iter = this->begin(); iter != this->end(); ++iter)
        {
            // NOTE: the first time through this loop we will get a separator
            // before the first value this is by design, since a separator
            // immediately following a brace "{," defines the separator for that block
            std::string value = *iter;
            unparse += m_separator + value;
        }
        unparse += "}";
        return ConfigValue(unparse, m_configName);
    }

private:
    // parse a 'value*count' pair or just a 'value' and insert in the array
    std::string::size_type ParseValue(const std::string& stringParse,
                                      std::string::size_type tokenStart,
                                      std::string::size_type tokenEnd)
    {
        // skip leading spaces
        tokenStart = stringParse.find_first_not_of(" \t", tokenStart);
        if (tokenStart >= tokenEnd)
        {
            // nothing but spaces
            return tokenEnd;
        }

        // check for an opening brace, if it exists, no need to parse
        // further, it's a nested element (and we don't allow counts)
        auto braceFound = FindBraces(stringParse, tokenStart);
        auto valueEnd = tokenEnd;

        // no braces, so search for repeat symbol
        if (braceFound == npos && m_repeatAsterisk)
        {
            valueEnd = stringParse.find_first_of("*", tokenStart);
        }

        std::string value;
        int count = 1;

        // no count found, just a value
        if (valueEnd >= tokenEnd || valueEnd == npos)
        {
            value = stringParse.substr(tokenStart, tokenEnd - tokenStart);
            Trim(value);
        }
        else
        {
            // if a count is specified (i.e. '1.23*5')

            // get the value
            value = stringParse.substr(tokenStart, valueEnd - tokenStart);
            Trim(value);
            tokenStart = valueEnd + 1;
            if (tokenStart >= tokenEnd)
            {
                return npos;
            }

            auto tokenLength = tokenEnd - tokenStart;

            // get the count
            auto countStr = stringParse.substr(tokenStart, tokenLength);
            Trim(countStr);

            // add the value to the dictionary
            ConfigValue countVal(countStr);
            count = countVal;
        }

        // push the values into the vector, and determine their names
        for (int i = 0; i < count; ++i)
        {
            char buf[10];

            // TODO: left-over of Linux compat, can be done nicer
            sprintf(buf, "%d", (int) size());

            std::string name = m_configName + '[' + buf + ']';
            push_back(ConfigValue(value, name));
        }
        return tokenEnd;
    }
};

// ConfigParamList - used for parameter lists, disables * handling and set default separator to ','
class ConfigParamList : public ConfigArray
{
public:
    // construct an array from a ConfigValue, propogate the configName
    ConfigParamList(const ConfigValue& configValue)
        : ConfigArray(configValue, ',', false)
    {
    }

    ConfigParamList(const char* configValue)
        : ConfigArray(configValue, ',', false)
    {
    }

    ConfigParamList()
        : ConfigArray(',', false)
    {
    }
};

// get config sections that define files (used for readers)
template <class ConfigRecordType>
void GetFileConfigNames(const ConfigRecordType& readerConfig,
                        std::vector<std::wstring>& features,
                        std::vector<std::wstring>& labels);
template <class ConfigRecordType>
void FindConfigNames(const ConfigRecordType& config, std::string key,
                     std::vector<std::wstring>& names);

// Version of argument vectors that preparse everything instead of parse on demand
template <typename T>
class argvector : public std::vector<T>
{
    typedef std::vector<T> B;
    using B::clear;
    using B::reserve;
    using B::push_back;
    static void parse(const std::wstring& in, float& val)
    {
        val = (float) msra::strfun::todouble(in);
    }

    // convert wstring toks2[0] to T val and check type
    template <typename INT>
    static void parseint(const std::wstring& in, INT& val)
    {
        double dval = msra::strfun::todouble(in);
        val = (INT) dval;
        if (val != dval)
            RuntimeError("argvector: invalid arg value");
    }
    static void parse(const std::wstring& in, size_t& val)
    {
        parseint(in, val);
    }
    static void parse(const std::wstring& in, int& val)
    {
        parseint(in, val);
    }
    static void parse(const std::wstring& in, std::wstring& val)
    {
        val = in;
    }

public:
    // constructor --construct empty, then assign a wstring from command-line argument
    void operator=(const std::wstring& arg)
    {
        clear();
        // separate the arguments
        std::vector<std::wstring> toks = msra::strfun::split(arg, L":");

        // comment the following argument for current stringargvector need to be empty.[v-xieche]
        // if (toks.empty()) RuntimeError ("argvector: arg must not be empty");
        foreach_index (i, toks)
        {
            // split off repeat factor
            std::vector<std::wstring> toks2 = msra::strfun::split(toks[i], L"*");

            T val;

            // convert wstring toks2[0] to T val and check type
            parse(toks2[0], val);

            // repeat factor
            int rep = (toks2.size() > 1) ? msra::strfun::toint(toks2[1]) : 1;

            if (rep < 1)
            {
                RuntimeError("argvector: invalid repeat factor");
            }

            for (int j = 0; j < rep; j++)
            {
                push_back(val);
            }
        }
    }

    // constructor --use this for setting default values
    argvector(const std::wstring& arg)
    {
        *this = arg;
    }
    // empty constructor --for use in structs
    argvector()
    {
    }

    // constructor to convert from config array to constant array
    argvector(const ConfigArray& configArray)
    {
        reserve(configArray.size());
        foreach_index (i, configArray)
        {
            T val = configArray[i];
            push_back(val);
        }
    }

    // constructor from ConfigValue to convert from config array to constant array
    argvector(const ConfigValue& configValue)
        : argvector((ConfigArray) configValue)
    {
    }

    // constructor from std::vector
    argvector(const std::vector<T> configVector)
        : std::vector<T>(configVector)
    {
    }

    // operator[] repeats last value infinitely
    T operator[](size_t i) const
    {
        if (i >= size())
        {
            return std::vector<T>::operator[](size() - 1);
        }
        else
        {
            return std::vector<T>::operator[](i);
        }
    }

    T& operator[](size_t i)
    {
        if (i >= size())
        {
            return std::vector<T>::operator[](size() - 1);
        }
        else
        {
            return std::vector<T>::operator[](i);
        }
    }

    T last() const
    {
        return (*this)[size() - 1];
    }

    // we give full read access to the vector, so we can use it bounded as well
    const std::vector<T>& tovector() const
    {
        return *this;
    }

    size_t size() const
    {
        return std::vector<T>::size();
    }
};

typedef argvector<int> intargvector;
typedef argvector<float> floatargvector;
typedef argvector<double> doubleargvector;
typedef argvector<std::wstring> stringargvector;

}}}
back to top