// ConfigParser.cpp -- config parser (syntactic only, that is, source -> Expression tree) #define _CRT_SECURE_NO_WARNINGS // "secure" CRT not available on all platforms --add this at the top of all CPP files that give "function or variable may be unsafe" warnings #include "BrainScriptParser.h" #include "File.h" #include #include #include #include #include #include #include #include #include #include #ifndef let #define let const auto #endif namespace Microsoft { namespace MSR { namespace BS { using namespace std; using namespace msra::strfun; using namespace Microsoft::MSR::CNTK; // --------------------------------------------------------------------------- // source files and text references (location) into them // --------------------------------------------------------------------------- // SourceFile constructors SourceFile::SourceFile(wstring location, wstring text) : path(location), lines(split(text, L"\r\n")) { } // from string, e.g. command line SourceFile::SourceFile(wstring path) : path(path) // from file { File(path, fileOptionsRead | fileOptionsText).GetLines(lines); } bool TextLocation::IsValid() const { return sourceFileAsIndex != SIZE_MAX; } // register a new source file and return a TextPosition that points to its start /*static*/ TextLocation TextLocation::NewSourceFile(SourceFile&& sourceFile) { TextLocation loc; loc.lineNo = 0; loc.charPos = 0; loc.sourceFileAsIndex = sourceFileMap.size(); // index under which we store the source file sourceFileMap.push_back(move(sourceFile)); // take ownership of the source file and give it a numeric index return loc; } // helper for pretty-printing errors: Show source-code line with ...^ under it to mark up the point of error struct Issue { TextLocation location; // using lineno and source file; char position only for printing the overall error loc wstring markup; // string with markup symbols at char positions and dots inbetween void AddMarkup(wchar_t symbol, size_t charPos) { if (charPos >= markup.size()) markup.resize(charPos + 1, L' '); // fill with '.' up to desired position if the string is not that long yet if (markup[charPos] == L' ') // don't overwrite markup[charPos] = symbol; } Issue(TextLocation location) : location(location) { } }; // trace /*static*/ void TextLocation::Trace(TextLocation location, const wchar_t* traceKind, const wchar_t* op, const wchar_t* exprPath) { fprintf(stderr, "%ls: %ls (path %ls)\n", traceKind, op, exprPath); const auto& lines = location.GetSourceFile().lines; const auto line = (location.lineNo == lines.size()) ? L"(end)" : lines[location.lineNo].c_str(); Issue issue(location); issue.AddMarkup(L'^', location.charPos); fprintf(stderr, " %ls\n %ls\n", line, issue.markup.c_str()); } // report an error // The source line is shown, and the position is marked as '^'. // Because it is often hard to recognize an issue only from the point where it occurred, we also report the history in compact visual form. // Since often multiple contexts are on the same source line, we only print each source line once in a consecutive row of contexts. /*static*/ void TextLocation::PrintIssue(const vector& locations, const wchar_t* errorKind, const wchar_t* kind, const wchar_t* what) { wstring error = CreateIssueMessage(locations, errorKind, kind, what); fprintf(stderr, "%ls", error.c_str()); fflush(stderr); } /*static*/ wstring TextLocation::CreateIssueMessage(const vector& locations, const wchar_t* errorKind, const wchar_t* kind, const wchar_t* what) { vector issues; // tracing the error backwards size_t symbolIndex = 0; wstring message; for (size_t n = 0; n < locations.size(); n++) { let& location = locations[n]; if (!location.IsValid()) // means thrower has no location, go up one context continue; // build the array if (symbolIndex == 0 || location.lineNo != issues.back().location.lineNo || location.sourceFileAsIndex != issues.back().location.sourceFileAsIndex) { if (issues.size() == 10) break; else issues.push_back(location); } // get the symbol to indicate how many steps back, in this sequence: ^ 0..9 a..z A..Z (we don't go further than this) wchar_t symbol; if (symbolIndex == 0) symbol = '^'; else if (symbolIndex < 1 + 10) symbol = '0' + (wchar_t) symbolIndex - 1; else if (symbolIndex < 1 + 10 + 26) symbol = 'a' + (wchar_t) symbolIndex - (1 + 10); else if (symbolIndex < 1 + 10 + 26 + 26) symbol = 'A' + (wchar_t) symbolIndex - (1 + 10 + 26); else break; symbolIndex++; // insert the markup issues.back().AddMarkup(symbol, location.charPos); } // print it backwards if (!locations.empty()) // (be resilient to some throwers not having a TextLocation; to be avoided) { let& firstLoc = issues.front().location; message += wstrprintf(L"[CALL STACK]\n"); for (auto i = issues.rbegin(); i != issues.rend(); i++) { let& issue = *i; auto& where = issue.location; const auto& lines = where.GetSourceFile().lines; const auto line = (where.lineNo == lines.size()) ? L"(end)" : lines[where.lineNo].c_str(); message += wstrprintf(L" %ls\n %ls\n", line, issue.markup.c_str()); } message += wstrprintf(L"%ls while %ls: %ls(%d)", errorKind, kind, firstLoc.GetSourceFile().path.c_str(), (int)firstLoc.lineNo + 1 /*report 1-based*/); } else { message += wstrprintf(L"%ls while %ls", errorKind, kind); } message += wstrprintf(L": %ls\n", what); return message; } /*static*/ vector TextLocation::sourceFileMap; // --------------------------------------------------------------------------- // reader -- reads source code, including loading from disk // --------------------------------------------------------------------------- class CodeSource { vector locationStack; // parent locations in case of included files TextLocation cursor; // current location const wchar_t* currentLine; // cache of cursor.GetSourceFile().lines[cursor.lineNo] // update currentLine from cursor void CacheCurrentLine() { let& lines = cursor.GetSourceFile().lines; if (cursor.lineNo == lines.size()) currentLine = nullptr; else currentLine = lines[cursor.lineNo].c_str(); } protected: // set a source file; only do that from constructor or inside PushSourceFile() void SetSourceFile(SourceFile&& sourceFile) { cursor = TextLocation::NewSourceFile(move(sourceFile)); // save source file and set the cursor to its start CacheCurrentLine(); // re-cache current line } public: class CodeSourceException : public ConfigException { public: CodeSourceException(const wstring& msg, TextLocation where) : ConfigException(msg, where) { } /*ConfigException::*/ const wchar_t* kind() const { return L"reading source"; } }; __declspec_noreturn static void Fail(wstring msg, TextLocation where) { //Microsoft::MSR::CNTK::DebugUtil::PrintCallStack(); throw CodeSourceException(msg, where); } // enter a source file, at start or as a result of an include statement void PushSourceFile(SourceFile&& sourceFile) { locationStack.push_back(cursor); SetSourceFile(move(sourceFile)); } // are we inside an include file? bool IsInInclude() { return locationStack.size() > 0; } // done with a source file. Only call this for nested files; the outermost one must not be popped. void PopSourceFile() { if (!IsInInclude()) LogicError("PopSourceFile: location stack empty"); cursor = locationStack.back(); // restore cursor we came from CacheCurrentLine(); // re-cache current line locationStack.pop_back(); } // get current cursor; this is remembered for each token, and also used when throwing errors TextLocation GetCursor() const { return cursor; } // get character at current position. // Special cases: // - end of line is returned as '\n' // - end of file is returned as 0 wchar_t GotChar() const { if (!currentLine) return 0; // end of file else if (!currentLine[cursor.charPos]) return '\n'; // end of line else return currentLine[cursor.charPos]; } // we chan also return the address of the current character, e.g. for passing it to a C stdlib function such as wcstod() const wchar_t* GotCharPtr() const { return currentLine + cursor.charPos; } // advance cursor by #chars (but across line boundaries) void ConsumeChars(size_t chars) { let ch = GotChar(); if (!ch) LogicError("Consume: cannot run beyond end of source file"); if (ch == '\n' && chars > 0) { if (chars != 1) LogicError("Consume: cannot run beyond end of line"); cursor.lineNo++; CacheCurrentLine(); // line no has changed: re-cache the line ptr cursor.charPos = 0; } else cursor.charPos += chars; } // get the next character wchar_t GetChar() { ConsumeChars(1); return GotChar(); } }; // --------------------------------------------------------------------------- // lexer -- iterates over the source code and returns token by token // --------------------------------------------------------------------------- class Lexer : public CodeSource { set keywords; set punctuations; vector includePaths; public: Lexer(vector&& includePaths) : CodeSource(), includePaths(includePaths), currentToken(TextLocation()) { keywords = set{ L"include", L"new", L"with", L"true", L"false", L"if", L"then", L"else", L"array", }; punctuations = set{ L"=", L";", L",", L"\n", L"[", L"]", L"(", L")", L"{", L"}", L"[|", L"|]", L"+", L"-", L"*", L"/", L"**", L".*", L"%", L"||", L"&&", L"<<", L">>", L"!", L"<", L"==", L">", L">=", L"!=", L"<=", L":", L"=>", L"..", L".", L"//", L"#", L"/*"}; } enum TokenKind { invalid, punctuation, numberliteral, stringliteral, booleanliter, identifier, keyword, eof // TODO: what are true and false? Literals or identifiers? }; struct Token { wstring symbol; // identifier, keyword, punctuation, or string literal double number; // number TokenKind kind; TextLocation beginLocation; // text loc of first character of this token bool isLineInitial; // this token is the first on the line (ignoring comments) Token(TextLocation loc) : beginLocation(loc), kind(invalid), number(0.0), isLineInitial(false) { } // diagnostic helper static wstring TokenKindToString(TokenKind kind) { switch (kind) { case invalid: return L"invalid"; case punctuation: return L"punctuation"; case numberliteral: return L"numberliteral"; case stringliteral: return L"stringliteral"; case identifier: return L"identifier"; case keyword: return L"keyword"; case eof: return L"eof"; default: return L"(unknown?)"; } } wstring ToString() const // string to show the content of token for debugging { let kindStr = TokenKindToString(kind); switch (kind) { case numberliteral: return kindStr + wstrprintf(L" %f", number); case stringliteral: return kindStr + L" '" + symbol + L"'"; case identifier: case keyword: case punctuation: return kindStr + L" " + symbol; default: return kindStr; } } }; class LexerException : public ConfigException { public: LexerException(const wstring& msg, TextLocation where) : ConfigException(msg, where) { } /*ConfigException::*/ const wchar_t* kind() const { return L"tokenizing"; } }; private: __declspec_noreturn static void Fail(wstring msg, Token where) { //Microsoft::MSR::CNTK::DebugUtil::PrintCallStack(); throw LexerException(msg, where.beginLocation); } // find a file either at given location or traverse include paths // TODO: also allow ... syntax, where ... refers to the directory of the enclosing file static wstring FindSourceFile(const wstring& path, const vector& includePaths) { if (File::Exists(path)) return path; // non-existent path: scan include paths // TODO: This is a little weird. Rather, this should be done by the call site. let fileName = File::FileNameOf(path); for (let& dir : includePaths) { // TODO: We should use the separator that matches the include path. let newPath = dir + L"/" + fileName; if (File::Exists(newPath)) return newPath; } // not in include path: try EXE directory let dir = File::DirectoryPathOf(File::GetExecutablePath()); let newPath = dir + L"/" + fileName; if (File::Exists(newPath)) return newPath; // not found: return unmodified, let caller fail return path; } Token currentToken; // consume input characters to form a next token // - this function mutates the cursor, but does not set currentToken // - white space and comments are skipped // - including files is handled here // - the cursor is left on the first character that does not belong to the token // TODO: need to know whether we want to see '\n' or not Token NextToken() { auto ch = GotChar(); // skip white space // We remember whether we crossed a line end. Dictionary assignments end at newlines if syntactically acceptable. bool crossedLineEnd = (GetCursor().lineNo == 0 && GetCursor().charPos == 0); while (iswblank(ch) || ch == '\n' || ch == '\r') { crossedLineEnd |= (ch == '\n' || ch == '\r'); ch = GetChar(); } Token t(GetCursor()); t.isLineInitial = crossedLineEnd; // handle end of (include) file if (ch == 0) { if (IsInInclude()) { includePaths.erase(includePaths.begin()); // pop dir of current include file PopSourceFile(); t = NextToken(); // tail call--the current 't' gets dropped/ignored t.isLineInitial = true; // eof is a line end return t; } // really end of all source code: we are done. If calling this function multiple times, we will keep returning this. t.kind = eof; } else if (iswdigit(ch) || (ch == L'.' && iswdigit(GotCharPtr()[1]))) // --- number { let beginPtr = GotCharPtr(); wchar_t* endPtr = nullptr; t.number = wcstod(beginPtr, &endPtr); // BUGBUG: this seems to honor locale settings. We need one that doesn't. With this, CNTK won't parse right in Germany. if (endPtr == beginPtr) Fail(L"parsing number", t); // should not really happen! t.kind = numberliteral; if (endPtr[0] == L'.' && endPtr[-1] == L'.') // prevent 1..2 from begin tokenized 1. .2 endPtr--; ConsumeChars(endPtr - beginPtr); } else if (iswalpha(ch) || ch == L'_') // --- identifier or keyword { while (iswalpha(ch) || ch == L'_' || iswdigit(ch)) // inside we also allow digits { t.symbol.push_back(ch); ch = GetChar(); } // check against keyword list if (keywords.find(t.symbol) != keywords.end()) t.kind = keyword; else t.kind = identifier; // special case: include "path" if (t.symbol == L"include") { let nameTok = NextToken(); // must be followed by a string literal if (nameTok.kind != stringliteral) Fail(L"'include' must be followed by a quoted string", nameTok); let path = FindSourceFile(nameTok.symbol, includePaths); PushSourceFile(SourceFile(path)); // current cursor is right after the pathname; that's where we will pick up later includePaths.insert(includePaths.begin(), File::DirectoryPathOf(path)); return NextToken(); } } else if (ch == L'"' || ch == 0x27) // --- string literal { t.kind = stringliteral; let q = ch; // remember quote character ch = GetChar(); // consume the quote character while (ch != 0 && ch != q) // note: our strings do not have any escape characters to consider { t.symbol.append(1, ch); ch = GetChar(); } if (ch == 0) // runaway string Fail(L"string without closing quotation mark", t); GetChar(); // consume the closing quote } else // --- punctuation { t.kind = punctuation; t.symbol = ch; t.symbol.append(1, GetChar()); // first try two-char punctuation if (punctuations.find(t.symbol) != punctuations.end()) GetChar(); // it is a two-char one: need to consume the second one of them else // try single-char one { t.symbol.pop_back(); // drop the last one & try again if (punctuations.find(t.symbol) == punctuations.end()) // unknown Fail(L"unexpected character: " + t.symbol, t); } // special case: comments if (t.symbol == L"#" || t.symbol == L"//") { ConsumeChars(wcslen(GotCharPtr())); return NextToken(); } else if (t.symbol == L"/*") { ch = GotChar(); while (ch != 0 && !(ch == L'*' && GetChar() == L'/')) // note: this test leverages short-circuit evaluation semantics of C ch = GetChar(); if (ch == 0) Fail(L"comment without closing */", t); GetChar(); // consume the final '/' return NextToken(); // and return the next token } } return t; } public: const Token& GotToken() { return currentToken; } void ConsumeToken() { currentToken = NextToken(); } const Token& GetToken() { ConsumeToken(); return GotToken(); } // some simple test function void Test() { let lexerTest = L"new CNTK [ do = (train:eval) # main\ntrain=/*test * */if eval include 'c:/me/test.txt' then 13 else array[1..10](i=>i*i); eval=\"a\"+'b' // line-end\n ] 'a\nb\nc' new"; PushSourceFile(SourceFile(L"(command line)", lexerTest)); while (GotToken().kind != Lexer::TokenKind::eof) { let& token = GotToken(); // get first token fprintf(stderr, "%ls\n", token.ToString().c_str()); ConsumeToken(); } Fail(L"error test", GetCursor()); } }; // --------------------------------------------------------------------------- // parser -- parses configurations // --------------------------------------------------------------------------- // diagnostics helper: print the content void Expression::DumpToStream(wstringstream & treeStream, int indent) { treeStream << std::setfill(L' ') << std::setw(indent) << L" "; treeStream << std::setw(0); if (op == L"s") treeStream << "'" << s.c_str() << "'"; else if (op == L"d") treeStream << std::fixed << std::setprecision(0) << d; else if (op == L"b") treeStream << b ? "true" : "false"; else if (op == L"id") treeStream << id.c_str(); else if (op == L"new" || op == L"array" || op == L".") treeStream << op.c_str() << " " << id.c_str(); else treeStream << op.c_str(); if (!args.empty()) { treeStream << std::endl; for (const auto& arg : args) { arg->DumpToStream(treeStream, indent + 1); } } if (!namedArgs.empty()) { treeStream << std::endl; for (const auto& arg : namedArgs) { treeStream << std::setfill(L' ') << std::setw(indent + 1) << L""; treeStream << arg.first.c_str() << L" =" << std::endl; arg.second.second->DumpToStream(treeStream, indent + 2); } } treeStream << std::endl; } class Parser : public Lexer { // errors class ParseException : public ConfigException { public: ParseException(const wstring& msg, TextLocation where) : ConfigException(msg, where) { } /*ConfigException::*/ const wchar_t* kind() const { return L"parsing"; } }; __declspec_noreturn static void Fail(const wstring& msg, Token where) { //Microsoft::MSR::CNTK::DebugUtil::PrintCallStack(); throw ParseException(msg, where.beginLocation); } //void Expected(const wstring & what) { Fail(strprintf("%ls expected", what.c_str()), GotToken().beginLocation); } // I don't know why this does not work void Expected(const wstring& what) { Fail(what + L" expected", GotToken().beginLocation); } // this token must be punctuation 's'; check and get the next void ConsumePunctuation(const wchar_t* s) { let& tok = GotToken(); if (tok.kind != punctuation || tok.symbol != s) Expected(L"'" + wstring(s) + L"'"); ConsumeToken(); } // this token must be keyword 's'; check and get the next void ConsumeKeyword(const wchar_t* s) { let& tok = GotToken(); if (tok.kind != keyword || tok.symbol != s) Expected(L"'" + wstring(s) + L"'"); ConsumeToken(); } // this token must be an identifier; check and get the next token. Return the identifier. wstring ConsumeIdentifier() { let& tok = GotToken(); if (tok.kind != identifier) Expected(L"identifier"); let id = tok.symbol; ConsumeToken(); return id; } map infixPrecedence; // precedence level of infix operators static const int unaryPrecedence = 90; // for unary "-" and "!". 90 is below x., x[, x(, and x{, but above all others // TODO: Would be more direct to fold this into the table below as well. public: Parser(SourceFile&& sourceFile, vector&& includePaths) : Lexer(move(includePaths)) { infixPrecedence = map{ {L".", 99}, {L"[", 99}, {L"(", 99}, {L"{", 99}, // (with LHS) these are also sort-of infix operands... {L"*", 10}, {L"/", 10}, {L".*", 10}, {L"**", 10}, {L"%", 10}, {L"+", 9}, {L"-", 9}, {L"with", 9}, {L"==", 8}, {L"!=", 8}, {L"<", 8}, {L"<=", 8}, {L">", 8}, {L">=", 8}, {L"&&", 7}, {L"||", 6}, {L">>", 5}, {L"<<", 5}, // TODO: do it as other languages {L":", 4}, {L"=>", 0}, }; SetSourceFile(move(sourceFile)); ConsumeToken(); // get the very first token } ExpressionPtr OperandFromTokenSymbol(const Token& tok) // helper to make an Operand expression with op==tok.symbol and then consume it { auto operand = make_shared(tok.beginLocation, tok.symbol); ConsumeToken(); return operand; } ExpressionPtr ParseOperand(bool stopAtNewline) { let& tok = GotToken(); ExpressionPtr operand; if (tok.kind == numberliteral) // === numeral literal { operand = make_shared(tok.beginLocation, L"d", tok.number, wstring(), false); ConsumeToken(); } else if (tok.kind == stringliteral) // === string literal { operand = make_shared(tok.beginLocation, L"s", 0.0, tok.symbol, false); ConsumeToken(); } else if (tok.symbol == L"true" || tok.symbol == L"false") // === boolean literal { operand = make_shared(tok.beginLocation, L"b", 0.0, wstring(), (tok.symbol == L"true")); ConsumeToken(); } else if (tok.kind == identifier) // === dict member (unqualified) { operand = make_shared(tok.beginLocation, L"id"); operand->id = ConsumeIdentifier(); } else if (tok.symbol == L"+" || tok.symbol == L"-" // === unary operators || tok.symbol == L"!") { operand = make_shared(tok.beginLocation, tok.symbol + L"("); // encoded as +( -( !( ConsumeToken(); operand->args.push_back(ParseExpression(unaryPrecedence, stopAtNewline)); } else if (tok.symbol == L"new") // === new class instance { operand = OperandFromTokenSymbol(tok); operand->id = ConsumeIdentifier(); operand->args.push_back(ParseOperand(stopAtNewline)); } else if (tok.symbol == L"if") // === conditional expression { operand = OperandFromTokenSymbol(tok); operand->args.push_back(ParseExpression(0, false)); // [0] condition ConsumeKeyword(L"then"); operand->args.push_back(ParseExpression(0, false)); // [1] then expression ConsumeKeyword(L"else"); operand->args.push_back(ParseExpression(0, false)); // [2] else expression } else if (tok.symbol == L"(") // === nested parentheses { ConsumeToken(); operand = ParseExpression(0, false /*go across newlines*/); // just return the content of the parens (they do not become part of the expression tree) ConsumePunctuation(L")"); } else if (tok.symbol == L"{" || tok.symbol == L"["/*soon to be deprecated*/) // === record constructor { let* closeSymbol = tok.symbol == L"{" ? L"}" : L"]"; operand = make_shared(tok.beginLocation, L"[]"); ConsumeToken(); operand->namedArgs = ParseRecordMembers(); ConsumePunctuation(closeSymbol); } #if 1 // the F# syntax is a stop-gap and meant for experimentation, and we will not recommend to use it // Rather, we must find a way to parse both Python-like array literals and BS dictionaries jointly, // and eventually deprecate [] for records. else if (tok.symbol == L"[|") // === array literal using F# syntax [| a; b; c |] (same as a:b:c, but also allows for 0- and 1-element arrays) { operand = make_shared(tok.beginLocation, L":"); ConsumeToken(); if (GotToken().symbol != L"|]") // {} defines an empty array { for (;;) { operand->args.push_back(ParseExpression(0, false)); // item. Precedence 0 means go until comma or closing parenthesis. if (GotToken().symbol != L";") break; ConsumeToken(); } } ConsumePunctuation(L"|]"); } #endif else if (tok.symbol == L"array") // === array constructor { operand = OperandFromTokenSymbol(tok); ConsumePunctuation(L"["); operand->args.push_back(ParseExpression(0, false)); // [0] first index ConsumePunctuation(L".."); operand->args.push_back(ParseExpression(0, false)); // [1] last index ConsumePunctuation(L"]"); ConsumePunctuation(L"("); operand->args.push_back(ParseExpression(0, false)); // [2] one-argument lambda to initialize ConsumePunctuation(L")"); } else Expected(L"operand"); return operand; // not using returns above to avoid "not all control paths return a value" } ExpressionPtr ParseExpression(int requiredPrecedence, bool stopAtNewline) { auto left = ParseOperand(stopAtNewline); // get first operand for (;;) { let& opTok = GotToken(); // BUGBUG: 'stopAtNewline' is broken. // It does not prevent "a = 13 b = 42" from being accepted. // On the other hand, it would prevent the totally valid "dict \n with dict2". // A correct solution should require "a = 13 ; b = 42", i.e. a semicolon or newline, // while continuing to parse across newlines when syntactically meaningful (there is no ambiguity in BrainScript). //if (stopAtNewline && opTok.isLineInitial) // break; let opIter = infixPrecedence.find(opTok.symbol); if (opIter == infixPrecedence.end()) // not an infix operator: we are done here, 'left' is our expression break; let opPrecedence = opIter->second; if (opPrecedence < requiredPrecedence) // operator below required precedence level: does not belong to this sub-expression break; let op = opTok.symbol; auto operation = make_shared(opTok.beginLocation, op, left); // [0] is left operand; we will add [1] except for macro application // deal with special cases first // We treat member lookup (.), macro application (a()), and indexing (a[i]) together with the true infix operators. if (op == L".") // === reference of a dictionary item { ConsumeToken(); operation->location = GotToken().beginLocation; // location of the identifier after the . operation->id = ConsumeIdentifier(); } else if (op == L"=>") { if (left->op != L"id") // currently only allow for a single argument Expected(L"identifier"); ConsumeToken(); let macroArgs = make_shared(left->location, L"()", left); // wrap identifier in a "()" macro-args expression // TODO: test parsing of i => j => i*j let body = ParseExpression(opPrecedence, stopAtNewline); // pass same precedence; this makes '=>' right-associative e.g.i=>j=>i*j operation->args[0] = macroArgs; // [0]: parameter list operation->args.push_back(body); // [1]: right operand } else if (op == L"(" || op == L"{") // === macro application { // op = "(" and "{" mean 'apply', where {} refers to experimental constructor syntax // args[0] = lambda expression (lambda: op="=>", args[0] = param list, args[1] = expression with unbound vars) // args[1] = arguments (arguments: op="()", args=vector of expressions, one per arg; and namedArgs) operation->args.push_back(ParseMacroArgs(false, op)); // [1]: all arguments } else if (op == L"[") // === array index { ConsumeToken(); operation->args.push_back(ParseExpression(0, false)); // [1]: index ConsumePunctuation(L"]"); } else if (op == L":") { // special case: (a : b : c) gets flattened into :(a,b,c) i.e. an operation with possibly >2 operands ConsumeToken(); let right = ParseExpression(opPrecedence + 1, stopAtNewline); // get right operand, or entire multi-operand expression with higher precedence if (left->op == L":") // appending to a list: flatten it { operation->args = left->args; operation->location = left->location; // location of first ':' (we need to choose some location) } operation->args.push_back(right); // form a list of multiple operands (not just two) } else // === regular infix operator { ConsumeToken(); let right = ParseExpression(opPrecedence + 1, stopAtNewline); // get right operand, or entire multi-operand expression with higher precedence operation->args.push_back(right); // [1]: right operand } left = operation; } return left; } // a macro-args expression lists position-dependent and optional parameters // This is used both for defining macros (LHS) and using macros (RHS). // Result: // op = "()" // args = vector of arguments (which are given comma-separated) // In case of macro definition, all arguments must be of type "id". Pass 'defining' to check for that. // namedArgs = dictionary of optional args // In case of macro definition, dictionary values are default values that are used if the argument is not given ExpressionPtr ParseMacroArgs(bool defining, wstring openSymbol) { ConsumePunctuation(openSymbol.c_str()); auto macroArgs = make_shared(GotToken().beginLocation, L"()"); let* closeSymbol = openSymbol == L"(" ? L")" : L"}"; if (GotToken().symbol != closeSymbol) // x() defines an empty argument list { for (;;) { let expr = ParseExpression(0, false); // this could be an optional arg (var = val) if (defining && expr->op != L"id") // when defining we only allow a single identifier Fail(L"argument identifier expected", expr->location); if (expr->op == L"id" && GotToken().symbol == L"=") { let id = expr->id; // 'expr' gets resolved (to 'id') and forgotten ConsumeToken(); let defValueExpr = ParseExpression(0, false); // default value let res = macroArgs->namedArgs.insert(make_pair(id, make_pair(expr->location, defValueExpr))); if (!res.second) Fail(L"duplicate optional parameter '" + id + L"'", expr->location); } else macroArgs->args.push_back(expr); // [0..]: position args if (GotToken().symbol != L",") break; ConsumeToken(); } } ConsumePunctuation(closeSymbol); return macroArgs; } map> ParseRecordMembers() { // A dictionary is a map // member identifier -> expression // Macro declarations are translated into lambdas, e.g. // F(A,B) = expr(A,B) // (and likewise F{A,B}) gets represented in the dictionary as // F = (A,B) => expr(A,B) // where a lambda expression has this structure: // op="=>" // args[0] = parameter list (op="()", with args (all of op="id") and namedArgs) // args[1] = expression with unbound arguments // An array constructor of the form // V[i:from..to] = expression of i // gets mapped to the explicit array operator // V = array[from..to] (i => expression of i) map> members; auto idTok = GotToken(); while (idTok.kind == identifier) { let location = idTok.beginLocation; // for error message let id = ConsumeIdentifier(); // the member's name // optional array constructor ExpressionPtr arrayIndexExpr, fromExpr, toExpr; if (GotToken().symbol == L"[") { // X[i:from..to] ConsumeToken(); arrayIndexExpr = ParseOperand(false); // 'i' name of index variable if (arrayIndexExpr->op != L"id") Expected(L"identifier"); ConsumePunctuation(L":"); fromExpr = ParseExpression(0, false); // 'from' start index ConsumePunctuation(L".."); toExpr = ParseExpression(0, false); // 'to' end index ConsumePunctuation(L"]"); } // optional macro args let& openParen = GotToken().symbol; let parameters = (openParen == L"(" || openParen == L"{") ? ParseMacroArgs(true /*defining*/, openParen) : ExpressionPtr(); // optionally, macro arguments ConsumePunctuation(L"="); auto rhs = ParseExpression(0, true /*can end at newline*/); // and the right-hand side // if macro then rewrite it as an assignment of a lambda expression if (parameters) rhs = make_shared(parameters->location, L"=>", parameters, rhs); // if array then rewrite it as an assignment of a array-constructor expression if (arrayIndexExpr) { // create a lambda expression over the index variable // BUGBUG: For {} constructor functions--we cannot declare constructor lambdas for now. let macroArgs = make_shared(arrayIndexExpr->location, L"()", arrayIndexExpr); // wrap identifier in a "()" macro-args expression let initLambdaExpr = make_shared(arrayIndexExpr->location, L"=>", macroArgs, rhs); // [0] is id, [1] is body rhs = make_shared(location, L"array"); rhs->args.push_back(fromExpr); // [0] first index rhs->args.push_back(toExpr); // [1] last index rhs->args.push_back(initLambdaExpr); // [2] one-argument lambda to initialize } // insert let res = members.insert(make_pair(id, make_pair(location, rhs))); if (!res.second) Fail(L"duplicate member definition '" + id + L"'", location); // advance idTok = GotToken(); if (idTok.symbol == L";") idTok = GetToken(); } return members; } void VerifyAtEnd() { if (GotToken().kind != eof) Fail(L"junk at end of source", GetCursor()); } // top-level parse function parses dictionary members without enclosing [ ... ] and returns it as a dictionary ExpressionPtr ParseRecordMembersToDict() { let topMembers = ParseRecordMembers(); VerifyAtEnd(); ExpressionPtr topDict = make_shared(GetCursor(), L"[]"); topDict->namedArgs = topMembers; return topDict; } }; // globally exported functions to execute the parser static ExpressionPtr Parse(SourceFile&& sourceFile, vector&& includePaths) { return Parser(move(sourceFile), move(includePaths)).ParseRecordMembersToDict(); } ExpressionPtr ParseConfigDictFromString(wstring text, wstring location, vector&& includePaths) { return Parse(SourceFile(location, text), move(includePaths)); } //ExpressionPtr ParseConfigDictFromFile(wstring path, vector includePaths) //{ // auto sourceFile = SourceFile(path); // note: no resolution against include paths done here // includePaths.insert(includePaths.begin(), File::DirectoryPathOf(path)); // must include our own path for nested include statements // return Parse(move(sourceFile), move(includePaths)); //} ExpressionPtr ParseConfigExpression(const wstring& sourceText, vector&& includePaths) { auto parser = Parser(SourceFile(L"(command line)", sourceText), move(includePaths)); auto expr = parser.ParseExpression(0, true /*can end at newline*/); parser.VerifyAtEnd(); return expr; } }}}