Content - 5954dc668bbb8f658b69833e42b5fd8eeeb8447e - 6b055d3/source/compiler-core/slang-lexer.cpp

visit type:
Tip revision: c5c8cfbb360d9a763f549df48636effde839eacd authored by Sai Praveen Bangaru on 27 September 2023, 00:50:13 UTC
Handle the case where the parent if-else region's after-block is unreachable. (#3241)
Tip revision: c5c8cfb
slang-lexer.cpp
// slang-lexer.cpp
#include "slang-lexer.h"

// This file implements the lexer/scanner, which is responsible for taking a raw stream of
// input bytes and turning it into semantically useful tokens.
//

#include "slang-name.h"
#include "slang-source-loc.h"

#include "slang-core-diagnostics.h"

namespace Slang
{
    Token TokenReader::getEndOfFileToken()
    {
        return Token(TokenType::EndOfFile, UnownedStringSlice::fromLiteral(""), SourceLoc());
    }

    const Token* TokenList::begin() const
    {
        SLANG_ASSERT(m_tokens.getCount());
        return &m_tokens[0];
    }

    const Token* TokenList::end() const
    {
        SLANG_ASSERT(m_tokens.getCount());
        SLANG_ASSERT(m_tokens[m_tokens.getCount() - 1].type == TokenType::EndOfFile);
        return &m_tokens[m_tokens.getCount() - 1];
    }

    TokenSpan::TokenSpan()
        : m_begin(nullptr)
        , m_end  (nullptr)
    {}

    TokenReader::TokenReader()
        : m_cursor(nullptr)
        , m_end   (nullptr)
    {
        _updateLookaheadToken();
    }

    Token& TokenReader::peekToken()
    {
        return m_nextToken;
    }

    TokenType TokenReader::peekTokenType() const
    {
        return m_nextToken.type;
    }

    SourceLoc TokenReader::peekLoc() const
    {
        return m_nextToken.loc;
    }

    Token TokenReader::advanceToken()
    {
        Token result = m_nextToken;
        if (m_cursor != m_end)
            m_cursor++;
        _updateLookaheadToken();
        return result;
    }

    void TokenReader::_updateLookaheadToken()
    {
        // We assume here that we can read a token from a non-null `m_cursor`
        // *even* in the case where `m_cursor == m_end`, because the invariant
        // for lists of tokens is that they should be terminated with and
        // end-of-file token, so that there is always a token "one past the end."
        //
        m_nextToken = m_cursor ? *m_cursor : getEndOfFileToken();

        // If the token we read came from the end of the sub-sequence we are
        // reading, then we will change the token type to an end-of-file token
        // so that code that reads from the sequence and expects a terminating
        // EOF will find it.
        //
        // TODO: We might eventually want a way to look at the actual token type
        // and not just use EOF in all cases: e.g., when emitting diagnostic
        // messages that include the token that is seen.
        // 
        if(m_cursor == m_end)
            m_nextToken.type = TokenType::EndOfFile;
    }

    // Lexer

    void Lexer::initialize(
        SourceView*     sourceView,
        DiagnosticSink* sink,
        NamePool*       namePool,
        MemoryArena*    memoryArena)
    {
        m_sourceView  = sourceView;
        m_sink        = sink;
        m_namePool    = namePool;
        m_memoryArena = memoryArena;

        auto content = sourceView->getContent();
        
        m_begin   = content.begin();
        m_cursor  = content.begin();
        m_end     = content.end();

        // Set the start location
        m_startLoc = sourceView->getRange().begin;

        // The first token read from a translation unit should be considered to be at
        // the start of a line, and *also* as coming after whitespace (conceptually
        // both the end-of-file and beginning-of-file pseudo-tokens are whitespace).
        //
        m_tokenFlags = TokenFlag::AtStartOfLine | TokenFlag::AfterWhitespace;
        m_lexerFlags = 0;
    }

    Lexer::~Lexer()
    {
    }

    enum { kEOF = -1 };

    // Get the next input byte, without any handling of
    // escaped newlines, non-ASCII code points, source locations, etc.
    static int _peekRaw(Lexer* lexer)
    {
        // If we are at the end of the input, return a designated end-of-file value
        if(lexer->m_cursor == lexer->m_end)
            return kEOF;

        // Otherwise, just look at the next byte
        return *lexer->m_cursor;
    }

    // Read one input byte without any special handling (similar to `peekRaw`)
    static int _advanceRaw(Lexer* lexer)
    {
        // The logic here is basically the same as for `peekRaw()`,
        // escape we advance `cursor` if we aren't at the end.

        if (lexer->m_cursor == lexer->m_end)
            return kEOF;

        return *lexer->m_cursor++;
    }

    // When the cursor is already at the first byte of an end-of-line sequence,
    // consume one or two bytes that compose the sequence.
    //
    // Basically, a newline is one of:
    //
    //  "\n"
    //  "\r"
    //  "\r\n"
    //  "\n\r"
    //
    // We always look for the longest match possible.
    //
    static void _handleNewLineInner(Lexer* lexer, int c)
    {
        SLANG_ASSERT(c == '\n' || c == '\r');

        int d = _peekRaw(lexer);
        if( (c ^ d) == ('\n' ^ '\r') )
        {
            _advanceRaw(lexer);
        }
    }

    // Look ahead one code point, dealing with complications like
    // escaped newlines.
    static int _peek(Lexer* lexer)
    {
        // Look at the next raw byte, and decide what to do
        int c = _peekRaw(lexer);

        if(c == '\\')
        {
            // We might have a backslash-escaped newline.
            // Look at the next byte (if any) to see.
            //
            // Note(tfoley): We are assuming a null-terminated input here,
            // so that we can safely look at the next byte without issue.
            int d = lexer->m_cursor[1];
            switch (d)
            {
            case '\r': case '\n':
                {
                    // The newline was escaped, so return the code point after *that*

                    int e = lexer->m_cursor[2];
                    if ((d ^ e) == ('\r' ^ '\n'))
                        return lexer->m_cursor[3];
                    return e;
                }

            default:
                break;
            }
        }
        // TODO: handle UTF-8 encoding for non-ASCII code points here

        // Default case is to just hand along the byte we read as an ASCII code point.
        return c;
    }

    // Get the next code point from the input, and advance the cursor.
    static int _advance(Lexer* lexer)
    {
        // We are going to loop, but only as a way of handling
        // escaped line endings.
        for (;;)
        {
            // If we are at the end of the input, then the task is easy.
            if (lexer->m_cursor == lexer->m_end)
                return kEOF;

            // Look at the next raw byte, and decide what to do
            int c = *lexer->m_cursor++;

            if (c == '\\')
            {
                // We might have a backslash-escaped newline.
                // Look at the next byte (if any) to see.
                //
                // Note(tfoley): We are assuming a null-terminated input here,
                // so that we can safely look at the next byte without issue.
                int d = *lexer->m_cursor;
                switch (d)
                {
                case '\r': case '\n':
                    // handle the end-of-line for our source location tracking
                    lexer->m_cursor++;
                    _handleNewLineInner(lexer, d);

                    lexer->m_tokenFlags |= TokenFlag::ScrubbingNeeded;

                    // Now try again, looking at the character after the
                    // escaped newline.
                    continue;

                default:
                    break;
                }
            }

            // TODO: Need to handle non-ASCII code points.

            // Default case is to return the raw byte we saw.
            return c;
        }
    }

    static void _handleNewLine(Lexer* lexer)
    {
        int c = _advance(lexer);
        _handleNewLineInner(lexer, c);
    }

    static void _lexLineComment(Lexer* lexer)
    {
        for(;;)
        {
            switch(_peek(lexer))
            {
            case '\n': case '\r': case kEOF:
                return;

            default:
                _advance(lexer);
                continue;
            }
        }
    }

    static void _lexBlockComment(Lexer* lexer)
    {
        for(;;)
        {
            switch(_peek(lexer))
            {
            case kEOF:
                // TODO(tfoley) diagnostic!
                return;

            case '\n': case '\r':
                _handleNewLine(lexer);
                continue;

            case '*':
                _advance(lexer);
                switch( _peek(lexer) )
                {
                case '/':
                    _advance(lexer);
                    return;

                default:
                    continue;
                }

            default:
                _advance(lexer);
                continue;
            }
        }
    }

    static void _lexHorizontalSpace(Lexer* lexer)
    {
        for(;;)
        {
            switch(_peek(lexer))
            {
            case ' ': case '\t':
                _advance(lexer);
                continue;

            default:
                return;
            }
        }
    }

    static void _lexIdentifier(Lexer* lexer)
    {
        for(;;)
        {
            int c = _peek(lexer);
            if(('a' <= c ) && (c <= 'z')
                || ('A' <= c) && (c <= 'Z')
                || ('0' <= c) && (c <= '9')
                || (c == '_'))
            {
                _advance(lexer);
                continue;
            }

            return;
        }
    }

    static SourceLoc _getSourceLoc(Lexer* lexer)
    {
        return lexer->m_startLoc + (lexer->m_cursor - lexer->m_begin);
    }

    static void _lexDigits(Lexer* lexer, int base)
    {
        for(;;)
        {
            int c = _peek(lexer);

            int digitVal = 0;
            switch(c)
            {
            case '0': case '1': case '2': case '3': case '4':
            case '5': case '6': case '7': case '8': case '9':
                digitVal = c - '0';
                break;

            case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
                if(base <= 10) return;
                digitVal = 10 + c - 'a';
                break;

            case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
                if(base <= 10) return;
                digitVal = 10 + c - 'A';
                break;

            default:
                // Not more digits!
                return;
            }

            if(digitVal >= base)
            {
                if (auto sink = lexer->getDiagnosticSink())
                {
                    char buffer[] = { (char) c, 0 };
                    sink->diagnose(_getSourceLoc(lexer), LexerDiagnostics::invalidDigitForBase, buffer, base);
                }
            }

            _advance(lexer);
        }
    }

    static TokenType _maybeLexNumberSuffix(Lexer* lexer, TokenType tokenType)
    {
        // Be liberal in what we accept here, so that figuring out
        // the semantics of a numeric suffix is left up to the parser
        // and semantic checking logic.
        //
        for( ;;)
        {
            int c = _peek(lexer);

            // Accept any alphanumeric character, plus underscores.
            if(('a' <= c ) && (c <= 'z')
                || ('A' <= c) && (c <= 'Z')
                || ('0' <= c) && (c <= '9')
                || (c == '_'))
            {
                _advance(lexer);
                continue;
            }

            // Stop at the first character that isn't
            // alphanumeric.
            return tokenType;
        }
    }

    static bool _isNumberExponent(int c, int base)
    {
        switch( c )
        {
        default:
            return false;

        case 'e': case 'E':
            if(base != 10) return false;
            break;

        case 'p': case 'P':
            if(base != 16) return false;
            break;
        }

        return true;
    }

    static bool _maybeLexNumberExponent(Lexer* lexer, int base)
    {
        if (_peek(lexer) == '#')
        {
            // Special case #INF 
            const auto inf = toSlice("#INF");
            for (auto c : inf)
            {
                if (_peek(lexer) != c)
                {
                    return false;
                }
                _advance(lexer);
            }

            return true;
        }

        if(!_isNumberExponent(_peek(lexer), base))
            return false;

        // we saw an exponent marker
        _advance(lexer);

        // Now start to read the exponent
        switch( _peek(lexer) )
        {
        case '+': case '-':
            _advance(lexer);
            break;
        }

        // TODO(tfoley): it would be an error to not see digits here...

        _lexDigits(lexer, 10);

        return true;
    }

    static TokenType _lexNumberAfterDecimalPoint(Lexer* lexer, int base)
    {
        _lexDigits(lexer, base);
        _maybeLexNumberExponent(lexer, base);

        return _maybeLexNumberSuffix(lexer, TokenType::FloatingPointLiteral);
    }

    static TokenType _lexNumber(Lexer* lexer, int base)
    {
        // TODO(tfoley): Need to consider whether to allow any kind of digit separator character.

        TokenType tokenType = TokenType::IntegerLiteral;

        // At the start of things, we just concern ourselves with digits
        _lexDigits(lexer, base);

        if( _peek(lexer) == '.' )
        {
            tokenType = TokenType::FloatingPointLiteral;

            _advance(lexer);
            _lexDigits(lexer, base);
        }

        if( _maybeLexNumberExponent(lexer, base))
        {
            tokenType = TokenType::FloatingPointLiteral;
        }

        _maybeLexNumberSuffix(lexer, tokenType);
        return tokenType;
    }

    static int _maybeReadDigit(char const** ioCursor, int base)
    {
        auto& cursor = *ioCursor;

        for(;;)
        {
            int c = *cursor;
            switch(c)
            {
            default:
                return -1;

            // TODO: need to decide on digit separator characters
            case '_':
                cursor++;
                continue;

            case '0': case '1': case '2': case '3': case '4':
            case '5': case '6': case '7': case '8': case '9':
                cursor++;
                return c - '0';

            case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
                if(base > 10)
                {
                    cursor++;
                    return 10 + c - 'a';
                }
                return -1;

            case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
                if(base > 10)
                {
                    cursor++;
                    return 10 + c - 'A';
                }
                return -1;
            }
        }
    }

    static int _readOptionalBase(char const** ioCursor)
    {
        auto& cursor = *ioCursor;
        if( *cursor == '0' )
        {
            cursor++;
            switch(*cursor)
            {
            case 'x': case 'X':
                cursor++;
                return 16;

            case 'b': case 'B':
                cursor++;
                return 2;

            case '0': case '1': case '2': case '3': case '4':
            case '5': case '6': case '7': case '8': case '9':
                return 8;

            default:
                return 10;
            }
        }

        return 10;
    }



    IntegerLiteralValue getIntegerLiteralValue(Token const& token, UnownedStringSlice* outSuffix)
    {
        IntegerLiteralValue value = 0;

        const UnownedStringSlice content = token.getContent();

        char const* cursor = content.begin();
        char const* end = content.end();

        int base = _readOptionalBase(&cursor);

        for( ;;)
        {
            int digit = _maybeReadDigit(&cursor, base);
            if(digit < 0)
                break;

            value = value*base + digit;
        }

        if(outSuffix)
        {
            *outSuffix = UnownedStringSlice(cursor, end);
        }

        return value;
    }

    FloatingPointLiteralValue getFloatingPointLiteralValue(Token const& token, UnownedStringSlice* outSuffix)
    {
        FloatingPointLiteralValue value = 0;

        const UnownedStringSlice content = token.getContent();

        char const* cursor = content.begin();
        char const* end = content.end();

        int radix = _readOptionalBase(&cursor);

        bool seenDot = false;
        FloatingPointLiteralValue divisor = 1;
        for( ;;)
        {
            if(*cursor == '.')
            {
                cursor++;
                seenDot = true;
                continue;
            }

            int digit = _maybeReadDigit(&cursor, radix);
            if(digit < 0)
                break;

            value = value*radix + digit;

            if(seenDot)
            {
                divisor *= radix;
            }
        }

        if (*cursor == '#')
        {
            // It must be INF
            const auto inf = toSlice("#INF");

            if (UnownedStringSlice(cursor, end).startsWith(inf))
            {
                if(outSuffix)
                {
                    *outSuffix = UnownedStringSlice(cursor + inf.getLength(), end);
                }

                value = INFINITY;

                return value;
            }
        }

        // Now read optional exponent
        if(_isNumberExponent(*cursor, radix))
        {
            cursor++;

            bool exponentIsNegative = false;
            switch(*cursor)
            {
            default:
                break;

            case '-':
                exponentIsNegative = true;
                cursor++;
                break;

            case '+':
                cursor++;
                break;
            }

            int exponentRadix = 10;
            int exponent = 0;

            for(;;)
            {
                int digit = _maybeReadDigit(&cursor, exponentRadix);
                if(digit < 0)
                    break;

                exponent = exponent*exponentRadix + digit;
            }

            FloatingPointLiteralValue exponentBase = 10;
            if(radix == 16)
            {
                exponentBase = 2;
            }

            FloatingPointLiteralValue exponentValue = pow(exponentBase, exponent);

            if( exponentIsNegative )
            {
                divisor *= exponentValue;
            }
            else
            {
                value *= exponentValue;
            }
        }

        value /= divisor;

        if(outSuffix)
        {
            *outSuffix = UnownedStringSlice(cursor, end);
        }

        return value;
    }

    static void _lexStringLiteralBody(Lexer* lexer, char quote)
    {
        for(;;)
        {
            int c = _peek(lexer);
            if(c == quote)
            {
                _advance(lexer);
                return;
            }

            switch(c)
            {
            case kEOF:
                if (auto sink = lexer->getDiagnosticSink())
                {
                    sink->diagnose(_getSourceLoc(lexer), LexerDiagnostics::endOfFileInLiteral);
                }
                return;

            case '\n': case '\r':
                if (auto sink = lexer->getDiagnosticSink())
                {
                    sink->diagnose(_getSourceLoc(lexer), LexerDiagnostics::newlineInLiteral);
                }
                return;

            case '\\':
                // Need to handle various escape sequence cases
                _advance(lexer);
                switch(_peek(lexer))
                {
                case '\'':
                case '\"':
                case '\\':
                case '?':
                case 'a':
                case 'b':
                case 'f':
                case 'n':
                case 'r':
                case 't':
                case 'v':
                    _advance(lexer);
                    break;

                case '0': case '1': case '2': case '3': case '4':
                case '5': case '6': case '7':
                    // octal escape: up to 3 characters
                    _advance(lexer);
                    for(int ii = 0; ii < 3; ++ii)
                    {
                        int d = _peek(lexer);
                        if(('0' <= d) && (d <= '7'))
                        {
                            _advance(lexer);
                            continue;
                        }
                        else
                        {
                            break;
                        }
                    }
                    break;

                case 'x':
                    // hexadecimal escape: any number of characters
                    _advance(lexer);
                    for(;;)
                    {
                        int d = _peek(lexer);
                        if(('0' <= d) && (d <= '9')
                            || ('a' <= d) && (d <= 'f')
                            || ('A' <= d) && (d <= 'F'))
                        {
                            _advance(lexer);
                            continue;
                        }
                        else
                        {
                            break;
                        }
                    }
                    break;

                // TODO: Unicode escape sequences

                }
                break;

            default:
                _advance(lexer);
                continue;
            }
        }
    }

    static void _lexRawStringLiteralBody(Lexer* lexer)
    {
        const char* start = lexer->m_cursor;
        const char* endOfDelimiter = nullptr;
        for (;;)
        {
            int c = _peek(lexer);
            if (c == '(' && endOfDelimiter == nullptr)
                endOfDelimiter = lexer->m_cursor;
            if (c == '\"')
            {
                if (!endOfDelimiter)
                {
                    if (auto sink = lexer->getDiagnosticSink())
                    {
                        sink->diagnose(_getSourceLoc(lexer), LexerDiagnostics::quoteCannotBeDelimiter);
                    }
                }
                else
                {
                    auto testStart = lexer->m_cursor - (endOfDelimiter - start);
                    if (testStart > endOfDelimiter)
                    {
                        auto testDelimiter = UnownedStringSlice(testStart, lexer->m_cursor);
                        auto delimiter = UnownedStringSlice(start, endOfDelimiter);
                        if (*(testStart - 1) == ')' && testDelimiter == delimiter)
                        {
                            _advance(lexer);
                            return;
                        }
                    }
                }
            }

            switch (c)
            {
            case kEOF:
                if (auto sink = lexer->getDiagnosticSink())
                {
                    sink->diagnose(_getSourceLoc(lexer), LexerDiagnostics::endOfFileInLiteral);
                }
                return;
            default:
                _advance(lexer);
                continue;
            }
        }
    }

    UnownedStringSlice getRawStringLiteralTokenValue(Token const& token)
    {
        auto content = token.getContent();
        if (content.getLength() <= 5)
            return UnownedStringSlice();
        auto start = content.begin() + 2;
        auto delimEnd = start;
        while (delimEnd < content.end() && *delimEnd != '(')
            delimEnd++;
        auto delimLength = delimEnd - start;
        auto contentEnd = content.end() - delimLength - 2;
        auto contentBegin = start + delimLength + 1;
        if (contentEnd <= contentBegin)
            return UnownedStringSlice();
        return UnownedStringSlice(contentBegin, contentEnd);
    }

    String getStringLiteralTokenValue(Token const& token)
    {
        SLANG_ASSERT(token.type == TokenType::StringLiteral
            || token.type == TokenType::CharLiteral);

        if (token.getContent().startsWith("R"))
            return getRawStringLiteralTokenValue(token);

        const UnownedStringSlice content = token.getContent();

        char const* cursor = content.begin();
        char const* end = content.end();
        SLANG_UNREFERENCED_VARIABLE(end);

        auto quote = *cursor++;
        SLANG_ASSERT(quote == '\'' || quote == '"');

        StringBuilder valueBuilder;
        for(;;)
        {
            SLANG_ASSERT(cursor != end);

            auto c = *cursor++;

            // If we see a closing quote, then we are at the end of the string literal
            if(c == quote)
            {
                SLANG_ASSERT(cursor == end);
                return valueBuilder.produceString();
            }

            // Characters that don't being escape sequences are easy;
            // just append them to the buffer and move on.
            if(c != '\\')
            {
                valueBuilder.append(c);
                continue;
            }

            // Now we look at another character to figure out the kind of
            // escape sequence we are dealing with:

            char d = *cursor++;

            switch(d)
            {
            // Simple characters that just needed to be escaped
            case '\'':
            case '\"':
            case '\\':
            case '?':
                valueBuilder.append(d);
                continue;

            // Traditional escape sequences for special characters
            case 'a': valueBuilder.append('\a'); continue;
            case 'b': valueBuilder.append('\b'); continue;
            case 'f': valueBuilder.append('\f'); continue;
            case 'n': valueBuilder.append('\n'); continue;
            case 'r': valueBuilder.append('\r'); continue;
            case 't': valueBuilder.append('\t'); continue;
            case 'v': valueBuilder.append('\v'); continue;

            // Octal escape: up to 3 characters
            case '0': case '1': case '2': case '3': case '4':
            case '5': case '6': case '7':
                {
                    cursor--;
                    int value = 0;
                    for(int ii = 0; ii < 3; ++ii)
                    {
                        d = *cursor;
                        if(('0' <= d) && (d <= '7'))
                        {
                            value = value*8 + (d - '0');

                            cursor++;
                            continue;
                        }
                        else
                        {
                            break;
                        }
                    }

                    // TODO: add support for appending an arbitrary code point?
                    valueBuilder.append((char) value);
                }
                continue;

            // Hexadecimal escape: any number of characters
            case 'x':
                {
                    int value = 0;
                    for(;;)
                    {
                        d = *cursor++;
                        int digitValue = 0;
                        if(('0' <= d) && (d <= '9'))
                        {
                            digitValue = d - '0';
                        }
                        else if( ('a' <= d) && (d <= 'f') )
                        {
                            digitValue = d - 'a';
                        }
                        else if( ('A' <= d) && (d <= 'F') )
                        {
                            digitValue = d - 'A';
                        }
                        else
                        {
                            cursor--;
                            break;
                        }

                        value = value*16 + digitValue;
                    }

                    // TODO: add support for appending an arbitrary code point?
                    valueBuilder.append((char) value);
                }
                continue;

            // TODO: Unicode escape sequences

            }
        }
    }

    String getFileNameTokenValue(Token const& token)
    {
        const UnownedStringSlice content = token.getContent();

        // A file name usually doesn't process escape sequences
        // (this is import on Windows, where `\\` is a valid
        // path separator character).

        // Just trim off the first and last characters to remove the quotes
        // (whether they were `""` or `<>`.
        return String(content.begin() + 1, content.end() - 1); 
    }

    static TokenType _lexTokenImpl(Lexer* lexer)
    {
        switch(_peek(lexer))
        {
        default:
            break;

        case kEOF:
            return TokenType::EndOfFile;

        case '\r': case '\n':
            _handleNewLine(lexer);
            return TokenType::NewLine;

        case ' ': case '\t':
            _lexHorizontalSpace(lexer);
            return TokenType::WhiteSpace;

        case '.':
            _advance(lexer);
            switch(_peek(lexer))
            {
            case '0': case '1': case '2': case '3': case '4':
            case '5': case '6': case '7': case '8': case '9':
                return _lexNumberAfterDecimalPoint(lexer, 10);

            case '.':
                // Note: consuming the second `.` here means that
                // we cannot back up and return a `.` token by itself
                // any more. We thus end up having distinct tokens for
                // `.`, `..`, and `...` even though the `..` case is
                // not part of HLSL.
                //
                _advance(lexer);
                switch(_peek(lexer))
                {
                case '.':
                    _advance(lexer);
                    return TokenType::Ellipsis;

                default:
                    return TokenType::DotDot;
                }

            default:
                return TokenType::Dot;
            }

        case '1': case '2': case '3': case '4':
        case '5': case '6': case '7': case '8': case '9':
            return _lexNumber(lexer, 10);

        case '0':
            {
                auto loc = _getSourceLoc(lexer);
                _advance(lexer);
                switch(_peek(lexer))
                {
                default:
                    return _maybeLexNumberSuffix(lexer, TokenType::IntegerLiteral);

                case '.':
                    _advance(lexer);
                    return _lexNumberAfterDecimalPoint(lexer, 10);

                case 'x': case 'X':
                    _advance(lexer);
                    return _lexNumber(lexer, 16);

                case 'b': case 'B':
                    _advance(lexer);
                    return _lexNumber(lexer, 2);

                case '0': case '1': case '2': case '3': case '4':
                case '5': case '6': case '7': case '8': case '9':
                    if (auto sink = lexer->getDiagnosticSink())
                    {
                        sink->diagnose(loc, LexerDiagnostics::octalLiteral);
                    }
                    return _lexNumber(lexer, 8);
                }
            }

        case 'a': case 'b': case 'c': case 'd': case 'e':
        case 'f': case 'g': case 'h': case 'i': case 'j':
        case 'k': case 'l': case 'm': case 'n': case 'o':
        case 'p': case 'q': case 'r': case 's': case 't':
        case 'u': case 'v': case 'w': case 'x': case 'y':
        case 'z':
        case 'A': case 'B': case 'C': case 'D': case 'E':
        case 'F': case 'G': case 'H': case 'I': case 'J':
        case 'K': case 'L': case 'M': case 'N': case 'O':
        case 'P': case 'Q': case 'S': case 'T':
        case 'U': case 'V': case 'W': case 'X': case 'Y':
        case 'Z':
        case '_':
            _lexIdentifier(lexer);
            return TokenType::Identifier;
        case 'R':
            _advance(lexer);
            switch (_peek(lexer))
            {
            default:
                _lexIdentifier(lexer);
                return TokenType::Identifier;
            case '\"':
                _advance(lexer);
                _lexRawStringLiteralBody(lexer);
                return TokenType::StringLiteral;
            }

        case '\"':
            _advance(lexer);
            _lexStringLiteralBody(lexer, '\"');
            return TokenType::StringLiteral;

        case '\'':
            _advance(lexer);
            _lexStringLiteralBody(lexer, '\'');
            return TokenType::CharLiteral;


        case '+':
            _advance(lexer);
            switch(_peek(lexer))
            {
            case '+': _advance(lexer); return TokenType::OpInc;
            case '=': _advance(lexer); return TokenType::OpAddAssign;
            default:
                return TokenType::OpAdd;
            }

        case '-':
            _advance(lexer);
            switch(_peek(lexer))
            {
            case '-': _advance(lexer); return TokenType::OpDec;
            case '=': _advance(lexer); return TokenType::OpSubAssign;
            case '>': _advance(lexer); return TokenType::RightArrow;
            default:
                return TokenType::OpSub;
            }

        case '*':
            _advance(lexer);
            switch(_peek(lexer))
            {
            case '=': _advance(lexer); return TokenType::OpMulAssign;
            default:
                return TokenType::OpMul;
            }

        case '/':
            _advance(lexer);
            switch(_peek(lexer))
            {
            case '=': _advance(lexer); return TokenType::OpDivAssign;
            case '/': _advance(lexer); _lexLineComment(lexer); return TokenType::LineComment;
            case '*': _advance(lexer); _lexBlockComment(lexer); return TokenType::BlockComment;
            default:
                return TokenType::OpDiv;
            }

        case '%':
            _advance(lexer);
            switch(_peek(lexer))
            {
            case '=': _advance(lexer); return TokenType::OpModAssign;
            default:
                return TokenType::OpMod;
            }

        case '|':
            _advance(lexer);
            switch(_peek(lexer))
            {
            case '|': _advance(lexer); return TokenType::OpOr;
            case '=': _advance(lexer); return TokenType::OpOrAssign;
            default:
                return TokenType::OpBitOr;
            }

        case '&':
            _advance(lexer);
            switch(_peek(lexer))
            {
            case '&': _advance(lexer); return TokenType::OpAnd;
            case '=': _advance(lexer); return TokenType::OpAndAssign;
            default:
                return TokenType::OpBitAnd;
            }

        case '^':
            _advance(lexer);
            switch(_peek(lexer))
            {
            case '=': _advance(lexer); return TokenType::OpXorAssign;
            default:
                return TokenType::OpBitXor;
            }

        case '>':
            _advance(lexer);
            switch(_peek(lexer))
            {
            case '>':
                _advance(lexer);
                switch(_peek(lexer))
                {
                case '=': _advance(lexer); return TokenType::OpShrAssign;
                default: return TokenType::OpRsh;
                }
            case '=': _advance(lexer); return TokenType::OpGeq;
            default:
                return TokenType::OpGreater;
            }

        case '<':
            _advance(lexer);
            switch(_peek(lexer))
            {
            case '<':
                _advance(lexer);
                switch(_peek(lexer))
                {
                case '=': _advance(lexer); return TokenType::OpShlAssign;
                default: return TokenType::OpLsh;
                }
            case '=': _advance(lexer); return TokenType::OpLeq;
            default:
                return TokenType::OpLess;
            }

        case '=':
            _advance(lexer);
            switch(_peek(lexer))
            {
            case '=': _advance(lexer); return TokenType::OpEql;
            default:
                return TokenType::OpAssign;
            }

        case '!':
            _advance(lexer);
            switch(_peek(lexer))
            {
            case '=': _advance(lexer); return TokenType::OpNeq;
            default:
                return TokenType::OpNot;
            }

        case '#':
            _advance(lexer);
            switch(_peek(lexer))
            {
            case '#': _advance(lexer); return TokenType::PoundPound;

            case '?': _advance(lexer); return TokenType::CompletionRequest;

            default:
                return TokenType::Pound;
            }

        case '~': _advance(lexer); return TokenType::OpBitNot;

        case ':': 
        {
            _advance(lexer);
            if (_peek(lexer) == ':')
            {
                _advance(lexer);
                return TokenType::Scope;
            }
            return TokenType::Colon;
        }
        case ';': _advance(lexer); return TokenType::Semicolon;
        case ',': _advance(lexer); return TokenType::Comma;

        case '{': _advance(lexer); return TokenType::LBrace;
        case '}': _advance(lexer); return TokenType::RBrace;
        case '[': _advance(lexer); return TokenType::LBracket;
        case ']': _advance(lexer); return TokenType::RBracket;
        case '(': _advance(lexer); return TokenType::LParent;
        case ')': _advance(lexer); return TokenType::RParent;

        case '?': _advance(lexer); return TokenType::QuestionMark;
        case '@': _advance(lexer); return TokenType::At;
        case '$':
        {
            _advance(lexer);
            if(_peek(lexer) == '$')
            {
                _advance(lexer);
                return TokenType::DollarDollar;
            }
            return TokenType::Dollar;
        }

        }

        // TODO(tfoley): If we ever wanted to support proper Unicode
        // in identifiers, etc., then this would be the right place
        // to perform a more expensive dispatch based on the actual
        // code point (and not just the first byte).

        {
            // If none of the above cases matched, then we have an
            // unexpected/invalid character.

            auto loc = _getSourceLoc(lexer);
            int c = _advance(lexer);

            if (auto sink = lexer->getDiagnosticSink())
            {
                if(c >= 0x20 && c <=  0x7E)
                {
                    char buffer[] = { (char) c, 0 };
                    sink->diagnose(loc, LexerDiagnostics::illegalCharacterPrint, buffer);
                }
                else
                {
                    // Fallback: print as hexadecimal
                    sink->diagnose(loc, LexerDiagnostics::illegalCharacterHex, String((unsigned char)c, 16));
                }
            }

            return TokenType::Invalid;
        }
    }

    Token Lexer::lexToken()
    {
        for(;;)
        {
            Token token;
            token.loc = _getSourceLoc(this);

            char const* textBegin = m_cursor;

            auto tokenType = _lexTokenImpl(this);

            // The flags on the token we just lexed will be based
            // on the current state of the lexer.
            //
            auto tokenFlags = m_tokenFlags;
            //
            // Depending on what kind of token we just lexed, the
            // flags that will be used for the *next* token might
            // need to be updated.
            //
            switch(tokenType)
            {
            case TokenType::NewLine:
                {
                    // If we just reached the end of a line, then the next token
                    // should count as being at the start of a line, and also after
                    // whitespace.
                    //
                    m_tokenFlags = TokenFlag::AtStartOfLine | TokenFlag::AfterWhitespace;
                    break;
                }

            case TokenType::WhiteSpace:
            case TokenType::BlockComment:
            case TokenType::LineComment:
                {
                    // True horizontal whitespace and comments both count as whitespace.
                    //
                    // Note that a line comment does not include the terminating newline,
                    // we do not need to set `AtStartOfLine` here.
                    //
                    m_tokenFlags |= TokenFlag::AfterWhitespace;
                    break;
                }
            
            default:
                {
                    // If we read some token other then the above cases, then we are
                    // neither after whitespace nor at the start of a line.
                    //
                    m_tokenFlags = 0;
                    break;
                }
            }

            token.type =  tokenType;
            token.flags = tokenFlags;

            char const* textEnd = m_cursor;

            // Note(tfoley): `StringBuilder::Append()` seems to crash when appending zero bytes
            if(textEnd != textBegin)
            {
                // "scrubbing" token value here to remove escaped newlines...
                //
                // Only perform this work if we encountered an escaped newline
                // while lexing this token (e.g., keep a flag on the lexer), or
                // do it on-demand when the actual value of the token is needed.
                if (tokenFlags & TokenFlag::ScrubbingNeeded)
                {
                    // Allocate space that will always be more than enough for stripped contents
                    char* startDst = (char*)m_memoryArena->allocateUnaligned(textEnd - textBegin);
                    char* dst = startDst;

                    auto tt = textBegin;
                    while (tt != textEnd)
                    {
                        char c = *tt++;
                        if (c == '\\')
                        {
                            char d = *tt;
                            switch (d)
                            {
                            case '\r': case '\n':
                            {
                                tt++;
                                char e = *tt;
                                if ((d ^ e) == ('\r' ^ '\n'))
                                {
                                    tt++;
                                }
                            }
                            continue;

                            default:
                                break;
                            }
                        }
                        *dst++ = c;
                    }
                    token.setContent(UnownedStringSlice(startDst, dst));
                }
                else
                {
                    token.setContent(UnownedStringSlice(textBegin, textEnd));
                }
            }

            if (m_namePool)
            {
                if (tokenType == TokenType::Identifier || tokenType == TokenType::CompletionRequest)
                {
                    token.setName(m_namePool->getName(token.getContent()));
                }
            }

            return token;
        }
    }

    TokenList Lexer::lexAllSemanticTokens()
    {
        TokenList tokenList;
        for(;;)
        {
            Token token = lexToken();

            // We are only interested intokens that are semantically
            // significant, so we will skip over forms of whitespace
            // and comments.
            //
            switch( token.type )
            {
            default:
                break;

            case TokenType::WhiteSpace:
            case TokenType::BlockComment:
            case TokenType::LineComment:
            case TokenType::NewLine:
                continue;
            }

            tokenList.add(token);
            if(token.type == TokenType::EndOfFile)
                return tokenList;
        }
    }

    TokenList Lexer::lexAllMarkupTokens()
    {
        TokenList tokenList;
        for(;;)
        {
            Token token = lexToken();
            switch( token.type )
            {
            default:
                break;

            case TokenType::WhiteSpace:
            case TokenType::NewLine:
                continue;
            }

            tokenList.add(token);
            if(token.type == TokenType::EndOfFile)
                return tokenList;
        }
    }

    /* static */UnownedStringSlice Lexer::sourceLocationLexer(const UnownedStringSlice& in)
    {
        Lexer lexer;

        SourceManager sourceManager;
        sourceManager.initialize(nullptr, nullptr);

        auto sourceFile = sourceManager.createSourceFileWithString(PathInfo::makeUnknown(), in);
        auto sourceView = sourceManager.createSourceView(sourceFile, nullptr, SourceLoc::fromRaw(0));

        DiagnosticSink sink(&sourceManager, nullptr);

        MemoryArena arena;

        RootNamePool rootNamePool;
        NamePool namePool;
        namePool.setRootNamePool(&rootNamePool);

        lexer.initialize(sourceView, &sink, &namePool, &arena);

        Token tok = lexer.lexToken();

        if (tok.type == TokenType::Invalid)
        {
            return UnownedStringSlice();
        }

        const int offset = sourceView->getRange().getOffset(tok.loc);

        SLANG_ASSERT(offset >= 0 && offset <= in.getLength());
        SLANG_ASSERT(Index(offset + tok.charsCount) <= in.getLength());

        return UnownedStringSlice(in.begin() + offset, in.begin() + offset + tok.charsCount);
    }

}
Browse the archive

https://github.com/shader-slang/slang