Content - 74ef678dea6b18c5b427409881020bf7345bf1d1 - f3c46c3/packages/compiler/src/ml_parser/lexer.ts

visit type:
Tip revision: 0461bff95f72107d55c8db7ae8a4154296565721 authored by Andrew Scott on 20 March 2024, 21:53:13 UTC
release: cut the v18.0.0-next.1 release
Tip revision: 0461bff
lexer.ts
/**
 * @license
 * Copyright Google LLC All Rights Reserved.
 *
 * Use of this source code is governed by an MIT-style license that can be
 * found in the LICENSE file at https://angular.io/license
 */

import * as chars from '../chars';
import {ParseError, ParseLocation, ParseSourceFile, ParseSourceSpan} from '../parse_util';

import {DEFAULT_INTERPOLATION_CONFIG, InterpolationConfig} from './defaults';
import {NAMED_ENTITIES} from './entities';
import {TagContentType, TagDefinition} from './tags';
import {IncompleteTagOpenToken, TagOpenStartToken, Token, TokenType} from './tokens';

export class TokenError extends ParseError {
  constructor(errorMsg: string, public tokenType: TokenType|null, span: ParseSourceSpan) {
    super(span, errorMsg);
  }
}

export class TokenizeResult {
  constructor(
      public tokens: Token[], public errors: TokenError[],
      public nonNormalizedIcuExpressions: Token[]) {}
}

export interface LexerRange {
  startPos: number;
  startLine: number;
  startCol: number;
  endPos: number;
}

/**
 * Options that modify how the text is tokenized.
 */
export interface TokenizeOptions {
  /** Whether to tokenize ICU messages (considered as text nodes when false). */
  tokenizeExpansionForms?: boolean;
  /** How to tokenize interpolation markers. */
  interpolationConfig?: InterpolationConfig;
  /**
   * The start and end point of the text to parse within the `source` string.
   * The entire `source` string is parsed if this is not provided.
   * */
  range?: LexerRange;
  /**
   * If this text is stored in a JavaScript string, then we have to deal with escape sequences.
   *
   * **Example 1:**
   *
   * ```
   * "abc\"def\nghi"
   * ```
   *
   * - The `\"` must be converted to `"`.
   * - The `\n` must be converted to a new line character in a token,
   *   but it should not increment the current line for source mapping.
   *
   * **Example 2:**
   *
   * ```
   * "abc\
   *  def"
   * ```
   *
   * The line continuation (`\` followed by a newline) should be removed from a token
   * but the new line should increment the current line for source mapping.
   */
  escapedString?: boolean;
  /**
   * If this text is stored in an external template (e.g. via `templateUrl`) then we need to decide
   * whether or not to normalize the line-endings (from `\r\n` to `\n`) when processing ICU
   * expressions.
   *
   * If `true` then we will normalize ICU expression line endings.
   * The default is `false`, but this will be switched in a future major release.
   */
  i18nNormalizeLineEndingsInICUs?: boolean;
  /**
   * An array of characters that should be considered as leading trivia.
   * Leading trivia are characters that are not important to the developer, and so should not be
   * included in source-map segments.  A common example is whitespace.
   */
  leadingTriviaChars?: string[];
  /**
   * If true, do not convert CRLF to LF.
   */
  preserveLineEndings?: boolean;

  /**
   * Whether to tokenize @ block syntax. Otherwise considered text,
   * or ICU tokens if `tokenizeExpansionForms` is enabled.
   */
  tokenizeBlocks?: boolean;
}

export function tokenize(
    source: string, url: string, getTagDefinition: (tagName: string) => TagDefinition,
    options: TokenizeOptions = {}): TokenizeResult {
  const tokenizer = new _Tokenizer(new ParseSourceFile(source, url), getTagDefinition, options);
  tokenizer.tokenize();
  return new TokenizeResult(
      mergeTextTokens(tokenizer.tokens), tokenizer.errors, tokenizer.nonNormalizedIcuExpressions);
}

const _CR_OR_CRLF_REGEXP = /\r\n?/g;

function _unexpectedCharacterErrorMsg(charCode: number): string {
  const char = charCode === chars.$EOF ? 'EOF' : String.fromCharCode(charCode);
  return `Unexpected character "${char}"`;
}

function _unknownEntityErrorMsg(entitySrc: string): string {
  return `Unknown entity "${entitySrc}" - use the "&#<decimal>;" or  "&#x<hex>;" syntax`;
}

function _unparsableEntityErrorMsg(type: CharacterReferenceType, entityStr: string): string {
  return `Unable to parse entity "${entityStr}" - ${
      type} character reference entities must end with ";"`;
}

enum CharacterReferenceType {
  HEX = 'hexadecimal',
  DEC = 'decimal',
}

class _ControlFlowError {
  constructor(public error: TokenError) {}
}

// See https://www.w3.org/TR/html51/syntax.html#writing-html-documents
class _Tokenizer {
  private _cursor: CharacterCursor;
  private _tokenizeIcu: boolean;
  private _interpolationConfig: InterpolationConfig;
  private _leadingTriviaCodePoints: number[]|undefined;
  private _currentTokenStart: CharacterCursor|null = null;
  private _currentTokenType: TokenType|null = null;
  private _expansionCaseStack: TokenType[] = [];
  private _inInterpolation: boolean = false;
  private readonly _preserveLineEndings: boolean;
  private readonly _i18nNormalizeLineEndingsInICUs: boolean;
  private readonly _tokenizeBlocks: boolean;
  tokens: Token[] = [];
  errors: TokenError[] = [];
  nonNormalizedIcuExpressions: Token[] = [];

  /**
   * @param _file The html source file being tokenized.
   * @param _getTagDefinition A function that will retrieve a tag definition for a given tag name.
   * @param options Configuration of the tokenization.
   */
  constructor(
      _file: ParseSourceFile, private _getTagDefinition: (tagName: string) => TagDefinition,
      options: TokenizeOptions) {
    this._tokenizeIcu = options.tokenizeExpansionForms || false;
    this._interpolationConfig = options.interpolationConfig || DEFAULT_INTERPOLATION_CONFIG;
    this._leadingTriviaCodePoints =
        options.leadingTriviaChars && options.leadingTriviaChars.map(c => c.codePointAt(0) || 0);
    const range =
        options.range || {endPos: _file.content.length, startPos: 0, startLine: 0, startCol: 0};
    this._cursor = options.escapedString ? new EscapedCharacterCursor(_file, range) :
                                           new PlainCharacterCursor(_file, range);
    this._preserveLineEndings = options.preserveLineEndings || false;
    this._i18nNormalizeLineEndingsInICUs = options.i18nNormalizeLineEndingsInICUs || false;
    this._tokenizeBlocks = options.tokenizeBlocks ?? true;
    try {
      this._cursor.init();
    } catch (e) {
      this.handleError(e);
    }
  }

  private _processCarriageReturns(content: string): string {
    if (this._preserveLineEndings) {
      return content;
    }
    // https://www.w3.org/TR/html51/syntax.html#preprocessing-the-input-stream
    // In order to keep the original position in the source, we can not
    // pre-process it.
    // Instead CRs are processed right before instantiating the tokens.
    return content.replace(_CR_OR_CRLF_REGEXP, '\n');
  }

  tokenize(): void {
    while (this._cursor.peek() !== chars.$EOF) {
      const start = this._cursor.clone();
      try {
        if (this._attemptCharCode(chars.$LT)) {
          if (this._attemptCharCode(chars.$BANG)) {
            if (this._attemptCharCode(chars.$LBRACKET)) {
              this._consumeCdata(start);
            } else if (this._attemptCharCode(chars.$MINUS)) {
              this._consumeComment(start);
            } else {
              this._consumeDocType(start);
            }
          } else if (this._attemptCharCode(chars.$SLASH)) {
            this._consumeTagClose(start);
          } else {
            this._consumeTagOpen(start);
          }
        } else if (this._tokenizeBlocks && this._attemptCharCode(chars.$AT)) {
          this._consumeBlockStart(start);
        } else if (
            this._tokenizeBlocks && !this._inInterpolation && !this._isInExpansionCase() &&
            !this._isInExpansionForm() && this._attemptCharCode(chars.$RBRACE)) {
          this._consumeBlockEnd(start);
        } else if (!(this._tokenizeIcu && this._tokenizeExpansionForm())) {
          // In (possibly interpolated) text the end of the text is given by `isTextEnd()`, while
          // the premature end of an interpolation is given by the start of a new HTML element.
          this._consumeWithInterpolation(
              TokenType.TEXT, TokenType.INTERPOLATION, () => this._isTextEnd(),
              () => this._isTagStart());
        }
      } catch (e) {
        this.handleError(e);
      }
    }
    this._beginToken(TokenType.EOF);
    this._endToken([]);
  }

  private _getBlockName(): string {
    // This allows us to capture up something like `@else if`, but not `@ if`.
    let spacesInNameAllowed = false;
    const nameCursor = this._cursor.clone();

    this._attemptCharCodeUntilFn(code => {
      if (chars.isWhitespace(code)) {
        return !spacesInNameAllowed;
      }
      if (isBlockNameChar(code)) {
        spacesInNameAllowed = true;
        return false;
      }
      return true;
    });
    return this._cursor.getChars(nameCursor).trim();
  }

  private _consumeBlockStart(start: CharacterCursor) {
    this._beginToken(TokenType.BLOCK_OPEN_START, start);
    const startToken = this._endToken([this._getBlockName()]);

    if (this._cursor.peek() === chars.$LPAREN) {
      // Advance past the opening paren.
      this._cursor.advance();
      // Capture the parameters.
      this._consumeBlockParameters();
      // Allow spaces before the closing paren.
      this._attemptCharCodeUntilFn(isNotWhitespace);

      if (this._attemptCharCode(chars.$RPAREN)) {
        // Allow spaces after the paren.
        this._attemptCharCodeUntilFn(isNotWhitespace);
      } else {
        startToken.type = TokenType.INCOMPLETE_BLOCK_OPEN;
        return;
      }
    }

    if (this._attemptCharCode(chars.$LBRACE)) {
      this._beginToken(TokenType.BLOCK_OPEN_END);
      this._endToken([]);
    } else {
      startToken.type = TokenType.INCOMPLETE_BLOCK_OPEN;
    }
  }

  private _consumeBlockEnd(start: CharacterCursor) {
    this._beginToken(TokenType.BLOCK_CLOSE, start);
    this._endToken([]);
  }

  private _consumeBlockParameters() {
    // Trim the whitespace until the first parameter.
    this._attemptCharCodeUntilFn(isBlockParameterChar);

    while (this._cursor.peek() !== chars.$RPAREN && this._cursor.peek() !== chars.$EOF) {
      this._beginToken(TokenType.BLOCK_PARAMETER);
      const start = this._cursor.clone();
      let inQuote: number|null = null;
      let openParens = 0;

      // Consume the parameter until the next semicolon or brace.
      // Note that we skip over semicolons/braces inside of strings.
      while ((this._cursor.peek() !== chars.$SEMICOLON && this._cursor.peek() !== chars.$EOF) ||
             inQuote !== null) {
        const char = this._cursor.peek();

        // Skip to the next character if it was escaped.
        if (char === chars.$BACKSLASH) {
          this._cursor.advance();
        } else if (char === inQuote) {
          inQuote = null;
        } else if (inQuote === null && chars.isQuote(char)) {
          inQuote = char;
        } else if (char === chars.$LPAREN && inQuote === null) {
          openParens++;
        } else if (char === chars.$RPAREN && inQuote === null) {
          if (openParens === 0) {
            break;
          } else if (openParens > 0) {
            openParens--;
          }
        }

        this._cursor.advance();
      }

      this._endToken([this._cursor.getChars(start)]);

      // Skip to the next parameter.
      this._attemptCharCodeUntilFn(isBlockParameterChar);
    }
  }

  /**
   * @returns whether an ICU token has been created
   * @internal
   */
  private _tokenizeExpansionForm(): boolean {
    if (this.isExpansionFormStart()) {
      this._consumeExpansionFormStart();
      return true;
    }

    if (isExpansionCaseStart(this._cursor.peek()) && this._isInExpansionForm()) {
      this._consumeExpansionCaseStart();
      return true;
    }

    if (this._cursor.peek() === chars.$RBRACE) {
      if (this._isInExpansionCase()) {
        this._consumeExpansionCaseEnd();
        return true;
      }

      if (this._isInExpansionForm()) {
        this._consumeExpansionFormEnd();
        return true;
      }
    }

    return false;
  }

  private _beginToken(type: TokenType, start = this._cursor.clone()) {
    this._currentTokenStart = start;
    this._currentTokenType = type;
  }

  private _endToken(parts: string[], end?: CharacterCursor): Token {
    if (this._currentTokenStart === null) {
      throw new TokenError(
          'Programming error - attempted to end a token when there was no start to the token',
          this._currentTokenType, this._cursor.getSpan(end));
    }
    if (this._currentTokenType === null) {
      throw new TokenError(
          'Programming error - attempted to end a token which has no token type', null,
          this._cursor.getSpan(this._currentTokenStart));
    }
    const token = {
      type: this._currentTokenType,
      parts,
      sourceSpan:
          (end ?? this._cursor).getSpan(this._currentTokenStart, this._leadingTriviaCodePoints),
    } as Token;
    this.tokens.push(token);
    this._currentTokenStart = null;
    this._currentTokenType = null;
    return token;
  }

  private _createError(msg: string, span: ParseSourceSpan): _ControlFlowError {
    if (this._isInExpansionForm()) {
      msg += ` (Do you have an unescaped "{" in your template? Use "{{ '{' }}") to escape it.)`;
    }
    const error = new TokenError(msg, this._currentTokenType, span);
    this._currentTokenStart = null;
    this._currentTokenType = null;
    return new _ControlFlowError(error);
  }

  private handleError(e: any) {
    if (e instanceof CursorError) {
      e = this._createError(e.msg, this._cursor.getSpan(e.cursor));
    }
    if (e instanceof _ControlFlowError) {
      this.errors.push(e.error);
    } else {
      throw e;
    }
  }

  private _attemptCharCode(charCode: number): boolean {
    if (this._cursor.peek() === charCode) {
      this._cursor.advance();
      return true;
    }
    return false;
  }

  private _attemptCharCodeCaseInsensitive(charCode: number): boolean {
    if (compareCharCodeCaseInsensitive(this._cursor.peek(), charCode)) {
      this._cursor.advance();
      return true;
    }
    return false;
  }

  private _requireCharCode(charCode: number) {
    const location = this._cursor.clone();
    if (!this._attemptCharCode(charCode)) {
      throw this._createError(
          _unexpectedCharacterErrorMsg(this._cursor.peek()), this._cursor.getSpan(location));
    }
  }

  private _attemptStr(chars: string): boolean {
    const len = chars.length;
    if (this._cursor.charsLeft() < len) {
      return false;
    }
    const initialPosition = this._cursor.clone();
    for (let i = 0; i < len; i++) {
      if (!this._attemptCharCode(chars.charCodeAt(i))) {
        // If attempting to parse the string fails, we want to reset the parser
        // to where it was before the attempt
        this._cursor = initialPosition;
        return false;
      }
    }
    return true;
  }

  private _attemptStrCaseInsensitive(chars: string): boolean {
    for (let i = 0; i < chars.length; i++) {
      if (!this._attemptCharCodeCaseInsensitive(chars.charCodeAt(i))) {
        return false;
      }
    }
    return true;
  }

  private _requireStr(chars: string) {
    const location = this._cursor.clone();
    if (!this._attemptStr(chars)) {
      throw this._createError(
          _unexpectedCharacterErrorMsg(this._cursor.peek()), this._cursor.getSpan(location));
    }
  }

  private _attemptCharCodeUntilFn(predicate: (code: number) => boolean) {
    while (!predicate(this._cursor.peek())) {
      this._cursor.advance();
    }
  }

  private _requireCharCodeUntilFn(predicate: (code: number) => boolean, len: number) {
    const start = this._cursor.clone();
    this._attemptCharCodeUntilFn(predicate);
    if (this._cursor.diff(start) < len) {
      throw this._createError(
          _unexpectedCharacterErrorMsg(this._cursor.peek()), this._cursor.getSpan(start));
    }
  }

  private _attemptUntilChar(char: number) {
    while (this._cursor.peek() !== char) {
      this._cursor.advance();
    }
  }

  private _readChar(): string {
    // Don't rely upon reading directly from `_input` as the actual char value
    // may have been generated from an escape sequence.
    const char = String.fromCodePoint(this._cursor.peek());
    this._cursor.advance();
    return char;
  }

  private _consumeEntity(textTokenType: TokenType): void {
    this._beginToken(TokenType.ENCODED_ENTITY);
    const start = this._cursor.clone();
    this._cursor.advance();
    if (this._attemptCharCode(chars.$HASH)) {
      const isHex = this._attemptCharCode(chars.$x) || this._attemptCharCode(chars.$X);
      const codeStart = this._cursor.clone();
      this._attemptCharCodeUntilFn(isDigitEntityEnd);
      if (this._cursor.peek() != chars.$SEMICOLON) {
        // Advance cursor to include the peeked character in the string provided to the error
        // message.
        this._cursor.advance();
        const entityType = isHex ? CharacterReferenceType.HEX : CharacterReferenceType.DEC;
        throw this._createError(
            _unparsableEntityErrorMsg(entityType, this._cursor.getChars(start)),
            this._cursor.getSpan());
      }
      const strNum = this._cursor.getChars(codeStart);
      this._cursor.advance();
      try {
        const charCode = parseInt(strNum, isHex ? 16 : 10);
        this._endToken([String.fromCharCode(charCode), this._cursor.getChars(start)]);
      } catch {
        throw this._createError(
            _unknownEntityErrorMsg(this._cursor.getChars(start)), this._cursor.getSpan());
      }
    } else {
      const nameStart = this._cursor.clone();
      this._attemptCharCodeUntilFn(isNamedEntityEnd);
      if (this._cursor.peek() != chars.$SEMICOLON) {
        // No semicolon was found so abort the encoded entity token that was in progress, and treat
        // this as a text token
        this._beginToken(textTokenType, start);
        this._cursor = nameStart;
        this._endToken(['&']);
      } else {
        const name = this._cursor.getChars(nameStart);
        this._cursor.advance();
        const char = NAMED_ENTITIES[name];
        if (!char) {
          throw this._createError(_unknownEntityErrorMsg(name), this._cursor.getSpan(start));
        }
        this._endToken([char, `&${name};`]);
      }
    }
  }

  private _consumeRawText(consumeEntities: boolean, endMarkerPredicate: () => boolean): void {
    this._beginToken(consumeEntities ? TokenType.ESCAPABLE_RAW_TEXT : TokenType.RAW_TEXT);
    const parts: string[] = [];
    while (true) {
      const tagCloseStart = this._cursor.clone();
      const foundEndMarker = endMarkerPredicate();
      this._cursor = tagCloseStart;
      if (foundEndMarker) {
        break;
      }
      if (consumeEntities && this._cursor.peek() === chars.$AMPERSAND) {
        this._endToken([this._processCarriageReturns(parts.join(''))]);
        parts.length = 0;
        this._consumeEntity(TokenType.ESCAPABLE_RAW_TEXT);
        this._beginToken(TokenType.ESCAPABLE_RAW_TEXT);
      } else {
        parts.push(this._readChar());
      }
    }
    this._endToken([this._processCarriageReturns(parts.join(''))]);
  }

  private _consumeComment(start: CharacterCursor) {
    this._beginToken(TokenType.COMMENT_START, start);
    this._requireCharCode(chars.$MINUS);
    this._endToken([]);
    this._consumeRawText(false, () => this._attemptStr('-->'));
    this._beginToken(TokenType.COMMENT_END);
    this._requireStr('-->');
    this._endToken([]);
  }

  private _consumeCdata(start: CharacterCursor) {
    this._beginToken(TokenType.CDATA_START, start);
    this._requireStr('CDATA[');
    this._endToken([]);
    this._consumeRawText(false, () => this._attemptStr(']]>'));
    this._beginToken(TokenType.CDATA_END);
    this._requireStr(']]>');
    this._endToken([]);
  }

  private _consumeDocType(start: CharacterCursor) {
    this._beginToken(TokenType.DOC_TYPE, start);
    const contentStart = this._cursor.clone();
    this._attemptUntilChar(chars.$GT);
    const content = this._cursor.getChars(contentStart);
    this._cursor.advance();
    this._endToken([content]);
  }

  private _consumePrefixAndName(): string[] {
    const nameOrPrefixStart = this._cursor.clone();
    let prefix: string = '';
    while (this._cursor.peek() !== chars.$COLON && !isPrefixEnd(this._cursor.peek())) {
      this._cursor.advance();
    }
    let nameStart: CharacterCursor;
    if (this._cursor.peek() === chars.$COLON) {
      prefix = this._cursor.getChars(nameOrPrefixStart);
      this._cursor.advance();
      nameStart = this._cursor.clone();
    } else {
      nameStart = nameOrPrefixStart;
    }
    this._requireCharCodeUntilFn(isNameEnd, prefix === '' ? 0 : 1);
    const name = this._cursor.getChars(nameStart);
    return [prefix, name];
  }

  private _consumeTagOpen(start: CharacterCursor) {
    let tagName: string;
    let prefix: string;
    let openTagToken: TagOpenStartToken|IncompleteTagOpenToken|undefined;
    try {
      if (!chars.isAsciiLetter(this._cursor.peek())) {
        throw this._createError(
            _unexpectedCharacterErrorMsg(this._cursor.peek()), this._cursor.getSpan(start));
      }

      openTagToken = this._consumeTagOpenStart(start);
      prefix = openTagToken.parts[0];
      tagName = openTagToken.parts[1];
      this._attemptCharCodeUntilFn(isNotWhitespace);
      while (this._cursor.peek() !== chars.$SLASH && this._cursor.peek() !== chars.$GT &&
             this._cursor.peek() !== chars.$LT && this._cursor.peek() !== chars.$EOF) {
        this._consumeAttributeName();
        this._attemptCharCodeUntilFn(isNotWhitespace);
        if (this._attemptCharCode(chars.$EQ)) {
          this._attemptCharCodeUntilFn(isNotWhitespace);
          this._consumeAttributeValue();
        }
        this._attemptCharCodeUntilFn(isNotWhitespace);
      }
      this._consumeTagOpenEnd();
    } catch (e) {
      if (e instanceof _ControlFlowError) {
        if (openTagToken) {
          // We errored before we could close the opening tag, so it is incomplete.
          openTagToken.type = TokenType.INCOMPLETE_TAG_OPEN;
        } else {
          // When the start tag is invalid, assume we want a "<" as text.
          // Back to back text tokens are merged at the end.
          this._beginToken(TokenType.TEXT, start);
          this._endToken(['<']);
        }
        return;
      }

      throw e;
    }

    const contentTokenType = this._getTagDefinition(tagName).getContentType(prefix);

    if (contentTokenType === TagContentType.RAW_TEXT) {
      this._consumeRawTextWithTagClose(prefix, tagName, false);
    } else if (contentTokenType === TagContentType.ESCAPABLE_RAW_TEXT) {
      this._consumeRawTextWithTagClose(prefix, tagName, true);
    }
  }

  private _consumeRawTextWithTagClose(prefix: string, tagName: string, consumeEntities: boolean) {
    this._consumeRawText(consumeEntities, () => {
      if (!this._attemptCharCode(chars.$LT)) return false;
      if (!this._attemptCharCode(chars.$SLASH)) return false;
      this._attemptCharCodeUntilFn(isNotWhitespace);
      if (!this._attemptStrCaseInsensitive(tagName)) return false;
      this._attemptCharCodeUntilFn(isNotWhitespace);
      return this._attemptCharCode(chars.$GT);
    });
    this._beginToken(TokenType.TAG_CLOSE);
    this._requireCharCodeUntilFn(code => code === chars.$GT, 3);
    this._cursor.advance();  // Consume the `>`
    this._endToken([prefix, tagName]);
  }

  private _consumeTagOpenStart(start: CharacterCursor): TagOpenStartToken {
    this._beginToken(TokenType.TAG_OPEN_START, start);
    const parts = this._consumePrefixAndName();
    return this._endToken(parts) as TagOpenStartToken;
  }

  private _consumeAttributeName() {
    const attrNameStart = this._cursor.peek();
    if (attrNameStart === chars.$SQ || attrNameStart === chars.$DQ) {
      throw this._createError(_unexpectedCharacterErrorMsg(attrNameStart), this._cursor.getSpan());
    }
    this._beginToken(TokenType.ATTR_NAME);
    const prefixAndName = this._consumePrefixAndName();
    this._endToken(prefixAndName);
  }

  private _consumeAttributeValue() {
    if (this._cursor.peek() === chars.$SQ || this._cursor.peek() === chars.$DQ) {
      const quoteChar = this._cursor.peek();
      this._consumeQuote(quoteChar);
      // In an attribute then end of the attribute value and the premature end to an interpolation
      // are both triggered by the `quoteChar`.
      const endPredicate = () => this._cursor.peek() === quoteChar;
      this._consumeWithInterpolation(
          TokenType.ATTR_VALUE_TEXT, TokenType.ATTR_VALUE_INTERPOLATION, endPredicate,
          endPredicate);
      this._consumeQuote(quoteChar);
    } else {
      const endPredicate = () => isNameEnd(this._cursor.peek());
      this._consumeWithInterpolation(
          TokenType.ATTR_VALUE_TEXT, TokenType.ATTR_VALUE_INTERPOLATION, endPredicate,
          endPredicate);
    }
  }

  private _consumeQuote(quoteChar: number) {
    this._beginToken(TokenType.ATTR_QUOTE);
    this._requireCharCode(quoteChar);
    this._endToken([String.fromCodePoint(quoteChar)]);
  }

  private _consumeTagOpenEnd() {
    const tokenType =
        this._attemptCharCode(chars.$SLASH) ? TokenType.TAG_OPEN_END_VOID : TokenType.TAG_OPEN_END;
    this._beginToken(tokenType);
    this._requireCharCode(chars.$GT);
    this._endToken([]);
  }

  private _consumeTagClose(start: CharacterCursor) {
    this._beginToken(TokenType.TAG_CLOSE, start);
    this._attemptCharCodeUntilFn(isNotWhitespace);
    const prefixAndName = this._consumePrefixAndName();
    this._attemptCharCodeUntilFn(isNotWhitespace);
    this._requireCharCode(chars.$GT);
    this._endToken(prefixAndName);
  }

  private _consumeExpansionFormStart() {
    this._beginToken(TokenType.EXPANSION_FORM_START);
    this._requireCharCode(chars.$LBRACE);
    this._endToken([]);

    this._expansionCaseStack.push(TokenType.EXPANSION_FORM_START);

    this._beginToken(TokenType.RAW_TEXT);
    const condition = this._readUntil(chars.$COMMA);
    const normalizedCondition = this._processCarriageReturns(condition);
    if (this._i18nNormalizeLineEndingsInICUs) {
      // We explicitly want to normalize line endings for this text.
      this._endToken([normalizedCondition]);
    } else {
      // We are not normalizing line endings.
      const conditionToken = this._endToken([condition]);
      if (normalizedCondition !== condition) {
        this.nonNormalizedIcuExpressions.push(conditionToken);
      }
    }
    this._requireCharCode(chars.$COMMA);
    this._attemptCharCodeUntilFn(isNotWhitespace);

    this._beginToken(TokenType.RAW_TEXT);
    const type = this._readUntil(chars.$COMMA);
    this._endToken([type]);
    this._requireCharCode(chars.$COMMA);
    this._attemptCharCodeUntilFn(isNotWhitespace);
  }

  private _consumeExpansionCaseStart() {
    this._beginToken(TokenType.EXPANSION_CASE_VALUE);
    const value = this._readUntil(chars.$LBRACE).trim();
    this._endToken([value]);
    this._attemptCharCodeUntilFn(isNotWhitespace);

    this._beginToken(TokenType.EXPANSION_CASE_EXP_START);
    this._requireCharCode(chars.$LBRACE);
    this._endToken([]);
    this._attemptCharCodeUntilFn(isNotWhitespace);

    this._expansionCaseStack.push(TokenType.EXPANSION_CASE_EXP_START);
  }

  private _consumeExpansionCaseEnd() {
    this._beginToken(TokenType.EXPANSION_CASE_EXP_END);
    this._requireCharCode(chars.$RBRACE);
    this._endToken([]);
    this._attemptCharCodeUntilFn(isNotWhitespace);

    this._expansionCaseStack.pop();
  }

  private _consumeExpansionFormEnd() {
    this._beginToken(TokenType.EXPANSION_FORM_END);
    this._requireCharCode(chars.$RBRACE);
    this._endToken([]);

    this._expansionCaseStack.pop();
  }

  /**
   * Consume a string that may contain interpolation expressions.
   *
   * The first token consumed will be of `tokenType` and then there will be alternating
   * `interpolationTokenType` and `tokenType` tokens until the `endPredicate()` returns true.
   *
   * If an interpolation token ends prematurely it will have no end marker in its `parts` array.
   *
   * @param textTokenType the kind of tokens to interleave around interpolation tokens.
   * @param interpolationTokenType the kind of tokens that contain interpolation.
   * @param endPredicate a function that should return true when we should stop consuming.
   * @param endInterpolation a function that should return true if there is a premature end to an
   *     interpolation expression - i.e. before we get to the normal interpolation closing marker.
   */
  private _consumeWithInterpolation(
      textTokenType: TokenType, interpolationTokenType: TokenType, endPredicate: () => boolean,
      endInterpolation: () => boolean) {
    this._beginToken(textTokenType);
    const parts: string[] = [];

    while (!endPredicate()) {
      const current = this._cursor.clone();
      if (this._interpolationConfig && this._attemptStr(this._interpolationConfig.start)) {
        this._endToken([this._processCarriageReturns(parts.join(''))], current);
        parts.length = 0;
        this._consumeInterpolation(interpolationTokenType, current, endInterpolation);
        this._beginToken(textTokenType);
      } else if (this._cursor.peek() === chars.$AMPERSAND) {
        this._endToken([this._processCarriageReturns(parts.join(''))]);
        parts.length = 0;
        this._consumeEntity(textTokenType);
        this._beginToken(textTokenType);
      } else {
        parts.push(this._readChar());
      }
    }

    // It is possible that an interpolation was started but not ended inside this text token.
    // Make sure that we reset the state of the lexer correctly.
    this._inInterpolation = false;

    this._endToken([this._processCarriageReturns(parts.join(''))]);
  }

  /**
   * Consume a block of text that has been interpreted as an Angular interpolation.
   *
   * @param interpolationTokenType the type of the interpolation token to generate.
   * @param interpolationStart a cursor that points to the start of this interpolation.
   * @param prematureEndPredicate a function that should return true if the next characters indicate
   *     an end to the interpolation before its normal closing marker.
   */
  private _consumeInterpolation(
      interpolationTokenType: TokenType, interpolationStart: CharacterCursor,
      prematureEndPredicate: (() => boolean)|null): void {
    const parts: string[] = [];
    this._beginToken(interpolationTokenType, interpolationStart);
    parts.push(this._interpolationConfig.start);

    // Find the end of the interpolation, ignoring content inside quotes.
    const expressionStart = this._cursor.clone();
    let inQuote: number|null = null;
    let inComment = false;
    while (this._cursor.peek() !== chars.$EOF &&
           (prematureEndPredicate === null || !prematureEndPredicate())) {
      const current = this._cursor.clone();

      if (this._isTagStart()) {
        // We are starting what looks like an HTML element in the middle of this interpolation.
        // Reset the cursor to before the `<` character and end the interpolation token.
        // (This is actually wrong but here for backward compatibility).
        this._cursor = current;
        parts.push(this._getProcessedChars(expressionStart, current));
        this._endToken(parts);
        return;
      }

      if (inQuote === null) {
        if (this._attemptStr(this._interpolationConfig.end)) {
          // We are not in a string, and we hit the end interpolation marker
          parts.push(this._getProcessedChars(expressionStart, current));
          parts.push(this._interpolationConfig.end);
          this._endToken(parts);
          return;
        } else if (this._attemptStr('//')) {
          // Once we are in a comment we ignore any quotes
          inComment = true;
        }
      }

      const char = this._cursor.peek();
      this._cursor.advance();
      if (char === chars.$BACKSLASH) {
        // Skip the next character because it was escaped.
        this._cursor.advance();
      } else if (char === inQuote) {
        // Exiting the current quoted string
        inQuote = null;
      } else if (!inComment && inQuote === null && chars.isQuote(char)) {
        // Entering a new quoted string
        inQuote = char;
      }
    }

    // We hit EOF without finding a closing interpolation marker
    parts.push(this._getProcessedChars(expressionStart, this._cursor));
    this._endToken(parts);
  }

  private _getProcessedChars(start: CharacterCursor, end: CharacterCursor): string {
    return this._processCarriageReturns(end.getChars(start));
  }

  private _isTextEnd(): boolean {
    if (this._isTagStart() || this._cursor.peek() === chars.$EOF) {
      return true;
    }

    if (this._tokenizeIcu && !this._inInterpolation) {
      if (this.isExpansionFormStart()) {
        // start of an expansion form
        return true;
      }

      if (this._cursor.peek() === chars.$RBRACE && this._isInExpansionCase()) {
        // end of and expansion case
        return true;
      }
    }

    if (this._tokenizeBlocks && !this._inInterpolation && !this._isInExpansion() &&
        (this._cursor.peek() === chars.$AT || this._cursor.peek() === chars.$RBRACE)) {
      return true;
    }

    return false;
  }

  /**
   * Returns true if the current cursor is pointing to the start of a tag
   * (opening/closing/comments/cdata/etc).
   */
  private _isTagStart(): boolean {
    if (this._cursor.peek() === chars.$LT) {
      // We assume that `<` followed by whitespace is not the start of an HTML element.
      const tmp = this._cursor.clone();
      tmp.advance();
      // If the next character is alphabetic, ! nor / then it is a tag start
      const code = tmp.peek();
      if ((chars.$a <= code && code <= chars.$z) || (chars.$A <= code && code <= chars.$Z) ||
          code === chars.$SLASH || code === chars.$BANG) {
        return true;
      }
    }
    return false;
  }

  private _readUntil(char: number): string {
    const start = this._cursor.clone();
    this._attemptUntilChar(char);
    return this._cursor.getChars(start);
  }

  private _isInExpansion(): boolean {
    return this._isInExpansionCase() || this._isInExpansionForm();
  }

  private _isInExpansionCase(): boolean {
    return this._expansionCaseStack.length > 0 &&
        this._expansionCaseStack[this._expansionCaseStack.length - 1] ===
        TokenType.EXPANSION_CASE_EXP_START;
  }

  private _isInExpansionForm(): boolean {
    return this._expansionCaseStack.length > 0 &&
        this._expansionCaseStack[this._expansionCaseStack.length - 1] ===
        TokenType.EXPANSION_FORM_START;
  }

  private isExpansionFormStart(): boolean {
    if (this._cursor.peek() !== chars.$LBRACE) {
      return false;
    }
    if (this._interpolationConfig) {
      const start = this._cursor.clone();
      const isInterpolation = this._attemptStr(this._interpolationConfig.start);
      this._cursor = start;
      return !isInterpolation;
    }
    return true;
  }
}

function isNotWhitespace(code: number): boolean {
  return !chars.isWhitespace(code) || code === chars.$EOF;
}

function isNameEnd(code: number): boolean {
  return chars.isWhitespace(code) || code === chars.$GT || code === chars.$LT ||
      code === chars.$SLASH || code === chars.$SQ || code === chars.$DQ || code === chars.$EQ ||
      code === chars.$EOF;
}

function isPrefixEnd(code: number): boolean {
  return (code < chars.$a || chars.$z < code) && (code < chars.$A || chars.$Z < code) &&
      (code < chars.$0 || code > chars.$9);
}

function isDigitEntityEnd(code: number): boolean {
  return code === chars.$SEMICOLON || code === chars.$EOF || !chars.isAsciiHexDigit(code);
}

function isNamedEntityEnd(code: number): boolean {
  return code === chars.$SEMICOLON || code === chars.$EOF || !chars.isAsciiLetter(code);
}

function isExpansionCaseStart(peek: number): boolean {
  return peek !== chars.$RBRACE;
}

function compareCharCodeCaseInsensitive(code1: number, code2: number): boolean {
  return toUpperCaseCharCode(code1) === toUpperCaseCharCode(code2);
}

function toUpperCaseCharCode(code: number): number {
  return code >= chars.$a && code <= chars.$z ? code - chars.$a + chars.$A : code;
}

function isBlockNameChar(code: number): boolean {
  return chars.isAsciiLetter(code) || chars.isDigit(code) || code === chars.$_;
}

function isBlockParameterChar(code: number): boolean {
  return code !== chars.$SEMICOLON && isNotWhitespace(code);
}

function mergeTextTokens(srcTokens: Token[]): Token[] {
  const dstTokens: Token[] = [];
  let lastDstToken: Token|undefined = undefined;
  for (let i = 0; i < srcTokens.length; i++) {
    const token = srcTokens[i];
    if ((lastDstToken && lastDstToken.type === TokenType.TEXT && token.type === TokenType.TEXT) ||
        (lastDstToken && lastDstToken.type === TokenType.ATTR_VALUE_TEXT &&
         token.type === TokenType.ATTR_VALUE_TEXT)) {
      lastDstToken.parts[0]! += token.parts[0];
      lastDstToken.sourceSpan.end = token.sourceSpan.end;
    } else {
      lastDstToken = token;
      dstTokens.push(lastDstToken);
    }
  }

  return dstTokens;
}


/**
 * The _Tokenizer uses objects of this type to move through the input text,
 * extracting "parsed characters". These could be more than one actual character
 * if the text contains escape sequences.
 */
interface CharacterCursor {
  /** Initialize the cursor. */
  init(): void;
  /** The parsed character at the current cursor position. */
  peek(): number;
  /** Advance the cursor by one parsed character. */
  advance(): void;
  /** Get a span from the marked start point to the current point. */
  getSpan(start?: this, leadingTriviaCodePoints?: number[]): ParseSourceSpan;
  /** Get the parsed characters from the marked start point to the current point. */
  getChars(start: this): string;
  /** The number of characters left before the end of the cursor. */
  charsLeft(): number;
  /** The number of characters between `this` cursor and `other` cursor. */
  diff(other: this): number;
  /** Make a copy of this cursor */
  clone(): CharacterCursor;
}

interface CursorState {
  peek: number;
  offset: number;
  line: number;
  column: number;
}

class PlainCharacterCursor implements CharacterCursor {
  protected state: CursorState;
  protected file: ParseSourceFile;
  protected input: string;
  protected end: number;

  constructor(fileOrCursor: PlainCharacterCursor);
  constructor(fileOrCursor: ParseSourceFile, range: LexerRange);
  constructor(fileOrCursor: ParseSourceFile|PlainCharacterCursor, range?: LexerRange) {
    if (fileOrCursor instanceof PlainCharacterCursor) {
      this.file = fileOrCursor.file;
      this.input = fileOrCursor.input;
      this.end = fileOrCursor.end;

      const state = fileOrCursor.state;
      // Note: avoid using `{...fileOrCursor.state}` here as that has a severe performance penalty.
      // In ES5 bundles the object spread operator is translated into the `__assign` helper, which
      // is not optimized by VMs as efficiently as a raw object literal. Since this constructor is
      // called in tight loops, this difference matters.
      this.state = {
        peek: state.peek,
        offset: state.offset,
        line: state.line,
        column: state.column,
      };
    } else {
      if (!range) {
        throw new Error(
            'Programming error: the range argument must be provided with a file argument.');
      }
      this.file = fileOrCursor;
      this.input = fileOrCursor.content;
      this.end = range.endPos;
      this.state = {
        peek: -1,
        offset: range.startPos,
        line: range.startLine,
        column: range.startCol,
      };
    }
  }

  clone(): PlainCharacterCursor {
    return new PlainCharacterCursor(this);
  }

  peek() {
    return this.state.peek;
  }
  charsLeft() {
    return this.end - this.state.offset;
  }
  diff(other: this) {
    return this.state.offset - other.state.offset;
  }

  advance(): void {
    this.advanceState(this.state);
  }

  init(): void {
    this.updatePeek(this.state);
  }

  getSpan(start?: this, leadingTriviaCodePoints?: number[]): ParseSourceSpan {
    start = start || this;
    let fullStart = start;
    if (leadingTriviaCodePoints) {
      while (this.diff(start) > 0 && leadingTriviaCodePoints.indexOf(start.peek()) !== -1) {
        if (fullStart === start) {
          start = start.clone() as this;
        }
        start.advance();
      }
    }
    const startLocation = this.locationFromCursor(start);
    const endLocation = this.locationFromCursor(this);
    const fullStartLocation =
        fullStart !== start ? this.locationFromCursor(fullStart) : startLocation;
    return new ParseSourceSpan(startLocation, endLocation, fullStartLocation);
  }

  getChars(start: this): string {
    return this.input.substring(start.state.offset, this.state.offset);
  }

  charAt(pos: number): number {
    return this.input.charCodeAt(pos);
  }

  protected advanceState(state: CursorState) {
    if (state.offset >= this.end) {
      this.state = state;
      throw new CursorError('Unexpected character "EOF"', this);
    }
    const currentChar = this.charAt(state.offset);
    if (currentChar === chars.$LF) {
      state.line++;
      state.column = 0;
    } else if (!chars.isNewLine(currentChar)) {
      state.column++;
    }
    state.offset++;
    this.updatePeek(state);
  }

  protected updatePeek(state: CursorState): void {
    state.peek = state.offset >= this.end ? chars.$EOF : this.charAt(state.offset);
  }

  private locationFromCursor(cursor: this): ParseLocation {
    return new ParseLocation(
        cursor.file, cursor.state.offset, cursor.state.line, cursor.state.column);
  }
}

class EscapedCharacterCursor extends PlainCharacterCursor {
  protected internalState: CursorState;

  constructor(fileOrCursor: EscapedCharacterCursor);
  constructor(fileOrCursor: ParseSourceFile, range: LexerRange);
  constructor(fileOrCursor: ParseSourceFile|EscapedCharacterCursor, range?: LexerRange) {
    if (fileOrCursor instanceof EscapedCharacterCursor) {
      super(fileOrCursor);
      this.internalState = {...fileOrCursor.internalState};
    } else {
      super(fileOrCursor, range!);
      this.internalState = this.state;
    }
  }

  override advance(): void {
    this.state = this.internalState;
    super.advance();
    this.processEscapeSequence();
  }

  override init(): void {
    super.init();
    this.processEscapeSequence();
  }

  override clone(): EscapedCharacterCursor {
    return new EscapedCharacterCursor(this);
  }

  override getChars(start: this): string {
    const cursor = start.clone();
    let chars = '';
    while (cursor.internalState.offset < this.internalState.offset) {
      chars += String.fromCodePoint(cursor.peek());
      cursor.advance();
    }
    return chars;
  }

  /**
   * Process the escape sequence that starts at the current position in the text.
   *
   * This method is called to ensure that `peek` has the unescaped value of escape sequences.
   */
  protected processEscapeSequence(): void {
    const peek = () => this.internalState.peek;

    if (peek() === chars.$BACKSLASH) {
      // We have hit an escape sequence so we need the internal state to become independent
      // of the external state.
      this.internalState = {...this.state};

      // Move past the backslash
      this.advanceState(this.internalState);

      // First check for standard control char sequences
      if (peek() === chars.$n) {
        this.state.peek = chars.$LF;
      } else if (peek() === chars.$r) {
        this.state.peek = chars.$CR;
      } else if (peek() === chars.$v) {
        this.state.peek = chars.$VTAB;
      } else if (peek() === chars.$t) {
        this.state.peek = chars.$TAB;
      } else if (peek() === chars.$b) {
        this.state.peek = chars.$BSPACE;
      } else if (peek() === chars.$f) {
        this.state.peek = chars.$FF;
      }

      // Now consider more complex sequences
      else if (peek() === chars.$u) {
        // Unicode code-point sequence
        this.advanceState(this.internalState);  // advance past the `u` char
        if (peek() === chars.$LBRACE) {
          // Variable length Unicode, e.g. `\x{123}`
          this.advanceState(this.internalState);  // advance past the `{` char
          // Advance past the variable number of hex digits until we hit a `}` char
          const digitStart = this.clone();
          let length = 0;
          while (peek() !== chars.$RBRACE) {
            this.advanceState(this.internalState);
            length++;
          }
          this.state.peek = this.decodeHexDigits(digitStart, length);
        } else {
          // Fixed length Unicode, e.g. `\u1234`
          const digitStart = this.clone();
          this.advanceState(this.internalState);
          this.advanceState(this.internalState);
          this.advanceState(this.internalState);
          this.state.peek = this.decodeHexDigits(digitStart, 4);
        }
      }

      else if (peek() === chars.$x) {
        // Hex char code, e.g. `\x2F`
        this.advanceState(this.internalState);  // advance past the `x` char
        const digitStart = this.clone();
        this.advanceState(this.internalState);
        this.state.peek = this.decodeHexDigits(digitStart, 2);
      }

      else if (chars.isOctalDigit(peek())) {
        // Octal char code, e.g. `\012`,
        let octal = '';
        let length = 0;
        let previous = this.clone();
        while (chars.isOctalDigit(peek()) && length < 3) {
          previous = this.clone();
          octal += String.fromCodePoint(peek());
          this.advanceState(this.internalState);
          length++;
        }
        this.state.peek = parseInt(octal, 8);
        // Backup one char
        this.internalState = previous.internalState;
      }

      else if (chars.isNewLine(this.internalState.peek)) {
        // Line continuation `\` followed by a new line
        this.advanceState(this.internalState);  // advance over the newline
        this.state = this.internalState;
      }

      else {
        // If none of the `if` blocks were executed then we just have an escaped normal character.
        // In that case we just, effectively, skip the backslash from the character.
        this.state.peek = this.internalState.peek;
      }
    }
  }

  protected decodeHexDigits(start: EscapedCharacterCursor, length: number): number {
    const hex = this.input.slice(start.internalState.offset, start.internalState.offset + length);
    const charCode = parseInt(hex, 16);
    if (!isNaN(charCode)) {
      return charCode;
    } else {
      start.state = start.internalState;
      throw new CursorError('Invalid hexadecimal escape sequence', start);
    }
  }
}

export class CursorError {
  constructor(public msg: string, public cursor: CharacterCursor) {}
}
Browse the archive

https://github.com/angular/angular