https://github.com/antlr/grammars-v4
Raw File
Tip revision: 3b26af5ec5cfc28779e609b3f20fc7e6454c51d4 authored by Ken Domino on 15 March 2024, 02:44:56 UTC
[build] Fix for #4008 -- add testing of TypeScript target back (#4010)
Tip revision: 3b26af5
regexLexer.g4
/*
 * [The "BSD license"]
 *  Copyright (c) 2019 PANTHEON.tech
 *  All rights reserved.
 *
 *  Redistribution and use in source and binary forms, with or without
 *  modification, are permitted provided that the following conditions
 *  are met:
 *
 *  1. Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *  2. Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in the
 *     documentation and/or other materials provided with the distribution.
 *  3. The name of the author may not be used to endorse or promote products
 *     derived from this software without specific prior written permission.
 *
 *  THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 *  IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 *  OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 *  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 *  NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 *  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 *  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 *  THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Lexer grammar for https://www.w3.org/TR/2004/REC-xmlschema-2-20041028/#regexs.
 *
 * This grammar is modified in following ways:
 * - we use lexer modes to disambiguate between Char, XmlChar and QuantExact
 * - we use separate lexer tokens to disambiguate positive and negative character groups
 * - XmlCharIncDash is removed in favor of DASH token, which is handled in parser
 */

// $antlr-format alignTrailingComments true, columnLimit 150, maxEmptyLinesToKeep 1, reflowComments false, useTab false
// $antlr-format allowShortRulesOnASingleLine true, allowShortBlocksOnASingleLine true, minEmptyLines 0, alignSemicolons ownLine
// $antlr-format alignColons trailing, singleLineOverrulesHangingColon true, alignLexerCommands true, alignLabels true, alignTrailers true

lexer grammar regexLexer;

LPAREN      : '(';
RPAREN      : ')';
PIPE        : '|';
PLUS        : '+';
QUESTION    : '?';
STAR        : '*';
WildcardEsc : '.';
Char        : ~('.' | '\\' | '?' | '*' | '+' | '(' | ')' | '|' | '[' | ']');

// Quantifier's quantity rule support
StartQuantity: '{' -> pushMode(QUANTITY);

// Single Character Escape
SingleCharEsc: SINGLE_ESC;

// Multi-Character Escape
MultiCharEsc: MULTI_ESC;

// Category Escape
CatEsc   : CAT_ESC   -> pushMode(CATEGORY);
ComplEsc : COMPL_ESC -> pushMode(CATEGORY);

// Positive/Negative Character Group
NegCharGroup : '[^' -> pushMode(CHARGROUP);
PosCharGroup : '['  -> pushMode(CHARGROUP);

mode QUANTITY;
EndQuantity : '}' -> popMode;
QuantExact  : [0-9]+;
COMMA       : ',';

mode CATEGORY;
EndCategory: '}' -> popMode;

// Categories
IsCategory  : Letters | Marks | Numbers | Punctuation | Separators | Symbols | Others;
Letters     : 'L' [ultmo]?;
Marks       : 'M' [nce]?;
Numbers     : 'N' [dlo]?;
Punctuation : 'P' [cdseifo]?;
Separators  : 'Z' [slp]?;
Symbols     : 'S' [mcko]?;
Others      : 'C' [cfon]?;

// Block Escape
IsBlock: 'Is' ([a-z0-9A-Z] | '-')+;

mode CHARGROUP;
NestedSingleCharEsc : SINGLE_ESC;
NestedMultiCharEsc  : MULTI_ESC;
NestedCatEsc        : CAT_ESC   -> pushMode(CATEGORY);
NestedComplEsc      : COMPL_ESC -> pushMode(CATEGORY);
NestedNegCharGroup  : '[^'      -> pushMode(CHARGROUP);
NestedPosCharGroup  : '['       -> pushMode(CHARGROUP);
EndCharGroup        : ']'       -> popMode;
DASH                : '-';
XmlChar             : ~('-' | '[' | ']');

fragment CAT_ESC    : '\\p{';
fragment COMPL_ESC  : '\\P{';
fragment MULTI_ESC  : '\\' [sSiIcCdDwW];
fragment SINGLE_ESC : '\\' [nrt\\|.?*+(){}\u002D\u005B\u005D\u005E];
back to top