Revision 3711749292ba9c29ad2e3b9eaee90995f8c8290a authored by Keno Fischer on 11 October 2023, 14:41:22 UTC, committed by GitHub on 11 October 2023, 14:41:22 UTC
This should be NFC and is intended to allow the optimizer to delete
:enter statements (by replacing them with `nothing`), without leaving
dangling `:leave`s around. This is accomplished by having `leave` take
(a variable number of) `:enter` tokens (that are already being used by
`:pop_exception`). The semantics are that a literal `nothing` or an
SSAValue pointing to a `nothing` statement are ignored, and one
exception handler is popped for each remaining argument. The actual
value of the token is ignored, except that the verifier asserts that it
belongs to an `:enter`.

Note that we don't need to do the same for :pop_exception, because the
token generated by an `:enter` is semantically only in scope for
:pop_exception during its catch block. If we determine the `:enter` is
dead, then its catch block is guaranteed to not be executed and will be
deleted wholesale by cfg liveness.

I was considering doing something fancier where :leave is changed back
to taking an integer after optimization, but the case where the IR size
is bigger after this change (when we are `:leave`ing many handlers) is
fairly rare and likely not worth the additional complexity or time cost
to do anything special. If it does show up in size benchmarks, I'd
rather give `:leave` a special, compact encoding.
1 parent 8180240
Raw File
pcre.jl
# This file is a part of Julia. License is MIT: https://julialang.org/license

## low-level pcre2 interface ##

module PCRE

import ..RefValue

# include($BUILDROOT/base/pcre_h.jl)
include(string(length(Core.ARGS) >= 2 ? Core.ARGS[2] : "", "pcre_h.jl"))

const PCRE_LIB = "libpcre2-8"

function create_match_context()
    JIT_STACK_START_SIZE = 32768
    JIT_STACK_MAX_SIZE = 1048576
    jit_stack = ccall((:pcre2_jit_stack_create_8, PCRE_LIB), Ptr{Cvoid},
                      (Csize_t, Csize_t, Ptr{Cvoid}),
                      JIT_STACK_START_SIZE, JIT_STACK_MAX_SIZE, C_NULL)
    ctx = ccall((:pcre2_match_context_create_8, PCRE_LIB),
                Ptr{Cvoid}, (Ptr{Cvoid},), C_NULL)
    ccall((:pcre2_jit_stack_assign_8, PCRE_LIB), Cvoid,
          (Ptr{Cvoid}, Ptr{Cvoid}, Ptr{Cvoid}), ctx, C_NULL, jit_stack)
    return ctx
end

THREAD_MATCH_CONTEXTS::Vector{Ptr{Cvoid}} = [C_NULL]

PCRE_COMPILE_LOCK = nothing

_tid() = Int(ccall(:jl_threadid, Int16, ())) + 1
_mth() = Int(Core.Intrinsics.atomic_pointerref(cglobal(:jl_n_threads, Cint), :acquire))

function get_local_match_context()
    tid = _tid()
    ctxs = THREAD_MATCH_CONTEXTS
    if length(ctxs) < tid
        # slow path to allocate it
        l = PCRE_COMPILE_LOCK::Threads.SpinLock
        lock(l)
        try
            ctxs = THREAD_MATCH_CONTEXTS
            if length(ctxs) < tid
                global THREAD_MATCH_CONTEXTS = ctxs = copyto!(fill(C_NULL, length(ctxs) + _mth()), ctxs)
            end
        finally
            unlock(l)
        end
    end
    ctx = @inbounds ctxs[tid]
    if ctx == C_NULL
        # slow path to allocate it
        ctx = create_match_context()
        THREAD_MATCH_CONTEXTS[tid] = ctx
    end
    return ctx
end

# supported options for different use cases

# arguments to pcre2_compile
const COMPILE_MASK      =
      ALT_BSUX          |
      ALT_CIRCUMFLEX    |
      ALT_VERBNAMES     |
      ANCHORED          |
      # AUTO_CALLOUT    |
      CASELESS          |
      DOLLAR_ENDONLY    |
      DOTALL            |
      # DUPNAMES        |
      ENDANCHORED       |
      EXTENDED          |
      EXTENDED_MORE     |
      FIRSTLINE         |
      LITERAL           |
      MATCH_INVALID_UTF |
      MATCH_UNSET_BACKREF |
      MULTILINE         |
      NEVER_BACKSLASH_C |
      NEVER_UCP         |
      NEVER_UTF         |
      NO_AUTO_CAPTURE   |
      NO_AUTO_POSSESS   |
      NO_DOTSTAR_ANCHOR |
      NO_START_OPTIMIZE |
      NO_UTF_CHECK      |
      UCP               |
      UNGREEDY          |
      USE_OFFSET_LIMIT  |
      UTF

# arguments to pcre2_set_newline
const COMPILE_NEWLINE_MASK = (
      NEWLINE_CR,
      NEWLINE_LF,
      NEWLINE_CRLF,
      NEWLINE_ANY,
      NEWLINE_ANYCRLF,
      NEWLINE_NUL)

# arguments to pcre2_set_compile_extra_options
const COMPILE_EXTRA_MASK            =
      EXTRA_ALLOW_SURROGATE_ESCAPES |
      EXTRA_ALT_BSUX                |
      EXTRA_BAD_ESCAPE_IS_LITERAL   |
      EXTRA_ESCAPED_CR_IS_LF        |
      EXTRA_MATCH_LINE              |
      EXTRA_MATCH_WORD

# arguments to match
const EXECUTE_MASK      =
      # ANCHORED        |
      # COPY_MATCHED_SUBJECT |
      # ENDANCHORED     |
      NOTBOL            |
      NOTEMPTY          |
      NOTEMPTY_ATSTART  |
      NOTEOL            |
      # NO_JIT          |
      NO_START_OPTIMIZE |
      NO_UTF_CHECK      |
      PARTIAL_HARD      |
      PARTIAL_SOFT


const UNSET = ~Csize_t(0)  # Indicates that an output vector element is unset

function info(regex::Ptr{Cvoid}, what::Integer, ::Type{T}) where T
    buf = RefValue{T}()
    ret = ccall((:pcre2_pattern_info_8, PCRE_LIB), Cint,
                (Ptr{Cvoid}, UInt32, Ptr{Cvoid}),
                regex, what, buf)
    if ret != 0
        error(ret == ERROR_NULL      ? "PCRE error: NULL regex object" :
              ret == ERROR_BADMAGIC  ? "PCRE error: invalid regex object" :
              ret == ERROR_BADOPTION ? "PCRE error: invalid option flags" :
                                       "PCRE error: unknown error ($ret)")
    end
    return buf[]
end

function ovec_length(match_data)
    n = ccall((:pcre2_get_ovector_count_8, PCRE_LIB), UInt32,
              (Ptr{Cvoid},), match_data)
    return 2Int(n)
end

function ovec_ptr(match_data)
    ptr = ccall((:pcre2_get_ovector_pointer_8, PCRE_LIB), Ptr{Csize_t},
                (Ptr{Cvoid},), match_data)
    return ptr
end

function compile(pattern::AbstractString, options::Integer)
    if !(pattern isa Union{String,SubString{String}})
        pattern = String(pattern)
    end
    errno = RefValue{Cint}(0)
    erroff = RefValue{Csize_t}(0)
    re_ptr = ccall((:pcre2_compile_8, PCRE_LIB), Ptr{Cvoid},
                   (Ptr{UInt8}, Csize_t, UInt32, Ref{Cint}, Ref{Csize_t}, Ptr{Cvoid}),
                   pattern, ncodeunits(pattern), options, errno, erroff, C_NULL)
    if re_ptr == C_NULL
        error("PCRE compilation error: $(err_message(errno[])) at offset $(erroff[])")
    end
    return re_ptr
end

function jit_compile(regex::Ptr{Cvoid})
    errno = ccall((:pcre2_jit_compile_8, PCRE_LIB), Cint,
                  (Ptr{Cvoid}, UInt32), regex, JIT_COMPLETE)
    errno == 0 && return true
    errno == ERROR_JIT_BADOPTION && return false
    error("PCRE JIT error: $(err_message(errno))")
end

free_match_data(match_data) =
    ccall((:pcre2_match_data_free_8, PCRE_LIB), Cvoid, (Ptr{Cvoid},), match_data)

free_re(re) =
    ccall((:pcre2_code_free_8, PCRE_LIB), Cvoid, (Ptr{Cvoid},), re)

free_jit_stack(stack) =
    ccall((:pcre2_jit_stack_free_8, PCRE_LIB), Cvoid, (Ptr{Cvoid},), stack)

free_match_context(context) =
    ccall((:pcre2_match_context_free_8, PCRE_LIB), Cvoid, (Ptr{Cvoid},), context)

function err_message(errno::Integer)
    buffer = Vector{UInt8}(undef, 1024)
    ret = ccall((:pcre2_get_error_message_8, PCRE_LIB), Cint,
                (Cint, Ptr{UInt8}, Csize_t), errno, buffer, length(buffer))
    ret == ERROR_BADDATA && error("PCRE error: invalid errno ($errno)")
    # TODO: seems like there should be a better way to get this string
    return GC.@preserve buffer unsafe_string(pointer(buffer))
end

exec(re, subject::Union{String,SubString{String}}, offset, options, match_data) =
    _exec(re, subject, offset, options, match_data)
exec(re, subject, offset, options, match_data) =
    _exec(re, String(subject), offset, options, match_data)

function _exec(re, subject, offset, options, match_data)
    rc = ccall((:pcre2_match_8, PCRE_LIB), Cint,
               (Ptr{Cvoid}, Ptr{UInt8}, Csize_t, Csize_t, UInt32, Ptr{Cvoid}, Ptr{Cvoid}),
               re, subject, ncodeunits(subject), offset, options, match_data, get_local_match_context())
    # rc == -1 means no match, -2 means partial match.
    rc < -2 && error("PCRE.exec error: $(err_message(rc))")
    return rc >= 0
end

function exec_r(re, subject, offset, options)
    match_data = create_match_data(re)
    ans = exec(re, subject, offset, options, match_data)
    free_match_data(match_data)
    return ans
end

function exec_r_data(re, subject, offset, options)
    match_data = create_match_data(re)
    ans = exec(re, subject, offset, options, match_data)
    return ans, match_data
end

function create_match_data(re)
    p = ccall((:pcre2_match_data_create_from_pattern_8, PCRE_LIB),
              Ptr{Cvoid}, (Ptr{Cvoid}, Ptr{Cvoid}), re, C_NULL)
    p == C_NULL && error("PCRE error: could not allocate memory")
    return p
end

function substring_number_from_name(re, name)
    n = ccall((:pcre2_substring_number_from_name_8, PCRE_LIB), Cint,
               (Ptr{Cvoid}, Cstring), re, name)
    return Int(n)
end

function substring_length_bynumber(match_data, number)
    s = RefValue{Csize_t}()
    rc = ccall((:pcre2_substring_length_bynumber_8, PCRE_LIB), Cint,
               (Ptr{Cvoid}, Cint, Ref{Csize_t}), match_data, number, s)
    if rc < 0
        rc == ERROR_UNSET && return 0
        error("PCRE error: $(err_message(rc))")
    end
    return Int(s[])
end

function substring_copy_bynumber(match_data, number, buf, buf_size)
    s = RefValue{Csize_t}(buf_size)
    rc = ccall((:pcre2_substring_copy_bynumber_8, PCRE_LIB), Cint,
               (Ptr{Cvoid}, UInt32, Ptr{UInt8}, Ref{Csize_t}),
               match_data, number, buf, s)
    rc < 0 && error("PCRE error: $(err_message(rc))")
    return Int(s[])
end

function capture_names(re)
    name_count = info(re, INFO_NAMECOUNT, UInt32)
    name_entry_size = info(re, INFO_NAMEENTRYSIZE, UInt32)
    nametable_ptr = info(re, INFO_NAMETABLE, Ptr{UInt8})
    names = Dict{Int,String}()
    for i = 1:name_count
        offset = (i-1)*name_entry_size + 1
        # The capture group index corresponding to name 'i' is stored as a
        # big-endian 16-bit value.
        high_byte = UInt16(unsafe_load(nametable_ptr, offset))
        low_byte = UInt16(unsafe_load(nametable_ptr, offset+1))
        idx = (high_byte << 8) | low_byte
        # The capture group name is a null-terminated string located directly
        # after the index.
        names[idx] = unsafe_string(nametable_ptr+offset+1)
    end
    return names
end

end # module
back to top