swh:1:snp:a72e953ecd624a7df6e6196bbdd05851996c5e40
Raw File
Tip revision: e844f0c0b17af40321c3dce34d0beaf5ce871729 authored by Elliot Saba on 14 August 2014, 18:12:26 UTC
Tag v0.3.0-rc4
Tip revision: e844f0c
regex.jl
## object-oriented Regex interface ##

include("pcre.jl")

const DEFAULT_OPTS = PCRE.JAVASCRIPT_COMPAT | PCRE.UTF8 | PCRE.NO_UTF8_CHECK

type Regex
    pattern::ByteString
    options::Uint32
    regex::Ptr{Void}
    extra::Ptr{Void}
    ovec::Vector{Int32}


    function Regex(pattern::String, options::Integer)
        pattern = bytestring(pattern)
        options = uint32(options)
        if (options & ~PCRE.OPTIONS_MASK) != 0
            error("invalid regex options: $options")
        end
        re = compile(new(pattern, options, C_NULL, C_NULL, Array(Int32, 0)))
        finalizer(re,
            function(re::Regex)
                re.extra != C_NULL && PCRE.free_study(re.extra)
                re.regex != C_NULL && PCRE.free(re.regex)
            end)
        re
    end
end

function Regex(pattern::String, flags::String)
    options = DEFAULT_OPTS
    for f in flags
        options |= f=='i' ? PCRE.CASELESS  :
                   f=='m' ? PCRE.MULTILINE :
                   f=='s' ? PCRE.DOTALL    :
                   f=='x' ? PCRE.EXTENDED  :
                   error("unknown regex flag: $f")
    end
    Regex(pattern, options)
end
Regex(pattern::String) = Regex(pattern, DEFAULT_OPTS)

function compile(regex::Regex)
    if regex.regex == C_NULL
        regex.regex = PCRE.compile(regex.pattern, regex.options & PCRE.COMPILE_MASK)
        regex.extra = PCRE.study(regex.regex, PCRE.STUDY_JIT_COMPILE)
        ncap  = PCRE.info(regex.regex, regex.extra,
                          PCRE.INFO_CAPTURECOUNT, Int32)
        resize!(regex.ovec, 3(ncap+1))
    end
    regex
end

macro r_str(pattern, flags...) Regex(pattern, flags...) end
macro r_mstr(pattern, flags...) Regex(pattern, flags...) end

copy(r::Regex) = r

function show(io::IO, re::Regex)
    imsx = PCRE.CASELESS|PCRE.MULTILINE|PCRE.DOTALL|PCRE.EXTENDED
    if (re.options & ~imsx) == DEFAULT_OPTS
        print(io, 'r')
        print_quoted_literal(io, re.pattern)
        if (re.options & PCRE.CASELESS ) != 0; print(io, 'i'); end
        if (re.options & PCRE.MULTILINE) != 0; print(io, 'm'); end
        if (re.options & PCRE.DOTALL   ) != 0; print(io, 's'); end
        if (re.options & PCRE.EXTENDED ) != 0; print(io, 'x'); end
    else
        print(io, "Regex(")
        show(io, re.pattern)
        print(io, ',')
        show(io, re.options)
        print(io, ')')
    end
end

# TODO: map offsets into non-ByteStrings back to original indices.
# or maybe it's better to just fail since that would be quite slow

immutable RegexMatch
    match::SubString{UTF8String}
    captures::Vector{Union(Nothing,SubString{UTF8String})}
    offset::Int
    offsets::Vector{Int}
end

function show(io::IO, m::RegexMatch)
    print(io, "RegexMatch(")
    show(io, m.match)
    if !isempty(m.captures)
        print(io, ", ")
        for i = 1:length(m.captures)
            print(io, i, "=")
            show(io, m.captures[i])
            if i < length(m.captures)
                print(io, ", ")
            end
        end
    end
    print(io, ")")
end

function ismatch(r::Regex, s::String, offset::Integer=0)
    compile(r)
    return PCRE.exec(r.regex, r.extra, bytestring(s), offset, r.options & PCRE.EXECUTE_MASK,
                     r.ovec)
end

function ismatch(r::Regex, s::SubString, offset::Integer=0)
    compile(r)
    return PCRE.exec(r.regex, r.extra, s, offset, r.options & PCRE.EXECUTE_MASK,
                  r.ovec)
end

function match(re::Regex, str::UTF8String, idx::Integer, add_opts::Uint32=uint32(0))
    opts = re.options & PCRE.EXECUTE_MASK | add_opts
    compile(re)
    if !PCRE.exec(re.regex, re.extra, str, idx-1, opts, re.ovec)
        return nothing
    end
    n = length(re.ovec)/3 - 1
    mat = SubString(str, re.ovec[1]+1, re.ovec[2])
    cap = Union(Nothing,SubString{UTF8String})[
            re.ovec[2i+1] < 0 ? nothing : SubString(str, re.ovec[2i+1]+1, re.ovec[2i+2]) for i=1:n ]
    off = Int[ re.ovec[2i+1]::Int32+1 for i=1:n ]
    RegexMatch(mat, cap, re.ovec[1]+1, off)
end

match(re::Regex, str::Union(ByteString,SubString), idx::Integer, add_opts::Uint32=uint32(0)) =
    match(re, utf8(str), idx, add_opts)

match(r::Regex, s::String) = match(r, s, start(s))
match(r::Regex, s::String, i::Integer) =
    error("regex matching is only available for bytestrings; use bytestring(s) to convert")

function matchall(re::Regex, str::UTF8String, overlap::Bool=false)
    regex = compile(re).regex
    extra = re.extra
    n = length(str.data)
    matches = SubString{UTF8String}[]
    offset = int32(0)
    opts = re.options & PCRE.EXECUTE_MASK
    opts_nonempty = opts | PCRE.ANCHORED | PCRE.NOTEMPTY_ATSTART
    prevempty = false
    ovec = Array(Int32, 3)
    while true
        result = ccall((:pcre_exec, :libpcre), Int32,
                       (Ptr{Void}, Ptr{Void}, Ptr{Uint8}, Int32,
                       Int32, Int32, Ptr{Int32}, Int32),
                       regex, extra, str, n,
                       offset, prevempty ? opts_nonempty : opts, ovec, 3)

        if result < 0
            if prevempty && offset < n
                offset = int32(nextind(str, offset + 1) - 1)
                prevempty = false
                continue
            else
                break
            end
        end

        push!(matches, SubString(str, ovec[1]+1, ovec[2]))
        prevempty = offset == ovec[2]
        if overlap
            if !prevempty
                offset = int32(nextind(str, offset + 1) - 1)
            end
        else
            offset = ovec[2]
        end
    end
    matches
end

matchall(re::Regex, str::Union(ByteString,SubString), overlap::Bool=false) =
    matchall(re, utf8(str), overlap)

function search(str::Union(ByteString,SubString), re::Regex, idx::Integer)
    if idx > nextind(str,endof(str))
        throw(BoundsError())
    end
    opts = re.options & PCRE.EXECUTE_MASK
    compile(re)
    PCRE.exec(re.regex, re.extra, str, idx-1, opts, re.ovec) ?
        ((re.ovec[1]+1):prevind(str,re.ovec[2]+1)) : (0:-1)
end
search(s::String, r::Regex, idx::Integer) =
    error("regex search is only available for bytestrings; use bytestring(s) to convert")
search(s::String, r::Regex) = search(s,r,start(s))

immutable RegexMatchIterator
    regex::Regex
    string::UTF8String
    overlap::Bool

    function RegexMatchIterator(regex::Regex, string::String, ovr::Bool=false)
        new(regex, string, ovr)
    end
end
compile(itr::RegexMatchIterator) = (compile(itr.regex); itr)
eltype(itr::RegexMatchIterator) = RegexMatch
start(itr::RegexMatchIterator) = match(itr.regex, itr.string, 1, uint32(0))
done(itr::RegexMatchIterator, prev_match) = (prev_match == nothing)

# Assumes prev_match is not nothing
function next(itr::RegexMatchIterator, prev_match)
    prevempty = isempty(prev_match.match)

    if itr.overlap
        if !prevempty
            offset = nextind(itr.string, prev_match.offset)
        else
            offset = prev_match.offset
        end
    else
        offset = prev_match.offset + endof(prev_match.match)
    end

    opts_nonempty = uint32(PCRE.ANCHORED | PCRE.NOTEMPTY_ATSTART)
    while true
        mat = match(itr.regex, itr.string, offset,
                    prevempty ? opts_nonempty : uint32(0))

        if mat === nothing
            if prevempty && offset <= length(itr.string.data)
                offset = nextind(itr.string, offset)
                prevempty = false
                continue
            else
                break
            end
        else
            return (prev_match, mat)
        end
    end
    (prev_match, nothing)
end

function eachmatch(re::Regex, str::String, ovr::Bool=false)
    RegexMatchIterator(re,str,ovr)
end

eachmatch(re::Regex, str::String) = RegexMatchIterator(re,str)

# miscellaneous methods that depend on Regex being defined

filter!(r::Regex, v) = filter!(x->ismatch(r,x), v)
filter(r::Regex, v)  = filter(x->ismatch(r,x), v)

filter!(r::Regex, d::Dict) = filter!((k,v)->ismatch(r,k),d)
filter(r::Regex,  d::Dict) = filter!(r,copy(d))


# Don't serialize the pointers
function serialize(s, r::Regex)
    serialize_type(s, typeof(r))
    serialize(s, r.pattern) 
    serialize(s, r.options) 
end

function deserialize(s, t::Type{Regex})
    pattern = deserialize(s)
    options = deserialize(s)
    Regex(pattern, options)        
end
back to top