search.jl
# This file is a part of Julia. License is MIT: https://julialang.org/license
"""
An abstract type representing any sort of pattern matching expression
(typically a regular expression). `AbstractPattern` objects can be used to
match strings with [`match`](@ref).
!!! compat "Julia 1.6"
This type is available in Julia 1.6 and later.
"""
abstract type AbstractPattern end
nothing_sentinel(i) = i == 0 ? nothing : i
function findnext(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:AbstractChar},
s::Union{String, SubString{String}}, i::Integer)
if i < 1 || i > sizeof(s)
i == sizeof(s) + 1 && return nothing
throw(BoundsError(s, i))
end
@inbounds isvalid(s, i) || string_index_err(s, i)
c = pred.x
c ≤ '\x7f' && return nothing_sentinel(_search(s, c % UInt8, i))
while true
i = _search(s, first_utf8_byte(c), i)
i == 0 && return nothing
pred(s[i]) && return i
i = nextind(s, i)
end
end
findfirst(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:Union{Int8,UInt8}}, a::ByteArray) =
nothing_sentinel(_search(a, pred.x))
findnext(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:Union{Int8,UInt8}}, a::ByteArray, i::Integer) =
nothing_sentinel(_search(a, pred.x, i))
findfirst(::typeof(iszero), a::ByteArray) = nothing_sentinel(_search(a, zero(UInt8)))
findnext(::typeof(iszero), a::ByteArray, i::Integer) = nothing_sentinel(_search(a, zero(UInt8), i))
function _search(a::Union{String,SubString{String},ByteArray}, b::Union{Int8,UInt8}, i::Integer = 1)
if i < 1
throw(BoundsError(a, i))
end
n = sizeof(a)
if i > n
return i == n+1 ? 0 : throw(BoundsError(a, i))
end
p = pointer(a)
q = GC.@preserve a ccall(:memchr, Ptr{UInt8}, (Ptr{UInt8}, Int32, Csize_t), p+i-1, b, n-i+1)
return q == C_NULL ? 0 : Int(q-p+1)
end
function _search(a::ByteArray, b::AbstractChar, i::Integer = 1)
if isascii(b)
_search(a,UInt8(b),i)
else
_search(a,codeunits(string(b)),i).start
end
end
function findprev(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:AbstractChar},
s::String, i::Integer)
c = pred.x
c ≤ '\x7f' && return nothing_sentinel(_rsearch(s, c % UInt8, i))
b = first_utf8_byte(c)
while true
i = _rsearch(s, b, i)
i == 0 && return nothing
pred(s[i]) && return i
i = prevind(s, i)
end
end
findlast(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:Union{Int8,UInt8}}, a::ByteArray) =
nothing_sentinel(_rsearch(a, pred.x))
findprev(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:Union{Int8,UInt8}}, a::ByteArray, i::Integer) =
nothing_sentinel(_rsearch(a, pred.x, i))
findlast(::typeof(iszero), a::ByteArray) = nothing_sentinel(_rsearch(a, zero(UInt8)))
findprev(::typeof(iszero), a::ByteArray, i::Integer) = nothing_sentinel(_rsearch(a, zero(UInt8), i))
function _rsearch(a::Union{String,ByteArray}, b::Union{Int8,UInt8}, i::Integer = sizeof(a))
if i < 1
return i == 0 ? 0 : throw(BoundsError(a, i))
end
n = sizeof(a)
if i > n
return i == n+1 ? 0 : throw(BoundsError(a, i))
end
p = pointer(a)
q = GC.@preserve a ccall(:memrchr, Ptr{UInt8}, (Ptr{UInt8}, Int32, Csize_t), p, b, i)
return q == C_NULL ? 0 : Int(q-p+1)
end
function _rsearch(a::ByteArray, b::AbstractChar, i::Integer = length(a))
if isascii(b)
_rsearch(a,UInt8(b),i)
else
_rsearch(a,codeunits(string(b)),i).start
end
end
"""
findfirst(pattern::AbstractString, string::AbstractString)
findfirst(pattern::AbstractPattern, string::String)
Find the first occurrence of `pattern` in `string`. Equivalent to
[`findnext(pattern, string, firstindex(s))`](@ref).
# Examples
```jldoctest
julia> findfirst("z", "Hello to the world") # returns nothing, but not printed in the REPL
julia> findfirst("Julia", "JuliaLang")
1:5
```
"""
findfirst(pattern::AbstractString, string::AbstractString) =
findnext(pattern, string, firstindex(string))
"""
findfirst(ch::AbstractChar, string::AbstractString)
Find the first occurrence of character `ch` in `string`.
!!! compat "Julia 1.3"
This method requires at least Julia 1.3.
# Examples
```jldoctest
julia> findfirst('a', "happy")
2
julia> findfirst('z', "happy") === nothing
true
```
"""
findfirst(ch::AbstractChar, string::AbstractString) = findfirst(==(ch), string)
"""
findfirst(pattern::AbstractVector{<:Union{Int8,UInt8}},
A::AbstractVector{<:Union{Int8,UInt8}})
Find the first occurrence of sequence `pattern` in vector `A`.
!!! compat "Julia 1.6"
This method requires at least Julia 1.6.
# Examples
```jldoctest
julia> findfirst([0x52, 0x62], [0x40, 0x52, 0x62, 0x63])
2:3
```
"""
findfirst(pattern::AbstractVector{<:Union{Int8,UInt8}},
A::AbstractVector{<:Union{Int8,UInt8}}) =
_search(A, pattern, firstindex(A))
# AbstractString implementation of the generic findnext interface
function findnext(testf::Function, s::AbstractString, i::Integer)
i = Int(i)
z = ncodeunits(s) + 1
1 ≤ i ≤ z || throw(BoundsError(s, i))
@inbounds i == z || isvalid(s, i) || string_index_err(s, i)
e = lastindex(s)
while i <= e
testf(@inbounds s[i]) && return i
i = @inbounds nextind(s, i)
end
return nothing
end
in(c::AbstractChar, s::AbstractString) = (findfirst(isequal(c),s)!==nothing)
function _searchindex(s::Union{AbstractString,ByteArray},
t::Union{AbstractString,AbstractChar,Int8,UInt8},
i::Integer)
x = Iterators.peel(t)
if isnothing(x)
return 1 <= i <= nextind(s,lastindex(s))::Int ? i :
throw(BoundsError(s, i))
end
t1, trest = x
while true
i = findnext(isequal(t1),s,i)
if i === nothing return 0 end
ii = nextind(s, i)::Int
a = Iterators.Stateful(trest)
matched = all(splat(==), zip(SubString(s, ii), a))
(isempty(a) && matched) && return i
i = ii
end
end
_searchindex(s::AbstractString, t::AbstractChar, i::Integer) = something(findnext(isequal(t), s, i), 0)
function _search_bloom_mask(c)
UInt64(1) << (c & 63)
end
_nthbyte(s::Union{String, SubString{String}}, i) = codeunit(s, i)
_nthbyte(t::AbstractVector, index) = t[index + (firstindex(t)-1)]
function _searchindex(s::Union{String, SubString{String}}, t::Union{String, SubString{String}}, i::Integer)
# Check for fast case of a single byte
lastindex(t) == 1 && return something(findnext(isequal(t[1]), s, i), 0)
_searchindex(codeunits(s), codeunits(t), i)
end
function _searchindex(s::AbstractVector{<:Union{Int8,UInt8}},
t::AbstractVector{<:Union{Int8,UInt8}},
_i::Integer)
sentinel = firstindex(s) - 1
n = length(t)
m = length(s)
i = Int(_i) - sentinel
(i < 1 || i > m+1) && throw(BoundsError(s, _i))
if n == 0
return 1 <= i <= m+1 ? max(1, i) : sentinel
elseif m == 0
return sentinel
elseif n == 1
return something(findnext(isequal(_nthbyte(t,1)), s, i), sentinel)
end
w = m - n
if w < 0 || i - 1 > w
return sentinel
end
bloom_mask = UInt64(0)
skip = n - 1
tlast = _nthbyte(t,n)
for j in 1:n
bloom_mask |= _search_bloom_mask(_nthbyte(t,j))
if _nthbyte(t,j) == tlast && j < n
skip = n - j - 1
end
end
i -= 1
while i <= w
if _nthbyte(s,i+n) == tlast
# check candidate
j = 0
while j < n - 1
if _nthbyte(s,i+j+1) != _nthbyte(t,j+1)
break
end
j += 1
end
# match found
if j == n - 1
# restore in case `s` is an OffSetArray
return i+firstindex(s)
end
# no match, try to rule out the next character
if i < w && bloom_mask & _search_bloom_mask(_nthbyte(s,i+n+1)) == 0
i += n
else
i += skip
end
elseif i < w
if bloom_mask & _search_bloom_mask(_nthbyte(s,i+n+1)) == 0
i += n
end
end
i += 1
end
sentinel
end
function _search(s::Union{AbstractString,AbstractVector{<:Union{Int8,UInt8}}},
t::Union{AbstractString,AbstractChar,AbstractVector{<:Union{Int8,UInt8}}},
i::Integer)
idx = _searchindex(s,t,i)
if isempty(t)
idx:idx-1
elseif idx >= firstindex(s)
idx:(idx + lastindex(t) - 1)
else
nothing
end
end
"""
findnext(pattern::AbstractString, string::AbstractString, start::Integer)
findnext(pattern::AbstractPattern, string::String, start::Integer)
Find the next occurrence of `pattern` in `string` starting at position `start`.
`pattern` can be either a string, or a regular expression, in which case `string`
must be of type `String`.
The return value is a range of indices where the matching sequence is found, such that
`s[findnext(x, s, i)] == x`:
`findnext("substring", string, i)` == `start:stop` such that
`string[start:stop] == "substring"` and `i <= start`, or `nothing` if unmatched.
# Examples
```jldoctest
julia> findnext("z", "Hello to the world", 1) === nothing
true
julia> findnext("o", "Hello to the world", 6)
8:8
julia> findnext("Lang", "JuliaLang", 2)
6:9
```
"""
findnext(t::AbstractString, s::AbstractString, start::Integer) = _search(s, t, Int(start))
"""
findnext(ch::AbstractChar, string::AbstractString, start::Integer)
Find the next occurrence of character `ch` in `string` starting at position `start`.
!!! compat "Julia 1.3"
This method requires at least Julia 1.3.
# Examples
```jldoctest
julia> findnext('z', "Hello to the world", 1) === nothing
true
julia> findnext('o', "Hello to the world", 6)
8
```
"""
findnext(ch::AbstractChar, string::AbstractString, start::Integer) =
findnext(==(ch), string, start)
"""
findnext(pattern::AbstractVector{<:Union{Int8,UInt8}},
A::AbstractVector{<:Union{Int8,UInt8}},
start::Integer)
Find the next occurrence of the sequence `pattern` in vector `A` starting at position `start`.
!!! compat "Julia 1.6"
This method requires at least Julia 1.6.
# Examples
```jldoctest
julia> findnext([0x52, 0x62], [0x52, 0x62, 0x72], 3) === nothing
true
julia> findnext([0x52, 0x62], [0x40, 0x52, 0x62, 0x52, 0x62], 3)
4:5
```
"""
findnext(pattern::AbstractVector{<:Union{Int8,UInt8}},
A::AbstractVector{<:Union{Int8,UInt8}},
start::Integer) =
_search(A, pattern, start)
"""
findlast(pattern::AbstractString, string::AbstractString)
Find the last occurrence of `pattern` in `string`. Equivalent to
[`findprev(pattern, string, lastindex(string))`](@ref).
# Examples
```jldoctest
julia> findlast("o", "Hello to the world")
15:15
julia> findfirst("Julia", "JuliaLang")
1:5
```
"""
findlast(pattern::AbstractString, string::AbstractString) =
findprev(pattern, string, lastindex(string))
"""
findlast(pattern::AbstractVector{<:Union{Int8,UInt8}},
A::AbstractVector{<:Union{Int8,UInt8}})
Find the last occurrence of `pattern` in array `A`. Equivalent to
[`findprev(pattern, A, lastindex(A))`](@ref).
# Examples
```jldoctest
julia> findlast([0x52, 0x62], [0x52, 0x62, 0x52, 0x62])
3:4
```
"""
findlast(pattern::AbstractVector{<:Union{Int8,UInt8}},
A::AbstractVector{<:Union{Int8,UInt8}}) =
findprev(pattern, A, lastindex(A))
"""
findlast(ch::AbstractChar, string::AbstractString)
Find the last occurrence of character `ch` in `string`.
!!! compat "Julia 1.3"
This method requires at least Julia 1.3.
# Examples
```jldoctest
julia> findlast('p', "happy")
4
julia> findlast('z', "happy") === nothing
true
```
"""
findlast(ch::AbstractChar, string::AbstractString) = findlast(==(ch), string)
"""
findall(
pattern::Union{AbstractString,AbstractPattern},
string::AbstractString;
overlap::Bool = false,
)
findall(
pattern::Vector{UInt8}
A::Vector{UInt8};
overlap::Bool = false,
)
Return a `Vector{UnitRange{Int}}` of all the matches for `pattern` in `string`.
Each element of the returned vector is a range of indices where the
matching sequence is found, like the return value of [`findnext`](@ref).
If `overlap=true`, the matching sequences are allowed to overlap indices in the
original string, otherwise they must be from disjoint character ranges.
# Examples
```jldoctest
julia> findall("a", "apple")
1-element Vector{UnitRange{Int64}}:
1:1
julia> findall("nana", "banana")
1-element Vector{UnitRange{Int64}}:
3:6
julia> findall("a", "banana")
3-element Vector{UnitRange{Int64}}:
2:2
4:4
6:6
julia> findall(UInt8[1,2], UInt8[1,2,3,1,2])
2-element Vector{UnitRange{Int64}}:
1:2
4:5
```
!!! compat "Julia 1.3"
This method requires at least Julia 1.3.
"""
function findall(t::Union{AbstractString, AbstractPattern, AbstractVector{<:Union{Int8,UInt8}}},
s::Union{AbstractString, AbstractPattern, AbstractVector{<:Union{Int8,UInt8}}},
; overlap::Bool=false)
found = UnitRange{Int}[]
i, e = firstindex(s), lastindex(s)
while true
r = findnext(t, s, i)
isnothing(r) && break
push!(found, r)
j = overlap || isempty(r) ? first(r) : last(r)
j > e && break
@inbounds i = nextind(s, j)
end
return found
end
# AbstractString implementation of the generic findprev interface
function findprev(testf::Function, s::AbstractString, i::Integer)
i = Int(i)
z = ncodeunits(s) + 1
0 ≤ i ≤ z || throw(BoundsError(s, i))
i == z && return nothing
@inbounds i == 0 || isvalid(s, i) || string_index_err(s, i)
while i >= 1
testf(@inbounds s[i]) && return i
i = @inbounds prevind(s, i)
end
return nothing
end
function _rsearchindex(s::AbstractString,
t::Union{AbstractString,AbstractChar,Int8,UInt8},
i::Integer)
if isempty(t)
return 1 <= i <= nextind(s, lastindex(s))::Int ? i :
throw(BoundsError(s, i))
end
t1, trest = Iterators.peel(Iterators.reverse(t))::NTuple{2,Any}
while true
i = findprev(isequal(t1), s, i)
i === nothing && return 0
ii = prevind(s, i)::Int
a = Iterators.Stateful(trest)
b = Iterators.Stateful(Iterators.reverse(
pairs(SubString(s, 1, ii))))
matched = all(splat(==), zip(a, (x[2] for x in b)))
if matched && isempty(a)
isempty(b) && return firstindex(s)
return nextind(s, popfirst!(b)[1])::Int
end
i = ii
end
end
function _rsearchindex(s::String, t::String, i::Integer)
# Check for fast case of a single byte
if lastindex(t) == 1
return something(findprev(isequal(t[1]), s, i), 0)
elseif lastindex(t) != 0
j = i ≤ ncodeunits(s) ? nextind(s, i)-1 : i
return _rsearchindex(codeunits(s), codeunits(t), j)
elseif i > sizeof(s)
return 0
elseif i == 0
return 1
else
return i
end
end
function _rsearchindex(s::AbstractVector{<:Union{Int8,UInt8}}, t::AbstractVector{<:Union{Int8,UInt8}}, _k::Integer)
sentinel = firstindex(s) - 1
n = length(t)
m = length(s)
k = Int(_k) - sentinel
k < 0 && throw(BoundsError(s, _k))
if n == 0
return 0 <= k <= m ? max(k, 1) : sentinel
elseif m == 0
return sentinel
elseif n == 1
return something(findprev(isequal(_nthbyte(t,1)), s, k), sentinel)
end
w = m - n
if w < 0 || k <= 0
return sentinel
end
bloom_mask = UInt64(0)
skip = n - 1
tfirst = _nthbyte(t,1)
for j in n:-1:1
bloom_mask |= _search_bloom_mask(_nthbyte(t,j))
if _nthbyte(t,j) == tfirst && j > 1
skip = j - 2
end
end
i = min(k - n + 1, w + 1)
while i > 0
if _nthbyte(s,i) == tfirst
# check candidate
j = 1
while j < n
if _nthbyte(s,i+j) != _nthbyte(t,j+1)
break
end
j += 1
end
# match found, restore in case `s` is an OffsetArray
if j == n
return i + sentinel
end
# no match, try to rule out the next character
if i > 1 && bloom_mask & _search_bloom_mask(_nthbyte(s,i-1)) == 0
i -= n
else
i -= skip
end
elseif i > 1
if bloom_mask & _search_bloom_mask(_nthbyte(s,i-1)) == 0
i -= n
end
end
i -= 1
end
sentinel
end
function _rsearch(s::Union{AbstractString,AbstractVector{<:Union{Int8,UInt8}}},
t::Union{AbstractString,AbstractChar,AbstractVector{<:Union{Int8,UInt8}}},
i::Integer)
idx = _rsearchindex(s,t,i)
if isempty(t)
idx:idx-1
elseif idx > firstindex(s) - 1
idx:(idx + lastindex(t) - 1)
else
nothing
end
end
"""
findprev(pattern::AbstractString, string::AbstractString, start::Integer)
Find the previous occurrence of `pattern` in `string` starting at position `start`.
The return value is a range of indices where the matching sequence is found, such that
`s[findprev(x, s, i)] == x`:
`findprev("substring", string, i)` == `start:stop` such that
`string[start:stop] == "substring"` and `stop <= i`, or `nothing` if unmatched.
# Examples
```jldoctest
julia> findprev("z", "Hello to the world", 18) === nothing
true
julia> findprev("o", "Hello to the world", 18)
15:15
julia> findprev("Julia", "JuliaLang", 6)
1:5
```
"""
findprev(t::AbstractString, s::AbstractString, i::Integer) = _rsearch(s, t, Int(i))
"""
findprev(ch::AbstractChar, string::AbstractString, start::Integer)
Find the previous occurrence of character `ch` in `string` starting at position `start`.
!!! compat "Julia 1.3"
This method requires at least Julia 1.3.
# Examples
```jldoctest
julia> findprev('z', "Hello to the world", 18) === nothing
true
julia> findprev('o', "Hello to the world", 18)
15
```
"""
findprev(ch::AbstractChar, string::AbstractString, start::Integer) =
findprev(==(ch), string, start)
"""
findprev(pattern::AbstractVector{<:Union{Int8,UInt8}},
A::AbstractVector{<:Union{Int8,UInt8}},
start::Integer)
Find the previous occurrence of the sequence `pattern` in vector `A` starting at position `start`.
!!! compat "Julia 1.6"
This method requires at least Julia 1.6.
# Examples
```jldoctest
julia> findprev([0x52, 0x62], [0x40, 0x52, 0x62, 0x52, 0x62], 3)
2:3
```
"""
findprev(pattern::AbstractVector{<:Union{Int8,UInt8}},
A::AbstractVector{<:Union{Int8,UInt8}},
start::Integer) =
_rsearch(A, pattern, start)
"""
occursin(needle::Union{AbstractString,AbstractPattern,AbstractChar}, haystack::AbstractString)
Determine whether the first argument is a substring of the second. If `needle`
is a regular expression, checks whether `haystack` contains a match.
# Examples
```jldoctest
julia> occursin("Julia", "JuliaLang is pretty cool!")
true
julia> occursin('a', "JuliaLang is pretty cool!")
true
julia> occursin(r"a.a", "aba")
true
julia> occursin(r"a.a", "abba")
false
```
See also [`contains`](@ref).
"""
occursin(needle::Union{AbstractString,AbstractChar}, haystack::AbstractString) =
_searchindex(haystack, needle, firstindex(haystack)) != 0
"""
occursin(haystack)
Create a function that checks whether its argument occurs in `haystack`, i.e.
a function equivalent to `needle -> occursin(needle, haystack)`.
The returned function is of type `Base.Fix2{typeof(occursin)}`.
!!! compat "Julia 1.6"
This method requires Julia 1.6 or later.
# Examples
```jldoctest
julia> search_f = occursin("JuliaLang is a programming language");
julia> search_f("JuliaLang")
true
julia> search_f("Python")
false
```
"""
occursin(haystack) = Base.Fix2(occursin, haystack)
in(::AbstractString, ::AbstractString) = error("use occursin(needle, haystack) for string containment")