Revision 343657a5cdfa7b23c9ad0636fa5c7a5d4434532b authored by Viral B. Shah on 14 April 2015, 19:24:42 UTC, committed by Viral B. Shah on 14 April 2015, 19:26:00 UTC
squeeze of a sparse matrix throws an error. (cherry picked from commit f8e343eb80bb2936ffc81064d4bdc197ba26598f)
1 parent a18af00
utf8.jl
## from base/boot.jl:
#
# immutable UTF8String <: String
# data::Array{Uint8,1}
# end
#
## basic UTF-8 decoding & iteration ##
const utf8_offset = [
0x00000000, 0x00003080,
0x000e2080, 0x03c82080,
0xfa082080, 0x82082080,
]
const utf8_trailing = [
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5,
]
is_utf8_start(byte::Uint8) = ((byte&0xc0)!=0x80)
## required core functionality ##
function endof(s::UTF8String)
d = s.data
i = length(d)
i == 0 && return i
while !is_utf8_start(d[i])
i -= 1
end
i
end
length(s::UTF8String) = int(ccall(:u8_strlen, Csize_t, (Ptr{Uint8},), s.data))
function next(s::UTF8String, i::Int)
# potentially faster version
# d = s.data
# a::Uint32 = d[i]
# if a < 0x80; return char(a); end
# #if a&0xc0==0x80; return '\ufffd'; end
# b::Uint32 = a<<6 + d[i+1]
# if a < 0xe0; return char(b - 0x00003080); end
# c::Uint32 = b<<6 + d[i+2]
# if a < 0xf0; return char(c - 0x000e2080); end
# return char(c<<6 + d[i+3] - 0x03c82080)
d = s.data
b = d[i]
if !is_utf8_start(b)
j = i-1
while 0 < j && !is_utf8_start(d[j])
j -= 1
end
if 0 < j && i <= j+utf8_trailing[d[j]+1] <= length(d)
# b is a continuation byte of a valid UTF-8 character
error("invalid UTF-8 character index")
end
# move past 1 byte in case the data is actually Latin-1
return '\ufffd', i+1
end
trailing = utf8_trailing[b+1]
if length(d) < i + trailing
return '\ufffd', i+1
end
c::Uint32 = 0
for j = 1:trailing+1
c <<= 6
c += d[i]
i += 1
end
c -= utf8_offset[trailing+1]
char(c), i
end
function first_utf8_byte(c::Char)
c < 0x80 ? uint8(c) :
c < 0x800 ? uint8((c>>6 )|0xc0) :
c < 0x10000 ? uint8((c>>12)|0xe0) :
uint8((c>>18)|0xf0)
end
## overload methods for efficiency ##
sizeof(s::UTF8String) = sizeof(s.data)
isvalid(s::UTF8String, i::Integer) =
(1 <= i <= endof(s.data)) && is_utf8_start(s.data[i])
const empty_utf8 = UTF8String(Uint8[])
function getindex(s::UTF8String, r::UnitRange{Int})
isempty(r) && return empty_utf8
i, j = first(r), last(r)
d = s.data
if !is_utf8_start(d[i])
i = nextind(s,i)
end
if j > endof(s)
throw(BoundsError())
end
j = nextind(s,j)-1
UTF8String(d[i:j])
end
function search(s::UTF8String, c::Char, i::Integer)
if c < 0x80 return search(s.data, uint8(c), i) end
while true
i = search(s.data, first_utf8_byte(c), i)
if i==0 || s[i]==c return i end
i = next(s,i)[2]
end
end
function rsearch(s::UTF8String, c::Char, i::Integer)
if c < 0x80 return rsearch(s.data, uint8(c), i) end
b = first_utf8_byte(c)
while true
i = rsearch(s.data, b, i)
if i==0 || s[i]==c return i end
i = prevind(s,i)
end
end
function string(a::ByteString...)
if length(a) == 1
return a[1]::UTF8String
end
# ^^ at least one must be UTF-8 or the ASCII-only method would get called
data = Array(Uint8,0)
for d in a
append!(data,d.data)
end
UTF8String(data)
end
function reverse(s::UTF8String)
out = similar(s.data)
if ccall(:u8_reverse, Cint, (Ptr{Uint8}, Ptr{Uint8}, Csize_t),
out, s.data, length(out)) == 1
error("invalid UTF-8 data")
end
UTF8String(out)
end
## outputing UTF-8 strings ##
print(io::IO, s::UTF8String) = (write(io, s.data);nothing)
write(io::IO, s::UTF8String) = write(io, s.data)
## transcoding to UTF-8 ##
utf8(x) = convert(UTF8String, x)
convert(::Type{UTF8String}, s::UTF8String) = s
convert(::Type{UTF8String}, s::ASCIIString) = UTF8String(s.data)
convert(::Type{UTF8String}, a::Array{Uint8,1}) = is_valid_utf8(a) ? UTF8String(a) : error("invalid UTF-8 sequence")
function convert(::Type{UTF8String}, a::Array{Uint8,1}, invalids_as::String)
l = length(a)
idx = 1
iscopy = false
while idx <= l
if is_utf8_start(a[idx])
nextidx = idx+1+utf8_trailing[a[idx]+1]
(nextidx <= (l+1)) && (idx = nextidx; continue)
end
!iscopy && (a = copy(a); iscopy = true)
endn = idx
while endn <= l
is_utf8_start(a[endn]) && break
endn += 1
end
(endn > idx) && (endn -= 1)
splice!(a, idx:endn, invalids_as.data)
l = length(a)
end
UTF8String(a)
end
convert(::Type{UTF8String}, s::String) = utf8(bytestring(s))
# The last case is the replacement character 0xfffd (3 bytes)
utf8sizeof(c::Char) = c < 0x80 ? 1 : c < 0x800 ? 2 : c < 0x10000 ? 3 : c < 0x110000 ? 4 : 3
![swh spinner](/static/img/swh-spinner.gif)
Computing file changes ...