https://github.com/JuliaLang/julia
Tip revision: 66201737c1f429d94af690f75bb132c7e987dd03 authored by Curtis Vogt on 21 December 2020, 16:58:02 UTC
Use closure to reduce code duplication
Use closure to reduce code duplication
Tip revision: 6620173
toml_parser.jl
# This file is a part of Julia. License is MIT: https://julialang.org/license
module TOML
using Base: IdSet
# In case we do not have the Dates stdlib available
# we parse DateTime into these internal structs,
# note that these do not do any argument checking
struct Date
year::Int
month::Int
day::Int
end
struct Time
hour::Int
minute::Int
second::Int
ms::Int
end
struct DateTime
date::Date
time::Time
end
DateTime(y, m, d, h, mi, s, ms) =
DateTime(Date(y,m,d), Time(h, mi, s, ms))
const EOF_CHAR = typemax(Char)
const TOMLDict = Dict{String, Any}
##########
# Parser #
##########
mutable struct Parser
str::String
# 1 character look ahead
current_char::Char
pos::Int
# prevpos equals the startbyte of the look ahead character
# prevpos-1 is therefore the end byte of the character we last ate
prevpos::Int
# File info
column::Int
line::Int
# The function `take_substring` takes the substring from `marker` up
# to `prevpos-1`.
marker::Int
# The current table that `key = value` entries are inserted into
active_table::TOMLDict
# As we parse dotted keys we store each part of the key in this cache
# A future improvement would be to also store the spans of the keys
# so that in error messages we could also show the previous key
# definition in case of duplicated keys
dotted_keys::Vector{String}
# Strings in TOML can have line continuations ('\' as the last character
# on a line. We store the byte ranges for each of these "chunks" in here
chunks::Vector{UnitRange{Int}}
# We need to keep track of those tables / arrays that are defined
# inline since we are not allowed to add keys to those
inline_tables::IdSet{TOMLDict}
static_arrays::IdSet{Any}
# [a.b.c.d] doesn't "define" the table [a]
# so keys can later be added to [a], therefore
# we need to keep track of what tables are
# actually defined
defined_tables::IdSet{TOMLDict}
# The table we will finally return to the user
root::TOMLDict
# Filled in in case we are parsing a file to improve error messages
filepath::Union{String, Nothing}
# Get's populated with the Dates stdlib if it exists
Dates::Union{Module, Nothing}
end
const DATES_PKGID = Base.PkgId(Base.UUID("ade2ca70-3891-5945-98fb-dc099432e06a"), "Dates")
function Parser(str::String; filepath=nothing)
root = TOMLDict()
l = Parser(
str, # str
EOF_CHAR, # current_char
firstindex(str), # pos
0, # prevpos
0, # column
1, # line
0, # marker
root, # active_table
String[], # dotted_keys
UnitRange{Int}[], # chunks
IdSet{TOMLDict}(), # inline_tables
IdSet{Any}(), # static_arrays
IdSet{TOMLDict}(), # defined_tables
root,
filepath,
isdefined(Base, :loaded_modules) ? get(Base.loaded_modules, DATES_PKGID, nothing) : nothing,
)
startup(l)
return l
end
function startup(l::Parser)
# Populate our one character look-ahead
c = eat_char(l)
# Skip BOM
if c === '\ufeff'
l.column -= 1
eat_char(l)
end
end
Parser() = Parser("")
Parser(io::IO) = Parser(read(io, String))
function reinit!(p::Parser, str::String; filepath::Union{Nothing, String}=nothing)
p.str = str
p.current_char = EOF_CHAR
p.pos = firstindex(str)
p.prevpos = 0
p.column = 0
p.line = 1
p.marker = 0
p.root = TOMLDict()
p.active_table = p.root
empty!(p.dotted_keys)
empty!(p.chunks)
empty!(p.inline_tables)
empty!(p.static_arrays)
empty!(p.defined_tables)
p.filepath = filepath
startup(p)
return p
end
##########
# Errors #
##########
throw_internal_error(msg) = error("internal TOML parser error: $msg")
# Many functions return a ParserError. We want this to bubble up
# all the way and have this error be returned to the user
# if the parse is called with `raise=false`. This macro
# makes that easier
@eval macro $(:var"try")(expr)
return quote
v = $(esc(expr))
v isa ParserError && return v
v
end
end
# TODO: Check all of these are used
@enum ErrorType begin
# Toplevel #
############
ErrRedefineTableArray
ErrExpectedNewLineKeyValue
ErrAddKeyToInlineTable
ErrAddArrayToStaticArray
ErrArrayTreatedAsDictionary
ErrExpectedEndOfTable
ErrExpectedEndArrayOfTable
# Keys #
########
ErrExpectedEqualAfterKey
# Check, are these the same?
ErrDuplicatedKey
ErrKeyAlreadyHasValue
ErrInvalidBareKeyCharacter
ErrEmptyBareKey
# Values #
##########
ErrUnexpectedEofExpectedValue
ErrUnexpectedStartOfValue
ErrGenericValueError
# Arrays
ErrExpectedCommaBetweenItemsArray
# Inline tables
ErrExpectedCommaBetweenItemsInlineTable
ErrTrailingCommaInlineTable
# Numbers
ErrUnderscoreNotSurroundedByDigits
ErrLeadingZeroNotAllowedInteger
ErrOverflowError
ErrLeadingDot
ErrNoTrailingDigitAfterDot
ErrTrailingUnderscoreNumber
# DateTime
ErrParsingDateTime
ErrOffsetDateNotSupported
# Strings
ErrNewLineInString
ErrUnexpectedEndString
ErrInvalidEscapeCharacter
ErrInvalidUnicodeScalar
end
const err_message = Dict(
ErrTrailingCommaInlineTable => "trailing comma not allowed in inline table",
ErrExpectedCommaBetweenItemsArray => "expected comma between items in array",
ErrExpectedCommaBetweenItemsInlineTable => "expected comma between items in inline table",
ErrExpectedEndArrayOfTable => "expected array of table to end with ']]'",
ErrInvalidBareKeyCharacter => "invalid bare key character",
ErrRedefineTableArray => "tried to redefine an existing table as an array",
ErrDuplicatedKey => "key already defined",
ErrKeyAlreadyHasValue => "key already has a value",
ErrEmptyBareKey => "bare key cannot be empty",
ErrExpectedNewLineKeyValue => "expected newline after key value pair",
ErrNewLineInString => "newline character in single quoted string",
ErrUnexpectedEndString => "string literal ened unexpectedly",
ErrExpectedEndOfTable => "expected end of table ']'",
ErrAddKeyToInlineTable => "tried to add a new key to an inline table",
ErrArrayTreatedAsDictionary => "tried to add a key to an array",
ErrAddArrayToStaticArray => "tried to append to a statically defined array",
ErrGenericValueError => "failed to parse value",
ErrLeadingZeroNotAllowedInteger => "leading zero in integer not allowed",
ErrUnderscoreNotSurroundedByDigits => "underscore is not surrounded by digits",
ErrUnexpectedStartOfValue => "unexpected start of value",
ErrOffsetDateNotSupported => "offset date-time is not supported",
ErrParsingDateTime => "parsing date/time value failed",
ErrTrailingUnderscoreNumber => "trailing underscore in number",
ErrLeadingDot => "floats require a leading zero",
ErrExpectedEqualAfterKey => "expected equal sign after key",
ErrNoTrailingDigitAfterDot => "expected digit after dot",
ErrOverflowError => "overflowed when parsing integer",
ErrInvalidUnicodeScalar => "invalid uncidode scalar",
ErrInvalidEscapeCharacter => "invalid escape character",
ErrUnexpectedEofExpectedValue => "unexpected end of file, expected a value"
)
for err in instances(ErrorType)
@assert haskey(err_message, err) "$err does not have an error message"
end
mutable struct ParserError <: Exception
type::ErrorType
# Arbitrary data to store at the
# call site to be used when formatting
# the error
data
# These are filled in before returning from parse function
str ::Union{String, Nothing}
filepath ::Union{String, Nothing}
line ::Union{Int, Nothing}
column ::Union{Int, Nothing}
pos ::Union{Int, Nothing} # position of parser when
table ::Union{TOMLDict, Nothing} # result parsed until error
end
ParserError(type, data) = ParserError(type, data, nothing, nothing, nothing, nothing, nothing, nothing)
ParserError(type) = ParserError(type, nothing)
# Defining these below can be useful when debugging code that erroneously returns a
# ParserError because you get a stacktrace to where the ParserError was created
#ParserError(type) = error(type)
#ParserError(type, data) = error(type,data)
# Many functions return either a T or a ParserError
const Err{T} = Union{T, ParserError}
function format_error_message_for_err_type(error::ParserError)
msg = err_message[error.type]
if error.type == ErrInvalidBareKeyCharacter
c_escaped = escape_string(string(error.data)::String)
msg *= ": '$c_escaped'"
end
return msg
end
# This is used in error formatting, for example,
# point_to_line("aa\nfoobar\n\bb", 4, 6) would return the strings:
# str1 = "foobar"
# str2 = "^^^"
# used to show the interval where an error happened
# Right now, it is only called with a == b
function point_to_line(str::AbstractString, a::Int, b::Int, context)
@assert b >= a
a = thisind(str, a)
b = thisind(str, b)
pos = something(findprev('\n', str, prevind(str, a)), 0) + 1
io1 = IOContext(IOBuffer(), context)
io2 = IOContext(IOBuffer(), context)
while true
if a <= pos <= b
printstyled(io2, "^"; color=:light_green)
else
print(io2, " ")
end
it = iterate(str, pos)
it === nothing && break
c, pos = it
c == '\n' && break
print(io1, c)
end
return String(take!(io1.io)), String(take!(io2.io))
end
function Base.showerror(io::IO, err::ParserError)
printstyled(io, "TOML Parser error:\n"; color=Base.error_color())
f = something(err.filepath, "none")
printstyled(io, f, ':', err.line, ':', err.column; bold=true)
printstyled(io, " error: "; color=Base.error_color())
println(io, format_error_message_for_err_type(err))
# In this case we want the arrow to point one character
pos = err.pos
err.type == ErrUnexpectedEofExpectedValue && (pos += 1)
str1, err1 = point_to_line(err.str, pos, pos, io)
@static if VERSION <= v"1.6.0-DEV.121"
# See https://github.com/JuliaLang/julia/issues/36015
format_fixer = get(io, :color, false) == true ? "\e[0m" : ""
println(io, "$format_fixer ", str1)
print(io, "$format_fixer ", err1)
else
println(io, " ", str1)
print(io, " ", err1)
end
end
################
# Parser utils #
################
@inline function next_char(l::Parser)::Char
state = iterate(l.str, l.pos)
l.prevpos = l.pos
l.column += 1
state === nothing && return EOF_CHAR
c, pos = state
l.pos = pos
if c == '\n'
l.line += 1
l.column = 0
end
return c
end
@inline function eat_char(l::Parser)::Char
c = l.current_char
l.current_char = next_char(l)
return c
end
@inline peek(l::Parser) = l.current_char
# Return true if the character was accepted. When a character
# is accepted it get's eaten and we move to the next character
@inline function accept(l::Parser, f::Union{Function, Char})::Bool
c = peek(l)
c == EOF_CHAR && return false
ok = false
if isa(f, Function)
ok = f(c)
elseif isa(f, Char)
ok = c === f
end
ok && eat_char(l)
return ok
end
# Return true if any character was accepted
function accept_batch(l::Parser, f::F)::Bool where {F}
ok = false
while accept(l, f)
ok = true
end
return ok
end
# Return true if `f` was accepted `n` times
@inline function accept_n(l::Parser, n, f::F)::Bool where {F}
for i in 1:n
if !accept(l, f)
return false
end
end
return true
end
@inline iswhitespace(c::Char) = c == ' ' || c == '\t'
@inline isnewline(c::Char) = c == '\n' || c == '\r'
skip_ws(l::Parser) = accept_batch(l, iswhitespace)
skip_ws_nl_no_comment(l::Parser)::Bool = accept_batch(l, x -> iswhitespace(x) || isnewline(x))
function skip_ws_nl(l::Parser)::Bool
skipped = false
while true
skipped_ws = accept_batch(l, x -> iswhitespace(x) || isnewline(x))
skipped_comment = skip_comment(l)
if !skipped_ws && !skipped_comment
break
end
skipped = true
end
return skipped
end
# Returns true if a comment was skipped
function skip_comment(l::Parser)::Bool
found_comment = accept(l, '#')
if found_comment
accept_batch(l, !isnewline)
end
return found_comment
end
skip_ws_comment(l::Parser) = skip_ws(l) && skip_comment(l)
@inline set_marker!(l::Parser) = l.marker = l.prevpos
take_substring(l::Parser) = SubString(l.str, l.marker:(l.prevpos-1))
############
# Toplevel #
############
# Driver, keeps parsing toplevel until we either get
# a `ParserError` or eof.
function parse(l::Parser)::TOMLDict
v = tryparse(l)
v isa ParserError && throw(v)
return v
end
function tryparse(l::Parser)::Err{TOMLDict}
while true
skip_ws_nl(l)
peek(l) == EOF_CHAR && break
v = parse_toplevel(l)
if v isa ParserError
v.str = l.str
v.pos = l.prevpos-1
v.table = l.root
v.filepath = l.filepath
v.line = l.line
v.column = l.column-1
return v
end
end
return l.root
end
# Top level can be either a table key, an array of table statement
# or a key/value entry.
function parse_toplevel(l::Parser)::Err{Nothing}
if accept(l, '[')
l.active_table = l.root
@try parse_table(l)
skip_ws_comment(l)
if !(peek(l) == '\n' || peek(l) == '\r' || peek(l) == EOF_CHAR)
eat_char(l)
return ParserError(ErrExpectedNewLineKeyValue)
end
else
@try parse_entry(l, l.active_table)
skip_ws_comment(l)
# SPEC: "There must be a newline (or EOF) after a key/value pair."
if !(peek(l) == '\n' || peek(l) == '\r' || peek(l) == EOF_CHAR)
c = eat_char(l)
return ParserError(ErrExpectedNewLineKeyValue)
end
end
end
function recurse_dict!(l::Parser, d::Dict, dotted_keys::AbstractVector{String}, check=true)::Err{TOMLDict}
for i in 1:length(dotted_keys)
d = d::TOMLDict
key = dotted_keys[i]
d = get!(TOMLDict, d, key)
if d isa Vector
d = d[end]
end
check && @try check_allowed_add_key(l, d, i == length(dotted_keys))
end
return d::TOMLDict
end
function check_allowed_add_key(l::Parser, d, check_defined=true)::Err{Nothing}
if !(d isa Dict)
return ParserError(ErrKeyAlreadyHasValue)
elseif d isa Dict && d in l.inline_tables
return ParserError(ErrAddKeyToInlineTable)
elseif check_defined && d in l.defined_tables
return ParserError(ErrDuplicatedKey)
end
return nothing
end
# Can only enter here from toplevel
function parse_table(l)
if accept(l, '[')
return parse_array_table(l)
end
table_key = @try parse_key(l)
skip_ws(l)
if !accept(l, ']')
return ParserError(ErrExpectedEndOfTable)
end
l.active_table = @try recurse_dict!(l, l.root, table_key)
push!(l.defined_tables, l.active_table)
return
end
function parse_array_table(l)::Union{Nothing, ParserError}
table_key = @try parse_key(l)
skip_ws(l)
if !(accept(l, ']') && accept(l, ']'))
return ParserError(ErrExpectedEndArrayOfTable)
end
d = @try recurse_dict!(l, l.root, @view(table_key[1:end-1]), false)
k = table_key[end]
old = get!(() -> [], d, k)
if old isa Vector
if old in l.static_arrays
return ParserError(ErrAddArrayToStaticArray)
end
else
return ParserError(ErrArrayTreatedAsDictionary)
end
d_new = TOMLDict()
push!(old, d_new)
push!(l.defined_tables, d_new)
l.active_table = d_new
return
end
function parse_entry(l::Parser, d)::Union{Nothing, ParserError}
key = @try parse_key(l)
skip_ws(l)
if !accept(l, '=')
return ParserError(ErrExpectedEqualAfterKey)
end
if length(key) > 1
d = @try recurse_dict!(l, d, @view(key[1:end-1]))
end
last_key_part = l.dotted_keys[end]
v = get(d, last_key_part, nothing)
if v !== nothing
@try check_allowed_add_key(l, v)
end
skip_ws(l)
value = @try parse_value(l)
# TODO: Performance, hashing `last_key_part` again here
d[last_key_part] = value
return
end
########
# Keys #
########
# SPEC: "Bare keys may only contain ASCII letters, ASCII digits, underscores,
# and dashes (A-Za-z0-9_-).
# Note that bare keys are allowed to be composed of only ASCII digits, e.g. 1234,
# but are always interpreted as strings."
@inline isvalid_barekey_char(c::Char) =
'a' <= c <= 'z' ||
'A' <= c <= 'Z' ||
isdigit(c) ||
c == '-' || c == '_'
# Current key...
function parse_key(l::Parser)
empty!(l.dotted_keys)
_parse_key(l)
end
# Recursively add dotted keys to `l.dotted_key`
function _parse_key(l::Parser)
skip_ws(l)
# SPEC: "A bare key must be non-empty,"
if isempty(l.dotted_keys) && accept(l, '=')
return ParserError(ErrEmptyBareKey)
end
keyval = if accept(l, '"')
@try parse_string_start(l, false)
elseif accept(l, '\'')
@try parse_string_start(l, true)
else
set_marker!(l)
if accept_batch(l, isvalid_barekey_char)
if !(peek(l) == '.' || peek(l) == ' ' || peek(l) == ']' || peek(l) == '=')
c = eat_char(l)
return ParserError(ErrInvalidBareKeyCharacter, c)
end
String(take_substring(l))
else
c = eat_char(l)
return ParserError(ErrInvalidBareKeyCharacter, c)
end
end
new_key = keyval
push!(l.dotted_keys, new_key)
# SPEC: "Whitespace around dot-separated parts is ignored."
skip_ws(l)
if accept(l, '.')
skip_ws(l)
@try _parse_key(l)
end
return l.dotted_keys
end
##########
# Values #
##########
function parse_value(l::Parser)
val = if accept(l, '[')
parse_array(l)
elseif accept(l, '{')
parse_inline_table(l)
elseif accept(l, '"')
parse_string_start(l, false)
elseif accept(l, '\'')
parse_string_start(l, true)
elseif accept(l, 't')
parse_bool(l, true)
elseif accept(l, 'f')
parse_bool(l, false)
else
parse_number_or_date_start(l)
end
if val === nothing
return ParserError(ErrGenericValueError)
end
return val
end
#########
# Array #
#########
function push!!(v::Vector, el)
T = eltype(v)
if el isa T || typeof(el) === T
push!(v, el::T)
return v
else
if typeof(T) === Union
newT = Any
else
newT = Union{T, typeof(el)}
end
new = Array{newT}(undef, length(v))
copy!(new, v)
return push!(new, el)
end
end
function parse_array(l::Parser)::Err{Vector}
skip_ws_nl(l)
array = Vector{Union{}}()
empty_array = accept(l, ']')
while !empty_array
v = @try parse_value(l)
# TODO: Worth to function barrier this?
array = push!!(array, v)
# There can be an arbitrary number of newlines and comments before a value and before the closing bracket.
skip_ws_nl(l)
comma = accept(l, ',')
skip_ws_nl(l)
accept(l, ']') && break
if !comma
return ParserError(ErrExpectedCommaBetweenItemsArray)
end
end
push!(l.static_arrays, array)
return array
end
################
# Inline table #
################
function parse_inline_table(l::Parser)::Err{TOMLDict}
dict = TOMLDict()
push!(l.inline_tables, dict)
skip_ws(l)
accept(l, '}') && return dict
while true
@try parse_entry(l, dict)
# SPEC: No newlines are allowed between the curly braces unless they are valid within a value.
skip_ws(l)
accept(l, '}') && return dict
if accept(l, ',')
skip_ws(l)
if accept(l, '}')
return ParserError(ErrTrailingCommaInlineTable)
end
else
return ParserError(ErrExpectedCommaBetweenItemsInlineTable)
end
end
end
###########
# Numbers #
###########
parse_inf(l::Parser, sgn::Int) = accept(l, 'n') && accept(l, 'f') ? sgn * Inf : nothing
parse_nan(l::Parser) = accept(l, 'a') && accept(l, 'n') ? NaN : nothing
function parse_bool(l::Parser, v::Bool)::Union{Bool, Nothing}
# Have eaten a 't' if `v` is true, otherwise have eaten a `f`.
v ? (accept(l, 'r') && accept(l, 'u') && accept(l, 'e') && return true) :
(accept(l, 'a') && accept(l, 'l') && accept(l, 's') && accept(l, 'e') && return false)
return nothing
end
isvalid_hex(c::Char) = isdigit(c) || ('a' <= c <= 'f') || ('A' <= c <= 'F')
isvalid_oct(c::Char) = '0' <= c <= '7'
isvalid_binary(c::Char) = '0' <= c <= '1'
const ValidSigs = Union{typeof.([isvalid_hex, isvalid_oct, isvalid_binary, isdigit])...}
# This function eats things accepted by `f` but also allows eating `_` in between
# digits. Retruns if it ate at lest one character and if it ate an underscore
function accept_batch_underscore(l::Parser, f::ValidSigs, fail_if_underscore=true)::Err{Tuple{Bool, Bool}}
contains_underscore = false
at_least_one = false
last_underscore = false
while true
c = peek(l)
if c == '_'
contains_underscore = true
if fail_if_underscore
return ParserError(ErrUnderscoreNotSurroundedByDigits)
end
eat_char(l)
fail_if_underscore = true
last_underscore = true
else
# SPEC: "Each underscore must be surrounded by at least one digit on each side."
fail_if_underscore = false
if f(c)
at_least_one = true
eat_char(l)
else
if last_underscore
return ParserError(ErrTrailingUnderscoreNumber)
end
return at_least_one, contains_underscore
end
last_underscore = false
end
end
end
function parse_number_or_date_start(l::Parser)
integer = true
read_dot = false
set_marker!(l)
sgn = 1
if accept(l, '+')
# do nothing
elseif accept(l, '-')
sgn = -1
end
if accept(l, 'i')
return parse_inf(l, sgn)
elseif accept(l, 'n')
return parse_nan(l)
end
if accept(l, '.')
return ParserError(ErrLeadingDot)
end
# Zero is allowed to follow by a end value char, a base x, o, b or a dot
readed_zero = false
if accept(l, '0')
readed_zero = true # Intentional bad grammar to remove the ambiguity in "read"...
if ok_end_value(peek(l))
return Int64(0)
elseif accept(l, 'x')
ate, contains_underscore = @try accept_batch_underscore(l, isvalid_hex)
ate && return parse_int(l, contains_underscore)
elseif accept(l, 'o')
ate, contains_underscore = @try accept_batch_underscore(l, isvalid_oct)
ate && return parse_int(l, contains_underscore)
elseif accept(l, 'b')
ate, contains_underscore = @try accept_batch_underscore(l, isvalid_binary)
ate && return parse_int(l, contains_underscore)
elseif accept(l, isdigit)
return parse_local_time(l)
elseif peek(l) !== '.'
return ParserError(ErrLeadingZeroNotAllowedInteger)
end
end
read_underscore = false
read_digit = accept(l, isdigit)
if !readed_zero && !read_digit
if peek(l) == EOF_CHAR
return ParserError(ErrUnexpectedEofExpectedValue)
else
return ParserError(ErrUnexpectedStartOfValue)
end
end
ate, contains_underscore = @try accept_batch_underscore(l, isdigit, readed_zero)
read_underscore |= contains_underscore
if (read_digit || ate) && ok_end_value(peek(l))
return parse_int(l, contains_underscore)
end
# Done with integers here
if !read_underscore
# No underscores in date / times
if peek(l) == '-'
return parse_datetime(l)
elseif peek(l) == ':'
return parse_local_time(l)
end
end
# Done with datetime / localtime here
# can optionally read a . + digits and then exponent
ate_dot = accept(l, '.')
ate, contains_underscore = @try accept_batch_underscore(l, isdigit, true)
if ate_dot && !ate
return ParserError(ErrNoTrailingDigitAfterDot)
end
read_underscore |= contains_underscore
if accept(l, x -> x == 'e' || x == 'E')
accept(l, x-> x == '+' || x == '-')
# SPEC: (which follows the same rules as decimal integer values but may include leading zeros)
read_digit = accept_batch(l, isdigit)
ate, read_underscore = @try accept_batch_underscore(l, isdigit, !read_digit)
contains_underscore |= read_underscore
end
if !ok_end_value(peek(l))
eat_char(l)
return ParserError(ErrGenericValueError)
end
return parse_float(l, read_underscore)
end
function take_string_or_substring(l, contains_underscore)::SubString
subs = take_substring(l)
# Need to pass a AbstractString to `parse` so materialize it in case it
# contains underscore.
return contains_underscore ? SubString(filter(!=('_'), subs)) : subs
end
function parse_float(l::Parser, contains_underscore)::Err{Float64}
s = take_string_or_substring(l, contains_underscore)
v = Base.tryparse(Float64, s)
v === nothing && return(ParserError(ErrGenericValueError))
return v
end
function parse_int(l::Parser, contains_underscore, base=nothing)::Err{Int64}
s = take_string_or_substring(l, contains_underscore)
v = try
Base.parse(Int64, s; base=base)
catch e
e isa Base.OverflowError && return(ParserError(ErrOverflowError))
error("internal parser error: did not correctly discredit $(repr(s)) as an int")
end
return v
end
##########################
# Date / Time / DateTime #
##########################
ok_end_value(c::Char) = iswhitespace(c) || c == '#' || c == EOF_CHAR || c == ']' ||
c == '}' || c == ',' || c == '\n' || c == '\r'
#=
# https://tools.ietf.org/html/rfc3339
# Internet Protocols MUST generate four digit years in dates.
date-fullyear = 4DIGIT
date-month = 2DIGIT ; 01-12
date-mday = 2DIGIT ; 01-28, 01-29, 01-30, 01-31 based on
; month/year
time-hour = 2DIGIT ; 00-23
time-minute = 2DIGIT ; 00-59
time-second = 2DIGIT ; 00-58, 00-59, 00-60 based on leap second
; rules
time-secfrac = "." 1*DIGIT
time-numoffset = ("+" / "-") time-hour ":" time-minute
time-offset = "Z" / time-numoffset
partial-time = time-hour ":" time-minute ":" time-second
[time-secfrac]
full-date = date-fullyear "-" date-month "-" date-mday
full-time = partial-time time-offset
date-time = full-date "T" full-time
=#
accept_two(l, f::F) where {F} = accept_n(l, 2, f) || return(ParserError(ErrParsingDateTime))
function parse_datetime(l)
# Year has already been eaten when we reach here
year = @try parse_int(l, false)
year in 0:9999 || return ParserError(ErrParsingDateTime)
# Month
accept(l, '-') || return ParserError(ErrParsingDateTime)
set_marker!(l)
@try accept_two(l, isdigit)
month = @try parse_int(l, false)
month in 1:12 || return ParserError(ErrParsingDateTime)
accept(l, '-') || return ParserError(ErrParsingDateTime)
# Day
set_marker!(l)
@try accept_two(l, isdigit)
day = @try parse_int(l, false)
# Verify the real range in the constructor below
day in 1:31 || return ParserError(ErrParsingDateTime)
# We might have a local date now
read_space = false
if ok_end_value(peek(l))
if (read_space = accept(l, ' '))
if !isdigit(peek(l))
return try_return_date(l, year, month, day)
end
else
return try_return_date(l, year, month, day)
end
end
if !read_space
accept(l, 'T') || accept(l, 't') || return ParserError(ErrParsingDateTime)
end
h, m, s, ms = @try _parse_local_time(l)
# Julia doesn't support offset times
if !accept(l, 'Z')
if accept(l, '+') || accept(l, '-')
return ParserError(ErrOffsetDateNotSupported)
end
end
if !ok_end_value(peek(l))
return ParserError(ErrParsingDateTime)
end
# The DateTime parser verifies things like leap year for us
return try_return_datetime(l, year, month, day, h, m, s, ms)
end
function try_return_datetime(p, year, month, day, h, m, s, ms)
Dates = p.Dates
if Dates !== nothing
try
return Dates.DateTime(year, month, day, h, m, s, ms)
catch
return ParserError(ErrParsingDateTime)
end
else
return DateTime(year, month, day, h, m, s, ms)
end
end
function try_return_date(p, year, month, day)
Dates = p.Dates
if Dates !== nothing
try
return Dates.Date(year, month, day)
catch
return ParserError(ErrParsingDateTime)
end
else
return Date(year, month, day)
end
end
function parse_local_time(l::Parser)
h = @try parse_int(l, false)
h in 0:23 || return ParserError(ErrParsingDateTime)
_, m, s, ms = @try _parse_local_time(l, true)
# TODO: Could potentially parse greater accuracy for the
# fractional seconds here.
return try_return_time(l, h, m, s, ms)
end
function try_return_time(p, h, m, s, ms)
Dates = p.Dates
if Dates !== nothing
try
return Dates.Time(h, m, s, ms)
catch
return ParserError(ErrParsingDateTime)
end
else
return Time(h, m, s, ms)
end
end
function _parse_local_time(l::Parser, skip_hour=false)::Err{NTuple{4, Int64}}
# Hour has potentially been already parsed in
# `parse_number_or_date_start` already
if skip_hour
hour = Int64(0)
else
set_marker!(l)
@try accept_two(l, isdigit)
hour = parse_int(l, false)
hour in 0:23 || return ParserError(ErrParsingDateTime)
end
accept(l, ':') || return ParserError(ErrParsingDateTime)
# minute
set_marker!(l)
@try accept_two(l, isdigit)
minute = parse_int(l, false)
minute in 0:59 || return ParserError(ErrParsingDateTime)
accept(l, ':') || return ParserError(ErrParsingDateTime)
# second
set_marker!(l)
@try accept_two(l, isdigit)
second = parse_int(l, false)
second in 0:59 || return ParserError(ErrParsingDateTime)
# optional fractional second
fractional_second = Int64(0)
if accept(l, '.')
set_marker!(l)
found_fractional_digit = false
for i in 1:3
found_fractional_digit |= accept(l, isdigit)
end
if !found_fractional_digit
return ParserError(ErrParsingDateTime)
end
# DateTime in base only manages 3 significant digits in fractional
# second
fractional_second = parse_int(l, false)
# Truncate off the rest eventual digits
accept_batch(l, isdigit)
end
return hour, minute, second, fractional_second
end
##########
# String #
##########
function parse_string_start(l::Parser, quoted::Bool)::Err{String}
# Have eaten a `'` if `quoted` is true, otherwise have eaten a `"`
multiline = false
c = quoted ? '\'' : '"'
if accept(l, c) # Eat second quote
if !accept(l, c)
return ""
end
accept(l, '\r') # Eat third quote
accept(l, '\n') # Eat third quote
multiline = true
end
return parse_string_continue(l, multiline, quoted)
end
@inline stop_candidates_multiline(x) = x != '"' && x != '\\'
@inline stop_candidates_singleline(x) = x != '"' && x != '\\' && x != '\n'
@inline stop_candidates_multiline_quoted(x) = x != '\'' && x != '\\'
@inline stop_candidates_singleline_quoted(x) = x != '\'' && x != '\\' && x != '\n'
function parse_string_continue(l::Parser, multiline::Bool, quoted::Bool)::Err{String}
start_chunk = l.prevpos
q = quoted ? '\'' : '"'
contains_backslash = false
offset = multiline ? 3 : 1
while true
if peek(l) == EOF_CHAR
return ParserError(ErrUnexpectedEndString)
end
if quoted
accept_batch(l, multiline ? stop_candidates_multiline_quoted : stop_candidates_singleline_quoted)
else
accept_batch(l, multiline ? stop_candidates_multiline : stop_candidates_singleline)
end
if !multiline && peek(l) == '\n'
return ParserError(ErrNewLineInString)
end
next_slash = peek(l) == '\\'
if !next_slash
# TODO: Doesn't handle values with e.g. format `""""str""""`
if accept(l, q) && (!multiline || (accept(l, q) && accept(l, q)))
push!(l.chunks, start_chunk:(l.prevpos-offset-1))
return take_chunks(l, contains_backslash)
end
end
c = eat_char(l) # eat the character we stopped at
next_slash = c == '\\'
if next_slash && !quoted
if peek(l) == '\n' || peek(l) == '\r'
push!(l.chunks, start_chunk:(l.prevpos-1-1)) # -1 due to eating the slash
skip_ws_nl_no_comment(l)
start_chunk = l.prevpos
else
c = eat_char(l) # eat the escaped character
if c == 'u' || c == 'U'
n = c == 'u' ? 4 : 6
set_marker!(l)
if !accept_n(l, n, isvalid_hex)
return ParserError(ErrInvalidUnicodeScalar)
end
codepoint = parse_int(l, false, 16)::Int64
#=
Unicode Scalar Value
---------------------
Any Unicode code point except high-surrogate and
low-surrogate code points. In other words, the ranges of
integers 0 to D7FF16 and E00016 to 10FFFF16 inclusive.
=#
if !(codepoint <= 0xD7FF || 0xE000 <= codepoint <= 0x10FFFF)
return ParserError(ErrInvalidUnicodeScalar)
end
elseif c != 'b' && c != 't' && c != 'n' && c != 'f' && c != 'r' && c != '"' && c!= '\\'
return ParserError(ErrInvalidEscapeCharacter)
end
contains_backslash = true
end
end
end
end
function take_chunks(l::Parser, unescape::Bool)::String
nbytes = sum(length, l.chunks)
str = Base._string_n(nbytes)
offset = 1
for chunk in l.chunks
# The SubString constructor takes as an index the first byte of the
# last character but we have the last byte.
n = length(chunk)
GC.@preserve str begin
unsafe_copyto!(pointer(str, offset), pointer(l.str, first(chunk)), n)
end
offset += n
end
empty!(l.chunks)
return unescape ? unescape_string(str) : str
end
end