swh:1:snp:a72e953ecd624a7df6e6196bbdd05851996c5e40
Tip revision: 5b2a6cfcb4173a0c2af62a3ca14d2d555be71316 authored by tan on 24 April 2015, 11:55:49 UTC
readdlm: use column vectors, estimate dims & types
readdlm: use column vectors, estimate dims & types
Tip revision: 5b2a6cf
datafmt.jl
## file formats ##
module DataFmt
importall Base
import Base: _default_delims, tryparse_internal
export countlines, readdlm, readcsv, writedlm, writecsv
const invalid_dlm = Char(0xfffffffe)
const offs_chunk_size = 5000
countlines(nameorfile) = countlines(nameorfile, '\n')
function countlines(filename::AbstractString, eol::Char)
open(filename) do io
countlines(io, eol)
end
end
function countlines(io::IO, eol::Char)
if !isascii(eol)
throw(ArgumentError("only ASCII line terminators are supported"))
end
a = Array(UInt8, 8192)
nl = 0
preceded_by_eol = true
while !eof(io)
nb = readbytes!(io, a)
for i=1:nb
if Char(a[i]) == eol
preceded_by_eol = true
elseif preceded_by_eol
preceded_by_eol = false
nl+=1
end
end
end
nl
end
readdlm(input, T::Type; opts...) = readdlm(input, invalid_dlm, T, '\n'; opts...)
readdlm(input, dlm::Char, T::Type; opts...) = readdlm(input, dlm, T, '\n'; opts...)
readdlm(input; opts...) = readdlm(input, invalid_dlm, '\n'; opts...)
readdlm(input, dlm::Char; opts...) = readdlm(input, dlm, '\n'; opts...)
readdlm(input, dlm::Char, eol::Char; opts...) = readdlm_auto(input, dlm, Any, eol, true; opts...)
readdlm(input, dlm::Char, T::Type, eol::Char; opts...) = readdlm_auto(input, dlm, T, eol, false; opts...)
function readdlm_auto(input, dlm::Char, T::Type, eol::Char, auto::Bool; opts...)
optsd = val_opts(opts)
use_mmap = get(optsd, :use_mmap, @windows ? false : true)
isa(input, AbstractString) && (fsz = filesize(input); input = use_mmap && (fsz > 0) && fsz < typemax(Int) ? as_mmap(input,fsz) : readall(input))
sinp = isa(input, Vector{UInt8}) ? bytestring(input) :
isa(input, IO) ? readall(input) :
input
readdlm_string(sinp, dlm, T, eol, auto, optsd)
end
function as_mmap(fname::AbstractString, fsz::Int64)
open(fname) do io
mmap_array(UInt8, (Int(fsz),), io)
end
end
function ascii_if_possible(sbuff::AbstractString)
isascii(sbuff) ? convert(ASCIIString,sbuff) : sbuff
end
#
# Handlers act on events generated by the parser.
# Parser calls store_cell on the handler to pass events.
#
# New handler
# DLMStore2: Store values into column vectors, expand as required
#
# Previous handlers:
# DLMStore: Store values directly into a result store (when result dimensions are known)
# DLMOffsets: Keep offsets (when result dimensions are not known)
abstract DLMHandler
type DLMStore2{S<:ByteString} <: DLMHandler
hdr::Vector{AbstractString}
datacols::Vector
coltype::DataType
nrows::Int
ncols::Int
lastrow::Int
lastcol::Int
hdr_offset::Int
sbuff::S
auto::Bool
eol::Char
end
function DLMStore2{S<:ByteString}(T::DataType, has_header::Bool, sbuff::S, auto::Bool, eol::Char)
hdr_offset = has_header ? 1 : 0
DLMStore2{S}(Array(SubString,0), Array(Vector,0), T, 0, 0, 0, 0, hdr_offset, sbuff, auto, eol)
end
# add a new column vector to the store
# called when the parser invokes store_cell with a column not encountered before
function add_col(dlmstore::DLMStore2)
nrows = dlmstore.nrows
cells::Vector = dlmstore.datacols
T = dlmstore.coltype
if nrows == 1 || isempty(cells)
# initialize with 100 rows. re-estimate rows after reading 100 rows.
if T == Any
# start with Int. change to Float64, Any in that order
push!(cells, Array(Int, max(nrows, 100)))
else
# start with the specific type (fixed. unlike when T == Any)
push!(cells, Array(T, max(nrows, 100)))
end
else
# Adding a column at row > 1 implies empty cells in the column for past rows
# That's allowed only on AbstractString and Any column types. A default value (empty string) is set for the missing cells.
if (T <: AbstractString) || (T == Any)
cfill = Array(T, length(cells[1]))
cfill[1:nrows] = SubString(dlmstore.sbuff, 1, 0)
push!(cells, cfill)
elseif ((T <: Number) || (T <: Char)) && dlmstore.auto
throw(TypeError(:store_cell, "", Any, T))
else
error("missing value at row $lastrow column $(lastcol+1)")
end
end
end
function changetype_col{T}(::Type{T}, dlmstore::DLMStore2, colidx::Int)
col = dlmstore.datacols[colidx]
(eltype(col) == T) && return col
dlmstore.datacols[colidx] = T[col...]
end
# called when DLMStore2.coltype == Any and we want to change the column type
# moves the type from Int to Float64 to Any
function changetype_col(dlmstore::DLMStore2, colidx::Int)
col = dlmstore.datacols[colidx]
if eltype(col) == Int
return changetype_col(Float64, dlmstore, colidx)
elseif eltype(col) == Float64
return changetype_col(Any, dlmstore, colidx)
end
return col
end
_chrinstr(sbuff::ByteString, chr::UInt8, startpos::Int, endpos::Int) = (endpos >= startpos) && (C_NULL != ccall(:memchr, Ptr{UInt8}, (Ptr{UInt8}, Int32, Csize_t), pointer(sbuff.data)+startpos-1, chr, endpos-startpos+1))
function store_cell{S<:ByteString}(dlmstore::DLMStore2{S}, row::Int, col::Int, quoted::Bool, startpos::Int, endpos::Int)
drow = row - dlmstore.hdr_offset
(col > dlmstore.ncols) && (dlmstore.ncols = col)
ncols = dlmstore.ncols
(drow > dlmstore.nrows) && (dlmstore.nrows = drow)
lastcol = dlmstore.lastcol
lastrow = dlmstore.lastrow
cells::Vector{Vector} = dlmstore.datacols
sbuff::S = dlmstore.sbuff
T = dlmstore.coltype
while length(cells) < ncols
# discovered new column, fill default values
add_col(dlmstore)
end
if length(cells[1]) < drow
# re-estimate and resize the column vectors
newsize = max(drow, round(Int, length(sbuff)*drow/endpos))
println("newsize: $newsize")
for cell in cells
resize!(cell, newsize)
end
end
endpos = prevind(sbuff, nextind(sbuff,endpos))
(endpos > 0) && ('\n' == dlmstore.eol) && ('\r' == Char(sbuff[endpos])) && (endpos = prevind(sbuff, endpos))
if quoted
startpos += 1
endpos -= 1
end
if drow > 0
# fill missing cells if there were empty cells/rows encountered by the parser
while ((drow - lastrow) > 1) || ((drow > lastrow > 0) && (lastcol < ncols))
if (lastcol == ncols) || (lastrow == 0)
lastcol = 0
lastrow += 1
end
for cidx in (lastcol+1):ncols
if (T <: AbstractString)
(cells[cidx])[lastrow] = SubString(sbuff, 1, 0)
elseif (T == Any)
changetype_col(Any, dlmstore, cidx)
(cells[cidx])[lastrow] = SubString(sbuff, 1, 0)
elseif ((T <: Number) || (T <: Char)) && dlmstore.auto
throw(TypeError(:store_cell, "", Any, T))
else
error("missing value at row $lastrow column $cidx")
end
end
lastcol = ncols
end
# fill current cell
fail = false
while true
if quoted && _chrinstr(sbuff, UInt8('"'), startpos, endpos)
unescaped = replace(SubString(sbuff,startpos,endpos), r"\"\"", "\"")
fail = colval(unescaped, 1, length(unescaped), cells[col], drow)
else
fail = colval(sbuff, startpos, endpos, cells[col], drow)
end
if fail && T == Any && eltype(cells[col]) != Any
changetype_col(dlmstore, col)
else
break
end
end
if fail
sval = SubString(sbuff,startpos,endpos)
((T <: Number) && dlmstore.auto) ? throw(TypeError(:store_cell, "", Any, T)) : error("file entry \"$(sval)\" cannot be converted to $T")
end
dlmstore.lastrow = drow
dlmstore.lastcol = col
else
# fill header
resize!(dlmstore.hdr, col)
if quoted && _chrinstr(sbuff, UInt8('"'), startpos, endpos)
unescaped = replace(SubString(sbuff,startpos,endpos), r"\"\"", "\"")
colval(unescaped, 1, length(unescaped), dlmstore.hdr, col)
else
colval(sbuff, startpos,endpos, dlmstore.hdr, col)
end
end
nothing
end
function result{T}(dlmstore::DLMStore2{T})
nrows = dlmstore.nrows
ncols = dlmstore.ncols
lastcol = dlmstore.lastcol
lastrow = dlmstore.lastrow
cells = dlmstore.datacols
sbuff = dlmstore.sbuff
println("final dims: $nrows,$ncols")
println("cell types:")
for cell in cells
resize!(cell, nrows)
println("\t$(typeof(cell))")
end
# check for missing columns in the last row
if (lastcol < ncols) || (lastrow < nrows)
while lastrow <= nrows
(lastcol == ncols) && (lastcol = 0; lastrow += 1)
for cidx in (lastcol+1):ncols
if (T <: AbstractString)
(cells[cidx])[lastrow] = SubString(sbuff, 1, 0)
elseif (T == Any)
changetype_col(Any, dlmstore, cidx)
(cells[cidx])[lastrow] = SubString(sbuff, 1, 0)
elseif ((T <: Number) || (T <: Char)) && dlmstore.auto
throw(TypeError(:store_cell, "", Any, T))
else
error("missing value at row $lastrow column $cidx")
end
end
lastcol = ncols
(lastrow == nrows) && break
end
dlmstore.lastrow = lastrow
dlmstore.lastcol = ncols
end
# hcat all column vecs into a matrix
data = hcat(cells...)
(dlmstore.hdr_offset > 0) ? (data, dlmstore.hdr) : data
end
function readdlm_string(sbuff::ByteString, dlm::Char, T::Type, eol::Char, auto::Bool, optsd::Dict)
ign_empty = (dlm == invalid_dlm)
quotes = get(optsd, :quotes, true)
comments = get(optsd, :comments, true)
comment_char = get(optsd, :comment_char, '#')
dims = get(optsd, :dims, nothing)
has_header = get(optsd, :header, get(optsd, :has_header, false))
haskey(optsd, :has_header) && (optsd[:has_header] != has_header) && throw(ArgumentError("conflicting values for header and has_header"))
skipstart = get(optsd, :skipstart, 0)
(skipstart >= 0) || throw(ArgumentError("skipstart must be ≥ 0, got $skipstart"))
skipblanks = get(optsd, :skipblanks, true)
offset_handler = DLMStore2(T, has_header, sbuff, auto, eol)
t1 = time()
dlm_parse(sbuff, eol, dlm, '"', comment_char, ign_empty, quotes, comments, skipstart, skipblanks, offset_handler)
t2 = time()
println("colvectors created in $(t2-t1) time")
res = result(offset_handler)
t3 = time()
println("hcat in $(t3-t2) time")
res
end
const valid_opts = [:header, :has_header, :ignore_invalid_chars, :use_mmap, :quotes, :comments, :dims, :comment_char, :skipstart, :skipblanks]
const valid_opt_types = [Bool, Bool, Bool, Bool, Bool, Bool, NTuple{2,Integer}, Char, Integer, Bool]
const deprecated_opts = Dict(:has_header => :header)
function val_opts(opts)
d = Dict{Symbol,Union(Bool,NTuple{2,Integer},Char,Integer)}()
for (opt_name, opt_val) in opts
!in(opt_name, valid_opts) && throw(ArgumentError("unknown option $opt_name"))
opt_typ = valid_opt_types[findfirst(valid_opts, opt_name)]
!isa(opt_val, opt_typ) && throw(ArgumentError("$opt_name should be of type $opt_typ, got $(typeof(opt_val))"))
d[opt_name] = opt_val
haskey(deprecated_opts, opt_name) && warn("$opt_name is deprecated, use $(deprecated_opts[opt_name]) instead")
end
d
end
function colval{S<:ByteString}(sbuff::S, startpos::Int, endpos::Int, cells::Array{Bool,1}, row::Int)
n = tryparse_internal(Bool, sbuff, startpos, endpos, false)
isnull(n) || (cells[row] = get(n))
isnull(n)
end
function colval{T<:Integer, S<:ByteString}(sbuff::S, startpos::Int, endpos::Int, cells::Array{T,1}, row::Int)
n = tryparse_internal(T, sbuff, startpos, endpos, 0, false)
isnull(n) || (cells[row] = get(n))
isnull(n)
end
function colval{S<:ByteString}(sbuff::S, startpos::Int, endpos::Int, cells::Array{Float64,1}, row::Int)
n = ccall(:jl_try_substrtod, Nullable{Float64}, (Ptr{UInt8},Csize_t,Cint), sbuff, startpos-1, endpos-startpos+1)
isnull(n) || (cells[row] = get(n))
isnull(n)
end
function colval{S<:ByteString}(sbuff::S, startpos::Int, endpos::Int, cells::Array{Float32,1}, row::Int)
n = ccall(:jl_try_substrtof, Nullable{Float32}, (Ptr{UInt8},Csize_t,Cint), sbuff, startpos-1, endpos-startpos+1)
isnull(n) || (cells[row] = get(n))
isnull(n)
end
function colval{T<:AbstractString, S<:ByteString}(sbuff::S, startpos::Int, endpos::Int, cells::Array{T,1}, row::Int)
((cells[row] = SubString(sbuff,startpos,endpos)); false)
end
function colval{S<:ByteString}(sbuff::S, startpos::Int, endpos::Int, cells::Array{Any,1}, row::Int)
# if array is of Any type, attempt parsing only the most common types: Int, Bool, Float64 and fallback to SubString
len = endpos-startpos+1
if len > 0
# check Inteter
ni64 = tryparse_internal(Int, sbuff, startpos, endpos, 0, false)
isnull(ni64) || (cells[row] = get(ni64); return false)
# check Bool
nb = tryparse_internal(Bool, sbuff, startpos, endpos, false)
isnull(nb) || (cells[row] = get(nb); return false)
# check float64
nf64 = ccall(:jl_try_substrtod, Nullable{Float64}, (Ptr{UInt8},Csize_t,Cint), sbuff, startpos-1, endpos-startpos+1)
isnull(nf64) || (cells[row] = get(nf64); return false)
end
cells[row] = SubString(sbuff, startpos, endpos)
false
end
function colval{T<:Char, S<:ByteString}(sbuff::S, startpos::Int, endpos::Int, cells::Array{T,1}, row::Int)
((startpos==endpos) ? ((cells[row] = next(sbuff,startpos)[1]); false) : true)
end
colval{S<:ByteString}(sbuff::S, startpos::Int, endpos::Int, cells::Array, row::Int) = true
dlm_parse(s::ASCIIString, eol::Char, dlm::Char, qchar::Char, cchar::Char, ign_adj_dlm::Bool, allow_quote::Bool, allow_comments::Bool, skipstart::Int, skipblanks::Bool, dh::DLMHandler) = begin
dlm_parse(s.data, UInt32(eol)%UInt8, UInt32(dlm)%UInt8, UInt32(qchar)%UInt8, UInt32(cchar)%UInt8,
ign_adj_dlm, allow_quote, allow_comments, skipstart, skipblanks, dh)
end
function dlm_parse{T,D}(dbuff::T, eol::D, dlm::D, qchar::D, cchar::D, ign_adj_dlm::Bool, allow_quote::Bool, allow_comments::Bool, skipstart::Int, skipblanks::Bool, dh::DLMHandler)
all_ascii = (D <: UInt8) || (isascii(eol) && isascii(dlm) && (!allow_quote || isascii(qchar)) && (!allow_comments || isascii(cchar)))
(T <: UTF8String) && all_ascii && (return dlm_parse(dbuff.data, eol%UInt8, dlm%UInt8, qchar%UInt8, cchar%UInt8, ign_adj_dlm, allow_quote, allow_comments, skipstart, skipblanks, dh))
ncols = nrows = col = 0
is_default_dlm = (dlm == UInt32(invalid_dlm) % D)
error_str = ""
# 0: begin field, 1: quoted field, 2: unquoted field, 3: second quote (could either be end of field or escape character), 4: comment, 5: skipstart
state = (skipstart > 0) ? 5 : 0
is_eol = is_dlm = is_cr = is_quote = is_comment = expct_col = false
idx = 1
try
slen = sizeof(dbuff)
col_start_idx = 1
was_cr = false
while idx <= slen
val,idx = next(dbuff, idx)
if (is_eol = (Char(val) == Char(eol)))
is_dlm = is_comment = is_cr = is_quote = false
elseif (is_dlm = (is_default_dlm ? in(Char(val), _default_delims) : (Char(val) == Char(dlm))))
is_comment = is_cr = is_quote = false
elseif (is_quote = (Char(val) == Char(qchar)))
is_comment = is_cr = false
elseif (is_comment = (Char(val) == Char(cchar)))
is_cr = false
else
is_cr = (Char(eol) == '\n') && (Char(val) == '\r')
end
if 2 == state # unquoted field
if is_dlm
state = 0
col += 1
store_cell(dh, nrows+1, col, false, col_start_idx, idx-2)
col_start_idx = idx
!ign_adj_dlm && (expct_col = true)
elseif is_eol
nrows += 1
col += 1
store_cell(dh, nrows, col, false, col_start_idx, idx - (was_cr ? 3 : 2))
col_start_idx = idx
ncols = max(ncols, col)
col = 0
state = 0
elseif (is_comment && allow_comments)
nrows += 1
col += 1
store_cell(dh, nrows, col, false, col_start_idx, idx - 2)
ncols = max(ncols, col)
col = 0
state = 4
end
elseif 1 == state # quoted field
is_quote && (state = 3)
elseif 4 == state # comment line
if is_eol
col_start_idx = idx
state = 0
end
elseif 0 == state # begin field
if is_quote
state = (allow_quote && !was_cr) ? 1 : 2
expct_col = false
elseif is_dlm
if !ign_adj_dlm
expct_col = true
col += 1
store_cell(dh, nrows+1, col, false, col_start_idx, idx-2)
end
col_start_idx = idx
elseif is_eol
if (col > 0) || !skipblanks
nrows += 1
if expct_col
col += 1
store_cell(dh, nrows, col, false, col_start_idx, idx - (was_cr ? 3 : 2))
end
ncols = max(ncols, col)
col = 0
end
col_start_idx = idx
expct_col = false
elseif is_comment && allow_comments
if col > 0
nrows += 1
if expct_col
col += 1
store_cell(dh, nrows, col, false, col_start_idx, idx - 2)
end
ncols = max(ncols, col)
col = 0
end
expct_col = false
state = 4
elseif !is_cr
state = 2
expct_col = false
end
elseif 3 == state # second quote
if is_quote && !was_cr
state = 1
elseif is_dlm && !was_cr
state = 0
col += 1
store_cell(dh, nrows+1, col, true, col_start_idx, idx-2)
col_start_idx = idx
!ign_adj_dlm && (expct_col = true)
elseif is_eol
nrows += 1
col += 1
store_cell(dh, nrows, col, true, col_start_idx, idx - (was_cr ? 3 : 2))
col_start_idx = idx
ncols = max(ncols, col)
col = 0
state = 0
elseif is_comment && allow_comments && !was_cr
nrows += 1
col += 1
store_cell(dh, nrows, col, true, col_start_idx, idx - 2)
ncols = max(ncols, col)
col = 0
state = 4
elseif (is_cr && was_cr) || !is_cr
error_str = escape_string("unexpected character '$(Char(val))' after quoted field at row $(nrows+1) column $(col+1)")
break
end
elseif 5 == state # skip start
if is_eol
col_start_idx = idx
skipstart -= 1
(0 == skipstart) && (state = 0)
end
end
was_cr = is_cr
end
if isempty(error_str)
if 1 == state # quoted field
error_str = "truncated column at row $(nrows+1) column $(col+1)"
elseif (2 == state) || (3 == state) || ((0 == state) && is_dlm) # unquoted field, second quote, or begin field with last character as delimiter
col += 1
nrows += 1
store_cell(dh, nrows, col, (3 == state), col_start_idx, idx-1)
ncols = max(ncols, col)
end
end
catch ex
rethrow(ex)
#if isa(ex, TypeError) && (ex.func == :store_cell)
# rethrow(ex)
#else
# error("at row $(nrows+1), column $col : $ex)")
#end
end
!isempty(error_str) && error(error_str)
return (nrows, ncols)
end
readcsv(io; opts...) = readdlm(io, ','; opts...)
readcsv(io, T::Type; opts...) = readdlm(io, ',', T; opts...)
# todo: keyword argument for # of digits to print
writedlm_cell(io::IO, elt::FloatingPoint, dlm, quotes) = print_shortest(io, elt)
function writedlm_cell{T}(io::IO, elt::AbstractString, dlm::T, quotes::Bool)
if quotes && !isempty(elt) && (('"' in elt) || ('\n' in elt) || ((T <: Char) ? (dlm in elt) : contains(elt, dlm)))
print(io, '"', replace(elt, r"\"", "\"\""), '"')
else
print(io, elt)
end
end
writedlm_cell(io::IO, elt, dlm, quotes) = print(io, elt)
function writedlm(io::IO, a::AbstractVecOrMat, dlm; opts...)
optsd = val_opts(opts)
quotes = get(optsd, :quotes, true)
pb = PipeBuffer()
nr = size(a,1)
nc = size(a,2)
for i = 1:nr
for j = 1:nc
writedlm_cell(pb, a[i,j], dlm, quotes)
j == nc ? write(pb,'\n') : print(pb,dlm)
end
(nb_available(pb) > (16*1024)) && write(io, takebuf_array(pb))
end
write(io, takebuf_array(pb))
nothing
end
writedlm{T}(io::IO, a::AbstractArray{T,0}, dlm; opts...) = writedlm(io, reshape(a,1), dlm; opts...)
#=
function writedlm_ndarray(io::IO, a::AbstractArray, dlm; opts...)
tail = size(a)[3:end]
function print_slice(idxs...)
writedlm(io, sub(a, 1:size(a,1), 1:size(a,2), idxs...), dlm; opts...)
if idxs != tail
print(io, "\n")
end
end
cartesianmap(print_slice, tail)
end
=#
function writedlm(io::IO, itr, dlm; opts...)
optsd = val_opts(opts)
quotes = get(optsd, :quotes, true)
pb = PipeBuffer()
for row in itr
state = start(row)
while !done(row, state)
(x, state) = next(row, state)
writedlm_cell(pb, x, dlm, quotes)
done(row, state) ? write(pb,'\n') : print(pb,dlm)
end
(nb_available(pb) > (16*1024)) && write(io, takebuf_array(pb))
end
write(io, takebuf_array(pb))
nothing
end
function writedlm(fname::AbstractString, a, dlm; opts...)
open(fname, "w") do io
writedlm(io, a, dlm; opts...)
end
end
writedlm(io, a; opts...) = writedlm(io, a, '\t'; opts...)
writecsv(io, a; opts...) = writedlm(io, a, ','; opts...)
writemime(io::IO, ::MIME"text/csv", a::AbstractVecOrMat) = writedlm(io, a, ',')
writemime(io::IO, ::MIME"text/tab-separated-values", a::AbstractVecOrMat) = writedlm(io, a, '\t')
end # module DataFmt