Content - 48c3951506d086a22b28f9c52aac0823060c5de5 - 23d7c9c/src/json_stream.ml

json_stream.ml
(*****************************************************************************)
(*                                                                           *)
(* Open Source License                                                       *)
(* Copyright (c) 2020 Nomadic Labs <contact@nomadic-labs.com>                *)
(*                                                                           *)
(* Permission is hereby granted, free of charge, to any person obtaining a   *)
(* copy of this software and associated documentation files (the "Software"),*)
(* to deal in the Software without restriction, including without limitation *)
(* the rights to use, copy, modify, merge, publish, distribute, sublicense,  *)
(* and/or sell copies of the Software, and to permit persons to whom the     *)
(* Software is furnished to do so, subject to the following conditions:      *)
(*                                                                           *)
(* The above copyright notice and this permission notice shall be included   *)
(* in all copies or substantial portions of the Software.                    *)
(*                                                                           *)
(* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR*)
(* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,  *)
(* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL   *)
(* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER*)
(* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING   *)
(* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER       *)
(* DEALINGS IN THE SOFTWARE.                                                 *)
(*                                                                           *)
(*****************************************************************************)

type jsonm_lexeme =
  [ `Null
  | `Bool of bool
  | `String of string
  | `Float of float
  | `Name of string
  | `As
  | `Ae
  | `Os
  | `Oe ]

let string_of_float f =
  let (fract, intr) = modf f in
  if fract = 0.0 then Format.asprintf "%.0f" intr else Format.asprintf "%g" f

let string_needs_escaping_at index s =
  let exception At of int in
  try
    for i = index to String.length s - 1 do
      match s.[i] with
      | '\"' | '\n' | '\r' | '\b' | '\t' | '\\' | '\x00' .. '\x1F' ->
          raise (At i)
      | _ ->
          ()
    done ;
    -1
  with At i -> i

let do_escape_string s =
  let buff = Buffer.create (String.length s) in
  for i = 0 to String.length s - 1 do
    match s.[i] with
    | '\"' ->
        Buffer.add_string buff "\\\""
    | '\n' ->
        Buffer.add_string buff "\\n"
    | '\r' ->
        Buffer.add_string buff "\\r"
    | '\b' ->
        Buffer.add_string buff "\\b"
    | '\t' ->
        Buffer.add_string buff "\\t"
    | '\\' ->
        Buffer.add_string buff "\\\\"
    | '\x00' .. '\x1F' as c ->
        Format.kasprintf (Buffer.add_string buff) "\\u%04x" (Char.code c)
    | c ->
        Buffer.add_char buff c
  done ;
  Buffer.contents buff

let escape_string s =
  if string_needs_escaping_at 0 s >= 0 then do_escape_string s else s

(** small_string_seq_of_jsonm_lexeme_seq: converts a seq of lexeme into a naive
    seq of small strings. This may or may not be appripriate depending on the
    way the resulting seq is consumed. *)
let small_string_seq_of_jsonm_lexeme_seq ~newline (s : jsonm_lexeme Seq.t) :
    string Seq.t =
  let rec sseq first depth seq () =
    match seq () with
    | Seq.Nil ->
        assert (depth = 0) ;
        if newline then Seq.Cons ("\n", Seq.empty) else Seq.Nil
    | Seq.Cons (`Null, seq) ->
        let tail = Seq.Cons ("null", sseq false depth seq) in
        if (not first) && depth > 0 then Seq.Cons (",", fun () -> tail)
        else tail
    | Seq.Cons (`Bool true, seq) ->
        let tail = Seq.Cons ("true", sseq false depth seq) in
        if (not first) && depth > 0 then Seq.Cons (",", fun () -> tail)
        else tail
    | Seq.Cons (`Bool false, seq) ->
        let tail = Seq.Cons ("false", sseq false depth seq) in
        if (not first) && depth > 0 then Seq.Cons (",", fun () -> tail)
        else tail
    | Seq.Cons (`As, seq) ->
        let tail = Seq.Cons ("[", sseq true (depth + 1) seq) in
        if (not first) && depth > 0 then Seq.Cons (",", fun () -> tail)
        else tail
    | Seq.Cons (`Ae, seq) ->
        Seq.Cons ("]", sseq false (depth - 1) seq)
    | Seq.Cons (`Os, seq) ->
        let tail = Seq.Cons ("{", sseq true (depth + 1) seq) in
        if (not first) && depth > 0 then Seq.Cons (",", fun () -> tail)
        else tail
    | Seq.Cons (`Oe, seq) ->
        Seq.Cons ("}", sseq false (depth - 1) seq)
    | Seq.Cons (`String s, seq) ->
        let tail =
          Seq.Cons
            ( "\"",
              fun () ->
                Seq.Cons
                  ( escape_string s,
                    fun () -> Seq.Cons ("\"", sseq false depth seq) ) )
        in
        if (not first) && depth > 0 then Seq.Cons (",", fun () -> tail)
        else tail
    | Seq.Cons (`Float f, seq) ->
        let f = string_of_float f in
        let tail = Seq.Cons (f, sseq false depth seq) in
        if (not first) && depth > 0 then Seq.Cons (",", fun () -> tail)
        else tail
    | Seq.Cons (`Name n, seq) ->
        let tail =
          Seq.Cons
            ( "\"",
              fun () ->
                Seq.Cons
                  ( escape_string n,
                    fun () ->
                      Seq.Cons
                        ("\"", fun () -> Seq.Cons (":", sseq true depth seq))
                  ) )
        in
        if (not first) && depth > 0 then Seq.Cons (",", fun () -> tail)
        else tail
  in
  sseq false 0 s

let dump_unescaped_string chunk_size_hint buff str index seq =
  let rec aux bytes_left_in_buff index =
    if bytes_left_in_buff > String.length str - index then (
      Buffer.add_substring buff str index (String.length str - index) ;
      seq () )
    else (
      Buffer.add_substring buff str index bytes_left_in_buff ;
      let s = Buffer.contents buff in
      Buffer.clear buff ;
      Seq.Cons (s, fun () -> aux chunk_size_hint (index + bytes_left_in_buff))
      )
  in
  aux (chunk_size_hint - Buffer.length buff) index

(* This is written with the assumption that there aren't many characters that
   need escaping: a few here and there, maybe just a couple of newlines in a
   block of text.

   Breaking this assumption does not cause errors. However, the performances
   might degrade somewhat. *)
let dump_escaped_string chunk_size_hint buff str seq =
  let rec aux_outer bytes_left_in_buff index =
    let next_escape = string_needs_escaping_at index str in
    if bytes_left_in_buff <= 6 then (
      let s = Buffer.contents buff in
      Buffer.clear buff ;
      Seq.Cons (s, fun () -> aux_outer chunk_size_hint index) )
    else if next_escape < 0 then
      (* string does not need escaping: dump the rest of the string as is *)
      dump_unescaped_string chunk_size_hint buff str index seq
    else if next_escape = 0 then (
      (* index is at character that needs escaping *)
      ( match str.[index] with
      | '\"' ->
          Buffer.add_string buff "\\\""
      | '\n' ->
          Buffer.add_string buff "\\n"
      | '\r' ->
          Buffer.add_string buff "\\r"
      | '\b' ->
          Buffer.add_string buff "\\b"
      | '\t' ->
          Buffer.add_string buff "\\t"
      | '\\' ->
          Buffer.add_string buff "\\\\"
      | '\x00' .. '\x1F' as c ->
          Format.kasprintf (Buffer.add_string buff) "\\u%04x" (Char.code c)
      | c ->
          Buffer.add_char buff c ) ;
      aux_outer (chunk_size_hint - Buffer.length buff) (index + 1) )
    else
      (* string needs escaping but later: write the non-empty non-escaped
            prefix and loop back *)
      let to_write_unescaped = next_escape - index in
      if bytes_left_in_buff > to_write_unescaped then (
        Buffer.add_substring buff str index to_write_unescaped ;
        aux_outer
          (bytes_left_in_buff - to_write_unescaped)
          (index + to_write_unescaped) )
      else
        let rec aux_inner bytes_left_in_buff index to_write_unescaped continue
            =
          if bytes_left_in_buff < to_write_unescaped then (
            Buffer.add_substring buff str index bytes_left_in_buff ;
            let s = Buffer.contents buff in
            Seq.Cons
              ( s,
                fun () ->
                  aux_inner
                    chunk_size_hint
                    (index + bytes_left_in_buff)
                    (to_write_unescaped - bytes_left_in_buff)
                    continue ) )
          else (
            Buffer.add_substring buff str index to_write_unescaped ;
            continue (index + to_write_unescaped) )
        in
        aux_inner bytes_left_in_buff index to_write_unescaped (fun index ->
            aux_outer (chunk_size_hint - Buffer.length buff) index)
  in
  aux_outer (chunk_size_hint - Buffer.length buff) 0

let dump_string_literal chunk_size_hint buff literal seq =
  Buffer.add_char buff '"' ;
  dump_escaped_string chunk_size_hint buff literal (fun () ->
      Buffer.add_char buff '"' ; seq ())

let string_seq_of_jsonm_lexeme_seq ~newline ~chunk_size_hint
    (s : jsonm_lexeme Seq.t) : string Seq.t =
  (* we need chunk_size_hint to be reasonably high to accommodate all small
     literals *)
  let chunk_size_hint = min chunk_size_hint 16 in
  (* we occasionally print several characters before checking the length of the
     buffer (e.g., in the case of a key-value name with non-printable character
     at the end: 6 for one hex-encoded non-printable character in a string + 1
     for the string closing double-quote + 1 for the key-value colon separator
     = 8 in total) (e.g., [Float.pred 0.] is ["-4.94066e-324"] which is 12
     characters long).
     So we allocate just above chunk_size +8 to avoid the need to resize. *)
  let buff_size = chunk_size_hint + 16 in
  (* single buffer for the whole serialisation *)
  let buff = Buffer.create buff_size in
  let rec sseq first depth seq () =
    if Buffer.length buff >= buff_size then (
      (* emit buffer content if we have reached the chunk size *)
      let b = Buffer.contents buff in
      Buffer.clear buff ;
      Seq.Cons (b, fun () -> (sseq [@ocaml.tailcall]) first depth seq ()) )
    else
      match seq () with
      (* termination *)
      | Seq.Nil ->
          assert (depth = 0) ;
          if newline then Buffer.add_char buff '\n' ;
          (* value terminator: newline *)
          if Buffer.length buff = 0 then
            (* corner case: we just flushed and haven't added a newline *)
            Seq.Nil
          else
            let b = Buffer.contents buff in
            Buffer.clear buff ;
            Seq.Cons (b, Seq.empty)
      (* fixed length, small lexemes *)
      | Seq.Cons (`Null, seq) ->
          (* if we are inside an object/array (i.e., depth is > 0) and we are
             not the first item (i.e., first is false) then we put a delimiter
             character. *)
          if (not first) && depth > 0 then Buffer.add_char buff ',' ;
          (* then the value *)
          Buffer.add_string buff "null" ;
          (* and we continue with the rest. Note that depth is unchanged
             but first is false whatever it's original value (because whatever
             follows _follows_) *)
          (sseq [@ocaml.tailcall]) false depth seq ()
      | Seq.Cons (`Bool true, seq) ->
          if (not first) && depth > 0 then Buffer.add_char buff ',' ;
          Buffer.add_string buff "true" ;
          (sseq [@ocaml.tailcall]) false depth seq ()
      | Seq.Cons (`Bool false, seq) ->
          if (not first) && depth > 0 then Buffer.add_char buff ',' ;
          Buffer.add_string buff "false" ;
          (sseq [@ocaml.tailcall]) false depth seq ()
      | Seq.Cons (`As, seq) ->
          if (not first) && depth > 0 then Buffer.add_char buff ',' ;
          Buffer.add_char buff '[' ;
          (* We increase the depth and mark the next value as being the first
             value of an array. *)
          (sseq [@ocaml.tailcall]) true (depth + 1) seq ()
      | Seq.Cons (`Ae, seq) ->
          assert (depth > 0) ;
          Buffer.add_char buff ']' ;
          (sseq [@ocaml.tailcall]) false (depth - 1) seq ()
      | Seq.Cons (`Os, seq) ->
          if (not first) && depth > 0 then Buffer.add_char buff ',' ;
          Buffer.add_char buff '{' ;
          (sseq [@ocaml.tailcall]) true (depth + 1) seq ()
      | Seq.Cons (`Oe, seq) ->
          assert (depth > 0) ;
          Buffer.add_char buff '}' ;
          (sseq [@ocaml.tailcall]) false (depth - 1) seq ()
      | Seq.Cons (`String s, seq) ->
          if (not first) && depth > 0 then Buffer.add_char buff ',' ;
          (* we delegate string literals to [dump_string_literal]. Note that we
             pass the rest of the sequence as a kind of continuation. This is
             because [dump_string_literal] may fill up the buffer and then some
             (depending on the size of the literal) and so it needs to be able
             to stick a few things in front. *)
          dump_string_literal chunk_size_hint buff s (fun () ->
              (sseq [@ocaml.tailcall]) false depth seq ())
      | Seq.Cons (`Float f, seq) ->
          if (not first) && depth > 0 then Buffer.add_char buff ',' ;
          let f = string_of_float f in
          Buffer.add_string buff f ;
          (sseq [@ocaml.tailcall]) false depth seq ()
      | Seq.Cons (`Name n, seq) ->
          if (not first) && depth > 0 then Buffer.add_char buff ',' ;
          dump_string_literal chunk_size_hint buff n (fun () ->
              (* set first to true to avoid printing of separator *)
              Buffer.add_char buff ':' ;
              (sseq [@ocaml.tailcall]) true depth seq ())
  in
  sseq false 0 s

let biseq_escaped_string_content buffer offset s k =
  let can_be_written = Bytes.length buffer - offset in
  if String.length s + 1 > can_be_written + Bytes.length buffer then
    if
      (* large string and.. *)
      (* TODO: present the string as a sequence of of blit instructions with
        increasing offsets *)
      offset < Bytes.length buffer / 2
    then (
      (* ..and the current buffer is almost empty:
             dump as much as we can on the current buffer to avoid sending a
             small buffer in the seq, and then use the rest of the string as its
             own buffer *)
      Bytes.blit_string s 0 buffer offset can_be_written ;
      let offset = offset + can_be_written in
      assert (offset = Bytes.length buffer) ;
      Seq.Cons
        ( (buffer, 0, offset),
          fun () ->
            let s = Bytes.unsafe_of_string s in
            Seq.Cons
              ( (s, can_be_written, Bytes.length s - can_be_written),
                fun () -> k 0 ) ) )
    else
      (* ..and the buffer is reasonably full:
             put the current buffer in the seq and then the string as a single
             chunk *)
      Seq.Cons
        ( (buffer, 0, offset),
          fun () ->
            let s = Bytes.unsafe_of_string s in
            Seq.Cons ((s, 0, Bytes.length s), fun () -> k 0) )
  else if String.length s + 1 <= can_be_written then (
    (* we [+ 1] to account for the closing quote that will be added by [k] *)
    (* small string: we dump it on the buffer and continue *)
    Bytes.blit_string s 0 buffer offset (String.length s) ;
    let offset = offset + String.length s in
    k offset )
  else (
    (* medium string: we blit two parts onto the buffer *)
    Bytes.blit_string s 0 buffer offset can_be_written ;
    let offset = offset + can_be_written in
    assert (offset = Bytes.length buffer) ;
    Seq.Cons
      ( (buffer, 0, offset),
        fun () ->
          let remain_to_be_written = String.length s - can_be_written in
          Bytes.blit_string s can_be_written buffer 0 remain_to_be_written ;
          let offset = remain_to_be_written in
          k offset ) )

let biseq_string_literal buffer offset s k =
  Bytes.set buffer offset '"' ;
  let offset = offset + 1 in
  let first_escape = string_needs_escaping_at 0 s in
  if first_escape < 0 then
    biseq_escaped_string_content buffer offset s (fun offset ->
        Bytes.set buffer offset '"' ;
        k (offset + 1))
  else
    (* NOTE: offset can't be 0 because we just wrote '"', also the string cannot
             be empty because this is matched-for earlier *)
    (* TODO: optimise by escaping using the available buffer *)
    let s = do_escape_string s in
    biseq_escaped_string_content buffer offset s (fun offset ->
        Bytes.set buffer offset '"' ;
        k (offset + 1))

let blit_instructions_seq_of_jsonm_lexeme_seq ~newline ~buffer lexeme_seq =
  let buffer_size = Bytes.length buffer in
  if buffer_size < 32 then
    raise
      (Invalid_argument
         "Data_encoding.blit_instructions_seq_of_jsonm_lexeme_seq") ;
  let flush_at = buffer_size - 16 in
  let[@ocaml.inline] sep first depth offset =
    (* if we are inside an object/array (i.e., depth is > 0) and we are
       not the first item (i.e., first is false) then we put a delimiter
       character. *)
    if (not first) && depth > 0 then (
      Bytes.set buffer offset ',' ;
      offset + 1 )
    else offset
  in
  let rec biseq first depth offset seq () =
    if offset >= flush_at then
      (* emit buffer content if we have reached the chunk size *)
      Seq.Cons
        ( (buffer, 0, offset),
          fun () -> (biseq [@ocaml.tailcall]) first depth 0 seq () )
    else
      match seq () with
      (* termination *)
      | Seq.Nil ->
          assert (depth = 0) ;
          let offset =
            if newline then (
              Bytes.set buffer offset '\n' ;
              offset + 1 )
            else offset
          in
          if offset = 0 then
            (* corner case: we just flushed (and haven't added a newline) *)
            Seq.Nil
          else Seq.Cons ((buffer, 0, offset), fun () -> Seq.Nil)
      (* fixed length, small lexemes *)
      | Seq.Cons (`Null, seq) ->
          let offset = sep first depth offset in
          Bytes.blit_string "null" 0 buffer offset 4 ;
          let offset = offset + 4 in
          (biseq [@ocaml.tailcall]) false depth offset seq ()
      | Seq.Cons (`Bool true, seq) ->
          let offset = sep first depth offset in
          Bytes.blit_string "true" 0 buffer offset 4 ;
          let offset = offset + 4 in
          (biseq [@ocaml.tailcall]) false depth offset seq ()
      | Seq.Cons (`Bool false, seq) ->
          let offset = sep first depth offset in
          Bytes.blit_string "false" 0 buffer offset 5 ;
          let offset = offset + 5 in
          (biseq [@ocaml.tailcall]) false depth offset seq ()
      | Seq.Cons (`As, seq) ->
          let offset = sep first depth offset in
          Bytes.set buffer offset '[' ;
          let offset = offset + 1 in
          (biseq [@ocaml.tailcall]) true (depth + 1) offset seq ()
      | Seq.Cons (`Ae, seq) ->
          Bytes.set buffer offset ']' ;
          let offset = offset + 1 in
          (biseq [@ocaml.tailcall]) false (depth - 1) offset seq ()
      | Seq.Cons (`Os, seq) ->
          let offset = sep first depth offset in
          Bytes.set buffer offset '{' ;
          let offset = offset + 1 in
          (biseq [@ocaml.tailcall]) true (depth + 1) offset seq ()
      | Seq.Cons (`Oe, seq) ->
          Bytes.set buffer offset '}' ;
          let offset = offset + 1 in
          (biseq [@ocaml.tailcall]) false (depth - 1) offset seq ()
      | Seq.Cons (`String "", seq) ->
          let offset = sep first depth offset in
          Bytes.blit_string "\"\"" 0 buffer offset 2 ;
          let offset = offset + 2 in
          (biseq [@ocaml.tailcall]) false depth offset seq ()
      | Seq.Cons (`String s, seq) ->
          let offset = sep first depth offset in
          (* we delegate string literals to [dump_string_literal]. Note that we
             pass the rest of the sequence as a kind of continuation. This is
             because [dump_string_literal] may fill up the buffer and then some
             (depending on the size of the literal) and so it needs to be able
             to stick a few things in front. *)
          biseq_string_literal buffer offset s (fun offset ->
              (biseq [@ocaml.tailcall]) false depth offset seq ())
      | Seq.Cons (`Float f, seq) ->
          let offset = sep first depth offset in
          let f = string_of_float f in
          biseq_escaped_string_content buffer offset f (fun offset ->
              (biseq [@ocaml.tailcall]) false depth offset seq ())
      | Seq.Cons (`Name n, seq) ->
          let offset = sep first depth offset in
          biseq_string_literal buffer offset n (fun offset ->
              if offset = buffer_size then
                Seq.Cons
                  ( (buffer, 0, offset),
                    fun () ->
                      Bytes.set buffer 0 ':' ;
                      let offset = 1 in
                      (* set first to true to avoid printing of separator *)
                      (biseq [@ocaml.tailcall]) true depth offset seq () )
              else (
                (* set first to true to avoid printing of separator *)
                Bytes.set buffer offset ':' ;
                let offset = offset + 1 in
                (biseq [@ocaml.tailcall]) true depth offset seq () ))
  in
  biseq false 0 0 lexeme_seq