https://hal.archives-ouvertes.fr/hal-01897572
Tip revision: ad0c16675d221938530269610308cd5a2c142687 authored by Software Heritage on 17 October 2018, 13:20:37 UTC
hal: Deposit 205 in collection hal
hal: Deposit 205 in collection hal
Tip revision: ad0c166
prelexerState.ml
open CST
open ExtPervasives
(**specification:
The shell breaks the input into tokens: words and operators; see
Token Recognition.
*)
type atom =
| WordComponent of (string * word_component)
| QuotingMark of quote_kind
| AssignmentMark
and quote_kind = SingleQuote | DoubleQuote | OpeningBrace
type lexing_context =
| Default
| AssignmentRHS of name
type prelexer_state = {
lexing_context : lexing_context;
nesting_context : Nesting.t list;
buffer : atom list;
}
type t = prelexer_state
let initial_state = {
lexing_context = Default;
nesting_context = [];
buffer = [];
}
let at_toplevel current =
match current.nesting_context with
| [Nesting.HereDocument _] | [] -> true
| _ -> false
let enter_assignment_rhs current name =
{ current with lexing_context = AssignmentRHS name }
let push_string b s =
(* FIXME: Is string concatenation too slow here? *)
match b.buffer with
| WordComponent (s', WordLiteral l) :: csts ->
{ b with buffer = WordComponent (s' ^ s, WordLiteral (l ^ s)) :: csts }
| _ ->
{ b with buffer = WordComponent (s, WordLiteral s) :: b.buffer }
let push_character b c =
push_string b (String.make 1 c)
let push_separated_string b s =
{ b with buffer = WordComponent (s, WordLiteral s) :: b.buffer }
let rec pop_character = function
| WordComponent (s, WordLiteral c) :: buffer ->
let sequel = String.(sub s 0 (length s - 1)) in
if sequel = "" then
buffer
else
WordComponent (sequel, WordLiteral sequel) :: buffer
| _ ->
assert false
(** [push_word_closing_character b c] push a character [c] to mark it
as part of the string representing the current word literal but
with no interpretation as a word CSTs. Typically, if the word
is "$(1)", the string representing the current word is "$(1)"
so the character ')' must be pushed as part of this string
representation but ')' is already taken care of in the word
CST [WordSubshell (_, _)] associated to this word so we do not
push ')' as a WordLiteral CST. *)
let push_word_closing_character b c =
{ b with buffer = WordComponent (String.make 1 c, WordEmpty) :: b.buffer }
let string_of_word (Word (s, _)) = s
let string_of_attribute = function
| NoAttribute -> ""
| UseDefaultValues w -> "-" ^ string_of_word w
| AssignDefaultValues w -> "=" ^ string_of_word w
| IndicateErrorifNullorUnset w -> "?" ^ string_of_word w
| UseAlternativeValue w -> "+" ^ string_of_word w
| RemoveSmallestSuffixPattern w -> "%" ^ string_of_word w
| RemoveLargestSuffixPattern w -> "%%" ^ string_of_word w
| RemoveSmallestPrefixPattern w -> "#" ^ string_of_word w
| RemoveLargestPrefixPattern w -> "##" ^ string_of_word w
let push_parameter ?(with_braces=false) ?(attribute=NoAttribute) b id =
let v = VariableAtom (id, attribute) in
let p =
if with_braces then
"${" ^ id ^ string_of_attribute attribute ^ "}"
else
"$" ^ id
in
{ b with buffer = WordComponent (p, WordVariable v) :: b.buffer }
let string_of_atom = function
| WordComponent (s, _) -> s
| AssignmentMark -> "|=|"
| QuotingMark _ -> "|Q|"
let contents_of_atom_list atoms =
String.concat "" (List.rev_map string_of_atom atoms)
let string_of_atom_list atoms =
String.concat "#" (List.rev_map string_of_atom atoms)
let contents b =
contents_of_atom_list b.buffer
let components_of_atom_list atoms =
let rec aux accu = function
| [] -> accu
| (WordComponent (_, WordEmpty)) :: b -> aux accu b
| (WordComponent (_, c)) :: b -> aux (c :: accu) b
| _ :: b -> aux accu b
in
aux [] atoms
let components b =
components_of_atom_list b.buffer
let push_quoting_mark k b =
{ b with buffer = QuotingMark k :: b.buffer }
let pop_quotation k b =
let rec aux squote quote = function
| [] ->
(squote, quote, [])
| QuotingMark k' :: buffer when k = k' ->
(squote, quote, buffer)
| (AssignmentMark | QuotingMark _) :: buffer ->
aux squote quote buffer (* FIXME: Check twice. *)
| WordComponent (w, WordEmpty) :: buffer ->
aux (w ^ squote) quote buffer
| WordComponent (w, c) :: buffer ->
aux (w ^ squote) (c :: quote) buffer
in
(* The last character is removed from the quote since it is the
closing character. *)
(* let buffer = pop_character b.buffer in *)
let squote, quote, buffer = aux "" [] b.buffer in
let word = Word (squote, quote) in
let quoted_word =
match k with
| SingleQuote -> WordSingleQuoted word
| DoubleQuote -> WordDoubleQuoted word
| OpeningBrace -> WordDoubleQuoted word
in
let squote =
match k with
| SingleQuote -> "'" ^ squote ^ "'"
| DoubleQuote -> "\"" ^ squote ^ "\""
| OpeningBrace -> squote
in
let quote = WordComponent (squote, quoted_word) in
{ b with buffer = quote :: buffer }
let push_assignment_mark current =
{ current with buffer = AssignmentMark :: current.buffer }
let is_assignment_mark = function
| AssignmentMark -> true
| _ -> false
let recognize_assignment current =
let rhs, prefix = take_until is_assignment_mark current.buffer in
if prefix = current.buffer then (
current
) else
let current' = { current with buffer = rhs @ List.tl prefix } in
match prefix with
| AssignmentMark :: WordComponent (s, _) :: prefix ->
assert (s.[String.length s - 1] = '='); (* By after_equal unique call. *)
(* [s] is a valid name. We have an assignment here. *)
let lhs = String.(sub s 0 (length s - 1)) in
(* FIXME: The following check could be done directly with
ocamllex rules, right?*)
if Name.is_name lhs then (
let rhs_string = contents_of_atom_list rhs in
{ current with buffer =
WordComponent (s ^ rhs_string,
WordAssignmentWord (Name lhs, Word (rhs_string,
components_of_atom_list rhs)))
:: prefix
}
) else
(*
If [lhs] is not a name, then the corresponding word
literal must be merged with the preceding one, if it exists.
*) (
begin match List.rev rhs with
| WordComponent (s_rhs, WordLiteral s_rhs') :: rev_rhs ->
let word = WordComponent (s ^ s_rhs, WordLiteral (s ^ s_rhs')) in
{ current with buffer = List.rev rev_rhs @ word :: prefix }
| _ ->
current'
end)
| _ -> current'
(** [(return ?with_newline lexbuf current tokens)] returns a list of
pretokens consisting of, in that order:
- WORD(w), where w is the contents of the buffer [current] in case the
buffer [current] is non-empty;
- all the elements of [tokens];
- NEWLINE, in case ?with_newline is true (default: false).
We know that [tokens] does not contain any Word pretokens. In fact, the
prelexer produces Word pretokens only from contents he has collected in
the buffer.
*)
let return ?(with_newline=false) lexbuf (current : prelexer_state) tokens =
assert (
not (List.exists (function (Pretoken.PreWord _)->true |_-> false) tokens)
);
let current = recognize_assignment current in
let flush_word b =
(* FIXME: Optimise! *)
let rec aux accu = function
| WordComponent (s, _) :: b -> aux (s ^ accu) b
| AssignmentMark :: b -> aux accu b
| QuotingMark _ :: b -> aux accu b
| [] -> accu
in
aux "" b.buffer
and produce token =
(* FIXME: Positions are not updated properly. *)
(token, lexbuf.Lexing.lex_start_p, lexbuf.Lexing.lex_curr_p)
in
let is_digit d =
Str.(string_match (regexp "^[0-9]+$") d 0)
in
let followed_by_redirection = Parser.(function
| Pretoken.Operator (LESSAND | GREATAND | DGREAT | DLESS _
| CLOBBER | LESS | GREAT | LESSGREAT) :: _ ->
true
| _ ->
false
) in
(*specification
2.10.1 Shell Grammar Lexical Conventions
The input language to the shell must be first recognized at the
character level. The resulting tokens shall be classified by
their immediate context according to the following rules (applied
in order). These rules shall be used to determine what a "token"
is that is subject to parsing at the token level. The rules for
token recognition in Token Recognition shall apply.
If the token is an operator, the token identifier for that
operator shall result.
If the string consists solely of digits and the delimiter character is
one of '<' or '>', the token identifier IO_NUMBER shall be
returned.
Otherwise, the token identifier TOKEN results.
*)
let buffered =
match flush_word current with
| "" ->
[]
| w when is_digit w && followed_by_redirection tokens ->
[Pretoken.IoNumber w]
| w ->
let csts =
List.(flatten (rev_map (function
| WordComponent (_, WordEmpty) -> []
| WordComponent (_, s) -> [s]
| AssignmentMark -> []
| QuotingMark _ -> []
) current.buffer))
in
[Pretoken.PreWord (w, csts)]
in
let tokens = if with_newline then tokens @ [Pretoken.NEWLINE] else tokens in
let tokens = buffered @ tokens in
List.map produce tokens
exception NotAWord of string
let word_of = function
| ((Pretoken.PreWord (w, cst), _, _)) :: _ -> Word (w, cst)
| (p, _, _) :: _ -> raise (NotAWord (Pretoken.string_of_pretoken p))
| [] -> raise (NotAWord "empty")
let located_word_of = function
| ((Pretoken.PreWord (w, cst), p1, p2)) :: _ -> (Word (w, cst), p1, p2)
| (p, _, _) :: _ -> raise (NotAWord (Pretoken.string_of_pretoken p))
| [] -> raise (NotAWord "empty")
let provoke_error current lexbuf =
return lexbuf current [Pretoken.Operator Parser.INTENDED_ERROR]
(**
A double quote can be escaped if we are already inside (at least)
two levels of quotation. For instance, if the input is <dquote>
<dquote> <backslash><backslash> <dquote> <dquote> <dquote>, the
escaped backslash is used to escape the quote character.
*)
let escape_analysis ?(for_backquote=false) level current =
let current =
List.map
(function
| WordComponent (s, _) -> s
| _ -> "")
current.buffer
in
let number_of_backslashes_to_escape = Nesting.(
(* FIXME: We will be looking for the general pattern here. *)
match level with
| Backquotes ('`', _) :: Backquotes ('`', _) :: Backquotes ('`', _) :: _ ->
[3]
| Backquotes ('`', _) :: Backquotes ('`', _) :: _ ->
[2]
| DQuotes :: Backquotes ('`', _) :: [] ->
[1; 2]
| DQuotes :: Backquotes ('`', _) :: DQuotes :: _ ->
[2]
| DQuotes :: Backquotes ('`', _) :: _ :: DQuotes :: _ ->
[2]
| Backquotes ('`', _) :: DQuotes :: _ ->
[2]
| Backquotes ('`', _) :: _ :: DQuotes :: _ ->
[2]
| [Backquotes ('`', _)] ->
if for_backquote then
[3]
else
[1; 2]
| _ ->
[1]
)
in
let current' = List.(concat (map rev (map string_to_char_list current))) in
if MorbigOptions.debug () then
Printf.eprintf "N = %s | %s\n"
(String.concat " "
(List.map string_of_int number_of_backslashes_to_escape)
)
(string_of_char_list current');
let backslashes_before = preceding '\\' current' in
if List.exists (fun k ->
backslashes_before >= k && (k - backslashes_before) mod (k + 1) = 0
) number_of_backslashes_to_escape
then (
(** There is no special meaning for this character. It is
escaped. *)
None
) else
(**
The character preceded by this sequence is not escaped.
In the case of `, the interpretation of this character
depends on the number of backslashes the precedes it.
Typically, in:
echo `echo \`foo\``
The second <backquote> is not escaped BUT it is not
closing the current subshell, it is opening a new
one.
*)
Some backslashes_before
let escape_analysis_predicate ?(for_backquote=false) level current =
escape_analysis ~for_backquote level current = None
let escaped_double_quote = escape_analysis_predicate
let escaped_single_quote = escape_analysis_predicate
let escaped_backquote = escape_analysis_predicate ~for_backquote:true
let escaped_backquote current =
escaped_backquote current.nesting_context current
let escaped_single_quote current =
escaped_single_quote current.nesting_context current
let escaped_double_quote current =
escaped_double_quote current.nesting_context current
let nesting_context current =
current.nesting_context
let enter what current =
let nesting_context = what :: current.nesting_context in
{ current with nesting_context }
let enter_double_quote =
enter Nesting.DQuotes
let enter_here_document dashed delimiter =
enter (Nesting.HereDocument (dashed, delimiter))
let enter_braces =
enter Nesting.Braces
let quit_double_quote current =
match current.nesting_context with
| Nesting.DQuotes :: nesting_context -> { current with nesting_context }
| _ -> assert false
let quit_braces current =
match current.nesting_context with
| Nesting.Braces :: nesting_context -> { current with nesting_context }
| _ -> assert false
let enter_backquotes op escaping_level current =
let nesting_context =
Nesting.Backquotes (op, escaping_level) :: current.nesting_context
in
{ current with nesting_context }
let under_backquote current =
match list_hd_opt current.nesting_context with
| Some (Nesting.Backquotes ('`', _)) -> true
| _ -> false
let under_braces current =
match list_hd_opt current.nesting_context with
| Some Nesting.Braces -> true
| _ -> false
let under_backquoted_style_command_substitution current =
Nesting.under_backquoted_style_command_substitution current.nesting_context
let under_double_quote current =
match current.nesting_context with
| (Nesting.DQuotes | Nesting.HereDocument _) :: _ -> true
| _ -> false
let under_real_double_quote current =
match current.nesting_context with
| Nesting.DQuotes :: _ -> true
| _ -> false
let under_here_document current =
match current.nesting_context with
| Nesting.HereDocument _ :: _ -> true
| _ -> false
let is_escaping_backslash current lexbuf c =
match c with
| '"' -> escaped_double_quote current
| '\'' -> escaped_single_quote current
| '`' -> escaped_backquote current
| _ -> escape_analysis_predicate current.nesting_context current
let rec closest_backquote_depth = function
| [] -> -1
| Nesting.Backquotes ('`', depth) :: _ -> depth
| _ :: nesting -> closest_backquote_depth nesting
let backquote_depth current =
let current_depth =
escape_analysis ~for_backquote:true current.nesting_context current
|> function
| Some d -> d
| None -> assert false (* By usage of backquote_depth. *)
in
if MorbigOptions.debug () then
Printf.eprintf "Backquote depth: %d =?= %d\n"
current_depth
(closest_backquote_depth current.nesting_context);
if current_depth = closest_backquote_depth current.nesting_context then
None
else
Some current_depth
let found_current_here_document_delimiter current =
match current.nesting_context with
| Nesting.HereDocument (dashed, delimiter) :: _ ->
let last_chunk = contents current in
let open QuoteRemoval in
let preprocess = if dashed then remove_tabs_at_linestart else fun x -> x in
let last_line = option_map (string_last_line last_chunk) preprocess in
last_line = Some delimiter
| _ ->
false
let remove_contents_suffix end_marker (contents : string) (cst : CST.word_cst) =
let contents = string_remove_suffix end_marker contents in
let rec aux cst =
match cst with
| (WordLiteral contents) :: cst ->
begin match lines contents with
| [] | [_] ->
aux cst
| rest ->
let rest = List.(rev (tl (rev rest))) in
let suffix = String.concat "\n" rest ^ "\n" in
WordLiteral suffix :: cst
end
| _ :: cst ->
aux cst
| [] ->
[]
in
contents, List.(rev (aux (rev cst)))
let debug ?(rule="") lexbuf current = Lexing.(
if MorbigOptions.debug () then
let curr_pos =
min lexbuf.lex_curr_pos lexbuf.lex_buffer_len
in
Printf.eprintf "\
%s [ ] %s { %s } %s @ %s #\n[%s]\n"
(Bytes.(to_string (sub lexbuf.lex_buffer 0 curr_pos)))
(let k = lexbuf.lex_buffer_len - curr_pos - 1 in
if k > 0 then
Bytes.(to_string (sub lexbuf.lex_buffer curr_pos k))
else "")
(Lexing.lexeme lexbuf)
rule
(String.concat " " (List.map Nesting.to_string current.nesting_context))
(string_of_atom_list current.buffer)
)