Content - bdacfe252523460b50288c6c0d011a8c60023dcc - 140b06e/m2const.pl

visit type:
https://github.com/RichardMoot/GrailLight

05 April 2024, 19:33:37 UTC
Tip revision: 67fbacd0e365d9008c021016377c0cfe1f1e309d authored by Richard Moot on 27 April 2021, 15:02:38 UTC
Update Supertag.tcl
Tip revision: 67fbacd
m2const.pl

:- module(m2const, [start/0,
		    start/1,
		    update_all/0,
		    export_all/0,
		    export/1,
		    export_text/1,
		    verify_sentences/0,
		    compute_penalties/0,
		    compute_penalties1/2]).

% constituent calculations

% start/0   computes word/4 and constituent/4 declarations from all files declared by xml_files/1
%
% export(File) all-in-one predicate combining *head.pl and the current XML files into a single *.pl file
%
% compute_penalties/0   computes crosses/4 declarations from constituent/4 declarations for all sentences
%
% cp(List)            as compute_penalties for all sentences in List
%
% update_crosses(Sent, Left, Right, Diff)    update given crosses/4 information by adding Diff to all
%                                            constituents crossing Left-Right
%
% update_crosses(Sent, Left, Right)          equivalent to update_crosses(Sent, Left, Right, 1).
%
% verify_sentences/0   verify if the words used in sent/2 declarations correspond to the given word/4
%                      declarations
%
% verify_lemmas/0

verbose(false).

xml_files(File) :-
	xml_files(File, _).

xml_files('flmf7aa1ep.cat.xml', aa1).
xml_files('flmf7aa2ep.cat.xml', aa2).
xml_files('flmf7ab2ep.xml', ab2).
xml_files('flmf7ae1ep.cat.xml', ae1).
xml_files('flmf7af2ep.cat.xml', af2).
xml_files('flmf7ag1exp.cat.xml', ag1).
xml_files('flmf7ag2ep.cat.xml', ag2).
xml_files('flmf7ah1ep.aa.xml', ah1).
xml_files('flmf7ah2ep.aa.xml', ah2).
xml_files('flmf7ai1exp.cat.xml', ai1).
xml_files('flmf7ai2ep.aa.cat.xml', ai2).
xml_files('flmf7aj1ep.indent.xml', aj1).
xml_files('flmf7ak1ep.indent.xml', ak1).
xml_files('flmf7ak2ep.xd.cat.xml', ak2).
xml_files('flmf7al1ep.cat.xml', al1).
xml_files('flmf7am1ep.xd.cat.xml', am1).
xml_files('flmf7am2ep.xd.cat.xml', am2).
xml_files('flmf7an1ep.xml', an1).
xml_files('flmf7an2co.af.cat.xml', an2).
xml_files('flmf7ao1ep.xml', ao1).
xml_files('flmf7ao2ep.xml', ao2).
xml_files('flmf7ap1ep.af.cat.xml', ap1).
xml_files('flmf7aq2ep.xd.cat.xml', aq2).
xml_files('flmf7as2ep.af.cat.xml', as2).
xml_files('flmf7atep.cat.xml', at).

xml_files('flmf300_13000ep.cat.xml', '300').
xml_files('flmf3_01000_01499ep.aa.xml','1000').
%xml_files('flmf3_03500_03999ep.aa.xml','3500').
xml_files('flmf3_08000_08499ep.xd.cat.xml', '8000').
%xml_files('flmf3_08500_08999ep.aa.xml','8500').
%xml_files('flmf3_09000_09499ep.aa.xml','9000').
%xml_files('flmf3_10000_10499ep.aa.xml','10000').
%xml_files('flmf3_11000_11499ep.aa.xml','11000').
%xml_files('flmf3_12000_12499ep.aa.xml','12000').
%xml_files('flmf3_12500_12999co.aa.xml','12500').

xml_files('annodis.er.xml', annodis).
xml_files('frwiki1.xml', frwiki1).
xml_files('frwiki2.xml', frwiki2).
xml_files('Europar.550.xml', europar).
xml_files('emea-fr-dev.xml', emea_d).
% xml_files('emea-fr-test.xml', emea_t).

% NOTE: all exported predicates are declared as being in module "user", since otherwise predicates like listing/1
% will prefix the module "m2const" leading to incorrect beheviour.

:- dynamic user:word/4, user:lemma/4, user:constituent/4, user:crosses/4, sent/2, current_file/1.

% create word/4 and constituent/4 declarations for the XML files declared by xml_file/1 (above).

start :-
	user:abolish(word/4),
	user:retractall(lemma(_,_,_,_)),
	user:retractall(word(_,_,_,_)),
	user:retractall(constituent(_,_,_,_)),
	findall(F, xml_files(F), Files),
	start(Files, 0, _).

start([], N, N).
start([F|Fs], N0, N) :-
	format(user_error, '~NXML File: ~w~n', [F]),
	load_structure(F, L0, [dialect(xml), space(default)]),
	delete_all_spaces(L0, L),
	xml_to_const(L, N0, N1, 0, _),
	nl(user_error),
	start(Fs, N1, N).

start(XMLFile) :-
	abolish(word/4),
	user:retractall(lemma(_,_,_,_)),
	user:retractall(word(_,_,_,_)),
	user:retractall(constituent(_,_,_,_)),
	retractall(current_file(_)),
	assert(current_file(XMLFile)),
	load_structure(XMLFile, L0, [dialect(xml), space(default)]),
	delete_all_spaces(L0, L),
	xml_to_const(L, 0, _, 0, _).

update_all :-
	user:abolish(word/4),
	user:retractall(lemma(_,_,_,_)),
	user:retractall(word(_,_,_,_)),
	user:retractall(constituent(_,_,_,_)),
	findall(F, xml_files(F), Files0),
	update_files(Files0, Files, Warn),
	nl(user_error),
	export_all(Files),
	format('~n= Update done!=~2n', []),
	format('Exported: ~@', [print_flat_list(Files)]),
	format('Warning : ~@', [print_flat_list(Warn)]).

% = update_files(+AllFiles, -UpdatedFiles, -Changed)
%
% 

update_files([], [], []).
update_files([XMLFile|Fs0], Fs, Ws) :-
	xml_files(XMLFile, FileRoot),
	atom_concat(FileRoot, 'head.pl', HeadFile),
	check_exists(HeadFile),
	atom_concat('head/', HeadFile, HHeadFile),
	check_exists(HHeadFile),
	atom_concat(FileRoot, '.pl', TargetFile),
	time_file(HeadFile, Time),
	time_file(HHeadFile, Time1),
        time_file(TargetFile, Time2),
	time_file(XMLFile, TimeX),
	atomic_list_concat(['diff ', HHeadFile, ' ', HeadFile, '> /dev/null'], Cmd),
  (
        /* check if corresponding file extracted from treebank is newer and different */
	/* warn if it is */
	Time1 > Time,
	\+ shell(Cmd)  
  ->
        format(user_error, '{Warning: file "~p" has been updated}~n', [HHeadFile]),
        Ws = [FileRoot|Ws1]
  ;
        Ws = Ws1
  ),	   
  (
	/* schedule file only if either the xml or the head file has changed */  
        Time2 > max(Time,TimeX)
  ->
        format(user_error, '(~p) ', [FileRoot]),
        Fs = Fs1
  ;
        format(user_error, '~p ', [FileRoot]),
        Fs = [FileRoot|Fs1]
  ),
        update_files(Fs0, Fs1, Ws1).

print_flat_list([]) :-
	format('none!~n', []).
print_flat_list([X|Xs]) :-
	print_flat_list(Xs, X).
print_flat_list([], X) :-
	format('~p~n', [X]).
print_flat_list([X|Xs], Y) :-
	format('~p, ', [Y]),
	print_flat_list(Xs, X).

export_all :-
	findall(FileRoot, xml_files(_, FileRoot), List),
	export_all(List).

export_all([]).
export_all([F|Fs]) :-
	export(F),
	export_all(Fs).

% = export(FileRoot)
%
% Given a file *head.pl and the currently loaded XML file, create a file *.pl (with intermediate
% files *const.pl, *word.pl and *crosses.pl, this last one will be deleted, because of its size)
% (* is replaced by FileRoot).

export(FileRoot) :-
	xml_files(XMLFile, FileRoot),
	atom_concat(FileRoot, 'head.pl', HeadFile),
	check_exists(HeadFile),
	atom_concat(FileRoot, '.pl', TargetFile),
	atom_concat(FileRoot, 'const.pl', ConstFile),
	delete_if_exists(ConstFile),
	atom_concat(FileRoot, 'word.pl', WordFile),
	delete_if_exists(WordFile),
	atom_concat(FileRoot, 'crosses.pl', CrossFile),
	delete_if_exists(CrossFile),
	format(user_error, '~NXML File: ~w~n', [XMLFile]),
	start(XMLFile),
	nl(user_error),
	tell(WordFile),
	user:listing(word(_,_,_,_)),
	told,
	abolish(sent/2),
	[HeadFile],
	rebracket_constituents,
	tell(ConstFile),
	user:listing(constituent(_,_,_,_)),
	told,
	compute_penalties,
	format(user_error, '~2nSaving penalties...', []),
	flush_output(user_error),
	tell(CrossFile),
	user:listing(crosses(_,_,_,_)),
	told,
	format(user_error, 'done!~n', []),
	format(user_error, 'Exporting...', []),
	flush_output(user_error),
	atomic_list_concat([cat,HeadFile,WordFile,CrossFile,'>',TargetFile], ' ', Cmd),
	format('~N~w~n', [Cmd]),
	process_create(path(sh), ['-c',Cmd], []),
	delete_file(CrossFile),
	format(user_error, 'done!~n', []).

% = export_text(+FileRoot)
%
% export the XML annotation file denoted by FileRoot as a standard text file (with .txt extension),
% using the same mechanisms as the extraction procedure.

export_text(FileRoot) :-
	xml_files(XMLFile, FileRoot),
	atom_concat(FileRoot, '.txt', TextFile),
	delete_if_exists(TextFile),
	user:abolish(word/4),
	user:retractall(lemma(_,_,_,_)),
	user:retractall(word(_,_,_,_)),
	user:retractall(constituent(_,_,_,_)),
	start([XMLFile], 0, N),
	nl(user_error),
	tell(TextFile),
	export_text_words(0, N),
	told.

export_text_words(N, N) :-
	!.
export_text_words(N0, N) :-
	N0 < N,
	N1 is N0 + 1,
	findall(W, word(N1, W, _, _), WL),
	print_word_list(WL),
	!,
	export_text_words(N1, N).

print_word_list([]) :-
	nl.
print_word_list([W|Ws]) :-
	format(' ~W', [W,[quoted(false)]]),
	print_word_list(Ws).

check_exists(File) :-
   (
	exists_file(File)
   ->	
        true
   ;
        format(user_error, '~NFile ~w not found, aborting!~n', [File]),
        fail
   ).
   
delete_if_exists(File) :-
   (
	exists_file(File)
   ->
        delete_file(File)
   ;
	true
   ).
   
delete_all_spaces(Es0, Es) :-
	filter_list(Es0, Es1),
	delete_all_spaces1(Es1, Es).

delete_all_spaces1([], []).
delete_all_spaces1([E0|Es0], [E|Es]) :-
	delete_element_spaces(E0, E),
	delete_all_spaces1(Es0, Es).

delete_element_spaces(element(A,B,Cs0), element(A,B,Cs)) :-
	!,
	filter_list(Cs0, Cs1),
	delete_all_spaces1(Cs1, Cs).
delete_element_spaces(E, E).


handle_word_list([], _, N, N).
handle_word_list([W|Ws], S, N0, N) :-
	handle_word(W, S, N0, N1),
	handle_word_list(Ws, S, N1, N).


handle_word('C.L.', S, N0, N) :-
	!,
	N1 is N0 + 1,
	N is N1 + 1,
	Word1 = 'C.',
	Word2 = 'L.',
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word1, N0, N1]),
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word2, N1, N]),
	user:assert(word(S, Word1, N0, N1)),
	user:assert(word(S, Word2, N1, N)).
handle_word('H.D.', S, N0, N) :-
	!,
	N1 is N0 + 1,
	N is N1 + 1,
	Word1 = 'H.',
	Word2 = 'D.',
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word1, N0, N1]),
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word2, N1, N]),
	user:assert(word(S, Word1, N0, N1)),
	user:assert(word(S, Word2, N1, N)).
handle_word('B.F.', S, N0, N) :-
	!,
	N1 is N0 + 1,
	N is N1 + 1,
	Word1 = 'B.',
	Word2 = 'F.',
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word1, N0, N1]),
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word2, N1, N]),
	user:assert(word(S, Word1, N0, N1)),
	user:assert(word(S, Word2, N1, N)).
handle_word('l\'isloise', S, N0, N) :-
	!,
	N1 is N0 + 1,
	N is N1 + 1,
	Word1 = 'l\'',
	Word2 = isloise,
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word1, N0, N1]),
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word2, N1, N]),
	user:assert(word(S, Word1, N0, N1)),
	user:assert(word(S, Word2, N1, N)).
handle_word('jusqu\'au', S, N0, N) :-
	!,
	N1 is N0 + 1,
	N is N1 + 1,
	Word1 = 'jusqu\'',
	Word2 = au,
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word1, N0, N1]),
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word2, N1, N]),
	user:assert(word(S, Word1, N0, N1)),
	user:assert(word(S, Word2, N1, N)).
handle_word('l\'on', S, N0, N) :-
	!,
	N1 is N0 + 1,
	N is N1 + 1,
	Word1 = 'l\'',
	Word2 = on,
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word1, N0, N1]),
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word2, N1, N]),
	user:assert(word(S, Word1, N0, N1)),
	user:assert(word(S, Word2, N1, N)).

handle_word('T.bond', S, N0, N) :-
	!,
	N1 is N0 + 1,
	N is N1 + 1,
	Word1 = 'T',
	Word2 = bond,
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word1, N0, N1]),
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word2, N1, N]),
	user:assert(word(S, Word1, N0, N1)),
	user:assert(word(S, Word2, N1, N)).
handle_word('Pan.Am', S, N0, N) :-
	!,
	N1 is N0 + 1,
	N is N1 + 1,
	Word1 = 'Pan',
	Word2 = 'Am',
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word1, N0, N1]),
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word2, N1, N]),
	user:assert(word(S, Word1, N0, N1)),
	user:assert(word(S, Word2, N1, N)).
handle_word('de la', S, N0, N) :-
	!,
	N1 is N0 + 1,
	N is N1 + 1,
	Word1 = de,
	Word2 = la,
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word1, N0, N1]),
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word2, N1, N]),
	user:assert(word(S, Word1, N0, N1)),
	user:assert(word(S, Word2, N1, N)).
handle_word('jusque-là', S, N0, N) :-
	!,
	N1 is N0 + 1,
	N is N1 + 1,
	Word1 = jusque,
	Word2 = '-là',
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word1, N0, N1]),
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word2, N1, N]),
	user:assert(word(S, Word1, N0, N1)),
	user:assert(word(S, Word2, N1, N)).
handle_word('de l\'', S, N0, N) :-
	!,
	N1 is N0 + 1,
	N is N1 + 1,
	Word1 = de,
	Word2 = 'l\'',
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word1, N0, N1]),
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word2, N1, N]),
	user:assert(word(S, Word1, N0, N1)),
	user:assert(word(S, Word2, N1, N)).

handle_word('R.R.Donnelley', S, N0, N) :-
	!,
	N1 is N0 + 1,
	N2 is N1 + 1,
	N is N2 + 1,
	Word1 = 'R.',
	Word2 = 'R.',
	Word3 = 'Donnelley',
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word1, N0, N1]),
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word2, N1, N2]),
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word3, N2, N]),
	user:assert(word(S, Word1, N0, N1)),
	user:assert(word(S, Word2, N1, N2)),
	user:assert(word(S, Word3, N2, N)).

handle_word('L.C.Waïkiki', S, N0, N) :-
	!,
	N1 is N0 + 1,
	N2 is N1 + 1,
	N is N2 + 1,
	Word1 = 'L.',
	Word2 = 'C.',
	Word3 = 'Waïkiki',
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word1, N0, N1]),
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word2, N1, N2]),
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word3, N2, N]),
	user:assert(word(S, Word1, N0, N1)),
	user:assert(word(S, Word2, N1, N2)),
	user:assert(word(S, Word3, N2, N)).


handle_word(W, S, N0, N) :-
	current_file('flmf7aa1ep.cat.xml'),
	atomic_list_concat([Word1,ci], '-', W),
	Word1 \= '',
	!,
	N1 is N0 + 1,
	N is N1 + 1,
	Word2 = '-ci',
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word1, N0, N1]),
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word2, N1, N]),
	user:assert(word(S, Word1, N0, N1)),
	user:assert(word(S, Word2, N1, N)).

handle_word(W, S, N0, N) :-
	current_file('flmf7am1ep.xd.cat.xml'),
	atomic_list_concat([Word1,là], '-', W),
	Word1 \= '',
	!,
	N1 is N0 + 1,
	N is N1 + 1,
	Word2 = '-là',
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word1, N0, N1]),
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word2, N1, N]),
	user:assert(word(S, Word1, N0, N1)),
	user:assert(word(S, Word2, N1, N)).

handle_word(W, S, N0, N) :-
	current_file('flmf3_08000_08499ep.xd.cat.xml'),
	atomic_list_concat([Word1,là], '-', W),
	Word1 \= '',
	!,
	N1 is N0 + 1,
	N is N1 + 1,
	Word2 = '-là',
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word1, N0, N1]),
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word2, N1, N]),
	user:assert(word(S, Word1, N0, N1)),
	user:assert(word(S, Word2, N1, N)).

handle_word(W, S, N0, N) :-
	current_file('flmf7atep.cat.xml'),
	atomic_list_concat([Word1,là], '-', W),
	Word1 \= '',
	!,
	N1 is N0 + 1,
	N is N1 + 1,
	Word2 = '-là',
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word1, N0, N1]),
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word2, N1, N]),
	user:assert(word(S, Word1, N0, N1)),
	user:assert(word(S, Word2, N1, N)).

handle_word(W, S, N0, N) :-
	current_file('flmf7ak2ep.xd.cat.xml'),
	atomic_list_concat([Word1,là], '-', W),
	Word1 \= '',
	!,
	N1 is N0 + 1,
	N is N1 + 1,
	Word2 = '-là',
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word1, N0, N1]),
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word2, N1, N]),
	user:assert(word(S, Word1, N0, N1)),
	user:assert(word(S, Word2, N1, N)).

handle_word(W, S, N0, N) :-
	current_file('flmf7ai2ep.aa.cat.xml'),
	atomic_list_concat([Word1,là], '-', W),
	Word1 \= '',
	!,
	N1 is N0 + 1,
	N is N1 + 1,
	Word2 = '-là',
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word1, N0, N1]),
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word2, N1, N]),
	user:assert(word(S, Word1, N0, N1)),
	user:assert(word(S, Word2, N1, N)).


handle_word(W, S, N0, N) :-
	current_file('flmf7ag1exp.cat.xml'),
	atomic_list_concat([Word1,là], '-', W),
	Word1 \= '',
	!,
	N1 is N0 + 1,
	N is N1 + 1,
	Word2 = '-là',
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word1, N0, N1]),
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word2, N1, N]),
	user:assert(word(S, Word1, N0, N1)),
	user:assert(word(S, Word2, N1, N)).

handle_word(W, S, N0, N) :-
	W \= '&',
	atomic_list_concat([Word1,Word3], '&', W),
	!,
	N1 is N0 + 1,
	N2 is N1 + 1,
	N is N2 + 1,
	Word2 = '&',
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word1, N0, N1]),
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word2, N1, N2]),
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word3, N2, N]),
	user:assert(word(S, Word1, N0, N1)),
	user:assert(word(S, Word2, N1, N2)),
	user:assert(word(S, Word3, N2, N)).

handle_word(W, S, N0, N) :-
	atom_chars(W, List),
	append(Prefix, ['°','C'], List),
	Prefix \= [],
	!,
	N1 is N0 + 1,
	N is N1 + 1,
	atom_chars(Word1, Prefix),
	atom_chars(Word2, ['°','C']),
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word1, N0, N1]),
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word2, N1, N]),
	user:assert(word(S, Word1, N0, N1)),
	user:assert(word(S, Word2, N1, N)).
	
handle_word(W, S, N0, N) :-
	N is N0 + 1,
	format1('word(~w, ~k, ~w, ~w).~n', [S, W, N0, N]),
	user:assert(word(S, W, N0, N)).

% = xml_to_const(+ElementList, +ParentElement)
%
%

xml_to_const([], N, N, M, M).
xml_to_const([E|Es], N0, N, M0, M) :-
	element_to_const(E, N0, N1, M0, M1),
	xml_to_const(Es, N1, N, M1, M).

get_number(Int, _, S) :-
	integer(Int),
	!,
	S = Int.
get_number(_-Int, _, S) :-
	integer(Int),
	!,
	S = Int.
get_number(Atom, _, S) :-
	atomic_list_concat(List, -, Atom),
	last(List, Last),
	integer(Last),
	!,
	S = Last.
get_number(_, S0, S) :-
	S is S0 + 1.

element_to_const(element(Nm,As,Cs), N0, N, M0, M) :-
	!,
	element_to_const(Nm, As, Cs, N0, N, M0, M).

element_to_const('SENT', As, Cs, S0, S, _, M) :-
	!,
	write(user_error, '.'),
    (
        member(nb=Num,As)
    ->
        get_number(Num, S0, S1)
    ;
	S1 is S0 +1
    ),
	xml_to_const(Cs, S1, S, 0, M).

element_to_const(w, As, Cs, S, S, M0, M) :-
	!,
	get_lemma(As, Lemma),
	collect_words(Cs, Ws0, []),
	simplify_words(Ws0, Ws),
    (   Ws \= []
    ->
	smart_concat_atoms(Ws, Word),
	atomic_list_concat(L, '_', Word),
	handle_word_list(L, S, M0, M),
    (
	M > M0 + 1
    ->
        format1('constituent(~w, w, ~w, ~w).~n', [S, M0, M]),
        user:assert(constituent(S, w, M0, M))
    ;
        user:assert(lemma(S, Lemma, M0, M))
    )
    ;
	M is M0
    ).
element_to_const(Cat, _As, Cs, S0, S, N0, N) :-
	!,
	xml_to_const(Cs, S0, S, N0, N),
    (
	N > N0 + 1
    ->
        format1('constituent(~w, ~k, ~w, ~w).~n', [S0, Cat, N0, N]),
        user:assert(constituent(S0, Cat, N0, N))
     ;
        true
    ).

% simplify_words(+ListOfWords, -SimplifiedListOfWords)
%
% specify various rewrites of the different multi-words/complex words, ie. these rules
% are applied to a list of "subwords" of a "w" constituent.

simplify_words(List0, List) :-
	append(Prefix, ['Gmb.H'], List0),
	append(Prefix, ['GmbH'], List),
	!.
simplify_words(List0, List) :-
	append(Prefix, ['Gmb.','H'], List0),
	append(Prefix, ['GmbH'], List),
	!.

simplify_words(['Puma','A.G.Rudolf', 'Dassler', 'Sport'],
	       ['Puma','A.', 'G.', 'Rudolf', 'Dassler', 'Sport']) :-
	!.
simplify_words(['R','19'], ['R','19']) :-
	current_file('flmf7aj1ep.indent.xml'),
	!.
simplify_words(['R','19'], ['R.19']) :-
	!.
simplify_words(['presqu\'','île'], ['presqu\'île']) :-
	!.
simplify_words([X,Y,'aujourd\'',hui], [X,Y,'aujourd\'hui']) :-
	!.
simplify_words([X,'aujourd\'',hui], [X,'aujourd\'hui']) :-
	!.
simplify_words([X,Y,'d\'',oeuvre], [X,Y,'d\'oeuvre']) :-
	!.
simplify_words(['Côte',-,'d\'',ivoire], ['Côte-d\'ivoire']) :-
	!.
simplify_words(['Côtes',-,'d\'','Armor'], ['Côtes-d\'Armor']) :-
	!.
simplify_words(['côte',-,'d\'',ivoire], ['côte-d\'ivoire']) :-
	!.
simplify_words(['Monde',-,'l\'','économie'], ['Monde-l\'économie']) :-
	!.
simplify_words([X,s], [Y]) :-
	atom_chars(X, List),
	last(List, '\''),
	!,
	atom_concat(X, s, Y).
simplify_words([no,'man\'',s,land], [no,'man\'s',land]) :-
	!.
simplify_words(['Rubik\'',s,cube], ['Rubik\'s',cube]) :-
	!.
simplify_words(['Who\'',s,next], ['Who\'s',next]) :-
	!.
simplify_words([X,-,ci], [Atom]) :-
	!,
	atomic_list_concat([X,-,ci], Atom).
simplify_words([pub,-,info], ['pub-info']) :-
	!.
% simplify_words([ville,-,campagne], ['ville-campagne']) :-
% 	!.
simplify_words([demi,-,X], ['demi-', X]) :-
	!.
simplify_words([mi,-,X], ['mi-', X]) :-
	!.
simplify_words(['FR', '3'], ['FR3']) :-
	!.
simplify_words(['Demak\'', up], ['Demak\'up']) :-
	!.
simplify_words(['FR', '3', X], ['FR3', X]) :-
	!.
simplify_words(['Antenne', 2, -, 'FR', 3], ['Antenne2-FR3']) :-
	!.
simplify_words(['Antenne', '2', -, 'FR', '3'], ['Antenne2-FR3']) :-
	!.
simplify_words([X], [Y,h,Z]) :-
	atomic_list_concat([Y0,Z0], h, X),
	atom_number(Y0, Y),
	atom_number(Z0, Z),
	!.
% simplify_words(['TF', 1], ['TF1']) :-
% 	!.
% simplify_words(['TF', '1'], ['TF1']) :-
% 	!.
simplify_words(Ws, Ws).


collect_words([]) -->
	[].
collect_words([W|Ws]) -->
	!,
	collect_word(W),
	collect_words(Ws).

collect_word(element(w,_,Ws)) -->
	!,
	collect_words(Ws).
collect_word(W) -->
	[W].

smart_concat_atoms(['arrière-', 'petits-enfants'], 'arrière-petits-enfants') :-
	!.
smart_concat_atoms(['aujourd\'',hui], 'aujourd\'hui') :-
	!.
smart_concat_atoms(['quelqu\'',un], 'quelqu\'un') :-
	!.
smart_concat_atoms(['Aujourd\'',hui], 'Aujourd\'hui') :-
	!.
smart_concat_atoms(Cs, Atom) :-
    (
        append(As, [-|Bs], Cs)
     ->
        smart_concat_atoms(As, AAtom),
        smart_concat_atoms(Bs, BAtom),
        concat_atom([AAtom,-,BAtom], Atom)
    ;
	append(As, [','|Bs], Cs)
    ->
        smart_concat_atoms(As, AAtom),
        smart_concat_atoms(Bs, BAtom),
        concat_atom([AAtom,',',BAtom], Atom)
     ;
        all_digits(Cs)
    ->
        concat_atom(Cs, '.', Atom)
    ;
	concat_atom(Cs, '_', Atom)
     ).
     

get_lemma(List, Lemma) :-
	member(lemma=Lemma, List),
	!.
get_lemma(_, '???').

all_digits([]).
all_digits([D|Ds]) :-
	integer(D),
	!,
	all_digits(Ds).
all_digits([D|Ds]) :-
	/* atom_number/2 fails silently if D is not a number */
	atom_number(D, _),
	all_digits(Ds).

% = filter_list(+ListA, ?ListB)
%
% true if ListA and ListB have the same elements except for the ' ' which
% are removed from ListB.

filter_list([], []).
filter_list([X|Xs], Ys0) :-
    (
	X == ' '
    ->
	filter_list(Xs, Ys0)
    ;
	Ys0 = [X|Ys],
	filter_list(Xs, Ys)
    ).

% = member1(?Element, +List)
%
% true if List contains Element. Like the library predicate member/2, but
% succeeds at most once. Unlike member_chk/2 unification is used instead of
% strict identity.

member1(X, [X|_]) :-
	!.
member1(X, [_|Ys]) :-
	member1(X, Ys).

cp(N0, N) :-
        compute_penalties1(N0),
    (
        N0 >= N
    ->
        true
    ;
        N1 is N0 + 1,
        cp(N1, N)
    ).

cp(List) :-
	user:abolish(crosses/4),
	user:retractall(crosses(_,_,_,_)),
	compute_penalties(List),
	user:retractall(crosses(_,_,_,0)).

% = compute_penalties
%
% compute penalties (as crosses/4 declarations) for all sentences

compute_penalties :-
	user:abolish(crosses/4),
	user:retractall(crosses(_,_,_,_)),
	/* recover list of sentence numbers */
	user:setof(X, A^B^C^constituent(X,A,B,C), Sentences),
	compute_penalties(Sentences),
	/* erase zero entries */
	user:retractall(crosses(_,_,_,0)).

% = compute_let(+SentNo, -Let)
%
% true if Let is the list containing all right edge numbers of words assigned the formula "let"

compute_let(SentNo, Let) :-
	clause(sent(SentNo,Sem),prob_parse(List0,Sem)),
	!,
	compute_let(List0, SentNo, 0, Let).
% do nothing when no clauses have been found
compute_let(_, []).

get_word(Sent, Word, L, R) :-
	word(Sent, Word, L, R),
	!.
get_word(_, 'NULL', _, _).


compute_let([], _, _, []).
compute_let([si(Word, _, _, [Formula-_])|Rest], SentNo, N0, Let0) :-
	N is N0 + 1,
	get_word(SentNo, Word0, N0, N),
   (
	/* do not distinguish number atoms '9' from integers */
	atom_codes(Word0, Codes0),
	atom_codes(Word, Codes),
	Codes = Codes0
   ->
	true
   ;
        format(user_error, '[~d] ~d-~d "~w" "~w"~n', [SentNo,N0,N,Word0,Word])
   ),
   (
        Formula = let
   ->
        Let0 = [N|Let1]
   ;
        Let1 = Let0
   ),
        compute_let(Rest, SentNo, N, Let1).

% =
% TODO: add some additional rules, for example:
% - when a "VPinf" constituent has "de" as first word, add a new "VPinf" constituent without "de"
% Recommended when release is stable, followed by verification

rebracket_constituents :-
	user:setof(X, A^B^C^constituent(X,A,B,C), Sentences),
	rebracket_constituents_list(Sentences).

rebracket_constituents_list([]).
rebracket_constituents_list([S|Ss]) :-
	rebracket_constituents(S),
	rebracket_constituents_list(Ss).

rebracket_constituents(S) :-
	compute_let(S, Let),
	reverse(Let, LetR),
	rebracket_constituents_left(LetR, S),
	rebracket_constituents_right(Let, S).

rebracket_constituents_right([], _).
rebracket_constituents_right([LR|Ls], S) :-
	LR1 is LR - 1,
	user:findall(t(Cat,R), constituent(S, Cat, LR, R), ListTR), % touching interpunction symbol on right edge 
	user:retractall(constituent(S, Cat, LR, _)),
	assert_all_right(ListTR, S, LR1),
	rebracket_constituents_right(Ls, S).

	
rebracket_constituents_left([], _).
rebracket_constituents_left([LR|Ls], S) :-
	LR1 is LR - 1,
	user:findall(t(Cat,L), constituent(S, Cat, L, LR), ListTL), % touching interpunction symbol on left edge
	user:retractall(constituent(S, Cat, _, LR)),
	assert_all_left(ListTL, S, LR1),
	rebracket_constituents_left(Ls, S).

assert_all_left([], _, _).
assert_all_left([t(Cat,L)|Rest], S, LR) :-
	user:assert(constituent(S, Cat, L, LR)),
	assert_all_left(Rest, S, LR).

assert_all_right([], _, _).
assert_all_right([t(Cat,R)|Rest], S, LR) :-
	user:assert(constituent(S, Cat, LR, R)),
	assert_all_right(Rest, S, LR).




compute_penalties([]).
compute_penalties([S|Ss]) :-
	format(user_error, '~n~w', [S]),
	compute_penalties1(S),
	compute_penalties(Ss).

compute_penalties1(S) :-
	compute_length(S, 0, Max),
	compute_penalties1(S, Max).

compute_penalties1(S, Max) :-
	initialize(0, Max, S),
	write(user_error, ':'),
	cross_comp(2, Max, S).
initialize(N0, N, S) :-
	NR is N0 + 2,
    (
        NR > N
    ->
        true
    ;
        findall(L,(user:constituent(S,_,L,R),aux_crosses(L,R,N0,NR)), Cs),
        length(Cs, Cr),
        user:assert(crosses(S, N0, NR, Cr)),
        N1 is N0 + 1,
        initialize(N1, N, S)
    ).


assert_crosses(S, NL, NR) :-
        findall(L-R,(user:constituent(S,_,L,R),aux_crosses(L,R,NL,NR)), Cs),
        length(Cs, Cr),
        user:assert(crosses(S, NL, NR, Cr)).
	
% = aux_crosses(+L, +R, +NL, +NR)
% 
%
% L -- N0 -- R -- NR
%
% xxxxxxxxxxxx
%      ooooooooooooo

aux_crosses(L, R, N0, NR) :-
	L < N0,
	N0 < R,
	R < NR.
% N0 -- L -- NR -- R
%
% ooooooooooooo
%       xxxxxxxxxxxx

aux_crosses(L, R, N0, NR) :-
	N0 < L,
	L < NR,
	NR < R.

cross_comp(D0, Max, S) :-
    (
        D0 >= Max
    ->
        true
    ;
	write(user_error, '.'),
        D is D0 + 1,
        compute_crosses(D, 0, Max, S),
        cross_comp(D, Max, S)
    ).

compute_crosses(D, N0, N, S) :-
	NR is N0 + D,
    (
        NR > N
    ->
        true
    ;
        assert_crosses(S, N0, NR),
        N1 is N0 +1,
        compute_crosses(D, N1, N, S)
     ). 

crosses1(S, L, R, C) :-
	user:crosses(S, L, R, C),
	!.
crosses1(_, _, _, 0).

compute_length(S, 0, Max) :-
	user:findall(R, word(S,_,_,R), Rights),
	compute_lengths1(Rights, 0, Max).

compute_lengths1([], M, M).
compute_lengths1([R|Rs], M0, M) :-
    (
        R > M0
    ->
        M1 = R
    ;
        M1 = M0
    ),
    compute_lengths1(Rs, M1, M).

% TODO: needs to be updated to handle implicit zeros correctly!

% = update_crosses(+Sentence, +Left, +Right)
%
% update the asserted crosses/4 predicate for Sentence, adding a
% constituent from position Left to Right.

update_crosses(Sent, X, Y) :-
	update_crosses(Sent, X, Y, 1).

% = update_crosses(+Sentence, +Left, +Right, +Plus)
%
% update the asserted crosses/4 predicate for Sentence, adding a
% constituent from position Left to Right, increasing the value
% of all constituent crossing this positions by Plus (eg. we
% can "erase" a pair of brackets with Plus = -1 or make "strong"
% brackets with Plus >= 2.

update_crosses(Sent, X, Y, Plus) :-
%	compute_length(Sent, 0, Length),
	user:findall(crosses(Sent,V,W,Z), crosses(Sent,V,W,Z), List0),
	update_crosses(List0, X, Y, Plus, List),
	user:retractall(crosses(Sent,_,_,_)),
	user:assert_crosses(List),
	user:listing(crosses(Sent,_,_,_)).

update_crosses([], _, _, _, []).
update_crosses([crosses(Sent, V, W, Cross0)|Rest0], X, Y, Plus, [crosses(Sent, V, W, Cross)|Rest]) :-
   ((	
	/* X < V < Y < W */
	X < V,
	V < Y,
	Y < W
   ;
	/* V < X < Y < W */
	V < W,
	X < W,
	W < Y
   )
   ->	
        Cross is Cross0 + Plus
   ;
       Cross = Cross0
   ),
       update_crosses(Rest0, X, Y, Plus, Rest).

assert_crosses([]).
assert_crosses([crosses(A,B,C,D)|Cs]) :-
   (
	D =:= 0
   ->
	true
   ;
	user:assert(crosses(A,B,C,D))
   ),
	assert_crosses(Cs).


assert_list([]).
assert_list([C|Cs]) :-
	user:assert(C),
	assert_list(Cs).


delete_position(Sent, Pos) :-
	user:findall(crosses(Sent,V,W,Z), crosses(Sent,V,W,Z), List0),
	delete_position(List0, Pos, List),
	user:retractall(crosses(Sent,_,_,_)),
	assert_list(List),
	user:listing(crosses(Sent,_,_,_)).


verify_sentences :-
	verify_sentences(log).
verify_sentences(Log) :-
    ( exists_file(Log) -> delete_file(Log) ; true),
	tell(log),
	findall(Num, clause(sent(Num,_),_), List),
	verify_sentences_list(List),
	told,
	format(user_error, '~NDone!~nLog output to file ~w~n', [Log]).

verify_sentences_list([]).
verify_sentences_list([N|Ns]) :-
	verify_sentence(N),
	verify_sentences_list(Ns).

verify_sentence(Num) :-
	clause(sent(Num,_), prob_parse(List,_)),
	verify_sentence(List, 0, Num, '.').

verify_sentence([], N0, Num, Status) :-
   (	
	word(Num, Word2, N0, N)
     ->
	format('~N!!!~d: Untreated word ~w~n', [Num,Word2]),
	verify_sentence([], N, Num, '*')
   ;
	write(user_error, Status)
   ).
verify_sentence([si(Word1,_,_,_)|Rest], N0, Num, Status0) :-
   (	
	word(Num, Word2, N0, N)
     ->
   (
        Word1 = Word2
   ->
        Status = Status0
   ;
        atom_number(Word2, Word1)	 
   ->
	Status = Status0
   ;
        Status = '*',
	format('~N~d: Word mismatch ~w-~w~n', [Num,Word1,Word2])
   );	Status = '*',
	format('~N~d: Unmatched word ~w~n', [Num,Word1]),
	N is N0 + 1),
        verify_sentence(Rest, N, Num, Status).


verify_lemmas :-
	verify_lemmas(log).
verify_lemmas(Log) :-
    ( exists_file(Log) -> delete_file(Log) ; true),
	tell(log),
	findall(Num, clause(sent(Num,_),_), List),
	verify_lemmas_list(List),
	told,
	format(user_error, '~NDone!~nLog output to file ~w~n', [Log]).

verify_lemmas_list([]).
verify_lemmas_list([N|Ns]) :-
	verify_lemma(N),
	verify_lemmas_list(Ns).

verify_lemma(Num) :-
	clause(sent(Num,_), prob_parse(List,_)),
	verify_lemma(List, 0, Num, '.').

verify_lemma([], _, _, Status) :-
	write(user_error, Status).
verify_lemma([si(_,_,Lemma1,_)|Rest], N0, Num, Status0) :-
   (	
	lemma(Num, Lemma2, N0, N)
     ->
   (
        same_lemma(Lemma1, Lemma2)
   ->
        Status = Status0
   ;
        atom_number(Lemma2, Lemma1)	 
   ->
	Status = Status0
   ;
        Status = '*',
	format('~N~d: Lemma mismatch ~w-~w~n', [Num,Lemma1,Lemma2])
   );	Status = '.',
	format('~N~d: Unmatched lemma ~w~n', [Num,Lemma1]),
	N is N0 + 1),
        verify_lemma(Rest, N, Num, Status).

same_lemma(Lemma, Lemma) :-
	!.
same_lemma('l\'', le) :-
	!.
same_lemma(la, le) :-
	!.
same_lemma(les, le) :-
	!.
same_lemma(des, de) :-
	!.
same_lemma(du, de) :-
	!.
same_lemma(notre, mon) :-
	!.
same_lemma(leur, son) :-
	!.
same_lemma('France', 'FRANCE') :-
	!.


format1(X, Y) :-
	verbose(true),
	!,
	format(X, Y).
format1(_, _).