Content - e8c6e9234c3798a62989b2d0614b95bc833cd009 - b503ff3/m2const.pl

visit type:
Tip revision: 6f0eae2220fa7ef961b0c91b9393eaaeda538e36 authored by Richard Moot on 26 November 2015, 16:19:07 UTC
Minor corrections
Tip revision: 6f0eae2
m2const.pl

% constituent calculations

% start/0   computes word/4 and constituent/4 declarations from all files declared by xml_files/1
%
% export(File) all-in-one predicate combining *head.pl and the current XML files into a single *.pl file
%
% compute_penalties/0   computes crosses/4 declarations from constituent/4 declarations for all sentences
%
% cp(List)            as compute_penalties for all sentences in List
%
% update_crosses(Sent, Left, Right, Diff)    update given crosses/4 information by adding Diff to all
%                                            constituents crossing Left-Right
%
% update_crosses(Sent, Left, Right)          equivalent to update_crosses(Sent, Left, Right, 1).
%
% verify_sentences/0   verify if the words used in sent/2 declarations correspond to the given word/4
%                      declarations
%
% verify_lemmas/0

verbose(false).

% xml_files('flmf7aa1ep.cat.xml').
xml_files('flmf7aa2ep.cat.xml').
% xml_files('flmf7ab2ep.xml').
% xml_files('flmf7ae1ep.cat.xml').
% xml_files('flmf7af2ep.cat.xml').
% xml_files('flmf7ag1exp.cat.xml').
% xml_files('flmf7ag2ep.cat.xml').
% xml_files('flmf7ah1ep.aa.xml').
% xml_files('flmf7ah2ep.aa.xml').
% xml_files('flmf7ai1exp.cat.xml').
% xml_files('flmf7ai2ep.aa.cat.xml').
% xml_files('flmf7aj1ep.indent.xml').
% xml_files('flmf7ak1ep.indent.xml').
% xml_files('flmf7ak2ep.xd.cat.xml').
% xml_files('flmf7al1ep.cat.xml').
% xml_files('flmf7am1ep.xd.cat.xml').
% xml_files('flmf7am2ep.xd.cat.xml').
% xml_files('flmf7an1ep.xml').
% xml_files('flmf7an2co.af.cat.xml').
% xml_files('flmf7ao1ep.xml').
% xml_files('flmf7ao2ep.xml').
% xml_files('flmf7ap1ep.af.cat.xml').
% xml_files('flmf7aq2ep.xd.cat.xml').
% xml_files('flmf7as2ep.af.cat.xml').
% xml_files('flmf7atep.cat.xml').
%
% xml_files('flmf300_13000ep.cat.xml').
% xml_files('flmf3_08000_08499ep.xd.cat.xml').
%
% xml_files('annodis.er.xml').

:- dynamic word/4, lemma/4, constituent/4.

% create word/4 and constituent/4 declarations for the XML files declared by xml_file/1 (above).

start :-
	abolish(word/4),
	retractall(lemma(_,_,_,_)),
	retractall(word(_,_,_,_)),
	retractall(constituent(_,_,_,_)),
	findall(F, xml_files(F), Files),
	start(Files, 0, _).

start([], N, N).
start([F|Fs], N0, N) :-
	format(user_error, '~NXML File: ~w~n', [F]),
	load_structure(F, L0, [dialect(xml), space(default)]),
	delete_all_spaces(L0, L),
	xml_to_const(L, N0, N1, 0, _),
	nl(user_error),
	start(Fs, N1, N).

% = export(FileRoot)
%
% Given a file *head.pl and the currently loaded XML file, create a file *.pl (with intermediate
% files *const.pl, *word.pl and *crosses.pl, this last one will be deleted, because of it size)
% (* is replaced by FileRoot).

export(FileRoot) :-
	atom_concat(FileRoot, 'head.pl', HeadFile),
	check_exists(HeadFile),
	atom_concat(FileRoot, '.pl', TargetFile),
	atom_concat(FileRoot, 'const.pl', ConstFile),
	delete_if_exists(ConstFile),
	atom_concat(FileRoot, 'word.pl', WordFile),
	delete_if_exists(WordFile),
	atom_concat(FileRoot, 'crosses.pl', CrossFile),
	delete_if_exists(CrossFile),
	start,
	tell(WordFile),
	listing(word(_,_,_,_)),
	told,
	tell(ConstFile),
	listing(constituent(_,_,_,_)),
	told,
	compute_penalties,
	format(user_error, '~2nSaving penalties...', []),
	flush_output(user_error),
	tell(CrossFile),
	listing(crosses(_,_,_,_)),
	told,
	format(user_error, 'done!~n', []),
	format(user_error, 'Exporting...', []),
	flush_output(user_error),
	atomic_list_concat([cat,HeadFile,WordFile,CrossFile,'>',TargetFile], ' ', Cmd),
	format('~N~w~n', [Cmd]),
	process_create(path(sh), ['-c',Cmd], []),
	delete_file(CrossFile),
	format(user_error, 'done!~n', []).

check_exists(File) :-
   (
	exists_file(File)
   ->	
        true
   ;
        format(user_error, '~NFile ~w not found, aborting!~n', [File]),
        fail
   ).
   
delete_if_exists(File) :-
   (
	exists_file(File)
   ->
        delete_file(File)
   ;
	true
   ).
   
delete_all_spaces(Es0, Es) :-
	filter_list(Es0, Es1),
	delete_all_spaces1(Es1, Es).

delete_all_spaces1([], []).
delete_all_spaces1([E0|Es0], [E|Es]) :-
	delete_element_spaces(E0, E),
	delete_all_spaces1(Es0, Es).

delete_element_spaces(element(A,B,Cs0), element(A,B,Cs)) :-
	!,
	filter_list(Cs0, Cs1),
	delete_all_spaces1(Cs1, Cs).
delete_element_spaces(E, E).


handle_word_list([], _, N, N).
handle_word_list([W|Ws], S, N0, N) :-
	handle_word(W, S, N0, N1),
	handle_word_list(Ws, S, N1, N).


handle_word('C.L.', S, N0, N) :-
	!,
	N1 is N0 + 1,
	N is N1 + 1,
	Word1 = 'C.',
	Word2 = 'L.',
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word1, N0, N1]),
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word2, N1, N]),
	assert(word(S, Word1, N0, N1)),
	assert(word(S, Word2, N1, N)).
handle_word('H.D.', S, N0, N) :-
	!,
	N1 is N0 + 1,
	N is N1 + 1,
	Word1 = 'H.',
	Word2 = 'D.',
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word1, N0, N1]),
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word2, N1, N]),
	assert(word(S, Word1, N0, N1)),
	assert(word(S, Word2, N1, N)).
handle_word('B.F.', S, N0, N) :-
	!,
	N1 is N0 + 1,
	N is N1 + 1,
	Word1 = 'B.',
	Word2 = 'F.',
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word1, N0, N1]),
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word2, N1, N]),
	assert(word(S, Word1, N0, N1)),
	assert(word(S, Word2, N1, N)).
handle_word('l\'isloise', S, N0, N) :-
	!,
	N1 is N0 + 1,
	N is N1 + 1,
	Word1 = 'l\'',
	Word2 = isloise,
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word1, N0, N1]),
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word2, N1, N]),
	assert(word(S, Word1, N0, N1)),
	assert(word(S, Word2, N1, N)).
handle_word('jusqu\'au', S, N0, N) :-
	!,
	N1 is N0 + 1,
	N is N1 + 1,
	Word1 = 'jusqu\'',
	Word2 = au,
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word1, N0, N1]),
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word2, N1, N]),
	assert(word(S, Word1, N0, N1)),
	assert(word(S, Word2, N1, N)).
handle_word('l\'on', S, N0, N) :-
	!,
	N1 is N0 + 1,
	N is N1 + 1,
	Word1 = 'l\'',
	Word2 = on,
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word1, N0, N1]),
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word2, N1, N]),
	assert(word(S, Word1, N0, N1)),
	assert(word(S, Word2, N1, N)).

handle_word('T.bond', S, N0, N) :-
	!,
	N1 is N0 + 1,
	N is N1 + 1,
	Word1 = 'T',
	Word2 = bond,
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word1, N0, N1]),
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word2, N1, N]),
	assert(word(S, Word1, N0, N1)),
	assert(word(S, Word2, N1, N)).
handle_word('Pan.Am', S, N0, N) :-
	!,
	N1 is N0 + 1,
	N is N1 + 1,
	Word1 = 'Pan',
	Word2 = 'Am',
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word1, N0, N1]),
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word2, N1, N]),
	assert(word(S, Word1, N0, N1)),
	assert(word(S, Word2, N1, N)).
handle_word('de la', S, N0, N) :-
	!,
	N1 is N0 + 1,
	N is N1 + 1,
	Word1 = de,
	Word2 = la,
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word1, N0, N1]),
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word2, N1, N]),
	assert(word(S, Word1, N0, N1)),
	assert(word(S, Word2, N1, N)).
handle_word('jusque-là', S, N0, N) :-
	!,
	N1 is N0 + 1,
	N is N1 + 1,
	Word1 = jusque,
	Word2 = '-là',
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word1, N0, N1]),
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word2, N1, N]),
	assert(word(S, Word1, N0, N1)),
	assert(word(S, Word2, N1, N)).
handle_word('de l\'', S, N0, N) :-
	!,
	N1 is N0 + 1,
	N is N1 + 1,
	Word1 = de,
	Word2 = 'l\'',
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word1, N0, N1]),
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word2, N1, N]),
	assert(word(S, Word1, N0, N1)),
	assert(word(S, Word2, N1, N)).

handle_word('R.R.Donnelley', S, N0, N) :-
	!,
	N1 is N0 + 1,
	N2 is N1 + 1,
	N is N2 + 1,
	Word1 = 'R.',
	Word2 = 'R.',
	Word3 = 'Donnelley',
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word1, N0, N1]),
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word2, N1, N2]),
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word3, N2, N]),
	assert(word(S, Word1, N0, N1)),
	assert(word(S, Word2, N1, N2)),
	assert(word(S, Word3, N2, N)).

handle_word('L.C.Waïkiki', S, N0, N) :-
	!,
	N1 is N0 + 1,
	N2 is N1 + 1,
	N is N2 + 1,
	Word1 = 'L.',
	Word2 = 'C.',
	Word3 = 'Waïkiki',
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word1, N0, N1]),
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word2, N1, N2]),
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word3, N2, N]),
	assert(word(S, Word1, N0, N1)),
	assert(word(S, Word2, N1, N2)),
	assert(word(S, Word3, N2, N)).


handle_word(W, S, N0, N) :-
	xml_files('flmf7aa1ep.cat.xml'),
	atomic_list_concat([Word1,ci], '-', W),
	Word1 \= '',
	!,
	N1 is N0 + 1,
	N is N1 + 1,
	Word2 = '-ci',
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word1, N0, N1]),
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word2, N1, N]),
	assert(word(S, Word1, N0, N1)),
	assert(word(S, Word2, N1, N)).

handle_word(W, S, N0, N) :-
	xml_files('flmf7am1ep.xd.cat.xml'),
	atomic_list_concat([Word1,là], '-', W),
	Word1 \= '',
	!,
	N1 is N0 + 1,
	N is N1 + 1,
	Word2 = '-là',
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word1, N0, N1]),
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word2, N1, N]),
	assert(word(S, Word1, N0, N1)),
	assert(word(S, Word2, N1, N)).

handle_word(W, S, N0, N) :-
	xml_files('flmf3_08000_08499ep.xd.cat.xml'),
	atomic_list_concat([Word1,là], '-', W),
	Word1 \= '',
	!,
	N1 is N0 + 1,
	N is N1 + 1,
	Word2 = '-là',
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word1, N0, N1]),
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word2, N1, N]),
	assert(word(S, Word1, N0, N1)),
	assert(word(S, Word2, N1, N)).

handle_word(W, S, N0, N) :-
	xml_files('flmf7atep.cat.xml'),
	atomic_list_concat([Word1,là], '-', W),
	Word1 \= '',
	!,
	N1 is N0 + 1,
	N is N1 + 1,
	Word2 = '-là',
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word1, N0, N1]),
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word2, N1, N]),
	assert(word(S, Word1, N0, N1)),
	assert(word(S, Word2, N1, N)).

handle_word(W, S, N0, N) :-
	xml_files('flmf7ak2ep.xd.cat.xml'),
	atomic_list_concat([Word1,là], '-', W),
	Word1 \= '',
	!,
	N1 is N0 + 1,
	N is N1 + 1,
	Word2 = '-là',
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word1, N0, N1]),
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word2, N1, N]),
	assert(word(S, Word1, N0, N1)),
	assert(word(S, Word2, N1, N)).

handle_word(W, S, N0, N) :-
	xml_files('flmf7ai2ep.aa.cat.xml'),
	atomic_list_concat([Word1,là], '-', W),
	Word1 \= '',
	!,
	N1 is N0 + 1,
	N is N1 + 1,
	Word2 = '-là',
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word1, N0, N1]),
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word2, N1, N]),
	assert(word(S, Word1, N0, N1)),
	assert(word(S, Word2, N1, N)).


handle_word(W, S, N0, N) :-
	xml_files('flmf7ag1exp.cat.xml'),
	atomic_list_concat([Word1,là], '-', W),
	Word1 \= '',
	!,
	N1 is N0 + 1,
	N is N1 + 1,
	Word2 = '-là',
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word1, N0, N1]),
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word2, N1, N]),
	assert(word(S, Word1, N0, N1)),
	assert(word(S, Word2, N1, N)).

handle_word(W, S, N0, N) :-
	W \= '&',
	atomic_list_concat([Word1,Word3], '&', W),
	!,
	N1 is N0 + 1,
	N2 is N1 + 1,
	N is N2 + 1,
	Word2 = '&',
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word1, N0, N1]),
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word2, N1, N2]),
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word3, N2, N]),
	assert(word(S, Word1, N0, N1)),
	assert(word(S, Word2, N1, N2)),
	assert(word(S, Word3, N2, N)).

handle_word(W, S, N0, N) :-
	atom_chars(W, List),
	append(Prefix, ['°','C'], List),
	Prefix \= [],
	!,
	N1 is N0 + 1,
	N is N1 + 1,
	atom_chars(Word1, Prefix),
	atom_chars(Word2, ['°','C']),
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word1, N0, N1]),
	format1('word(~w, ~k, ~w, ~w).~n', [S, Word2, N1, N]),
	assert(word(S, Word1, N0, N1)),
	assert(word(S, Word2, N1, N)).
	
handle_word(W, S, N0, N) :-
	N is N0 + 1,
	format1('word(~w, ~k, ~w, ~w).~n', [S, W, N0, N]),
	assert(word(S, W, N0, N)).

% = xml_to_const(+ElementList, +ParentElement)
%
%

xml_to_const([], N, N, M, M).
xml_to_const([E|Es], N0, N, M0, M) :-
	element_to_const(E, N0, N1, M0, M1),
	xml_to_const(Es, N1, N, M1, M).

get_number(Int, _, S) :-
	integer(Int),
	!,
	S = Int.
get_number(_-Int, _, S) :-
	integer(Int),
	!,
	S = Int.
get_number(Atom, _, S) :-
	atomic_list_concat(List, -, Atom),
	last(List, Last),
	integer(Last),
	!,
	S = Last.
get_number(_, S0, S) :-
	S is S0 + 1.

element_to_const(element(Nm,As,Cs), N0, N, M0, M) :-
	!,
	element_to_const(Nm, As, Cs, N0, N, M0, M).

element_to_const('SENT', As, Cs, S0, S, _, M) :-
	!,
	write(user_error, '.'),
    (
        member(nb=Num,As)
    ->
        get_number(Num, S0, S1)
    ;
	S1 is S0 +1
    ),
	xml_to_const(Cs, S1, S, 0, M).

element_to_const(w, As, Cs, S, S, M0, M) :-
	!,
	get_lemma(As, Lemma),
	collect_words(Cs, Ws0, []),
	simplify_words(Ws0, Ws),
    (   Ws \= []
    ->
	smart_concat_atoms(Ws, Word),
	atomic_list_concat(L, '_', Word),
	handle_word_list(L, S, M0, M),
    (
	M > M0 + 1
    ->
        format1('constituent(~w, w, ~w, ~w).~n', [S, M0, M]),
        assert(constituent(S, w, M0, M))
    ;
        assert(lemma(S, Lemma, M0, M))
    )
    ;
	M is M0
    ).
element_to_const(Cat, _As, Cs, S0, S, N0, N) :-
	!,
	xml_to_const(Cs, S0, S, N0, N),
    (
	N > N0 + 1
    ->
        format1('constituent(~w, ~k, ~w, ~w).~n', [S0, Cat, N0, N]),
        assert(constituent(S0, Cat, N0, N))
     ;
        true
    ).

% simplify_words(+ListOfWords, -SimplifiedListOfWords)
%
% specify various rewrites of the different multi-words/complex words, ie. these rules
% are applied to a list of "subwords" of a "w" constituent.

simplify_words(List0, List) :-
	append(Prefix, ['Gmb.H'], List0),
	append(Prefix, ['GmbH'], List),
	!.
simplify_words(List0, List) :-
	append(Prefix, ['Gmb.','H'], List0),
	append(Prefix, ['GmbH'], List),
	!.

simplify_words(['Puma','A.G.Rudolf', 'Dassler', 'Sport'],
	       ['Puma','A.', 'G.', 'Rudolf', 'Dassler', 'Sport']) :-
	!.
simplify_words(['R','19'], ['R','19']) :-
	xml_files('flmf7aj1ep.indent.xml'),
	!.
simplify_words(['R','19'], ['R.19']) :-
	!.
simplify_words(['presqu\'','île'], ['presqu\'île']) :-
	!.
simplify_words([X,Y,'aujourd\'',hui], [X,Y,'aujourd\'hui']) :-
	!.
simplify_words([X,'aujourd\'',hui], [X,'aujourd\'hui']) :-
	!.
simplify_words([X,Y,'d\'',oeuvre], [X,Y,'d\'oeuvre']) :-
	!.
simplify_words(['Côte',-,'d\'',ivoire], ['Côte-d\'ivoire']) :-
	!.
simplify_words(['côte',-,'d\'',ivoire], ['côte-d\'ivoire']) :-
	!.
simplify_words(['Monde',-,'l\'','économie'], ['Monde-l\'économie']) :-
	!.
simplify_words([X,s], [Y]) :-
	atom_chars(X, List),
	last(List, '\''),
	!,
	atom_concat(X, s, Y).
simplify_words([no,'man\'',s,land], [no,'man\'s',land]) :-
	!.
simplify_words(['Rubik\'',s,cube], ['Rubik\'s',cube]) :-
	!.
simplify_words(['Who\'',s,next], ['Who\'s',next]) :-
	!.
simplify_words([X,-,ci], [Atom]) :-
	!,
	atomic_list_concat([X,-,ci], Atom).
simplify_words([pub,-,info], ['pub-info']) :-
	!.
% simplify_words([ville,-,campagne], ['ville-campagne']) :-
% 	!.
simplify_words([demi,-,X], ['demi-', X]) :-
	!.
simplify_words([mi,-,X], ['mi-', X]) :-
	!.
simplify_words(['FR', '3'], ['FR3']) :-
	!.
simplify_words(['Demak\'', up], ['Demak\'up']) :-
	!.
simplify_words(['FR', '3', X], ['FR3', X]) :-
	!.
simplify_words(['Antenne', 2, -, 'FR', 3], ['Antenne2-FR3']) :-
	!.
simplify_words(['Antenne', '2', -, 'FR', '3'], ['Antenne2-FR3']) :-
	!.
simplify_words([X], [Y,h,Z]) :-
	atomic_list_concat([Y0,Z0], h, X),
	atom_number(Y0, Y),
	atom_number(Z0, Z),
	!.
% simplify_words(['TF', 1], ['TF1']) :-
% 	!.
% simplify_words(['TF', '1'], ['TF1']) :-
% 	!.
simplify_words(Ws, Ws).


collect_words([]) -->
	[].
collect_words([W|Ws]) -->
	!,
	collect_word(W),
	collect_words(Ws).

collect_word(element(w,_,Ws)) -->
	!,
	collect_words(Ws).
collect_word(W) -->
	[W].

smart_concat_atoms(['arrière-', 'petits-enfants'], 'arrière-petits-enfants') :-
	!.
smart_concat_atoms(['aujourd\'',hui], 'aujourd\'hui') :-
	!.
smart_concat_atoms(['quelqu\'',un], 'quelqu\'un') :-
	!.
smart_concat_atoms(['Aujourd\'',hui], 'Aujourd\'hui') :-
	!.
smart_concat_atoms(Cs, Atom) :-
    (
        append(As, [-|Bs], Cs)
     ->
        smart_concat_atoms(As, AAtom),
        smart_concat_atoms(Bs, BAtom),
        concat_atom([AAtom,-,BAtom], Atom)
    ;
	append(As, [','|Bs], Cs)
    ->
        smart_concat_atoms(As, AAtom),
        smart_concat_atoms(Bs, BAtom),
        concat_atom([AAtom,',',BAtom], Atom)
     ;
        all_digits(Cs)
    ->
        concat_atom(Cs, '.', Atom)
    ;
	concat_atom(Cs, '_', Atom)
     ).
     

get_lemma(List, Lemma) :-
	member(lemma=Lemma, List),
	!.
get_lemma(_, '???').

all_digits([]).
all_digits([D|Ds]) :-
	integer(D),
	!,
	all_digits(Ds).
all_digits([D|Ds]) :-
	/* atom_number/2 fails silently if D is not a number */
	atom_number(D, _),
	all_digits(Ds).

% = filter_list(+ListA, ?ListB)
%
% true if ListA and ListB have the same elements except for the ' ' which
% are removed from ListB.

filter_list([], []).
filter_list([X|Xs], Ys0) :-
    (
	X == ' '
    ->
	filter_list(Xs, Ys0)
    ;
	Ys0 = [X|Ys],
	filter_list(Xs, Ys)
    ).

% = member1(?Element, +List)
%
% true if List contains Element. Like the library predicate member/2, but
% succeeds at most once. Unlike member_chk/2 unification is used instead of
% strict identity.

member1(X, [X|_]) :-
	!.
member1(X, [_|Ys]) :-
	member1(X, Ys).

cp(N0, N) :-
        compute_penalties1(N0),
    (
        N0 >= N
    ->
        true
    ;
        N1 is N0 + 1,
        cp(N1, N)
    ).

cp(List) :-
	abolish(crosses/4),
	retractall(crosses(_,_,_,_)),
	compute_penalties(List),
	retractall(crosses(_,_,_,0)).

% = compute_penalties
%
% compute penalties (as crosses/4 declarations) for all sentences

compute_penalties :-
	abolish(crosses/4),
	retractall(crosses(_,_,_,_)),
	/* recover list of sentence numbers */
	setof(X, A^B^C^constituent(X,A,B,C), Sentences),
	compute_penalties(Sentences),
	/* erase zero entries */
	retractall(crosses(_,_,_,0)).

add_constituents :-
	setof(X, A^B^C^constituent(X,A,B,C), Sentences),
	add_constituents(Sentences),

add_constituents([]).
add_constituents([S|Ss]) :-
	add_constituents1(S),
	add_constituents(Ss).

add_constituents1(S) :-
	findall(t(Cat,L,R), constituent(S, Cat, L, R), Triples),
	add_constituents2(Triples, S).

add_constituents2([], _).
add_constituents2([t(Cat, L, R)|Rest], S) :-
	add_constituents2(S, Cat, L, R),
	add_constituents2(Rest, S).

% check if there is an interpunction symbol to the left of the current constituent,
% if so add a new constituent which includes this interpunction symbol (and do so
% recursively, adding multiple interpunction symbols if necessary).

add_constituents2(S, Cat, L, R) :-
   (
	word(S, W, L0, L),
        is_interpunction(W)
   ->
        format1('constituent(~w, ~w, ~w, ~w).~n', [S,Cat,L0,R]),
        retractall(constituent(S, Cat, L, R)),
        assert(constituent(S, Cat, L0, R)),
        add_constituents2(S, Cat, L0, R)
   ;
        true
   ).

is_interpunction('...').
is_interpunction(',').
is_interpunction(':').
is_interpunction(';').
is_interpunction('(').
is_interpunction(')').
is_interpunction('"').
is_interpunction('[').
is_interpunction(']').

compute_penalties([]).
compute_penalties([S|Ss]) :-
	format(user_error, '~n~w', [S]),
	compute_penalties1(S),
	compute_penalties(Ss).

compute_penalties1(S) :-
	compute_length(S, 0, Max),
	compute_penalties1(S, Max).

compute_penalties1(S, Max) :-
	initialize(0, Max, S),
	write(user_error, ':'),
	cross_comp(2, Max, S).

initialize(N0, N, S) :-
	NR is N0 + 2,
    (
        NR > N
    ->
        true
    ;
        findall(L,(constituent(S,_,L,R),aux_crosses(L,R,N0,NR)), Cs),
        length(Cs, Cr),
        assert(crosses(S, N0, NR, Cr)),
        N1 is N0 +1,
        initialize(N1, N, S)
    ).

% L -- N0 -- R -- NR

aux_crosses(L, R, N0, NR) :-
	L < N0,
	N0 < R,
	R < NR.
% N0 -- L -- NR -- R
aux_crosses(L, R, N0, NR) :-
	N0 < L,
	L < NR,
	NR < R.

cross_comp(D0, Max, S) :-
    (
        D0 >= Max
    ->
        true
    ;
	write(user_error, '.'),
        D is D0 + 1,
        compute_crosses(D, 0, Max, S),
        cross_comp(D, Max, S)
    ).

compute_crosses(D, N0, N, S) :-
	NR is N0 + D,
    (
        NR > N
    ->
        true
    ;
        NR0 is NR - 1,
        crosses(S, N0, NR0, Cr0),
        /* crossed at length D-1, but no longer crosses at length D */
        findall(., (constituent(S,_,L,NR),L > N0), List0),
        /* didn't cross at length D-1, but crosses at length D */
        findall(., constituent(S,_,NR0,_), List1),
        length(List0, C0),
        length(List1, C1),
        Cr is (Cr0 + C1) - C0,
        assert(crosses(S, N0, NR, Cr)),
        N1 is N0 +1,
        compute_crosses(D, N1, N, S)
     ). 

crosses1(S, L, R, C) :-
	crosses(S, L, R, C),
	!.
crosses1(_, _, _, 0).

compute_length(S, 0, Max) :-
	findall(R, constituent(S,_,_,R), Rights),
	compute_lengths1(Rights, 0, Max).

compute_lengths1([], M, M).
compute_lengths1([R|Rs], M0, M) :-
    (
        R > M0
    ->
        M1 = R
    ;
        M1 = M0
    ),
    compute_lengths1(Rs, M1, M).

% TODO: needs to be updated to handle implicit zeros correctly!

% = update_crosses(+Sentence, +Left, +Right)
%
% update the asserted crosses/4 predicate for Sentence, adding a
% constituent from position Left to Right.

update_crosses(Sent, X, Y) :-
	update_crosses(Sent, X, Y, 1).

% = update_crosses(+Sentence, +Left, +Right, +Plus)
%
% update the asserted crosses/4 predicate for Sentence, adding a
% constituent from position Left to Right, increasing the value
% of all constituent crossing this positions by Plus (eg. we
% can "erase" a pair of brackets with Plus = -1 or make "strong"
% brackets with Plus >= 2.

update_crosses(Sent, X, Y, Plus) :-
%	compute_length(Sent, 0, Length),
	findall(crosses(Sent,V,W,Z), crosses(Sent,V,W,Z), List0),
	update_crosses(List0, X, Y, Plus, List),
	retractall(crosses(Sent,_,_,_)),
	assert_crosses(List),
	listing(crosses(Sent,_,_,_)).

update_crosses([], _, _, _, []).
update_crosses([crosses(Sent, V, W, Cross0)|Rest0], X, Y, Plus, [crosses(Sent, V, W, Cross)|Rest]) :-
   ((	
	/* X < V < Y < W */
	X < V,
	V < Y,
	Y < W
   ;
	/* V < X < Y < W */
	V < W,
	X < W,
	W < Y
   )
   ->	
        Cross is Cross0 + Plus
   ;
       Cross = Cross0
   ),
       update_crosses(Rest0, X, Y, Plus, Rest).

assert_crosses([]).
assert_crosses([crosses(A,B,C,D)|Cs]) :-
   (
	D =:= 0
   ->
	true
   ;
	assert(crosses(A,B,C,D))
   ),
	assert_crosses(Cs).


assert_list([]).
assert_list([C|Cs]) :-
	assert(C),
	assert_list(Cs).


delete_position(Sent, Pos) :-
	findall(crosses(Sent,V,W,Z), crosses(Sent,V,W,Z), List0),
	delete_position(List0, Pos, List),
	retractall(crosses(Sent,_,_,_)),
	assert_list(List),
	listing(crosses(Sent,_,_,_)).


verify_sentences :-
	verify_sentences(log).
verify_sentences(Log) :-
    ( exists_file(Log) -> delete_file(Log) ; true),
	tell(log),
	findall(Num, clause(sent(Num,_),_), List),
	verify_sentences_list(List),
	told,
	format(user_error, '~NDone!~nLog output to file ~w~n', [Log]).

verify_sentences_list([]).
verify_sentences_list([N|Ns]) :-
	verify_sentence(N),
	verify_sentences_list(Ns).

verify_sentence(Num) :-
	clause(sent(Num,_), prob_parse(List,_)),
	verify_sentence(List, 0, Num, '.').

verify_sentence([], N0, Num, Status) :-
   (	
	word(Num, Word2, N0, N)
     ->
	format('~N!!!~d: Untreated word ~w~n', [Num,Word2]),
	verify_sentence([], N, Num, '*')
   ;
	write(user_error, Status)
   ).
verify_sentence([si(Word1,_,_,_)|Rest], N0, Num, Status0) :-
   (	
	word(Num, Word2, N0, N)
     ->
   (
        Word1 = Word2
   ->
        Status = Status0
   ;
        atom_number(Word2, Word1)	 
   ->
	Status = Status0
   ;
        Status = '*',
	format('~N~d: Word mismatch ~w-~w~n', [Num,Word1,Word2])
   );	Status = '*',
	format('~N~d: Unmatched word ~w~n', [Num,Word1]),
	N is N0 + 1),
        verify_sentence(Rest, N, Num, Status).


verify_lemmas :-
	verify_lemmas(log).
verify_lemmas(Log) :-
    ( exists_file(Log) -> delete_file(Log) ; true),
	tell(log),
	findall(Num, clause(sent(Num,_),_), List),
	verify_lemmas_list(List),
	told,
	format(user_error, '~NDone!~nLog output to file ~w~n', [Log]).

verify_lemmas_list([]).
verify_lemmas_list([N|Ns]) :-
	verify_lemma(N),
	verify_lemmas_list(Ns).

verify_lemma(Num) :-
	clause(sent(Num,_), prob_parse(List,_)),
	verify_lemma(List, 0, Num, '.').

verify_lemma([], _, _, Status) :-
	write(user_error, Status).
verify_lemma([si(_,_,Lemma1,_)|Rest], N0, Num, Status0) :-
   (	
	lemma(Num, Lemma2, N0, N)
     ->
   (
        same_lemma(Lemma1, Lemma2)
   ->
        Status = Status0
   ;
        atom_number(Lemma2, Lemma1)	 
   ->
	Status = Status0
   ;
        Status = '*',
	format('~N~d: Lemma mismatch ~w-~w~n', [Num,Lemma1,Lemma2])
   );	Status = '.',
	format('~N~d: Unmatched lemma ~w~n', [Num,Lemma1]),
	N is N0 + 1),
        verify_lemma(Rest, N, Num, Status).

same_lemma(Lemma, Lemma) :-
	!.
same_lemma('l\'', le) :-
	!.
same_lemma(la, le) :-
	!.
same_lemma(les, le) :-
	!.
same_lemma(des, de) :-
	!.
same_lemma(du, de) :-
	!.
same_lemma(notre, mon) :-
	!.
same_lemma(leur, son) :-
	!.
same_lemma('France', 'FRANCE') :-
	!.


format1(X, Y) :-
	verbose(true),
	!,
	format(X, Y).
format1(_, _).
Browse the archive

https://github.com/RichardMoot/GrailLight