https://gitlab.com/hatiers/lstclean.git
Tip revision: a74862fb741564a88fa4ad4651965c0362c50e30 authored by arnaudbey on 01 March 2023, 15:58:44 UTC
WIP
WIP
Tip revision: a74862f
xml-annote-ids.pl
use Encode;
use IO::Handle;
STDOUT->autoflush();
use strict;
use utf8;
use Data::Dumper;
use Text::Levenshtein qw(distance);
use Data::Dumper;
use experimental 'smartmatch';
use locale;
######################################### Parameters #########################################
my $repIn = "in";
my $repOut = "out";##################################
my $source;
opendir(DIRIN,$repIn);
while ($source = readdir(DIRIN)) {
if($source=~/\.xml$/){
open( SOURCE, "<:encoding(utf8)", $repIn."/".$source );
open( CIBLE, ">:encoding(utf8)", $repOut."/".$source );
my $line;
my $lemma;
my $cat;
my $acceptionUniq;
my $allXML=join("",<SOURCE>);
# while ($line = <SOURCE>){
#
#
# }
# if($allXML=~/<LST (CAT="([^"]+)"[^<>]+LEMMA="([^"]+)"[^<>]*>)/sg){
my $cpt=0;
my $cptSub=0;
while($allXML=~/(<LST(\s+CAT="([^"]+)"[^<>]+?LEMMA="([^"]+?)"[^<>]*?>))/sg){
my $empan = $1;
my $inOpenTag=$2;
my $lemma=$4;
my $cat=$3;
# print "TOKEN $5 \n";
my $acceptionUniq="";
if($lemma =~/^(.+?)([_A-Z\d]+)$/){
$acceptionUniq = $1."_".$cat.$2;
# print "LemmaComp $acceptionUniq\n";
}
else{
$acceptionUniq = $lemma."_".$cat;
# print "LemmaSimple $acceptionUniq\n";
}
# print "Passage $inOpenTag\n";
if($allXML=~/\Q$empan/s){
print "EMPAN $empan\n";
print "SUB $inOpenTag\n";
$cptSub++;
$allXML=~s/\Q$empan/<LST acceptionUniq="$acceptionUniq" $inOpenTag/s;
}
$cpt++;
}
print "Lemma LST : $cpt \n";
print "SUB LST : $cptSub \n";
print CIBLE $allXML;
close(SOURCE);
close(CIBLE);
}
}
closedir(DIRIN);
