https://gitlab.com/hatiers/lstclean.git
Tip revision: c4fe82768078eb5a65a80880c9d1de4c70ca54d7 authored by arnaudbey on 28 May 2020, 08:37:31 UTC
fix header + apply templates in admin pages
fix header + apply templates in admin pages
Tip revision: c4fe827
xml-annote-ids.pl
use Encode;
use IO::Handle;
STDOUT->autoflush();
use strict;
use utf8;
use Data::Dumper;
use Text::Levenshtein qw(distance);
use Data::Dumper;
use experimental 'smartmatch';
use locale;
######################################### Parameters #########################################
my $repIn = "xml/in";
my $repOut = "xml/out";##################################
my $source;
opendir(DIRIN,$repIn);
while ($source = readdir(DIRIN)) {
if($source=~/\.xml$/){
open( SOURCE, "<:encoding(utf8)", $repIn."/".$source );
open( CIBLE, ">:encoding(utf8)", $repOut."/".$source );
my $line;
my $lemma;
my $cat;
my $acceptionUniq;
my $allXML=join("",<SOURCE>);
# while ($line = <SOURCE>){
#
#
# }
# if($allXML=~/<LST (CAT="([^"]+)"[^<>]+LEMMA="([^"]+)"[^<>]*>)/sg){
my $cpt=0;
my $cptSub=0;
while($allXML=~/(<LST(\s+CAT="([^"]+)"[^<>]+?LEMMA="([^"]+?)"[^<>]*?>))/sg){
my $empan = $1;
my $inOpenTag=$2;
my $lemma=$4;
my $cat=$3;
# print "TOKEN $5 \n";
my $acceptionUniq="";
if($lemma =~/^(.+?)([_A-Z\d]+)$/){
$acceptionUniq = $1."_".$cat.$2;
# print "LemmaComp $acceptionUniq\n";
}
else{
$acceptionUniq = $lemma."_".$cat;
# print "LemmaSimple $acceptionUniq\n";
}
# print "Passage $inOpenTag\n";
if($allXML=~/\Q$empan/s){
print "EMPAN $empan\n";
print "SUB $inOpenTag\n";
$cptSub++;
$allXML=~s/\Q$empan/<LST acceptionUniq="$acceptionUniq" $inOpenTag/s;
}
$cpt++;
}
print "Lemma LST : $cpt \n";
print "SUB LST : $cptSub \n";
print CIBLE $allXML;
close(SOURCE);
close(CIBLE);
}
}
closedir(DIRIN);
