use Encode; use IO::Handle; STDOUT->autoflush(); use strict; use utf8; use Data::Dumper; use Text::Levenshtein qw(distance); use Data::Dumper; use experimental 'smartmatch'; use locale; ######################################### Parameters ######################################### my $repIn = "xml/in"; my $repOut = "xml/out";################################## my $source; opendir(DIRIN,$repIn); while ($source = readdir(DIRIN)) { if($source=~/\.xml$/){ open( SOURCE, "<:encoding(utf8)", $repIn."/".$source ); open( CIBLE, ">:encoding(utf8)", $repOut."/".$source ); my $line; my $lemma; my $cat; my $acceptionUniq; my $allXML=join("",); # while ($line = ){ # # # } # if($allXML=~/]+LEMMA="([^"]+)"[^<>]*>)/sg){ my $cpt=0; my $cptSub=0; while($allXML=~/(]+?LEMMA="([^"]+?)"[^<>]*?>))/sg){ my $empan = $1; my $inOpenTag=$2; my $lemma=$4; my $cat=$3; # print "TOKEN $5 \n"; my $acceptionUniq=""; if($lemma =~/^(.+?)([_A-Z\d]+)$/){ $acceptionUniq = $1."_".$cat.$2; # print "LemmaComp $acceptionUniq\n"; } else{ $acceptionUniq = $lemma."_".$cat; # print "LemmaSimple $acceptionUniq\n"; } # print "Passage $inOpenTag\n"; if($allXML=~/\Q$empan/s){ print "EMPAN $empan\n"; print "SUB $inOpenTag\n"; $cptSub++; $allXML=~s/\Q$empan/