https://hal.archives-ouvertes.fr/hal-02963802
Raw File
Tip revision: 6b9bf3964385d0c16d262796d9e4a3a30a52dafd authored by Software Heritage on 12 October 2020, 00:00:00 UTC
hal: Deposit 1045 in collection hal
Tip revision: 6b9bf39
langFromDictCreator.sh

#| \
if [ $# -ne 3 ]
then
    echo "Usage: `basename $0` <jsgf> <dict> <outdir>"
	exit 1
fi

! hash sphinx_jsgf2fsg && echo "Missing sphinx_jsgf2fsg , run apt-get install sphinxbase-utils ;'" && exit 1

jsgf_orig=$1
dict_dir=$2
outdir=$3

jsgf=$(mktemp)
tmp_dict=$(mktemp -d)
tmp_lang=$(mktemp -d)
lang_dir=$(mktemp -d)

cp -r $dict_dir/* $tmp_dict
cp $jsgf_orig $jsgf

rm $tmp_dict/lexiconp.txt
rm $tmp_dict/lexicon.txt

# Make JSGF acceptable for sphinx_jsgf2fsg
sed -i "s/^public //" $jsgf
sed -i "s/^<MAIN>/public <MAIN>/" $jsgf

filename=$(basename "$jsgf")
filename="${filename%.*}"

mkdir -p $outdir

# Convert to FSG
sphinx_jsgf2fsg -jsgf $jsgf -fsm $outdir/${filename}.fsm -symtab $outdir/${filename}.sym 2> $outdir/${filename}.err


python compareFile.py $outdir/${filename}.sym $dict_dir/lexicon.txt 0 0 1 > $tmp_dict/lexicon.txt 

# Additional lexicons
echo "</s> sil" >>  $tmp_dict/lexicon.txt
echo "<UNK> xx" >> $tmp_dict/lexicon.txt
echo "<s> sil" >> $tmp_dict/lexicon.txt
echo "vesta v e s t a" >> $tmp_dict/lexicon.txt
echo "degrés d g R e" >>  $tmp_dict/lexicon.txt
echo "degrés d g R e z" >>  $tmp_dict/lexicon.txt
echo "degrés d swa g R e" >>  $tmp_dict/lexicon.txt
echo "degrés d swa g R e z" >>  $tmp_dict/lexicon.txt
echo "treize t R e z" >>  $tmp_dict/lexicon.txt
echo "treize t R e z swa" >>  $tmp_dict/lexicon.txt
echo "°c d g R e" >>  $tmp_dict/lexicon.txt
echo "°c d swa g R e" >>  $tmp_dict/lexicon.txt
echo "the z e"  >>  $tmp_dict/lexicon.txt
echo "6ter  s i t e R" >> $tmp_dict/lexicon.txt
echo "6ter  s i s t e R" >> $tmp_dict/lexicon.txt
echo "6ter  s i s swa t e R" >> $tmp_dict/lexicon.txt
echo "d17   d e d i s e t" >> $tmp_dict/lexicon.txt
echo "d8     d e H i t" >> $tmp_dict/lexicon.txt
echo "d8     d e H i t swa" >> $tmp_dict/lexicon.txt
echo "hd1   a S d e in" >> $tmp_dict/lexicon.txt
echo "l'     l " >> $tmp_dict/lexicon.txt
echo "l'équipe l e k i p" >> $tmp_dict/lexicon.txt
echo "l'équipe l e k i p swa" >> $tmp_dict/lexicon.txt
echo "équipe  e k i p" >> $tmp_dict/lexicon.txt
echo "équipe  e k i p swa" >> $tmp_dict/lexicon.txt
echo "nrj12  e n e R Z i d u z" >> $tmp_dict/lexicon.txt
echo "as-tu   a t y" >> $tmp_dict/lexicon.txt
echo "aurais-tu  o R e t y" >> $tmp_dict/lexicon.txt
echo "cinéma  s i n e m a" >> $tmp_dict/lexicon.txt
echo "j'    Z" >> $tmp_dict/lexicon.txt
echo "aimerais  e m R e" >> $tmp_dict/lexicon.txt
echo "aimerais  e m R e z" >> $tmp_dict/lexicon.txt
echo "aimerais  e m swa R e" >> $tmp_dict/lexicon.txt
echo "aimerais  e m swa R e z" >> $tmp_dict/lexicon.txt
echo "j'aimerais Z e m R e" >> $tmp_dict/lexicon.txt
echo "j'aimerais Z e m R e z" >> $tmp_dict/lexicon.txt
echo "j'aimerais Z e m swa R e" >> $tmp_dict/lexicon.txt
echo "j'aimerais Z e m swa R e z" >> $tmp_dict/lexicon.txt
echo "l'   l" >> $tmp_dict/lexicon.txt
echo "épisode  e p i z o d" >> $tmp_dict/lexicon.txt
echo "épisode  e p i z o d swa" >> $tmp_dict/lexicon.txt
echo "l'épisode  l e p i z o d" >> $tmp_dict/lexicon.txt
echo "l'épisode  l e p i z o d swa" >> $tmp_dict/lexicon.txt
echo "cinema s i n e m a" >> $tmp_dict/lexicon.txt
echo "m'  m" >> $tmp_dict/lexicon.txt
echo "enregistres  an R swa Z i s t R" >> $tmp_dict/lexicon.txt
echo "enregistres  an R swa Z i s t R swa" >> $tmp_dict/lexicon.txt
echo "enregistres  an R Z i s t R" >> $tmp_dict/lexicon.txt
echo "enregistres  an R Z i s t R swa" >> $tmp_dict/lexicon.txt
echo "m'enregistres  m an R swa Z i s t R" >> $tmp_dict/lexicon.txt
echo "m'enregistres  m an R swa Z i s t R swa" >> $tmp_dict/lexicon.txt
echo "m'enregistres  m an R Z i s t R" >> $tmp_dict/lexicon.txt
echo "m'enregistres  m an R Z i s t R swa" >> $tmp_dict/lexicon.txt
echo "peux-tu    p swa t e y" >> $tmp_dict/lexicon.txt
echo "qu'  k" >> $tmp_dict/lexicon.txt
echo "acteur  a k t swa R  " >> $tmp_dict/lexicon.txt
echo "qu'acteur  k a k t swa R  " >> $tmp_dict/lexicon.txt
echo "qu'est-ce   k e s" >> $tmp_dict/lexicon.txt
echo "trouve-moi  t R u v m w a" >> $tmp_dict/lexicon.txt
echo "y'a  j a" >> $tmp_dict/lexicon.txt
echo "est-ce  e s" >> $tmp_dict/lexicon.txt
echo "d'    d" >> $tmp_dict/lexicon.txt
echo "incendie  in s an d i" >> $tmp_dict/lexicon.txt
echo "d'incendie  d in s an d i" >> $tmp_dict/lexicon.txt
echo "transmetteurs  t R an s m e t swa R" >> $tmp_dict/lexicon.txt
echo "transmetteurs  t R an s m e t swa R z" >> $tmp_dict/lexicon.txt
echo "intrusions  in t R y z j on" >> $tmp_dict/lexicon.txt
echo "intrusions  in t R y z j on z" >> $tmp_dict/lexicon.txt
echo "d'intrusions d in t R y z j on" >> $tmp_dict/lexicon.txt
echo "d'intrusions d in t R y z j on z" >> $tmp_dict/lexicon.txt
echo "transmetteur  t R an s m e t swa R" >> $tmp_dict/lexicon.txt
echo "transmetteur  t R an s m e t swa R R" >> $tmp_dict/lexicon.txt
echo "mois-ci  m w a s i" >> $tmp_dict/lexicon.txt
echo "année  a n e" >> $tmp_dict/lexicon.txt
echo "l'année  l a n e" >> $tmp_dict/lexicon.txt
echo "j'ai   Z e " >> $tmp_dict/lexicon.txt
echo "ai-je    e Z" >> $tmp_dict/lexicon.txt
echo "ai-je    e Z swa" >> $tmp_dict/lexicon.txt
echo "eau   o" >> $tmp_dict/lexicon.txt
echo "d'eau   d o" >> $tmp_dict/lexicon.txt
echo "température  t an p e R a t y R" >> $tmp_dict/lexicon.txt
echo "température  t an p e R a t y R swa" >> $tmp_dict/lexicon.txt
echo "temperature  t an p e R a t y R" >> $tmp_dict/lexicon.txt
echo "temperature  t an p e R a t y R swa" >> $tmp_dict/lexicon.txt
echo "80  k a t R swa v in" >> $tmp_dict/lexicon.txt
echo "50  s in k an t" >> $tmp_dict/lexicon.txt
echo "20  v in" >> $tmp_dict/lexicon.txt
echo "20  v in t" >> $tmp_dict/lexicon.txt
echo "20  v in t swa" >> $tmp_dict/lexicon.txt
echo "%   p u R s" >> $tmp_dict/lexicon.txt
echo "%  p u R s an" >> $tmp_dict/lexicon.txt
echo "%  p u R s an t" >> $tmp_dict/lexicon.txt
echo "%  p u R s swa" >> $tmp_dict/lexicon.txt
echo "%  p u R s swa t" >> $tmp_dict/lexicon.txt
echo "%  p u R s t" >> $tmp_dict/lexicon.txt
echo "%  p u R s an" >> $tmp_dict/lexicon.txt
echo "%  p u R s an z" >> $tmp_dict/lexicon.txt
echo "allen a l swa n" >> $tmp_dict/lexicon.txt
echo "allumés a l y m e" >> $tmp_dict/lexicon.txt
echo "allumés a l y m e z" >> $tmp_dict/lexicon.txt
echo "arte a R t" >> $tmp_dict/lexicon.txt
echo "arte a R t e" >> $tmp_dict/lexicon.txt
echo "arte a R t swa" >> $tmp_dict/lexicon.txt
echo "c'est s e" >> $tmp_dict/lexicon.txt
echo "c'est s e t" >> $tmp_dict/lexicon.txt
echo "chauffe-eau S o f o" >> $tmp_dict/lexicon.txt
echo "desperate d e s p e R a j t" >> $tmp_dict/lexicon.txt
echo "desperate d e s p e R a j t swa" >> $tmp_dict/lexicon.txt
echo "désactive d e z a k t i v" >> $tmp_dict/lexicon.txt
echo "désactive d e z a k t i v swa" >> $tmp_dict/lexicon.txt
echo "désactiver d e z a k t i v e" >> $tmp_dict/lexicon.txt
echo "désactiver d e z a k t i v e R" >> $tmp_dict/lexicon.txt
echo "détecteur d e t e k t swa R" >> $tmp_dict/lexicon.txt
echo "détecteurs d e t e k t swa R" >> $tmp_dict/lexicon.txt
echo "détecteurs d e t e k t swa R z" >> $tmp_dict/lexicon.txt
echo "enregistres an R Z i s t R" >> $tmp_dict/lexicon.txt
echo "enregistres an R Z i s t R swa" >> $tmp_dict/lexicon.txt
echo "enregistres an R swa Z i s t R" >> $tmp_dict/lexicon.txt
echo "enregistres an R swa Z i s t R swa" >> $tmp_dict/lexicon.txt
echo "gulli g y l i" >> $tmp_dict/lexicon.txt
echo "housewives a u s w a j f" >> $tmp_dict/lexicon.txt
echo "itele i t e l e" >> $tmp_dict/lexicon.txt
echo "jackson Z a k s o n" >> $tmp_dict/lexicon.txt
echo "jackson d Z a k s o n" >> $tmp_dict/lexicon.txt
echo "jt Z i t e" >> $tmp_dict/lexicon.txt
echo "lcp e l s e p e" >> $tmp_dict/lexicon.txt
echo "lee l i" >> $tmp_dict/lexicon.txt
echo "luminaire l y m i n e R" >> $tmp_dict/lexicon.txt
echo "luminaire l y m i n e R swa" >> $tmp_dict/lexicon.txt
echo "luminaires l y m i n e R" >> $tmp_dict/lexicon.txt
echo "luminaires l y m i n e R swa" >> $tmp_dict/lexicon.txt
echo "luminaires l y m i n e R swa z" >> $tmp_dict/lexicon.txt
echo "luminaires l y m i n e R z" >> $tmp_dict/lexicon.txt
echo "m6 e m s i s" >> $tmp_dict/lexicon.txt
echo "mute m y t" >> $tmp_dict/lexicon.txt
echo "mute m y t swa" >> $tmp_dict/lexicon.txt
echo "nt1 e n t e in" >> $tmp_dict/lexicon.txt
echo "qu'il k i l" >> $tmp_dict/lexicon.txt
echo "quentin k an t in" >> $tmp_dict/lexicon.txt
echo "recommandes R k o m an d" >> $tmp_dict/lexicon.txt
echo "recommandes R k o m an d swa" >> $tmp_dict/lexicon.txt
echo "recommandes R swa k o m an d" >> $tmp_dict/lexicon.txt
echo "recommandes R swa k o m an d swa" >> $tmp_dict/lexicon.txt
echo "regarderai R g a R d R e" >> $tmp_dict/lexicon.txt
echo "regarderai R g a R d swa R e" >> $tmp_dict/lexicon.txt
echo "regarderai R swa g a R d R e" >> $tmp_dict/lexicon.txt
echo "regarderai R swa g a R d swa R e" >> $tmp_dict/lexicon.txt
echo "rmc e R e m s e" >> $tmp_dict/lexicon.txt
echo "samuel s a m H e l" >> $tmp_dict/lexicon.txt
echo "samuel s a m y e l" >> $tmp_dict/lexicon.txt
echo "store s t o R" >> $tmp_dict/lexicon.txt
echo "store s t o R swa" >> $tmp_dict/lexicon.txt
echo "suggères s y Z e R" >> $tmp_dict/lexicon.txt
echo "suggères s y Z e R swa" >> $tmp_dict/lexicon.txt
echo "suggères s y g Z e R" >> $tmp_dict/lexicon.txt
echo "suggères s y g Z e R swa" >> $tmp_dict/lexicon.txt
echo "tarantino t a R an t i n o" >> $tmp_dict/lexicon.txt
echo "tmc t e e m s e" >> $tmp_dict/lexicon.txt
echo "tv t e v e" >> $tmp_dict/lexicon.txt
echo "télécommandes t e l e k o m an d" >> $tmp_dict/lexicon.txt
echo "télécommandes t e l e k o m an d swa" >> $tmp_dict/lexicon.txt
echo "télécommandes t e l e k o m an d swa z" >> $tmp_dict/lexicon.txt
echo "télécommandes t e l e k o m an d z" >> $tmp_dict/lexicon.txt
echo "voice v o j s" >> $tmp_dict/lexicon.txt
echo "voice v o j s swa" >> $tmp_dict/lexicon.txt
echo "voice v w a s" >> $tmp_dict/lexicon.txt
echo "volatiles v o l a t i l" >> $tmp_dict/lexicon.txt
echo "volatiles v o l a t i l swa" >> $tmp_dict/lexicon.txt
echo "volatiles v o l a t i l swa z" >> $tmp_dict/lexicon.txt
echo "volatiles v o l a t i l z" >> $tmp_dict/lexicon.txt
echo "w9 d u b l swa v e n swa f" >> $tmp_dict/lexicon.txt
echo "woody w u d i" >> $tmp_dict/lexicon.txt
echo "éteins e t in" >> $tmp_dict/lexicon.txt
echo "éteins e t in z" >> $tmp_dict/lexicon.txt
echo "éteintes e t in t" >> $tmp_dict/lexicon.txt
echo "éteintes e t in t swa" >> $tmp_dict/lexicon.txt
echo "éteintes e t in t swa z" >> $tmp_dict/lexicon.txt
echo "éteintes e t in t z" >> $tmp_dict/lexicon.txt
echo "éteints e t in" >> $tmp_dict/lexicon.txt
echo "éteints e t in z" >> $tmp_dict/lexicon.txt

python compareFile.py $outdir/${filename}.sym  $tmp_dict/lexicon.txt  0 0 0 > $tmp_dict/missing_words.txt 



utils/prepare_lang.sh --position-dependent-phones false $tmp_dict "<UNK>" $tmp_lang $lang_dir
# utils/prepare_lang.sh $tmp_dict "<UNK>" $tmp_lang $lang_dir

echo "Lang directory is:"$lang_dir
word_sym_file="$lang_dir/words.txt"


# Optmize the FSG
#fstcompile --arc_type=log --acceptor --isymbols=$outdir/${filename}.sym --keep_isymbols $outdir/${filename}.fsm | fstdeterminize | fstminimize | fstrmepsilon | fstprint | \
#   utils/remove_oovs.pl /dev/null | \
#   utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$word_sym_file \
#     --osymbols=$word_sym_file --keep_isymbols=false --keep_osymbols=false | \
#    fstrmepsilon > $outdir/G.fst

fstcompile --acceptor --isymbols=$word_sym_file --osymbols=$word_sym_file --keep_isymbols=false  --keep_osymbols=false  $outdir/${filename}.fsm | fstdeterminize | fstminimize | fstrmepsilon > $outdir/G.fst
echo  "Checking how stochastic G is (the first of these numbers should be small):"
fstisstochastic $outdir/G.fst

cp $lang_dir/L.fst $lang_dir/L_disambig.fst $lang_dir/phones.txt $lang_dir/words.txt $outdir/
mkdir $outdir/phones
cp $lang_dir/phones/silence.csl $lang_dir/phones/disambig.int $outdir/phones/

cp $jsgf_orig $outdir

## Checking that G.fst is determinizable.
#fstdeterminize $outdir/G.fst /dev/null || echo Error determinizing G.
#
## Checking that L_disambig.fst is determinizable.
#fstdeterminize $lang_dir/L_disambig.fst /dev/null || echo Error determinizing L.
#
#fsttablecompose $lang_dir/L_disambig.fst $outdir/G.fst | \
#   fstdeterminizestar >/dev/null || { echo Error determinizing LG; exit 1; }
back to top