forked from gooofy/kaldi-adapt-lm
-
Notifications
You must be signed in to change notification settings - Fork 2
/
3-adapt.sh
389 lines (373 loc) · 11.2 KB
/
3-adapt.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
#!/bin/bash
# Adapt model
set -e
SCRIPT_FOLDER=$(dirname $(realpath $BASH_SOURCE))
cd "$SCRIPT_FOLDER"
KALDI_DIR="$(realpath kaldi)"
KENLM_DIR="$(realpath kenlm)"
export PATH="$KALDI_DIR:$KENLM_DIR:$PATH"
echo "---------------------------LM_ADAPT-------------------------------"
MODEL_LANG=
MODEL_FORMAT=
NGRAMS=
SKIP_PRUNING=0
USE_DISCOUNT_FALLBACK=0
IGNORE_OOV_WORDS=0
OPTIMIZE_LM=1
OPTIMIZE_LEXICON=0
GENERATE_WORDS=0
SKIP_CONFIRM=0
G2P_TOOL=
function usage {
echo "Usage: $0 -l [LANG] -f [FORMAT] [OPTIONS]:"
echo "-l LANG language code (required), e.g. '-l en'"
echo "-f FORMAT language model format ('ngram' or 'jsgf')"
echo "-n NGRAM N-gram count for non-JSGF model"
echo "-p G2P_TOOL 'sequitur' or 'phonetisaurus' for G2P"
echo "-i ignore OOV words (will disable -g)"
echo "-g generate OOV words with G2P"
echo "-u N-gram setting: skip pruning"
echo "-d N-gram setting: use 'discount_fallback'"
echo "-r use raw sentences (don't process)"
echo "-o optimize lexicon and remove unused words"
echo "-y skip confirmation"
echo "-h show this help"
echo ""
}
while getopts igorl:f:n:p:udyh? opt; do
case $opt in
i) IGNORE_OOV_WORDS=1;;
g) GENERATE_WORDS=1;;
r) OPTIMIZE_LM=0;;
o) OPTIMIZE_LEXICON=1;;
l) MODEL_LANG="$OPTARG";;
f) MODEL_FORMAT="$OPTARG";;
n) NGRAMS="$OPTARG";;
u) SKIP_PRUNING=1;;
d) USE_DISCOUNT_FALLBACK=1;;
p) G2P_TOOL="$OPTARG";;
y) SKIP_CONFIRM=1;;
?|h) usage; exit 2;;
esac
done
if [ -n "$MODEL_LANG" ]; then
echo "Model language code used: $MODEL_LANG"
else
echo "Missing language argument '-l'. Use same code as for your model."
echo ""
usage
exit 1
fi
if [ -n "$MODEL_FORMAT" ]; then
echo "Model format used: $MODEL_FORMAT"
else
echo "Missing format argument '-f'. Using default: 'ngram'"
MODEL_FORMAT="ngram"
fi
if [ "$MODEL_FORMAT" = "ngram" ]; then
if [ -n "$NGRAMS" ]; then
echo "N-gram count: $NGRAMS"
else
echo "Missing N-gram argument '-n'. Using default: 4"
NGRAMS=4
fi
if [ $SKIP_PRUNING -eq 1 ]; then
echo "N-gram model will not be pruned"
fi
if [ $USE_DISCOUNT_FALLBACK -eq 1 ]; then
echo "N-gram model will use 'discount_fallback'"
fi
fi
if [ -n "$G2P_TOOL" ]; then
echo "G2P tool: $G2P_TOOL"
else
if [ $GENERATE_WORDS -eq 1 ]; then
echo "Missing G2P argument '-p'. Using default: 'phonetisaurus'"
G2P_TOOL="phonetisaurus"
fi
fi
MODEL="$(realpath model)"
MODEL_LEXICON="model/data/local/dict/lexicon.txt"
if [ $MODEL_FORMAT = "jsgf" ]; then
SENTENCES="lm_corpus/grammar_${MODEL_LANG}.jsgf"
else
SENTENCES="lm_corpus/sentences_${MODEL_LANG}.txt"
fi
MY_DICT="lm_dictionary/my_dict_${MODEL_LANG}.txt"
AUTO_DICT="lm_dictionary/auto_dict_${MODEL_LANG}.txt"
if [ $G2P_TOOL = "sequitur" ]; then
G2P_MODEL="sequitur/zamia_${MODEL_LANG}_4gram.g2p"
else
G2P_MODEL="phonetisaurus/zamia_g2p_${MODEL_LANG}.fst"
fi
echo "Working directory: $SCRIPT_FOLDER"
echo ""
if [ ! -f "$MODEL_LEXICON" ]; then
echo "Lexicon NOT found at: $MODEL_LEXICON"
exit 1
else
echo "Lexicon: $MODEL_LEXICON"
fi
if [ ! -f "$SENTENCES" ]; then
echo "LM sentences NOT found at: $SENTENCES"
exit 1
else
echo "LM sentences: $SENTENCES"
fi
if [ ! -f "$MY_DICT" ]; then
echo "Custom dictionary NOT found at: $MY_DICT"
echo "-> custom dict. ignored"
else
echo "Custom dictionary: $MY_DICT"
fi
if [ $IGNORE_OOV_WORDS -eq 1 ]; then
echo "Note: Missing vocabulary will be ignored during training of LM!"
OPTIMIZE_LEXICON=0
echo "Note: Disabled flag '-o' (required OOV check)."
else
if [ ! -f "$G2P_MODEL" ]; then
echo "G2P model NOT found at: $G2P_MODEL"
if [ $GENERATE_WORDS -eq 1 ]; then
exit 1
else
echo "-> new words will be stored instead"
fi
else
echo "G2P model for new words: $G2P_MODEL"
G2P_MODEL="$(realpath "$G2P_MODEL")"
fi
fi
if [ $OPTIMIZE_LM -eq 1 ]; then
echo "Note: Language model will be optimized before training (lower-case, sorted)."
fi
if [ $OPTIMIZE_LEXICON -eq 1 ]; then
echo "Note: Lexicon will be reduced to used (LM) vocab at the end (-o flag)."
fi
if [ $SKIP_CONFIRM -eq 0 ]; then
echo ""
read -p "Press any key to continue (CTRL+C to abort)."
fi
echo ""
# Functions
#optimize sentences
function optimize_sentences {
# TODO: need more optimizations ? (extend and use replace.py)
echo "Optimizing LM file (convert to lower-case sorted list) ..."
echo "LM file: $1"
awk '{ print tolower($0) }' "$1" | sort > "${1}.tmp"
rm "$1"
mv "${1}.tmp" "$1"
echo "Optimization done."
}
#merge 2 lists into target list by adding new and overwriting existing words
function merge_new_words_with_lex {
echo "Merging new words with original lexicon ..."
echo "New words: $1"
echo "Original: $2"
sort -u -t' ' -k1,1 "$1" "$2" | grep -a " " > "$3"
# NOTE: 'grep -a " "' has the effect of removing words that have no phonemes.
# This can also be used to deliberately clean up the lexicon.
echo "Result: $3"
}
#remove phonemes from lexicon to get only words
function create_vocab_file {
echo "Creating 'vocab' file from lexicon ..."
cut -f 1 -d ' ' "$1" > "$2"
echo "Result: $2"
}
#remove tag for pronunciation variants (1-9)
function remove_pronunciation_tag {
sed -i 's/^\(\S*\)_\([1-9]\)\s/\1 /' "$1"
}
# Start
# Make sure paths are all absolute
MY_DICT="$(realpath "$MY_DICT")"
AUTO_DICT="$(realpath "$AUTO_DICT")"
MODEL_LEXICON="$(realpath "$MODEL_LEXICON")"
SENTENCES="$(realpath "$SENTENCES")"
# Make sure some files exists
touch -a "$AUTO_DICT"
# Clean up old
if [ -d "./adapt" ]; then
echo "Cleaning up old 'adapt' folder ..."
rm -r "adapt"
echo ""
fi
mkdir -p adapt
# Copy language model and optimize
if [ $MODEL_FORMAT = "jsgf" ]; then
echo "Copying JSGF grammar to 'adapt/G.jsgf' ..."
cp "$SENTENCES" adapt/G.jsgf
echo "Extracting words from JSGF grammar to 'adapt/lm.txt' ..."
sed '/^#/ d' "$SENTENCES" > adapt/lm_tmp.txt
python3 replace.py -l -f adapt/lm_tmp.txt -o adapt/lm_tmp.txt
#sed -i "s/[^a-zA-Z0-9\xC0-\xFF.<>'_-]/ /g" adapt/lm_tmp.txt # replaced because of unicode mess!!
sed -i "s/<[^>]*>/ /g" adapt/lm_tmp.txt
grep -oE "\S+" adapt/lm_tmp.txt | sort | uniq > adapt/lm.txt
rm adapt/lm_tmp.txt
else
echo "Copying LM sentences to 'adapt/lm.txt' ..."
cp "$SENTENCES" adapt/lm.txt
fi
echo ""
#convert lm.txt to sorted lower-case list etc
if [ $OPTIMIZE_LM -eq 1 ]; then
optimize_sentences "$(realpath adapt/lm.txt)"
echo ""
fi
# OOV loop
max_loops=2
cd adapt
while true; do
if ((max_loops <= 0)); then
break
else
((max_loops--))
fi
# Optionally merge automatically created dictionary (if not empty)
if [ -s "$AUTO_DICT" ]; then
merge_new_words_with_lex "$AUTO_DICT" "$MODEL_LEXICON" "$(realpath lexicon.txt)"
# ... and replace old one
rm "$MODEL_LEXICON"
cp "lexicon.txt" "$MODEL_LEXICON"
echo "Replaced original lexicon"
echo ""
fi
# Optionally merge custom dictionary (if not empty and not already merged)
if [ -s "$MY_DICT" ] && ((max_loops == 1)); then
merge_new_words_with_lex "$MY_DICT" "$MODEL_LEXICON" "$(realpath lexicon.txt)"
# ... and replace old one
rm "$MODEL_LEXICON"
cp "lexicon.txt" "$MODEL_LEXICON"
echo "Replaced original lexicon"
echo ""
fi
# Create vocab.txt from lexicon
create_vocab_file "$MODEL_LEXICON" "$(realpath vocab.txt)"
echo ""
# test out-of-vocabulary words:
if [ $IGNORE_OOV_WORDS -eq 0 ]; then
echo "Searching for out-of-vocab (OOV) words in custom LM ..."
# get unique LM vocab as file (NOTE: split at non-whitespace to avoid unicode mess):
grep -oE "\S+" lm.txt | sort | uniq > lm_vocab.txt
# compare to vocab.txt (requires sorted lists!):
comm -23 lm_vocab.txt vocab.txt > lm_oov_words.txt
# same for unsorted (but slower)? grep -Fxv -f first-file.txt second-file.txt
if [ -s lm_oov_words.txt ]; then
if [ $GENERATE_WORDS -eq 1 ]; then
if ((max_loops >= 1)); then
echo "Creating phonemes for new words ..."
#NOTE: in theory the lex should support variants (word, word_1, word_2), use it?
if [ $G2P_TOOL = "sequitur" ]; then
g2p.py --model "$G2P_MODEL" --encoding="utf-8" --apply="lm_oov_words.txt" > "$AUTO_DICT"
# replace tab with space
sed -i $'s/\t/ /' "$AUTO_DICT"
else
cat "lm_oov_words.txt" | xargs phonetisaurus predict --model "$G2P_MODEL" > "$AUTO_DICT"
fi
echo "Done - Please consider manually checking the new words to optimize pronunciations!"
echo "New words file: $AUTO_DICT"
echo ""
echo "Restarting OOV checks ..."
echo ""
else
echo "--- Still found out-of-vocab words (will be ignored):"
cat lm_oov_words.txt
echo ""
break
fi
else
echo "--- Found oov words:"
cat lm_oov_words.txt
echo ""
echo "--- OOV check failed! Check output for out-of-vocab words and try again."
echo "See: $(realpath lm_oov_words.txt)"
echo "Note: You can skip this check to ignore OOV words using the '-i' argument"
echo "or use the G2P tool via '-g' to generate pronunciations automatically."
max_loops=0
exit 1
fi
else
echo "No (more) out-of-vocab words found :-)."
break
fi
else
break
fi
done
echo ""
# Clean up and optionally reduce lexicon
rm "vocab.txt"
#fix pronunciation tags
remove_pronunciation_tag "lexicon.txt"
#reduce size?
if [ $OPTIMIZE_LEXICON -eq 1 ]; then
if [ -f "lm_vocab.txt" ]; then
echo "Reducing lexicon to (LM) vocabulary ..."
if [ -f "lexicon_full.txt" ]; then
rm "lexicon_full.txt"
fi
mv "lexicon.txt" "lexicon_full.txt"
python3 ../filter_lex.py --lex "lexicon_full.txt" --vocab "lm_vocab.txt" --include-variants --out "lexicon.txt"
else
echo "ERROR: Missing 'lm_vocab.txt' for lexicon optimization!"
echo "Please don't use '-i' flag together with '-o'."
exit 1
fi
fi
#replace lexicon in model folder (one last time ^^)
rm "$MODEL_LEXICON"
cp "lexicon.txt" "$MODEL_LEXICON"
# Create LM (JSGF or ARPA)
if [ $MODEL_FORMAT = "jsgf" ]; then
# We already have the file, just define the name
LM_NAME="G.jsgf"
else
# Generate ARPA LM
# Ken's info page: https://kheafield.com/code/kenlm/estimation/
LM_NAME="lm.arpa"
N_GRAM=$NGRAMS
if (( $N_GRAM < 3 )); then
PRUNE="--prune 0"
elif (( $N_GRAM == 3 )); then
PRUNE="--prune 0 0 1"
elif (( $N_GRAM >= 4 )); then
PRUNE="--prune 0 0 1 2"
fi
if [ $SKIP_PRUNING -eq 1 ]; then
PRUNE="--prune 0"
fi
if [ $USE_DISCOUNT_FALLBACK -eq 1 ]; then
echo "Creating ARPA ${N_GRAM}-gram LM ($PRUNE) with 'discount_fallback' ..."
lmplz -S 50% --text lm.txt --arpa $LM_NAME --order $N_GRAM $PRUNE --discount_fallback
#TODO: remove whenever possible (when LM is large enough): --discount_fallback
else
echo "Creating ARPA ${N_GRAM}-gram LM ($PRUNE) ..."
lmplz -S 50% --text lm.txt --arpa $LM_NAME --order $N_GRAM $PRUNE
fi
#echo "PLEASE NOTE: '--limit_vocab_file' has been replaced by pre-training vocabulary check!"
fi
echo "LM READY"
echo ""
# Run adaptation of existing model
cd ..
LM_PATH="$(realpath adapt/$LM_NAME)"
MODEL_OUT="new-$MODEL_LANG"
echo "Starting model adaptation ..."
python3 -m adapt -f -k ${KALDI_DIR} ${MODEL} ${LM_PATH} ${MODEL_OUT}
# TODO: improve error handling
echo ""
if [ ! -f "work/exp/adapt/graph/HCLG.fst" ]; then
echo "Missing essential file: $(realpath work/exp/adapt/graph/HCLG.fst)"
echo "Please check log above for errors"
exit 1
else
echo "New LM: $LM_PATH"
if [ $GENERATE_WORDS -eq 1 ]; then
echo "Auto-generated OOV words (plz check): $AUTO_DICT"
fi
echo "New ASR model stored at: $(realpath work)"
echo ""
echo "You can use '4a-build-vosk-model.sh [optional_path]' to convert it to Vosk format."
fi
echo ""