From ac97a14b8880debc7a1b2383a59b6597dc65bbcb Mon Sep 17 00:00:00 2001 From: Flammie A Pirinen Date: Sat, 3 Feb 2024 00:16:45 +0100 Subject: [PATCH 01/45] start suggesting pre-commit to people :-) --- autogen.sh | 9 ++++++++- configure.ac | 7 +++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/autogen.sh b/autogen.sh index 385d4aa5..8d31b8ba 100755 --- a/autogen.sh +++ b/autogen.sh @@ -1,6 +1,13 @@ #!/bin/sh -echo "Initial automake setup of $(basename $(pwd))" +echo "Initial automake setup of $(basename "$(pwd)")" + # autoreconf should work for most platforms autoreconf -i -v + +if ! type pre-commit > /dev/null 2>&1 ; then + echo "we recommend use of pre-commit for automatic checks and fixes:" + echo " on mac: sudo brew install pre-commit" + echo " on many others: sudo python3 -m pip install pre-commit" +fi diff --git a/configure.ac b/configure.ac index 2d22aafa..7a330dc5 100644 --- a/configure.ac +++ b/configure.ac @@ -59,6 +59,8 @@ AS_IF([test x$UCONV = xfalse], on macbrew: brew install icu4c (and follow instructions: i.e. set your PATH if necessary) ])]) +AC_PATH_PROG([PRECOMMIT], [pre-commit], [false]) + AC_CONFIG_FILES([Makefile \ $PACKAGE.pc \ @@ -77,4 +79,9 @@ cat< Date: Sat, 3 Feb 2024 19:25:50 +0100 Subject: [PATCH 02/45] typo --- am-shared/src_alt_orth-include.am | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/am-shared/src_alt_orth-include.am b/am-shared/src_alt_orth-include.am index dfec027d..4efbef27 100644 --- a/am-shared/src_alt_orth-include.am +++ b/am-shared/src_alt_orth-include.am @@ -459,7 +459,7 @@ $(foreach alt_orth,$(ALT_ORTHS),$(eval $(call alt_orth_norm_generators,$(alt_ort # morpher is a morph segmenting variant: taloautoissani -> talo#auto>i>ssa>ni define alt_orth_morphers -.generated/morpher-gt-desc.$(1).tmp.%: a.generated/nalyser-raw-gt-desc.% \ +.generated/morpher-gt-desc.$(1).tmp.%: .generated/analyser-raw-gt-desc.% \ orthography/inituppercase.compose.% \ orthography/spellrelax.compose.% \ filters/remove-hyphenation-marks.% \ From e426fc18af3e6209595d1f7b28db34e5b8858eed Mon Sep 17 00:00:00 2001 From: Flammie A Pirinen Date: Mon, 12 Feb 2024 21:49:00 +0100 Subject: [PATCH 03/45] more tags from real data --- scripts/unimorph/convert.py | 29 ++++++++++++++++++++++++++++- scripts/unimorph/excluded.tags | 34 +++++++++++++++++++++++----------- 2 files changed, 51 insertions(+), 12 deletions(-) diff --git a/scripts/unimorph/convert.py b/scripts/unimorph/convert.py index 9f518001..06001fa0 100755 --- a/scripts/unimorph/convert.py +++ b/scripts/unimorph/convert.py @@ -371,6 +371,8 @@ def giella2unimorph(tags): continue elif giella == 'Largo': continue + elif giella == 'ABBR-': + continue elif giella == 'ABBR': continue elif giella == 'ACR': @@ -403,6 +405,8 @@ def giella2unimorph(tags): continue elif giella == 'S': continue + elif giella == 'Quote': + continue elif giella == 'CLB': continue elif giella == 'Prop': @@ -412,10 +416,16 @@ def giella2unimorph(tags): continue # ? elif giella == 'Use/Rus': unimorphtags += ['DIAL'] + elif giella == 'Use/Circ': + unimorphtags += ['XXXCIRC'] elif giella == 'Use/Dial': unimorphtags += ['DIAL'] + elif giella.startswith('Use/'): + unimorphtags += ['XXX' + giella[4:]] elif giella.startswith('Usage/'): unimorphtags += ['XXX' + giella[6:]] + elif giella == 'Guess': + unimorphtags += ['XXX'] elif giella == '??': unimorphtags += ['XXX'] elif giella == 'TODO': @@ -497,11 +507,15 @@ def giella2unimorph(tags): elif giella == 'Foc': unimorphtags += ['LGSPEC1/UnnamedFoc'] elif giella.startswith('Foc/'): + unimorphtags += ['LGSPEC1/' + giella[4:]] + elif giella == 'Clit': + unimorphtags += ['LGSPEC1/UnnamedClit'] + elif giella.startswith('Clit/'): unimorphtags += ['LGSPEC1/' + giella[5:]] elif giella == 'Clt': unimorphtags += ['LGSPEC2'] elif giella.startswith('Clt/'): - unimorphtags += ['LGSPEC2/' + giella[5:]] + unimorphtags += ['LGSPEC2/' + giella[4:]] elif giella.startswith('OLang/'): continue elif giella.startswith('Gram/'): @@ -571,6 +585,9 @@ def main(): help='Do not try to recase input and output when matching') a.add_argument('-t', '--threshold', metavar='THOLD', default=99, help='if coverage is less than THOLD exit with error') + a.add_argument('-I', '--include-specs', metavar='INCSPEC', + help='include INCSPEC in generated data', + action='append', choices=['lgspec', 'typo', 'xxx', 'dial']) options = a.parse_args() if not options.infile: options.infile = stdin @@ -585,6 +602,16 @@ def main(): skip_typo = True skip_xxx = True skip_dial = True + if options.include_specs: + for inclusive in options.include_specs: + if inclusive == 'lgspec': + skip_lgspec = False + elif inclusive == 'typo': + skip_typo = False + elif inclusive == 'xxx': + skip_xxx = False + elif inclusive == 'dial': + skip_dial = False for line in options.infile: fields = line.strip().split(':') if line.strip() == '': diff --git a/scripts/unimorph/excluded.tags b/scripts/unimorph/excluded.tags index 26f11bbf..2b114726 100644 --- a/scripts/unimorph/excluded.tags +++ b/scripts/unimorph/excluded.tags @@ -10,8 +10,8 @@ +Der +Der/AAdv +Der/adda -+Der/Adv +Der/adte ++Der/Adv +Der/ahtje +Der/alla +Der/allash @@ -20,9 +20,7 @@ +Der/Caus +Der/d +Der/dahtte -+Der/stahtte +Der/dalla -+Der/stalla +Der/Dimin +Der/eamoš +Der/easti @@ -33,51 +31,65 @@ +Der/ht +Der/huhtti +Der/huvva ++Der/im +Der/InchL ++Der/ist +Der/ja ++Der/kas +Der/keahtta +Der/l -+Der/lg -+Der/r +Der/laagasj +Der/laakan ++Der/lane +Der/las +Der/lasj -+Der/mas -+Der/oottyd -+Der/oollyd -+Der/stoovvyd +Der/lazh ++Der/lg ++Der/lik ++Der/line ++Der/m ++Der/mas +Der/mata +Der/meahttun +Der/muš +Der/musj +Der/MWN +Der/N2A ++Der/ne ++Der/nna +Der/NomAct +Der/NomAg +Der/OkshnOms ++Der/oollyd ++Der/oottyd +Der/OvOms +Der/PassL +Der/PassS +Der/Poss ++Der/r +Der/sazh +Der/st ++Der/stahtte ++Der/stalla +Der/stoollyd ++Der/stoovvyd +Der/stuvva +Der/t -+Der/tOin -+Der/tt +Der/tamatu ++Der/ti ++Der/tOin +Der/toovvyd ++Der/tt +Der/tu +Der/tud ++Der/us +Der/v +Der/viđá +Der/viđi +Der/vuota +Dim/ke +Dyn ++Guess +Err/DerSub +Err/MissingSpace +Err/Orth From eddfdd31a6e1ec31bf619fde12f2edd0bc655882 Mon Sep 17 00:00:00 2001 From: Flammie A Pirinen Date: Mon, 12 Feb 2024 21:49:14 +0100 Subject: [PATCH 04/45] don't dep on *.lexc --- am-shared/docs-dir-include.am | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/am-shared/docs-dir-include.am b/am-shared/docs-dir-include.am index 05ccd802..6a029cf4 100644 --- a/am-shared/docs-dir-include.am +++ b/am-shared/docs-dir-include.am @@ -49,7 +49,7 @@ endif # Generate endpoint json file for shield.io lemma count badge. # Only to be stored in the gh-pages branch, ignored in main. -$(srcdir)/lemmacount.json: $(top_srcdir)/src/fst/morphology/stems/*.lexc +$(srcdir)/lemmacount.json: $(AM_V_GEN)$(GTCORE)/scripts/make-lemmacount.json.sh $(abs_top_srcdir) > $@ # Generate a maturity.json file as endpoint for the maturity badge. From 2238ec589c0d69a12805242857a2658885056835 Mon Sep 17 00:00:00 2001 From: Flammie A Pirinen Date: Mon, 19 Feb 2024 17:29:13 +0100 Subject: [PATCH 05/45] methods to dump compound hit parade --- scripts/corpus-stats.bash | 4 ++-- scripts/freq-evals.py | 19 +++++++++++++++++++ 2 files changed, 21 insertions(+), 2 deletions(-) mode change 100644 => 100755 scripts/corpus-stats.bash diff --git a/scripts/corpus-stats.bash b/scripts/corpus-stats.bash old mode 100644 new mode 100755 index 87883a11..f64e8cac --- a/scripts/corpus-stats.bash +++ b/scripts/corpus-stats.bash @@ -25,7 +25,7 @@ for ll in "$@" ; do printf "paragraphs tokens characters\n" wc "$ll$copyright.text" python "$GTHOME/scripts/freq-evals.py" -a "$ANALYSER" -i "$ll$copyright.freqs" \ - -m "$ll$copyright.missinglist" + -m "$ll$copyright.missinglist" -n "$ll$copyright.prodlist" done for gecs in goldstandard correct-no-gs ; do CORPUS="$GTLANGS/corpus-$ll/$gecs/converted/" @@ -44,7 +44,7 @@ for ll in "$@" ; do printf "paragraphs tokens characters\n" wc "$ll$gecs.text" python "$GTHOME/scripts/freq-evals.py" -a "$ANALYSER" -i "$ll$gecs.freqs" \ - -m "$ll$gecs.missinglist" + -m "$ll$copyright.missinglist" -n "$ll$copyright.prodlist" done done diff --git a/scripts/freq-evals.py b/scripts/freq-evals.py index 447e6dcf..18dd75f9 100755 --- a/scripts/freq-evals.py +++ b/scripts/freq-evals.py @@ -40,6 +40,9 @@ def main(): a.add_argument("-m", "--missing", metavar="MISSFILE", type=FileType("w"), dest="missfile", help="write missing list to MISSFILE") + a.add_argument("-n", "--near-misses", metavar="NMFILE", + type=FileType("w"), + dest="nearmissfile", help="write deriv comp only to NMFILE") a.add_argument("-o", "--output", metavar="OUTFILE", type=FileType("w"), dest="outfile", help="write output to OUTFILE") @@ -121,6 +124,22 @@ def main(): if analyses: covered += freq types_covered += 1 + all_comps = True + all_derivs = True + all_comps_or_derivs = True + for analysis in analyses: + if "+Der" not in analysis[0]: + all_derivs = False + if "+Cmp" not in analysis[0]: + all_comps = False + if "+Cmp" not in analysis[0] and "+Der" not in analysis[0]: + all_comps_or_derivs = False + if all_comps: + print("C", freq, surf, sep="\t", file=options.nearmissfile) + elif all_derivs: + print("D", freq, surf, sep="\t", file=options.nearmissfile) + elif all_comps_or_derivs: + print("CD", freq, surf, sep="\t", file=options.nearmissfile) else: no_results += freq types_no_results += 1 From d01ce30c63b7cb89b55b1e68505e8b9df1c4a571 Mon Sep 17 00:00:00 2001 From: Flammie A Pirinen Date: Mon, 19 Feb 2024 17:29:41 +0100 Subject: [PATCH 06/45] hackarounds for low mem stuff --- scripts/unimorph/convert.py | 30 +++++++++++++++++++++--- scripts/unimorph/generate-alphabets.bash | 17 +++++++------- 2 files changed, 36 insertions(+), 11 deletions(-) diff --git a/scripts/unimorph/convert.py b/scripts/unimorph/convert.py index 06001fa0..7f1f9861 100755 --- a/scripts/unimorph/convert.py +++ b/scripts/unimorph/convert.py @@ -23,6 +23,8 @@ def giella2unimorph(tags): unimorphtags += ['N'] elif giella == 'Det': unimorphtags += ['DET'] + elif giella == 'Part': + unimorphtags += ['PART'] elif giella == 'Pcle': unimorphtags += ['PART'] elif giella == 'Adv': @@ -39,6 +41,8 @@ def giella2unimorph(tags): unimorphtags += ['ADJ'] elif giella == 'Adj': unimorphtags += ['ADJ'] + elif giella == 'Intj': + unimorphtags += ['INTJ'] elif giella == 'Interj': unimorphtags += ['INTJ'] elif giella == 'CC': @@ -141,6 +145,8 @@ def giella2unimorph(tags): unimorphtags += ['IND'] # XXX: can sometimes be indef? elif giella == 'Prs': unimorphtags += ['PRS'] + elif giella == 'Past': + unimorphtags += ['PST'] elif giella == 'Prt': unimorphtags += ['PST'] elif giella == 'Prt1': @@ -330,6 +336,8 @@ def giella2unimorph(tags): unimorphtags += ['OBGLIG'] elif giella == 'Interr': unimorphtags += ['INT'] # XXX: ABE? + elif giella in ['Der1', 'Der2']: + continue elif giella == 'Der/Comp': unimorphtags += ['CMPR'] elif giella == 'Comp': @@ -375,6 +383,8 @@ def giella2unimorph(tags): continue elif giella == 'ABBR': continue + elif giella == 'ACRO': + continue elif giella == 'ACR': continue elif giella == 'LEFT': @@ -397,12 +407,14 @@ def giella2unimorph(tags): continue elif giella == 'Ord': continue - elif giella in ['v1', 'v2', 'v3', 'v4', 'v5']: + elif giella in ['v1', 'v2', 'v3', 'v4', 'v5', 'v6']: continue elif giella in ['G3', 'G7']: continue elif giella.startswith('Sem'): continue + elif giella == 'Dummytag': + continue elif giella == 'S': continue elif giella == 'Quote': @@ -448,10 +460,14 @@ def giella2unimorph(tags): unimorphtags += ['TR'] elif giella == 'Impers': unimorphtags += ['IMPRS'] + elif giella == 'Reflex': + unimorphtags += ['REFL'] elif giella == 'Refl': unimorphtags += ['REFL'] elif giella == 'Recipr': unimorphtags += ['RECP'] + elif giella == 'Distr': + unimorphtags += ['REM'] elif giella == 'Dist': unimorphtags += ['REM'] elif giella == 'Prox': @@ -462,6 +478,8 @@ def giella2unimorph(tags): elif giella == 'AssocColl': # myv continue + elif giella in ['0,0', '0,1']: + continue elif giella in ['F1', 'F2', 'F3', 'F4', 'F5', 'F6', 'F7', 'F8', 'F9', 'F00', 'F01', 'F02', 'F03', 'F04', 'F05', 'F06', 'F07', 'F10', 'F11', 'F12', 'F13', 'F14', 'F15', 'F16', 'F08', @@ -480,7 +498,7 @@ def giella2unimorph(tags): 'B', 'C', 'E', 'D', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'W', 'X', 'Y', 'Z', - 'Š', 'Ž', 'Ä', 'Õ', 'Ö', 'Ü']: + 'Š', 'Ž', 'Ä', 'Õ', 'Ö', 'Ü', '0']: # est continue elif giella == 'Adn': @@ -540,13 +558,19 @@ def giella2unimorph(tags): elif giella.startswith('AErr/'): print('SOmething broken here½!', tags) unimorphtags += ['TYPO'] + elif '' in giella: + print('SOmething broken here½!', tags) + elif '' in giella: + print('SOmething broken here½!', tags) elif '' in giella: print('SOmething broken here½!', tags) + elif 'N224-1-9' in giella: + print('SOmething broken here½!', tags) elif '#222-5-19' in giella: print('SOmething broken here½!', tags) elif '/-' in giella: print('SOmething broken here½!', tags) - elif giella in ['a', 'b', 'i', 't', 'd', 's', 'n', 'ä']: + elif giella in ['a', 'b', 'i', 't', 'd', 's', 'n', 'ä', 'ö']: print('SOmething broken here½!', tags) elif giella in ['Ne', 'Ni', 'Nte', 'Ntee', 'Nt', 'Nti', 'Na', 'No', 'N-', 'c']: diff --git a/scripts/unimorph/generate-alphabets.bash b/scripts/unimorph/generate-alphabets.bash index 82780c0e..6e9f6557 100755 --- a/scripts/unimorph/generate-alphabets.bash +++ b/scripts/unimorph/generate-alphabets.bash @@ -20,13 +20,14 @@ if test ! -f "$generator" ; then echo "Could not find generator automaton $generator" exit 1 fi -echo "$cyclicRE +UglyHack | [? - [ $cyclicRE ] ]* ;" | - sed -e 's/+/%+/g' -e 's:/:%/:g' -e 's/#/%#/g' -e 's/\^/%^/g' > generative.regex -hfst-regexp2fst -i generative.regex -o generative.hfst -f foma -hfst-compose -F -1 generative.hfst -2 "$generator" |\ - hfst-fst2fst -f olw -o generator.hfst for c in a b c d e f g h i j k l m n o p q r s t u v x y z å ä ö š ž ; do - hfst-fst2strings -c 0 generator.hfst -p $c -done > generated.alpha -uniq < generated.alpha | "$(dirname "$0")"/convert.py + echo "$cyclicRE +UglyHack | $c [? - [ $cyclicRE ] ]* ;" | + sed -e 's/+/%+/g' -e 's:/:%/:g' -e 's/#/%#/g' -e 's/\^/%^/g' > generative.$c.regex + hfst-regexp2fst -i generative.$c.regex -o generative.$c.hfst -f foma + hfst-compose -F -1 generative.$c.hfst -2 "$generator" |\ + hfst-fst2fst -f olw -o generator.$c.hfst + hfst-fst2strings -c 0 generator.$c.hfst > generated.$c + echo $c + uniq < generated.$c | "$(dirname "$0")"/convert.py +done From 2056b6c6939efd2bbb30293dcf81baf5c3d5b2c3 Mon Sep 17 00:00:00 2001 From: Anders Lorentsen Date: Fri, 23 Feb 2024 16:25:46 +0100 Subject: [PATCH 07/45] add correct_typos.py as a replacement for preprocess --corr=... --- scripts/correct_typos.py | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 scripts/correct_typos.py diff --git a/scripts/correct_typos.py b/scripts/correct_typos.py new file mode 100644 index 00000000..3c1d9a5a --- /dev/null +++ b/scripts/correct_typos.py @@ -0,0 +1,37 @@ +"""Read a corrections file (path as first argument to the script) +and then read line by line from standard input, and substitute each +word with the correct one from the corrections file.""" +from sys import stdin, argv + +if len(argv) <= 1: + exit("usage: python {argv[0]} ") + + +def read_corrections_file(path): + lookups = {} + with open(path, "r") as f: + lines = f.readlines() + for line in lines: + line = line.strip() + try: + wrong, right = line.split("\t") + except ValueError: + pass + else: + lookups[wrong] = right + return lookups + + +def main(): + correction_file = argv[1] + corrections = read_corrections_file(correction_file) + + for line in stdin.readlines(): + line = line.strip() + if not line: + continue + print(corrections.get(line, line)) + + +if __name__ == "__main__": + raise SystemExit(main()) From bbd14b415973b85bbaaa0f77eb7e28f212ac0295 Mon Sep 17 00:00:00 2001 From: Anders Lorentsen Date: Fri, 23 Feb 2024 16:28:14 +0100 Subject: [PATCH 08/45] add shebang to correct_typos.py --- scripts/correct_typos.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/correct_typos.py b/scripts/correct_typos.py index 3c1d9a5a..64040bed 100644 --- a/scripts/correct_typos.py +++ b/scripts/correct_typos.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 """Read a corrections file (path as first argument to the script) and then read line by line from standard input, and substitute each word with the correct one from the corrections file.""" From 994a107093d3d44869bb249ec6bdbb199ec7524b Mon Sep 17 00:00:00 2001 From: Anders Lorentsen Date: Fri, 23 Feb 2024 16:30:57 +0100 Subject: [PATCH 09/45] flush each output line --- scripts/correct_typos.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/correct_typos.py b/scripts/correct_typos.py index 64040bed..e2414613 100644 --- a/scripts/correct_typos.py +++ b/scripts/correct_typos.py @@ -31,7 +31,7 @@ def main(): line = line.strip() if not line: continue - print(corrections.get(line, line)) + print(corrections.get(line, line), flush=True) if __name__ == "__main__": From 359b5670db69558d515adf102f4ddc934746ef8b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=B8rre=20Gaup?= Date: Wed, 21 Feb 2024 11:31:30 +0100 Subject: [PATCH 10/45] Use scripts from GiellaLTGramTools --- Makefile.am | 3 --- am-shared/tools-grammarcheckers-dir-include.am | 2 +- am-shared/tools-grammarcheckers-tests-dir-include.am | 2 +- 3 files changed, 2 insertions(+), 5 deletions(-) diff --git a/Makefile.am b/Makefile.am index b7105550..7118715c 100644 --- a/Makefile.am +++ b/Makefile.am @@ -130,14 +130,11 @@ nobase_dist_pkgdata_SCRIPTS = \ scripts/iso639-to-name.sh \ scripts/iso-639-3.txt \ scripts/lookup2cg \ - scripts/make_grammarchecker_zip.py \ scripts/make-hfstspeller-version-easter-egg.sh \ scripts/make-lemmacount.json.sh \ scripts/make-maturity.json.sh \ scripts/merge-templates.sh \ scripts/morph-test.py \ - scripts/gramcheck-test.py \ - scripts/gramcheck_comparator.py \ scripts/new-language.sh \ scripts/patgen.exp \ scripts/predict.py \ diff --git a/am-shared/tools-grammarcheckers-dir-include.am b/am-shared/tools-grammarcheckers-dir-include.am index fb429ab5..cda2c1cb 100644 --- a/am-shared/tools-grammarcheckers-dir-include.am +++ b/am-shared/tools-grammarcheckers-dir-include.am @@ -149,7 +149,7 @@ else !CAN_PIP echo lxml and pip is missing so this may fail...: endif endif - $(AM_V_GEN)$(GTCORE)/scripts/make_grammarchecker_zip.py pipespec.xml $@ + make_grammarchecker_zip pipespec.xml $@ # Additional developer tools: dev: modes/$(GTLANG)gram.mode schemas.xml diff --git a/am-shared/tools-grammarcheckers-tests-dir-include.am b/am-shared/tools-grammarcheckers-tests-dir-include.am index 90184004..c693b7d0 100644 --- a/am-shared/tools-grammarcheckers-tests-dir-include.am +++ b/am-shared/tools-grammarcheckers-tests-dir-include.am @@ -7,6 +7,6 @@ if WANT_GRAMCHECK # Default variable for all languages -TESTS_ENVIRONMENT = "$(GIELLA_CORE)/scripts/gramcheck-test.py" -q -s ../$(GTLANG2).zcheck 2>/dev/null +TESTS_ENVIRONMENT = gramcheck-yaml -q -s ../$(GTLANG2).zcheck 2>/dev/null endif # WANT_GRAMCHECK From 3ccf7df501fc09b7d72ae5898211938a5a6771ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=B8rre=20Gaup?= Date: Mon, 26 Feb 2024 19:09:08 +0100 Subject: [PATCH 11/45] Err out if duplicate tests are found --- scripts/gramcheck-test.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/scripts/gramcheck-test.py b/scripts/gramcheck-test.py index 9e41d903..24fd0eb9 100755 --- a/scripts/gramcheck-test.py +++ b/scripts/gramcheck-test.py @@ -111,6 +111,16 @@ def load_config(self, args): ) config["tests"] = yaml_settings.get("Tests", []) + dupes = "\n".join( + {f"\t{test}" for test in config["tests"] if config["tests"].count(test) > 1} + ) + if dupes: # check for duplicates + print( + f"ERROR: Remove the following dupes in {config['test_file']}\n{dupes}", + file=sys.stderr, + ) + sys.exit(99) # exit code 99 signals hard exit to Make + if args.total and len(args.test_files) == 1: notfixed = ( config["test_file"].parent / f"{config['test_file'].stem}.notfixed.yaml" From 811a5120acddb350c41cb46f9ad84e6816c2cacc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=B8rre=20Gaup?= Date: Mon, 26 Feb 2024 19:11:54 +0100 Subject: [PATCH 12/45] Revert "Use scripts from GiellaLTGramTools" This commit was way premature --- Makefile.am | 3 +++ am-shared/tools-grammarcheckers-dir-include.am | 2 +- am-shared/tools-grammarcheckers-tests-dir-include.am | 2 +- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/Makefile.am b/Makefile.am index 7118715c..b7105550 100644 --- a/Makefile.am +++ b/Makefile.am @@ -130,11 +130,14 @@ nobase_dist_pkgdata_SCRIPTS = \ scripts/iso639-to-name.sh \ scripts/iso-639-3.txt \ scripts/lookup2cg \ + scripts/make_grammarchecker_zip.py \ scripts/make-hfstspeller-version-easter-egg.sh \ scripts/make-lemmacount.json.sh \ scripts/make-maturity.json.sh \ scripts/merge-templates.sh \ scripts/morph-test.py \ + scripts/gramcheck-test.py \ + scripts/gramcheck_comparator.py \ scripts/new-language.sh \ scripts/patgen.exp \ scripts/predict.py \ diff --git a/am-shared/tools-grammarcheckers-dir-include.am b/am-shared/tools-grammarcheckers-dir-include.am index cda2c1cb..fb429ab5 100644 --- a/am-shared/tools-grammarcheckers-dir-include.am +++ b/am-shared/tools-grammarcheckers-dir-include.am @@ -149,7 +149,7 @@ else !CAN_PIP echo lxml and pip is missing so this may fail...: endif endif - make_grammarchecker_zip pipespec.xml $@ + $(AM_V_GEN)$(GTCORE)/scripts/make_grammarchecker_zip.py pipespec.xml $@ # Additional developer tools: dev: modes/$(GTLANG)gram.mode schemas.xml diff --git a/am-shared/tools-grammarcheckers-tests-dir-include.am b/am-shared/tools-grammarcheckers-tests-dir-include.am index c693b7d0..90184004 100644 --- a/am-shared/tools-grammarcheckers-tests-dir-include.am +++ b/am-shared/tools-grammarcheckers-tests-dir-include.am @@ -7,6 +7,6 @@ if WANT_GRAMCHECK # Default variable for all languages -TESTS_ENVIRONMENT = gramcheck-yaml -q -s ../$(GTLANG2).zcheck 2>/dev/null +TESTS_ENVIRONMENT = "$(GIELLA_CORE)/scripts/gramcheck-test.py" -q -s ../$(GTLANG2).zcheck 2>/dev/null endif # WANT_GRAMCHECK From 2f46c67324165299d02a60964e0981d3183c98db Mon Sep 17 00:00:00 2001 From: Anders Lorentsen Date: Wed, 28 Feb 2024 13:30:19 +0100 Subject: [PATCH 13/45] check for xml parsing errors in merge_giella_dicts --- dicts/scripts/merge_giella_dicts.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/dicts/scripts/merge_giella_dicts.py b/dicts/scripts/merge_giella_dicts.py index a90bcf14..2c0dfa85 100644 --- a/dicts/scripts/merge_giella_dicts.py +++ b/dicts/scripts/merge_giella_dicts.py @@ -55,7 +55,14 @@ def merge_giella_dicts(directory, out_file): for file in xml_files: with open(file) as f: text = f.read() - tree = ET.fromstring(text) + try: + tree = ET.fromstring(text) + except ET.ParseError as e: + print(f"Warning: XML error in file {file}", file=sys.stderr) + print("The process continues, but the contents of this file will " + "not be included in the merged output") + print(e, file=sys.stderr) + continue if tree.tag != "r": # root node not , not a giella xml dictionary # (this can be the meta.xml file, for example) From 2c78961f2090bfbb33e50bda01a73fd818fc3552 Mon Sep 17 00:00:00 2001 From: Sjur N Moshagen Date: Thu, 29 Feb 2024 20:07:12 +0200 Subject: [PATCH 14/45] One space more indent --- scripts/extract-lemmas.sh | 62 +++++++++++++++++++-------------------- 1 file changed, 31 insertions(+), 31 deletions(-) diff --git a/scripts/extract-lemmas.sh b/scripts/extract-lemmas.sh index 7bb71daa..32b38524 100755 --- a/scripts/extract-lemmas.sh +++ b/scripts/extract-lemmas.sh @@ -91,34 +91,34 @@ keep_hom_tags () { # The main lemma extraction thing: grep ";" $inputfile | # grep only lines containing ; - egrep -v "^[[:space:]]*(\!|\@|<|\+)" | # do NOT grep lines beginning with (space +) !, @ or < - keep_hom_tags | # treat homonyms special - egrep -v "^[[:space:]]*[[:alnum:]_-]+[[:space:]]*;" | # do NOT grep lines containing ONLY a continuation lexicon ref - egrep -v "(LEXICON| K |ENDLEX|\+Err\/Lex|DerSub)" | # do NOT grep lines containing a number of generally known wrong stuff - exclgrep "$excludepattern" | # do NOT grep things specified in each test script - egrep "$includepattern" | # DO grep things specified in each test script if specified - sed 's/^[ ]*//' | # Remove initial whitespace - sed 's/% /€/g' | # escape lexc escapes - sed 's/%:/¢/g' | # escape lexc escapes - sed 's/%#/¥/g' | # escape lexc escapes - sed 's/%@/£/g' | # escape lexc escapes - perl -pe 's/\+(?![A-Z])(?!v[0-9])/xxplussxx/g' | # escape + when not being the first letter in a tag - sed 's/%\(.\)/\1/g' | # simplify lexc escapes - tr '\t' ' ' | # replate tabs with spaces - tr -s ' ' | # squash spaces to one - sed 's/:/XXXXX/' | # escape upper-lower mark before next step - cut_fields | # extract lemma, possibly contlex if specified - sed 's/@.* / /' | # remove lemma final flag diacritics - sed 's/XXXXX.* / /' | # remove lower part - sed 's/XXXXX.*//' | # remove lower part - tr -d "#" | # remove word boundaries in lemmas (should not exist, but just to be safe) - tr " " "\t" | # change space to tabs - why?? - sed 's/€/ /g' | # restore lexc escapes to their lexical form - sed 's/¢/:/g' | # restore lexc escapes to their lexical form - sed 's/£/@/g' | # restore lexc escapes to their lexical form - sed 's/¥/#/g' | # restore lexc escapes to their lexical form - egrep -v "(^$|^;|^[0-9]$|^\!)" | # remove useless lines - perl -pe 's/__(Hom[0-9]+)__/\+\1/' | # restore homonym tags if kept - perl -pe 's/__(G[37]+)__/\+\1/' | # restore homonym tags if kept - sed 's/xxplussxx/\+/g' | # restore literal, escaped + sign - sort -u + egrep -v "^[[:space:]]*(\!|\@|<|\+)" | # do NOT grep lines beginning with (space +) !, @ or < + keep_hom_tags | # treat homonyms special + egrep -v "^[[:space:]]*[[:alnum:]_-]+[[:space:]]*;" | # do NOT grep lines containing ONLY a continuation lexicon ref + egrep -v "(LEXICON| K |ENDLEX|\+Err\/Lex|DerSub)" | # do NOT grep lines containing a number of generally known wrong stuff + exclgrep "$excludepattern" | # do NOT grep things specified in each test script + egrep "$includepattern" | # DO grep things specified in each test script if specified + sed 's/^[ ]*//' | # Remove initial whitespace + sed 's/% /€/g' | # escape lexc escapes + sed 's/%:/¢/g' | # escape lexc escapes + sed 's/%#/¥/g' | # escape lexc escapes + sed 's/%@/£/g' | # escape lexc escapes + perl -pe 's/\+(?![A-Z])(?!v[0-9])/xxplussxx/g' | # escape + when not being the first letter in a tag + sed 's/%\(.\)/\1/g' | # simplify lexc escapes + tr '\t' ' ' | # replate tabs with spaces + tr -s ' ' | # squash spaces to one + sed 's/:/XXXXX/' | # escape upper-lower mark before next step + cut_fields | # extract lemma, possibly contlex if specified + sed 's/@.* / /' | # remove lemma final flag diacritics + sed 's/XXXXX.* / /' | # remove lower part + sed 's/XXXXX.*//' | # remove lower part + tr -d "#" | # remove word boundaries in lemmas (should not exist, but just to be safe) + tr " " "\t" | # change space to tabs - why?? + sed 's/€/ /g' | # restore lexc escapes to their lexical form + sed 's/¢/:/g' | # restore lexc escapes to their lexical form + sed 's/£/@/g' | # restore lexc escapes to their lexical form + sed 's/¥/#/g' | # restore lexc escapes to their lexical form + egrep -v "(^$|^;|^[0-9]$|^\!)" | # remove useless lines + perl -pe 's/__(Hom[0-9]+)__/\+\1/' | # restore homonym tags if kept + perl -pe 's/__(G[37]+)__/\+\1/' | # restore homonym tags if kept + sed 's/xxplussxx/\+/g' | # restore literal, escaped + sign + sort -u From 70c3f464c15fabd502faf31fc35e60b7c8c9a2bb Mon Sep 17 00:00:00 2001 From: Flammie A Pirinen Date: Tue, 5 Mar 2024 16:05:24 +0100 Subject: [PATCH 15/45] handle all files that have been input? --- scripts/extract-lemmas.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/extract-lemmas.sh b/scripts/extract-lemmas.sh index 32b38524..d310357f 100755 --- a/scripts/extract-lemmas.sh +++ b/scripts/extract-lemmas.sh @@ -47,7 +47,7 @@ while test $# -ge 1 ; do shift fi elif test -f "$1"; then - inputfile="$1" + inputfiles="$inputfiles $1" shift else echo "$0: unknown option $1" @@ -90,7 +90,7 @@ keep_hom_tags () { } # The main lemma extraction thing: -grep ";" $inputfile | # grep only lines containing ; +cat $inputfiles | grep ";" | # grep only lines containing ; egrep -v "^[[:space:]]*(\!|\@|<|\+)" | # do NOT grep lines beginning with (space +) !, @ or < keep_hom_tags | # treat homonyms special egrep -v "^[[:space:]]*[[:alnum:]_-]+[[:space:]]*;" | # do NOT grep lines containing ONLY a continuation lexicon ref From e57827354e24fb91c595a989b6e85b19090ad653 Mon Sep 17 00:00:00 2001 From: Sjur N Moshagen Date: Wed, 6 Mar 2024 08:36:58 +0200 Subject: [PATCH 16/45] Do not shift file argument list, the global shift at the end of the loop does what is needed The extra shift caused files to be skipped --- scripts/extract-lemmas.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/extract-lemmas.sh b/scripts/extract-lemmas.sh index d310357f..f7b3a2ed 100755 --- a/scripts/extract-lemmas.sh +++ b/scripts/extract-lemmas.sh @@ -48,7 +48,6 @@ while test $# -ge 1 ; do fi elif test -f "$1"; then inputfiles="$inputfiles $1" - shift else echo "$0: unknown option $1" print_usage From d83b5907507dcbea7f96e7e350167378edcfb6e8 Mon Sep 17 00:00:00 2001 From: Sjur N Moshagen Date: Wed, 6 Mar 2024 08:37:07 +0200 Subject: [PATCH 17/45] whitespace --- scripts/extract-lemmas.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/extract-lemmas.sh b/scripts/extract-lemmas.sh index f7b3a2ed..68d0a72d 100755 --- a/scripts/extract-lemmas.sh +++ b/scripts/extract-lemmas.sh @@ -89,7 +89,7 @@ keep_hom_tags () { } # The main lemma extraction thing: -cat $inputfiles | grep ";" | # grep only lines containing ; +cat $inputfiles | grep ";" | # grep only lines containing ; egrep -v "^[[:space:]]*(\!|\@|<|\+)" | # do NOT grep lines beginning with (space +) !, @ or < keep_hom_tags | # treat homonyms special egrep -v "^[[:space:]]*[[:alnum:]_-]+[[:space:]]*;" | # do NOT grep lines containing ONLY a continuation lexicon ref From 9053d72fceabc574cd5bcc6385f0d8cfb0af5d2e Mon Sep 17 00:00:00 2001 From: Sjur N Moshagen Date: Wed, 6 Mar 2024 08:38:09 +0200 Subject: [PATCH 18/45] Bump bugfix --- configure.ac | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configure.ac b/configure.ac index 7a330dc5..89e0f91b 100644 --- a/configure.ac +++ b/configure.ac @@ -14,7 +14,7 @@ ## You should have received a copy of the GNU General Public License ## along with this program. If not, see . -AC_INIT([giella-core], [0.22.1], [feedback@divvun.no], [giella-core], [https://github.com/giellalt/giella-core]) +AC_INIT([giella-core], [0.22.2], [feedback@divvun.no], [giella-core], [https://github.com/giellalt/giella-core]) AC_REVISION([$Revision$]) AC_CONFIG_AUX_DIR([build-aux]) AM_INIT_AUTOMAKE([1.9 tar-pax -Wall -Werror foreign]) From bebe7f47d60eee643db321dbdfd553b4f18033f9 Mon Sep 17 00:00:00 2001 From: Sjur N Moshagen Date: Wed, 6 Mar 2024 08:46:14 +0200 Subject: [PATCH 19/45] Update help text --- scripts/extract-lemmas.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/extract-lemmas.sh b/scripts/extract-lemmas.sh index 68d0a72d..ae292239 100755 --- a/scripts/extract-lemmas.sh +++ b/scripts/extract-lemmas.sh @@ -4,8 +4,8 @@ # set -x function print_usage() { - echo "Usage: $0 [OPTIONS...] INPUTFILE" - echo "Extract lemmas from INPUTFILE (lexc)" + echo "Usage: $0 [OPTIONS...] INPUTFILE [INPUTFILE ...]" + echo "Extract lemmas from INPUTFILE(S) (lexc). Avoid affix files in the input." echo echo " -h, --help Print this usage info" echo " --exclude '(pattern)' Exclude (egrep) patterns from the lemma list" From 246d30749222a4c48c15fd32ecaf10969588de3d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=B8rre=20Gaup?= Date: Tue, 27 Feb 2024 11:06:27 +0100 Subject: [PATCH 20/45] Stop if there are no tests --- scripts/gramcheck-test.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/scripts/gramcheck-test.py b/scripts/gramcheck-test.py index 24fd0eb9..a247a76e 100755 --- a/scripts/gramcheck-test.py +++ b/scripts/gramcheck-test.py @@ -111,6 +111,12 @@ def load_config(self, args): ) config["tests"] = yaml_settings.get("Tests", []) + if not config["tests"]: + print( + f"ERROR: No tests in {config['test_file']}", + file=sys.stderr, + ) + sys.exit(99) # exit code 99 signals hard exit to Make dupes = "\n".join( {f"\t{test}" for test in config["tests"] if config["tests"].count(test) > 1} ) From a5dab869090349da76ace91fdefc9d50e6b8480f Mon Sep 17 00:00:00 2001 From: Trond Trosterud Date: Wed, 7 Feb 2024 10:33:10 +0100 Subject: [PATCH 21/45] GTLANGS not in svn, main in svn --- devtools/init.d/init.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/devtools/init.d/init.sh b/devtools/init.d/init.sh index a4110886..c2917843 100755 --- a/devtools/init.d/init.sh +++ b/devtools/init.d/init.sh @@ -49,7 +49,7 @@ fi . "$GIELLA_CORE"/devtools/init.d/lookup-init.sh # Alias for svn update -alias svnup="svn up \$GTLANGS/* \$GTBIG \$GTFREE \$GTPRIV \$GTHOME/art" +alias svnup="svn up \$GTBIG \$GTFREE \$GTPRIV \$GTHOME/ \$GTHOME/art" # Sorting Cyrillic lists From 1a593e8d0551dacc193cbbfb7ddc0b3dedf6e00e Mon Sep 17 00:00:00 2001 From: Trond Trosterud Date: Wed, 28 Feb 2024 16:35:41 +0100 Subject: [PATCH 22/45] correct to git --- dicts/scripts/gt_dictionary.css | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dicts/scripts/gt_dictionary.css b/dicts/scripts/gt_dictionary.css index df930671..e1c5d4c7 100644 --- a/dicts/scripts/gt_dictionary.css +++ b/dicts/scripts/gt_dictionary.css @@ -1,5 +1,6 @@ @charset "utf-8"; -@import "../../../tools/xxe/gtdict-config/gt_dictionary_XXE.css"; +#@import "../../../tools/xxe/gtdict-config/gt_dictionary_XXE.css"; +@import "../../giella-core/dicts/scripts/gt_dictionary_XXE.css"; @import "gt_dictionary_shared.css"; From 66b2a32e81d6355b3838990ab1ad335d43afc0b5 Mon Sep 17 00:00:00 2001 From: Trond Trosterud Date: Wed, 28 Feb 2024 16:36:08 +0100 Subject: [PATCH 23/45] book --- dicts/scripts/gt_dictionary.dtd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dicts/scripts/gt_dictionary.dtd b/dicts/scripts/gt_dictionary.dtd index cccdf286..dbb5744e 100644 --- a/dicts/scripts/gt_dictionary.dtd +++ b/dicts/scripts/gt_dictionary.dtd @@ -160,7 +160,7 @@ > - + From 0096fc3ce86d43553facfd478f2674c6adc47409 Mon Sep 17 00:00:00 2001 From: Trond Trosterud Date: Wed, 28 Feb 2024 16:37:19 +0100 Subject: [PATCH 24/45] XXE --- dicts/scripts/gt_dictionary_XXE.css | 67 ++++ dicts/scripts/gt_dictionary_XXE_alt.css | 171 ++++++++++ dicts/scripts/gt_dictionary_XXE_shared.css | 362 +++++++++++++++++++++ 3 files changed, 600 insertions(+) create mode 100644 dicts/scripts/gt_dictionary_XXE.css create mode 100644 dicts/scripts/gt_dictionary_XXE_alt.css create mode 100644 dicts/scripts/gt_dictionary_XXE_shared.css diff --git a/dicts/scripts/gt_dictionary_XXE.css b/dicts/scripts/gt_dictionary_XXE.css new file mode 100644 index 00000000..06e92746 --- /dev/null +++ b/dicts/scripts/gt_dictionary_XXE.css @@ -0,0 +1,67 @@ +@charset "utf-8"; +@import "gt_dictionary_XXE_shared.css"; + +r, rootdict { + background-color: #fff; +} + +/* tg styling: */ +tg + tg:before { + content: "; "; +} +tg { + display: inline; +} + +re { + display: inline; +} + +/* t styling: */ +syn + syn:before, +t + t:before, +te + t:before, +tf + t:before, +t + tf:before, +te + tf:before, +tf + tf:before { + content: ", "; +} +t, tf, te { + display: inline; +} +l + t { + margin-left: +.35em; +} + +/* xg, x, xt styling: */ +te + xg:before, tf + xg:before { + margin-left: 0.5ex; + content: " "; +} + +xg + xg:before { + display: inline; + content: "; "; +} + +xg x { + font-style: italic; +} + +/* syng, syn styling: */ +syng { + display: inline; +} + +syn { + display: inline; +} + + +/* Her kjem tillegga for sme-oahpa: */ + +tr + tr:before { + content: "; "; +} + diff --git a/dicts/scripts/gt_dictionary_XXE_alt.css b/dicts/scripts/gt_dictionary_XXE_alt.css new file mode 100644 index 00000000..91e3d654 --- /dev/null +++ b/dicts/scripts/gt_dictionary_XXE_alt.css @@ -0,0 +1,171 @@ +@charset "utf-8"; +@import "gt_dictionary_XXE_shared.css"; + +/* This is the css file for the dictionary tree view */ + +r, rootdict, lexicon { + background-color: #bcd; +} + +/* mg styling: */ +e > mg { + counter-reset: translationgroup; +} + +apps:before { + display: block; + content: "Apps: "; +} +apps { + display: block; + margin-left: 3ex; +} + +app { + display: inline; +} +app + app:before { + content: ", "; +} +app:before { + content: attr(name) ": "; +} +/* source styling: */ + +sources:before { + display: inline; +} +sources { + display: block; +} +book { + display: inline; + content: "Books: "; + +} +book + book:before { + content: ", "; +} +book:after { + content: attr(name); +} + +book + frequency:before { + display: inline; + content: " Frequency: "; +} + +frequency:before { + display: inline; + content: "Frequency: "; +} +frequency { + display: inline; +} +frequency:after { + content: attr(class); +} + +geography:before { + display: inline; + content: "; Geography: "; +} +geography { + display: inline; +} +geography + geography:before { + content: ", "; +} +geography:after { + content: attr(class); +} + + +/* tg styling: */ +tg:before { + display: inline; + counter-increment: translationgroup; + content: counter(translationgroup, lower-alpha) ". "; +} +tg { + display: block; + margin-left: 1.33ex; +} + +semantics:before { + display: inline; + content: "Sem ➩ "; +} +semantics { + display: block; +} +sem { + display: inline; +} +sem + sem:before { + content: ", "; +} +sem:after { + content: attr(class); +} + + +re { + display: block; + margin-left: 2ex; +} + +/* t styling: */ +t, tf { + display: list-item; + margin-left: 2.33ex; +} +tf:after { + display: inline; + font-style: italic; + color: gray; + font-size: small; + vertical-align: super; + margin-left: 0.5ex; + content: "Phrase" ; +} +te { + display: block; + margin-left: 2.33ex; + color: #2f4f4f; +} + +/* xg, x, xt styling: */ +xg { + display: block; + margin-left: 2.33ex; +} + +xg x { + display: inline; + font-style: italic; +} + +xt { + display: block; + margin-left: 1.33ex; +} + +/* syng, syn styling: */ +syng { + display: block; + margin-left: 3ex; +} + +syn { + display: list-item; + margin-left: 3ex; +} + + +/* Her kjem tillegga for sme-oahpa: */ + +tr { + display: list-item; + margin-left: 2.33ex; +} diff --git a/dicts/scripts/gt_dictionary_XXE_shared.css b/dicts/scripts/gt_dictionary_XXE_shared.css new file mode 100644 index 00000000..510b9dd4 --- /dev/null +++ b/dicts/scripts/gt_dictionary_XXE_shared.css @@ -0,0 +1,362 @@ +@charset "utf-8"; +@namespace xml url('http://www.w3.org/XML/1998/namespace'); + +r, rootdict, lexicon { + display: block; + margin: 1ex; +} + +lics { + display: block; +} + +lics:before { + content: "Copyright Notes"; + display: block; + font-weight: bold; + margin-bottom: 21px; + margin-left: 0px; + margin-right: 0px; + margin-top: 21px; +} + +lic { + display: block; + margin: 18px; +} + +ref:before { + display: inline; + white-space: pre; /* required to save \A in the next rule*/ + content: 'Please refer to this source code with the following attribution text:\A"'; + font-weight: bold; + font-size: smaller; +} +ref { + display: block; + margin: 18px; +} +ref:after { + display: inline; + content: '"'; +} + +i { + display: inline; + font-style: italic; +} + +sourcenote { + display: block; + font-weight: bold; + margin: 18px; + padding-bottom: 18px; +} + +a { + display: inline; +} + +/* e styling: */ +e, entry { + display: block; + margin-bottom: 0.3ex; + counter-reset: meaninggroups; + padding: 0.3ex; + border: 1px ridge #eee; +} +e:after, entry:after { + display: block ; + color: gray ; + margin-left: 2ex ; +} +e[usage]:after { + content: "Usage: " attr(usage) ; +} + +/* +e[src]:after { + content: "Source: " attr(src) ; +} +e[usage][src]:after { + content: "Usage: " attr(usage) ", source: " attr(src) ; +} +*/ + +/* l styling: */ +l, lemma { + font-weight: bold; +} +l:after, lemma:after { + display: inline; + margin-left: 0.5ex; + content: attr(pos); + font-style: italic; + font-size: small; + vertical-align: super; + color: gray; +} +l[pos][type]:after { + content: attr(pos) ', ' attr(type); +} +l[pos][type][nr]:after { + content: attr(pos) ', ' attr(type) ', ' attr(nr); +} +l[pos][nr]:after { + content: attr(pos) ', ' attr(nr); +} +l[pos][illpl]:after { + content: attr(pos) ', illpl=' attr(illpl); +} +/* +l[pos][context]:after { + content: attr(pos) ', ' attr(context); +} +*/ +/* lc styling: */ + +lc { + display: none; + /* font-style: italic; + font-size: small; */ +} + +lc + lc:before { + display: none; + /* content: ", "; */ +} + +/* lsub styling: */ +lsub { + display: none; + /* font-style: normal; */ +} + +lsub + lsub:before { + display: none; + /* content: ", "; */ +} + + +stem { + font-style: italic; + font-size: small; +} + +stem + stem:before { + content: ", "; +} +*/ +lg > analysis:before { + content: " Analysis: " ; + font-size: small; + color: gray; +} +lg > analysis { + display: inline; +} +/* +lemma_ref:before { + content: "▸ " ; +} +*/ +lemma_ref { + display: none; + /* display: inline; */ +} +lemma_ref:after { + display: none; +/* + display: inline; + content: " (" attr(lemmaID) ") "; + font-size: small; + color: gray; +*/ +} +/* mini paradigm styling: */ +mini_paradigm:before { + display: block ; + content: " Miniparadigm:" ; + font-size: small; + color: gray; +} +mini_paradigm { + display: block ; + margin-left: 5ex; +} +mini_paradigm > analysis:before { + display: inline ; + content: attr(ms) ' ' ; + color: gray; +} +mini_paradigm > analysis { + display: list-item ; +} +wordform { + display: inline ; +} +wordform + wordform:before { + display: inline ; + content: ', ' ; +} + +/* mg styling: */ +e > mg:before { + display: inline; + counter-increment: meaninggroups; + content: counter(meaninggroups, decimal) ". "; +} +e > mg { + display: block; + margin-bottom: 0.2ex; + margin-left: 1.33ex; +} + +re:before, te:before { + content: " ("; + display: inline; +} +re { + color: gray; + font-style: italic; + font-size: small; +} +re:after, te:after { + content: ") "; + display: inline; +} + +t:after { + display: inline; + font-style: italic; + color: gray; + font-size: small; + vertical-align: super; + margin-left: 0.5ex; +} +t[pos]:after { + content: attr(pos); +} +t[pos][type]:after { + content: attr(pos) ", " attr(type) ; +} +t[pos][decl]:after { + content: attr(pos) " - " attr(decl) ; +} +t[pos][decl][type]:after { + content: attr(pos) " - " attr(decl) ", " attr(type) ; +} +/* +t:lang(nob):after { + content: " norsk"; +} +t:lang(swe):after { + content: " svensk"; +} +t:lang(sme):after { + content: " nordsamisk"; +} +t:lang(fin):after { + content: " finsk"; +} +*/ + +/* +l_ref:before { + display: inline; + color: red; + content: "[⇒ "; +} +*/ + +l_ref { + display: none; +/* + color: red; + font-style: italic; +*/ +} + +/* +l_ref:after { + display: inline; + color: red; + content: '] '; +} +*/ + + +xt:before { + display: inline; + color: gray; + content: "⇒ "; +} + +syng:before { + display: inline; + color: gray; + content: " Synonyms ▶ "; +} + + +/* Her kjem tillegga for sme-oahpa: */ + +translations { + display: block; + margin-left: 1.33ex; +} + + +tr:after { + display: inline; + content: attr(xml|lang); + font-style: italic; + color: gray; + vertical-align: super; +} + +val { + font-style: italic; +} + +valency:before { + content: " Val:"; + font-size: smaller; + font-weight: bold; + } + +val[class]:after { + display: inline; + content: " " attr(class) ', ' ; +} + +semantics { + font-style: italic; +} + +semantics:before { + content: " Sem:"; + font-weight: bold; + font-size: smaller; + } + +sem[class]:after { + display: inline; + content: " " attr(class) ', ' ; +} + +stem + { + font-style: italic; +} + +stem:before { + content: " Stem:"; + font-size: smaller; + font-weight: bold; + } + +stem[class]:after { + display: inline; + content: " " attr(class) ', ' ; +} From c1cfe89e60ced0fabe355e314cf84cda7462060e Mon Sep 17 00:00:00 2001 From: Trond Trosterud Date: Wed, 6 Mar 2024 13:48:32 +0100 Subject: [PATCH 25/45] for reference hmm --- dicts/fitswe-all-fst-script | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 dicts/fitswe-all-fst-script diff --git a/dicts/fitswe-all-fst-script b/dicts/fitswe-all-fst-script new file mode 100644 index 00000000..593c4a02 --- /dev/null +++ b/dicts/fitswe-all-fst-script @@ -0,0 +1,4 @@ +read lexc ../../dict-fit-swe/bin/fitswe-all.lexc +invert net +save stack ../../dict-fit-swe/bin/fitswe-all.fst +quit From 2688e39dd998cfa841271901c0f3f5d96dcd64a6 Mon Sep 17 00:00:00 2001 From: Flammie A Pirinen Date: Thu, 14 Mar 2024 15:45:53 +0100 Subject: [PATCH 26/45] move more stuff to gendir --- ...-spellcheckers-fstbased-desktop-hfst-dir-include.am | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/am-shared/tools-spellcheckers-fstbased-desktop-hfst-dir-include.am b/am-shared/tools-spellcheckers-fstbased-desktop-hfst-dir-include.am index 4f639832..a91daaac 100644 --- a/am-shared/tools-spellcheckers-fstbased-desktop-hfst-dir-include.am +++ b/am-shared/tools-spellcheckers-fstbased-desktop-hfst-dir-include.am @@ -278,7 +278,7 @@ easteregg.%.desktop.suggtxt: easteregg.%.desktop.txt ####### Error model: ####### # Error model building - edit distance based on transducer alphabet: -editdist.%.regex: editdist.%.txt $(initial_letter_deps) +.generated/editdist.%.regex: editdist.%.txt $(initial_letter_deps) $(GENDIR) $(AM_V_GEN)$(GTCORE)/scripts/editdist.py \ --verbose \ $(swaps) \ @@ -289,7 +289,7 @@ editdist.%.regex: editdist.%.txt $(initial_letter_deps) --output-file=$@ \ $(initial_letter_error_model_option) -.generated/editdist.%.hfst: editdist.%.regex $(GENDIR) +.generated/editdist.%.hfst: .generated/editdist.%.regex $(GENDIR) $(AM_V_RGX2FST)$(HFST_REGEXP2FST) -S $(HFST_FLAGS) -i $<\ --format=openfst-tropical \ -o $@ @@ -354,7 +354,7 @@ strings.all.%.hfst: $(strings_all_deps) # Combine edit distance with string pattern edits, then multiply according to # the specified editing distance. The strings part is included depending on # variable setting in Makefile.am. -editdist.all.%.hfst: $(strings_deps) editdist.%.hfst +.generated/editdist.all.%.hfst: $(strings_deps) .generated/editdist.%.hfst $(strings_fst_include) \ | $(HFST_REPEAT) -f 1 -t $(EDIT_DISTANCE) \ -o $@ @@ -369,14 +369,14 @@ words.%.hfst: $(words_deps) easteregg.%.desktop.suggtxt # The final error model is assembled here: errmodel.%.hfst: words.%.hfst \ $(initial_letter_deps) \ - editdist.all.%.hfst \ + .generated/editdist.all.%.hfst \ $(final_strings_deps) $(AM_V_RGX2FST)printf "\ [ @\"words.$*.hfst\" \ | \ [ \ $(initial_letter_fst_include) \ - @\"editdist.all.$*.hfst\" \ + @\".generated/editdist.all.$*.hfst\" \ $(final_strings_fst_include) \ ] \ ];" \ From 8e596839f034074496054024e2d1c7061691519e Mon Sep 17 00:00:00 2001 From: Flammie A Pirinen Date: Tue, 19 Mar 2024 16:44:24 +0100 Subject: [PATCH 27/45] maybe recurse one deeper? --- am-shared/docs-dir-include.am | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/am-shared/docs-dir-include.am b/am-shared/docs-dir-include.am index 6a029cf4..d5a87d15 100644 --- a/am-shared/docs-dir-include.am +++ b/am-shared/docs-dir-include.am @@ -182,6 +182,7 @@ $(LINKS): d2=`echo "$$d" | cut -d '/' -f 2` ;\ d3=`echo "$$d" | cut -d '/' -f 3` ;\ d4=`echo "$$d" | cut -d '/' -f 4` ;\ + d5=`echo "$$d" | cut -d '/' -f 5` ;\ if test "x$$d1" != "x$$oldd1" ; then \ echo "* \`$$d1/\`" ;\ oldd1=$$d1 ;\ @@ -204,6 +205,12 @@ $(LINKS): echo " * \`$$d4/\`" ;\ oldd4=$$d4 ;\ fi ; \ + if test "x$$d5" = x -a "x$$d4" != x ; then \ + echo " * [$$docname]($$html) ([src]($(REPOURL)/$$doc))" ;\ + elif test "x$$d5" != "x$$oldd5" ; then \ + echo " * \`$$d5/\`" ;\ + oldd4=$$d4 ;\ + fi ; \ done > $@ empty.md: From 54daec670de2c31c23047fcf2bcb8c7a3d0b6bd4 Mon Sep 17 00:00:00 2001 From: Flammie A Pirinen Date: Tue, 19 Mar 2024 23:16:49 +0100 Subject: [PATCH 28/45] more gendirs --- ...llcheckers-fstbased-desktop-dir-include.am | 6 +-- ...ckers-fstbased-desktop-hfst-dir-include.am | 50 +++++++++---------- ...based-desktop-hfst_alt_orth-dir-include.am | 14 +++--- ...stbased-desktop-hfst_alt_ws-dir-include.am | 12 ++--- ...-fstbased-desktop-hfst_area-dir-include.am | 16 +++--- ...rs-fstbased-desktop_weights-dir-include.am | 22 ++++---- 6 files changed, 60 insertions(+), 60 deletions(-) diff --git a/am-shared/tools-spellcheckers-fstbased-desktop-dir-include.am b/am-shared/tools-spellcheckers-fstbased-desktop-dir-include.am index 3ee25256..39c7ef38 100644 --- a/am-shared/tools-spellcheckers-fstbased-desktop-dir-include.am +++ b/am-shared/tools-spellcheckers-fstbased-desktop-dir-include.am @@ -18,7 +18,7 @@ # Use this as the source lexical fst for unit weighting, it contains correct # surface forms except for the word boundary #, which is still present, and # used in the weighting: -UW_SPELLER_SRC=generator-desktopspeller-gt-norm-base.hfst +UW_SPELLER_SRC=.generated/generator-desktopspeller-gt-norm-base.hfst ## Conditional string variables; # Set file name to the empty string or initial_letters_all depending on variable: @@ -34,8 +34,8 @@ tag_weighted_dep=$(shell \ if WANT_SPELLERS if CAN_HFST -GT_COMMON_SPELLER_HFST+=generator-desktopspeller-gt-norm.hfst -GT_COMMON_SPELLER_HFST+=analyser-desktopspeller-gt-norm.hfst +GT_COMMON_SPELLER_HFST+=.generated/generator-desktopspeller-gt-norm.hfst +GT_COMMON_SPELLER_HFST+=.generated/analyser-desktopspeller-gt-norm.hfst endif # CAN_HFST endif # WANT_SPELLERS diff --git a/am-shared/tools-spellcheckers-fstbased-desktop-hfst-dir-include.am b/am-shared/tools-spellcheckers-fstbased-desktop-hfst-dir-include.am index a91daaac..376e2efc 100644 --- a/am-shared/tools-spellcheckers-fstbased-desktop-hfst-dir-include.am +++ b/am-shared/tools-spellcheckers-fstbased-desktop-hfst-dir-include.am @@ -47,7 +47,7 @@ swaps=$(shell \ # Set file name to the empty string or initial_letters.all depending on variable: initial_letter_deps=$(shell \ if [[ $(INITIAL_EDITS) != 'no' ]] ; then \ - echo "initial_letters.all.%.hfst"; \ + echo ".generated/initial_letters.all.%.hfst"; \ else \ echo ""; \ fi) @@ -55,7 +55,7 @@ initial_letter_deps=$(shell \ # Set file name to the empty string or initial_letters.all depending on variable: initial_letter_fst_include=$(shell \ if [[ $(INITIAL_EDITS) != 'no' ]] ; then \ - echo "( @\\\"initial_letters.all.$*.hfst\\\" )"; \ + echo "( @\\\".generated/initial_letters.all.$*.hfst\\\" )"; \ else \ echo ""; \ fi) @@ -71,11 +71,11 @@ initial_letter_error_model_option=$(shell \ # Set dependency file name(s) depending on variable value: initial_letter_all_deps=$(shell \ if [[ $(INITIAL_EDITS) == 'regex' ]] ; then \ - echo "initial_letters.regex.%.hfst"; \ + echo ".generated/initial_letters.regex.%.hfst"; \ elif [[ $(INITIAL_EDITS) == 'txt' ]] ; then \ - echo "initial_letters.txt.%.hfst"; \ + echo ".generated/initial_letters.txt.%.hfst"; \ elif [[ $(INITIAL_EDITS) == 'both' ]] ; then \ - echo "initial_letters.regex.%.hfst initial_letters.txt.%.hfst"; \ + echo ".generated/initial_letters.regex.%.hfst .generated/initial_letters.txt.%.hfst"; \ else \ echo ""; \ fi) @@ -96,7 +96,7 @@ initial_letter_all_build=$(shell \ # Set file name to the empty string or strings.all depending on variable: strings_deps=$(shell \ if [[ $(STRING_EDITS) != 'no' ]] ; then \ - echo "strings.all.%.hfst"; \ + echo ".generated/strings.all.%.hfst"; \ else \ echo ""; \ fi) @@ -112,11 +112,11 @@ strings_fst_include=$(shell \ # Set dependency file name(s) depending on variable value: strings_all_deps=$(shell \ if [[ $(STRING_EDITS) == 'regex' ]] ; then \ - echo "strings.regex.%.hfst"; \ + echo ".generated/strings.regex.%.hfst"; \ elif [[ $(STRING_EDITS) == 'txt' ]] ; then \ - echo "strings.txt.%.hfst"; \ + echo ".generated/strings.txt.%.hfst"; \ elif [[ $(STRING_EDITS) == 'both' ]] ; then \ - echo "strings.regex.%.hfst strings.txt.%.hfst"; \ + echo ".generated/strings.regex.%.hfst .generated/strings.txt.%.hfst"; \ else \ echo ""; \ fi) @@ -137,7 +137,7 @@ strings_all_build=$(shell \ # Set file name to the empty string or final_strings.all depending on variable: final_strings_deps=$(shell \ if [[ $(FINAL_STRING_EDITS) != 'no' ]] ; then \ - echo "final_strings.all.%.hfst"; \ + echo ".generated/final_strings.all.%.hfst"; \ else \ echo ""; \ fi) @@ -145,7 +145,7 @@ final_strings_deps=$(shell \ # Set build command for strings.all depending on variable value: final_strings_fst_include=$(shell \ if [[ $(FINAL_STRING_EDITS) != 'no' ]] ; then \ - echo "( @\\\"final_strings.all.$*.hfst\\\" )"; \ + echo "( @\\\".generated/final_strings.all.$*.hfst\\\" )"; \ else \ echo ""; \ fi) @@ -153,11 +153,11 @@ final_strings_fst_include=$(shell \ # Set dependency file name(s) depending on variable value: final_strings_all_deps=$(shell \ if [[ $(FINAL_STRING_EDITS) == 'regex' ]] ; then \ - echo "final_strings.regex.%.hfst"; \ + echo ".generated/final_strings.regex.%.hfst"; \ elif [[ $(FINAL_STRING_EDITS) == 'txt' ]] ; then \ - echo "final_strings.txt.%.hfst"; \ + echo ".generated/final_strings.txt.%.hfst"; \ elif [[ $(FINAL_STRING_EDITS) == 'both' ]] ; then \ - echo "final_strings.regex.%.hfst final_strings.txt.%.hfst"; \ + echo ".generated/final_strings.regex.%.hfst final_strings.txt.%.hfst"; \ else \ echo ""; \ fi) @@ -341,14 +341,14 @@ easteregg.%.desktop.suggtxt: easteregg.%.desktop.txt # larger as the edit distance, since the file is multiplied again as part of # the editStrings build target. The idea is that the regex should contain a # highly targeted set of frequent spelling errors. -strings.regex.%.hfst: strings.%.regex anystar.hfst +.generated/strings.regex.%.hfst: strings.%.regex .generated/anystar.hfst $(AM_V_RGX2FST)$(HFST_REGEXP2FST) -S $(HFST_FLAGS) -i $<\ - | $(HFST_CONCATENATE) anystar.hfst - \ - | $(HFST_CONCATENATE) - anystar.hfst \ + | $(HFST_CONCATENATE) .generated/anystar.hfst - \ + | $(HFST_CONCATENATE) - .generated/anystar.hfst \ | $(HFST_REPEAT) -f 1 -t $(STRING_REGEX_EDIT_DISTANCE) \ -o $@ -strings.all.%.hfst: $(strings_all_deps) +.generated/strings.all.%.hfst: $(strings_all_deps) $(strings_all_build) # Combine edit distance with string pattern edits, then multiply according to @@ -360,19 +360,19 @@ strings.all.%.hfst: $(strings_all_deps) -o $@ # Error model building - list of words known to be misspelled: -words.%.hfst: $(words_deps) easteregg.%.desktop.suggtxt +.generated/words.%.hfst: $(words_deps) easteregg.%.desktop.suggtxt $(GENDIR) $(AM_V_STR2FST)grep -h -v '^#' $^ | grep -v '^$$' \ | $(HFST_STRINGS2FST) $(HFST_FLAGS) -j \ --format=openfst-tropical \ -o $@ # The final error model is assembled here: -errmodel.%.hfst: words.%.hfst \ +errmodel.%.hfst: .generated/words.%.hfst \ $(initial_letter_deps) \ .generated/editdist.all.%.hfst \ $(final_strings_deps) $(AM_V_RGX2FST)printf "\ - [ @\"words.$*.hfst\" \ + [ @\".generated/words.$*.hfst\" \ | \ [ \ $(initial_letter_fst_include) \ @@ -388,17 +388,17 @@ errmodel.%.hfst: words.%.hfst \ ####### Alternate error model: ####### # Alternatively, the error model can be constructed as a long list of regular # expressions, semicolon separated: -errmodel.%.hfst: errmodel.%.regex easteregg.%.hfst +errmodel.%.hfst: errmodel.%.regex .generated/easteregg.%.hfst $(AM_V_GEN)$(HFST_REGEXP2FST) $(HFSTFLAGS) -S -i $< \ - | $(HFST_DISJUNCT) - easteregg.$*.hfst \ + | $(HFST_DISJUNCT) - .generated/easteregg.$*.hfst \ | $(HFST_PUSH_WEIGHTS) --push=initial \ | $(HFST_FST2FST) $(HFST_FLAGS) -f olw \ -o $@ # ... or as an xfscript file: -errmodel.%.hfst: errmodel.%.xfscript easteregg.%.hfst +errmodel.%.hfst: errmodel.%.xfscript .generated/easteregg.%.hfst $(AM_V_GEN)$(HFST_REGEXP2FST) $(HFSTFLAGS) -S -i $< \ - | $(HFST_DISJUNCT) - easteregg.$*.hfst \ + | $(HFST_DISJUNCT) - .generated/easteregg.$*.hfst \ | $(HFST_PUSH_WEIGHTS) --push=initial \ | $(HFST_FST2FST) $(HFST_FLAGS) -f olw \ -o $@ diff --git a/am-shared/tools-spellcheckers-fstbased-desktop-hfst_alt_orth-dir-include.am b/am-shared/tools-spellcheckers-fstbased-desktop-hfst_alt_orth-dir-include.am index 8096e675..96ca3ea7 100644 --- a/am-shared/tools-spellcheckers-fstbased-desktop-hfst_alt_orth-dir-include.am +++ b/am-shared/tools-spellcheckers-fstbased-desktop-hfst_alt_orth-dir-include.am @@ -22,11 +22,11 @@ if WANT_SPELLERS if WANT_ALT_ORTH_PROOFTOOLS GT_ALT_ORTH_ERRMODELS=$(shell for ld in $(ALT_ORTHS); do\ - echo "errmodel.$$ld.hfst" ; \ + echo ".generated/errmodel.$$ld.hfst" ; \ done) GT_ALT_ORTH_SPELLER_ACCEPTORS=$(shell for ld in $(ALT_ORTHS); do\ - echo "acceptor.$$ld.hfst" ; \ + echo ".generated/acceptor.$$ld.hfst" ; \ done) ALT_ORTH_ZHFST_FILES=$(shell for ld in $(ALT_ORTHS); do\ @@ -42,10 +42,10 @@ endif # HAVE_ALT_ORTHS #### Build rules: #### # Alternative based on the raw fst instead of the standard orthography: -acceptor.%.hfst: \ +.generated/acceptor.%.hfst: \ $(GT_SPELLER_HFST) \ - easteregg.%.desktop.hfst \ - $(top_builddir)/src/fst/orthography/raw-to-%.compose.hfst + .generated/easteregg.%.desktop.hfst \ + $(top_builddir)/src/fst/orthography/raw-to-%.compose.hfst $(GENDIR) $(AM_V_GEN)cat $< \ | $(HFST_COMPOSE) $(HFST_FLAGS) -F \ -2 $(top_builddir)/src/fst/orthography/raw-to-$*.compose.hfst \ @@ -56,9 +56,9 @@ acceptor.%.hfst: \ -o $@ # Build rule for acceptors for alternate writing systems: -acceptor.%.hfst: \ +.generated/acceptor.%.hfst: \ $(GT_SPELLER_HFST) \ - easteregg.%.desktop.hfst \ + .generated/easteregg.%.desktop.hfst \ $(top_builddir)/src/fst/orthography/$(DEFAULT_ORTH)-to-%.compose.hfst $(AM_V_GEN)cat $< \ | $(HFST_COMPOSE) $(HFST_FLAGS) -F \ diff --git a/am-shared/tools-spellcheckers-fstbased-desktop-hfst_alt_ws-dir-include.am b/am-shared/tools-spellcheckers-fstbased-desktop-hfst_alt_ws-dir-include.am index b86faa3b..8e3829d6 100644 --- a/am-shared/tools-spellcheckers-fstbased-desktop-hfst_alt_ws-dir-include.am +++ b/am-shared/tools-spellcheckers-fstbased-desktop-hfst_alt_ws-dir-include.am @@ -21,11 +21,11 @@ if CAN_HFST if WANT_SPELLERS GT_ALT_WS_ERRMODELS=$(shell for ld in $(ALT_WSS); do\ - echo "errmodel.$$ld.hfst" ; \ + echo ".generated/errmodel.$$ld.hfst" ; \ done) GT_ALT_WS_SPELLER_ACCEPTORS=$(shell for ld in $(ALT_WSS); do\ - echo "acceptor.$$ld.hfst" ; \ + echo ".generated/acceptor.$$ld.hfst" ; \ done) ALT_WS_ZHFST_FILES=$(shell for ld in $(ALT_WSS); do\ @@ -41,8 +41,8 @@ endif # HAVE_ALT_WSS # Build rule for acceptors for alternate writing systems: $(GT_ALT_WS_SPELLER_ACCEPTORS): \ - acceptor.%.hfst: \ - $(GT_SPELLER_HFST) easteregg.%.desktop.hfst \ + .generated/acceptor.%.hfst: \ + $(GT_SPELLER_HFST) .generated/easteregg.%.desktop.hfst \ $(top_builddir)/src/fst/orthography/$(DEFAULT_WS)-to-%.compose.hfst $(AM_V_GEN)cat $< \ | $(HFST_COMPOSE) $(HFST_FLAGS) -F \ @@ -57,8 +57,8 @@ $(GT_ALT_WS_SPELLER_ACCEPTORS): \ $(ALT_WS_ZHFST_FILES): \ $(GTLANG2)-%.zhfst: \ index.%.xml \ - acceptor.%.hfst \ - errmodel.%.hfst + .generated/acceptor.%.hfst \ + .generated/errmodel.%.hfst $(AM_V_at)rm -f $@ $(AM_V_at)$(MKDIR_P) build/$@ $(AM_V_at)rm -f build/$@/* diff --git a/am-shared/tools-spellcheckers-fstbased-desktop-hfst_area-dir-include.am b/am-shared/tools-spellcheckers-fstbased-desktop-hfst_area-dir-include.am index 724e4003..58c2f819 100644 --- a/am-shared/tools-spellcheckers-fstbased-desktop-hfst_area-dir-include.am +++ b/am-shared/tools-spellcheckers-fstbased-desktop-hfst_area-dir-include.am @@ -21,11 +21,11 @@ if CAN_HFST if WANT_SPELLERS GT_AREA_ERRMODELS=$(shell for ld in $(AREAS); do\ - echo "errmodel.$$ld.hfst" ; \ + echo ".generated/errmodel.$$ld.hfst" ; \ done) GT_AREA_SPELLER_ACCEPTORS=$(shell for ld in $(AREAS); do\ - echo "acceptor.$$ld.hfst" ; \ + echo ".generated/acceptor.$$ld.hfst" ; \ done) AREA_ZHFST_FILES=$(shell for ld in $(AREAS); do\ @@ -40,8 +40,8 @@ endif # HAVE_AREAS #### Build rules: #### # Build rule for acceptors for specific areas/countries: -acceptor.%.hfst: \ - $(GT_SPELLER_HFST) easteregg.%.desktop.hfst \ +.generated/acceptor.%.hfst: \ + $(GT_SPELLER_HFST) .generated/easteregg.%.desktop.hfst \ $(top_builddir)/src/fst/filters/remove-all_areas_but_%-strings.hfst $(AM_V_GEN)cat $< \ | $(HFST_COMPOSE) $(HFST_FLAGS) -F \ @@ -56,14 +56,14 @@ acceptor.%.hfst: \ $(AREA_ZHFST_FILES): \ $(GTLANG2)_%.zhfst: \ index.%.xml \ - acceptor.%.hfst \ - errmodel.%.hfst + .generated/acceptor.%.hfst \ + .generated/errmodel.%.hfst $(AM_V_at)rm -f $@ $(AM_V_at)$(MKDIR_P) build/$@ $(AM_V_at)rm -f build/$@/* $(AM_V_at)cp index.$*.xml build/$@/index.xml - $(AM_V_at)cp acceptor.$*.hfst build/$@/acceptor.default.hfst - $(AM_V_at)cp errmodel.$*.hfst build/$@/errmodel.default.hfst + $(AM_V_at)cp .generated/acceptor.$*.hfst build/$@/acceptor.default.hfst + $(AM_V_at)cp .generated/errmodel.$*.hfst build/$@/errmodel.default.hfst $(AM_V_ZIP)cd build/$@/ && $(ZIP) $(ZIPFLAGS) ../../$@ * $(AM_V_at)$(MKDIR_P) 3 $(AM_V_at)cp -f $@ 3/ diff --git a/am-shared/tools-spellcheckers-fstbased-desktop_weights-dir-include.am b/am-shared/tools-spellcheckers-fstbased-desktop_weights-dir-include.am index 22dedbea..3105365c 100644 --- a/am-shared/tools-spellcheckers-fstbased-desktop_weights-dir-include.am +++ b/am-shared/tools-spellcheckers-fstbased-desktop_weights-dir-include.am @@ -36,20 +36,20 @@ corpus_size_limit_command=$(shell \ $(AM_V_GEN)$(HFST_TXT2FST) $(HFST_FLAGS) -f openfst-tropical $< -o $@ # sort the clean corpus: -%.sort.txt: weights/%.clean.txt +.generated/%.sort.txt: weights/%.clean.txt $(AM_V_GEN)LC_ALL=C.utf8 sort < $< > $@ # token count: -%.wordcount.txt: %.sort.txt +.generated/%.wordcount.txt: .generated/%.sort.txt $(AM_V_GEN)wc -l < $< > $@ # Unique the sorted, clean corpus: -%.uniq.txt: %.sort.txt +.generated/%.uniq.txt: .generated/%.sort.txt $(AM_V_GEN)LC_ALL=C.utf8 uniq -c < $< |\ LC_ALL=C.utf8 sort -nr $(corpus_size_limit_command) > $@ # type count: -%.typecount.txt: %.uniq.txt +.generated/%.typecount.txt: .generated/%.uniq.txt $(AM_V_GEN)wc -l < $< > $@ # calculate unit weight, smoothed using ALPHA: @@ -60,21 +60,21 @@ corpus_size_limit_command=$(shell \ # | $(BC) -l > $@ # Alternative unit weight: highest tropical weight + ALPHA: -%.unitweight.txt: %.tropical.txt +.generated/%.unitweight.txt: .generated/%.tropical.txt $(AM_V_GEN)echo "$$(cut -f2 < $^ | sort -nru | head -n1) + $(ALPHA)" \ | $(BC) -l > $@ # add tropical weights to the corpus: -%.tropical.txt: %.uniq.txt %.wordcount.txt %.typecount.txt +.generated/%.tropical.txt: .generated/%.uniq.txt .generated/%.wordcount.txt .generated/%.typecount.txt $(AM_V_GEN)cat $< |\ - $(GAWK) -v CS="$$(cat $*.wordcount.txt)" \ - -v DS="$$(cat $*.typecount.txt)" \ + $(GAWK) -v CS="$$(cat .generated/$*.wordcount.txt)" \ + -v DS="$$(cat .generated/$*.typecount.txt)" \ -v ALPHA=$(ALPHA) \ -f $(GTCORE)/scripts/uniq_count2tropical_weight.awk \ > $@ # build an fst of surface forms with tropical weights for each word form: -.generated/%.surfs.hfst: %.tropical.txt $(GENDIR) +.generated/%.surfs.hfst: .generated/%.tropical.txt $(AM_V_STR2FST)cat $< |\ $(HFST_STRINGS2FST) -j $(HFST_FLAGS) -f openfst-tropical -o $@ @@ -104,10 +104,10 @@ corpus_size_limit_command=$(shell \ # Add the unit weight to each unit in compounds, both dynamic and lexical: #unitweighted.hfst: unitweighted_limited.hfst .generated/unitweighted.hfst: $(UW_SPELLER_SRC) \ - $(UNITWEIGHT) $(GENDIR) + .generated/$(UNITWEIGHT) $(AM_V_REWEIGHT)$(HFST_FST2FST) --format=openfst-tropical -i $< \ | $(HFST_REWEIGHT) $(HFST_FLAGS) \ - -e -a $$(cat $(UNITWEIGHT)) \ + -e -a $$(cat .generated/$(UNITWEIGHT)) \ -o $@ # Keep these intermediate targets when building using --debug: From ea0a06c819dae6688beea718303edad900309053 Mon Sep 17 00:00:00 2001 From: Sjur N Moshagen Date: Fri, 22 Mar 2024 13:30:23 +0200 Subject: [PATCH 29/45] BUGFIX: Use proper indent and correct variables --- am-shared/docs-dir-include.am | 6 +++--- configure.ac | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/am-shared/docs-dir-include.am b/am-shared/docs-dir-include.am index d5a87d15..34d969ab 100644 --- a/am-shared/docs-dir-include.am +++ b/am-shared/docs-dir-include.am @@ -206,10 +206,10 @@ $(LINKS): oldd4=$$d4 ;\ fi ; \ if test "x$$d5" = x -a "x$$d4" != x ; then \ - echo " * [$$docname]($$html) ([src]($(REPOURL)/$$doc))" ;\ + echo " * [$$docname]($$html) ([src]($(REPOURL)/$$doc))" ;\ elif test "x$$d5" != "x$$oldd5" ; then \ - echo " * \`$$d5/\`" ;\ - oldd4=$$d4 ;\ + echo " * \`$$d5/\`" ;\ + oldd5=$$d5 ;\ fi ; \ done > $@ diff --git a/configure.ac b/configure.ac index 89e0f91b..c4c74207 100644 --- a/configure.ac +++ b/configure.ac @@ -14,7 +14,7 @@ ## You should have received a copy of the GNU General Public License ## along with this program. If not, see . -AC_INIT([giella-core], [0.22.2], [feedback@divvun.no], [giella-core], [https://github.com/giellalt/giella-core]) +AC_INIT([giella-core], [0.22.3], [feedback@divvun.no], [giella-core], [https://github.com/giellalt/giella-core]) AC_REVISION([$Revision$]) AC_CONFIG_AUX_DIR([build-aux]) AM_INIT_AUTOMAKE([1.9 tar-pax -Wall -Werror foreign]) From 51fbc463602a431218b94db9514a21697e978c1e Mon Sep 17 00:00:00 2001 From: Flammie A Pirinen Date: Wed, 27 Mar 2024 16:15:37 +0100 Subject: [PATCH 30/45] gendir fixes --- ...eckers-fstbased-mobile-hfst-dir-include.am | 95 ++++++++++--------- 1 file changed, 49 insertions(+), 46 deletions(-) diff --git a/am-shared/tools-spellcheckers-fstbased-mobile-hfst-dir-include.am b/am-shared/tools-spellcheckers-fstbased-mobile-hfst-dir-include.am index de12b37a..791eec61 100644 --- a/am-shared/tools-spellcheckers-fstbased-mobile-hfst-dir-include.am +++ b/am-shared/tools-spellcheckers-fstbased-mobile-hfst-dir-include.am @@ -144,7 +144,7 @@ mob_strings_all_build=$(shell \ # Set file name to the empty string or final_strings.all depending on variable: mob_final_strings_deps=$(shell \ if [[ $(MOB_FINAL_STRING_EDITS) != 'no' ]] ; then \ - echo "final_strings.all.%.hfst"; \ + echo ".generated/final_strings.all.%.hfst"; \ else \ echo ""; \ fi) @@ -152,7 +152,7 @@ mob_final_strings_deps=$(shell \ # Set build command for strings.all depending on variable value: mob_final_strings_fst_include=$(shell \ if [[ $(MOB_FINAL_STRING_EDITS) != 'no' ]] ; then \ - echo "( @\\\"final_strings.all.$*.hfst\\\" )"; \ + echo "( @\\\".generated/final_strings.all.$*.hfst\\\" )"; \ else \ echo ""; \ fi) @@ -160,11 +160,11 @@ mob_final_strings_fst_include=$(shell \ # Set dependency file name(s) depending on variable value: mob_final_strings_all_deps=$(shell \ if [[ $(MOB_FINAL_STRING_EDITS) == 'regex' ]] ; then \ - echo "final_strings.regex.%.hfst"; \ + echo ".generated/final_strings.regex.%.hfst"; \ elif [[ $(MOB_FINAL_STRING_EDITS) == 'txt' ]] ; then \ - echo "final_strings.txt.%.hfst"; \ + echo ".generated/final_strings.txt.%.hfst"; \ elif [[ $(MOB_FINAL_STRING_EDITS) == 'both' ]] ; then \ - echo "final_strings.regex.%.hfst final_strings.txt.%.hfst"; \ + echo ".generated/final_strings.regex.%.hfst .generated/final_strings.txt.%.hfst"; \ else \ echo ""; \ fi) @@ -247,13 +247,13 @@ easteregg.%.mobile.suggtxt: easteregg.%.mobile.txt > $@ # Easter egg string acceptor: -easteregg.%.mobile.hfst: easteregg.%.mobile.txt +.generated/easteregg.%.mobile.hfst: easteregg.%.mobile.txt $(GENDIR) $(AM_V_GEN)$(HFST_STRINGS2FST) $(HFST_FLAGS) -j < $< \ | $(HFST_PROJECT) $(HFST_FLAGS) --project=lower > $@ ####### Error model: ####### # Error model building - edit distance based on transducer alphabet: -editdist.%.mobile.regex: editdist.%.txt $(mob_initial_letter_deps) +.generated/editdist.%.mobile.regex: editdist.%.txt $(mob_initial_letter_deps) $(GENDIR) $(AM_V_GEN)$(GTCORE)/scripts/editdist.py \ --verbose \ $(mob_swaps) \ @@ -264,7 +264,7 @@ editdist.%.mobile.regex: editdist.%.txt $(mob_initial_letter_deps) --output-file=$@ \ $(mob_initial_letter_error_model_option) -predict.%.regex: editdist.%.txt +.generated/predict.%.regex: editdist.%.txt $(GENDIR) $(AM_V_GEN)$(GTCORE)/scripts/predict.py \ --verbose \ --epsilon='@0@' \ @@ -274,12 +274,12 @@ predict.%.regex: editdist.%.txt --output=$@ # Initial string edits, if enabled: -initial_letters.txt.%.mobile.hfst: initial_letters.%.txt +.generated/initial_letters.txt.%.mobile.hfst: initial_letters.%.txt $(GENDIR) $(AM_V_STR2FST)grep -v '^#' $< | grep -v '^$$' | cut -f1-2 \ | $(HFST_STRINGS2FST) $(HFST_FLAGS) -j -p \ -o $@ -initial_letters.regex.%.mobile.hfst: initial_letters.%.regex +.generated/initial_letters.regex.%.mobile.hfst: initial_letters.%.regex $(GENDIR) $(AM_V_RGX2FST)$(HFST_REGEXP2FST) -S $(HFST_FLAGS) -i $<\ --format=openfst-tropical \ -o $@ @@ -288,29 +288,29 @@ initial_letters.regex.%.mobile.hfst: initial_letters.%.regex # $(mob_initial_letter_all_build) # Final string edits, if enabled: -final_strings.txt.%.mobile.hfst: final_strings.%.txt +.generated/final_strings.txt.%.mobile.hfst: final_strings.%.txt $(GENDIR) $(AM_V_STR2FST)grep -v '^#' $< | grep -v '^$$' | cut -f1-2 \ | $(HFST_STRINGS2FST) $(HFST_FLAGS) -j \ --format=openfst-tropical \ -o $@ -final_strings.regex.%.mobile.hfst: final_strings.%.regex +.generated/final_strings.regex.%.mobile.hfst: final_strings.%.regex $(GENDIR) $(AM_V_RGX2FST)$(HFST_REGEXP2FST) -S $(HFST_FLAGS) -i $<\ --format=openfst-tropical \ -o $@ -final_strings.all.%.hfst: $(mob_final_strings_all_deps) +.generated/final_strings.all.%.hfst: $(mob_final_strings_all_deps) $(GENDIR) $(mob_final_strings_all_build) # In-word list of strings known to be misspelled, fall back for mobile fst's: # if there is no mobile-specific txt file, it will fall back to the default error # model file. If there IS a mobile specific txt file, the default rule will apply # due to pattern matching rules. -strings.txt.%.mobile.hfst: strings.%.txt anystar.hfst +.generated/strings.txt.%.mobile.hfst: strings.%.txt .generated/anystar.hfst $(AM_V_STR2FST)grep -v '^#' $< | grep -v '^$$' | cut -f1-2 \ | $(HFST_STRINGS2FST) $(HFST_FLAGS) -j \ - | $(HFST_CONCATENATE) anystar.hfst - \ - | $(HFST_CONCATENATE) - anystar.hfst \ + | $(HFST_CONCATENATE) .generated/anystar.hfst - \ + | $(HFST_CONCATENATE) - .generated/anystar.hfst \ -o $@ # strings regex file: @@ -319,31 +319,33 @@ strings.txt.%.mobile.hfst: strings.%.txt anystar.hfst # larger as the edit distance, since the file is multiplied again as part of # the editStrings build target. The idea is that the regex should contain a # highly targeted set of frequent spelling errors. -strings.regex.%.mobile.hfst: strings.%.regex anystar.hfst +.generated/strings.regex.%.mobile.hfst: strings.%.regex .generated/anystar.hfst $(AM_V_RGX2FST)$(HFST_REGEXP2FST) -S $(HFST_FLAGS) -i $<\ - | $(HFST_CONCATENATE) anystar.hfst - \ - | $(HFST_CONCATENATE) - anystar.hfst \ + | $(HFST_CONCATENATE) .generated/anystar.hfst - \ + | $(HFST_CONCATENATE) - .generated/anystar.hfst \ | $(HFST_REPEAT) -f 1 -t $(STRING_REGEX_EDIT_DISTANCE) \ -o $@ -strings.all.%.mobile.hfst: $(mob_strings_all_deps) +.generated/strings.all.%.mobile.hfst: $(mob_strings_all_deps) $(mob_strings_all_build) # Combine edit distance with string pattern edits, then multiply according to # the specified editing distance. The strings part is included depending on # variable setting in Makefile.am. # Then combine it with keyboard layout error model: -editdist.all.%.hfst.tmp: $(strings_deps) editdist.%.hfst +.generated/editdist.all.%.hfst.tmp: $(strings_deps) .generated/editdist.%.hfst $(strings_fst_include) > $@ -editdist.all.%.hfst: editdist.all.%.hfst.tmp keyboardlayout.hfst - $(AM_V_UNION)$(HFST_DISJUNCT) $^ \ - | $(HFST_REPEAT) -f 1 -t $(EDIT_DISTANCE) \ - -o $@ +.generated/editdist.all.%.hfst: .generated/editdist.all.%.hfst.tmp + $(AM_V_CP)cp -f $< $@ +#.generated/editdist.all.%.hfst: .generated/editdist.all.%.hfst.tmp .generated/keyboardlayout.hfst +# $(AM_V_UNION)$(HFST_DISJUNCT) $^ \ +# | $(HFST_REPEAT) -f 1 -t $(EDIT_DISTANCE) \ +# -o $@ # Error model building - list of words known to be misspelled, mobile version: -words.%.mobile.hfst: $(mob_words_deps) easteregg.%.mobile.suggtxt +.generated/words.%.mobile.hfst: $(mob_words_deps) easteregg.%.mobile.suggtxt $(GENDIR) $(AM_V_STR2FST)grep -h -v '^#' $^ | grep -v '^$$' \ | $(HFST_STRINGS2FST) $(HFST_FLAGS) -j \ --format=openfst-tropical \ @@ -351,37 +353,38 @@ words.%.mobile.hfst: $(mob_words_deps) easteregg.%.mobile.suggtxt # Error model building - list of words known to be misspelled, mobile # fallback version, using the default words.txt file as input instead: -words.%.mobile.hfst: $(words_deps) easteregg.%.mobile.suggtxt +.generated/words.%.mobile.hfst: $(words_deps) easteregg.%.mobile.suggtxt $(GENDIR) $(AM_V_STR2FST)grep -h -v '^#' $^ | grep -v '^$$' \ | $(HFST_STRINGS2FST) $(HFST_FLAGS) -j \ -o $@ -.PHONY: att -att: $(srcdir)/keyboardlayout.att -$(srcdir)/keyboardlayout.att: - $(AM_V_GEN)$(KBDGEN) -t errormodel -l $(KEYBOARD_LAYOUT_ID) \ - $(KEYBOARD_LAYOUT_DIR)/project.yaml \ - > $@ - $(AM_V_at)echo "ATT error model for keyboard layout has been made!" - $(AM_V_at)echo "Remember to add $@ to svn!" - -keyboardlayout.hfst: keyboardlayout.att anystar.hfst +## XXX: temporarily disabled? +##.PHONY: att +##att: $(srcdir)/keyboardlayout.att +##$(srcdir)/keyboardlayout.att: +## $(AM_V_GEN)$(KBDGEN) -t errormodel -l $(KEYBOARD_LAYOUT_ID) \ +## $(KEYBOARD_LAYOUT_DIR)/project.yaml \ +## > $@ +## $(AM_V_at)echo "ATT error model for keyboard layout has been made!" +## $(AM_V_at)echo "Remember to add $@ to svn!" + +.generated/keyboardlayout.hfst: keyboardlayout.att .generated/anystar.hfst $(AM_V_TXT2FST)$(HFST_TXT2FST) $< \ - | $(HFST_CONCATENATE) anystar.hfst - \ - | $(HFST_CONCATENATE) - anystar.hfst \ + | $(HFST_CONCATENATE) .generated/anystar.hfst - \ + | $(HFST_CONCATENATE) - .generated/anystar.hfst \ > $@ # The final error model is assembled here: -errmodel.%.hfst: words.%.hfst \ +.generated/errmodel.%.hfst: .generated/words.%.hfst \ $(mob_initial_letter_deps) \ - editdist.all.%.hfst \ + .generated/editdist.all.%.hfst \ $(mob_final_strings_deps) $(AM_V_RGX2FST)printf "\ - [ @\"words.$*.hfst\" \ + [ @\".generated/words.$*.hfst\" \ | \ [ \ $(mob_initial_letter_fst_include) \ - @\"editdist.all.$*.hfst\" \ + @\".generated/editdist.all.$*.hfst\" \ $(mob_final_strings_fst_include) \ ] \ ];" \ @@ -391,7 +394,7 @@ errmodel.%.hfst: words.%.hfst \ -o $@ # with wordform prediction -errmodel.%.predict.hfst: predict.%.hfst +.generated/errmodel.%.predict.hfst: .generated/predict.%.hfst hfst-fst2fst -f olw -v -i $< -o $@ ######## Alternate error model: ####### @@ -415,11 +418,11 @@ errmodel.%.predict.hfst: predict.%.hfst ####### Speller acceptor: ####### # Build the automaton used for the speller $(MOB_GT_SPELLER_ACCEPTOR): \ -acceptor.%.hfst: $(MOB_GT_SPELLER_HFST) easteregg.%.hfst +acceptor.%.hfst: $(MOB_GT_SPELLER_HFST) .generated/easteregg.%.hfst $(AM_V_PROJECT)$(HFST_PROJECT) $(HFST_FLAGS) \ $(MORE_VERBOSITY) --project=lower < $< \ | $(HFST_MINIMIZE_SPELLER) \ - | $(HFST_DISJUNCT) $(MORE_VERBOSITY) - easteregg.$*.hfst \ + | $(HFST_DISJUNCT) $(MORE_VERBOSITY) - .generated/easteregg.$*.hfst \ | $(HFST_PUSH_WEIGHTS) $(MORE_VERBOSITY) --push=initial \ | $(HFST_FST2FST) $(MORE_VERBOSITY) $(HFST_FLAGS) -f olw \ -o $@ From 677d82761b6ba33fe9bdbdb9a9770ac948b03fa1 Mon Sep 17 00:00:00 2001 From: Trond Trosterud Date: Thu, 28 Mar 2024 08:38:08 +0200 Subject: [PATCH 31/45] esu, eus --- devtools/init.d/lookup-init.sh | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/devtools/init.d/lookup-init.sh b/devtools/init.d/lookup-init.sh index c256fa7d..17962a28 100755 --- a/devtools/init.d/lookup-init.sh +++ b/devtools/init.d/lookup-init.sh @@ -74,6 +74,7 @@ alias deng="$LOOKUP $GTLANGS/lang-eng/src/fst/generator-gt-desc.xfst" alias dest2="$LOOKUP $GTLANGS/lang-est/src/fst/generator-gt-desc.xfst" alias dest="$LOOKUP $GTLANGS/lang-est/src/fst/generator-gt-desc.xfst" alias destNorm="$LOOKUP $GTLANGS/lang-est/src/fst/generator-gt-norm.xfst" +alias desu="$LOOKUP $GTLANGS/lang-esu/src/fst/generator-gt-desc.xfst" alias deus="$LOOKUP $GTLANGS/lang-eus/src/fst/generator-gt-desc.xfst" alias devn="$LOOKUP $GTLANGS/lang-evn/src/fst/generator-gt-desc.xfst" alias devnNorm="$LOOKUP $GTLANGS/lang-evn/src/fst/generator-gt-norm.xfst" @@ -266,6 +267,10 @@ alias hddeuNorm="$HLOOKUP $GTLANGS/lang-deu/src/fst/generator-gt-norm.hfstol" alias hdeng="$HLOOKUP $GTLANGS/lang-eng/src/fst/generator-gt-desc.hfstol" alias hdest="$HLOOKUP $GTLANGS/lang-est/src/fst/generator-gt-desc.hfstol" alias hdestNorm="$HLOOKUP $GTLANGS/lang-est/src/fst/generator-gt-norm.hfstol" +alias hdesu="$HLOOKUP $GTLANGS/lang-esu/src/fst/generator-gt-desc.hfstol" +alias hdesuNorm="$HLOOKUP $GTLANGS/lang-esu/src/fst/generator-gt-norm.hfstol" +alias hdeus="$HLOOKUP $GTLANGS/lang-eus/src/fst/generator-gt-desc.hfstol" +alias hdeusNorm="$HLOOKUP $GTLANGS/lang-eus/src/fst/generator-gt-norm.hfstol" alias hdevn="$HLOOKUP $GTLANGS/lang-evn/src/fst/generator-gt-desc.hfstol" alias hdevnNorm="$HLOOKUP $GTLANGS/lang-evn/src/fst/generator-gt-norm.hfstol" alias hdfao="$HLOOKUP $GTLANGS/lang-fao/src/fst/generator-gt-desc.hfstol" @@ -433,6 +438,10 @@ alias hudeu="$HLOOKUP $GTLANGS/lang-deu/src/fst/analyser-gt-desc.hfstol" alias hueng="$HLOOKUP $GTLANGS/lang-eng/src/fst/analyser-gt-desc.hfstol" alias huest="$HLOOKUP $GTLANGS/lang-est/src/fst/analyser-gt-desc.hfstol" alias huestNorm="$HLOOKUP $GTLANGS/lang-est/src/fst/analyser-gt-norm.hfstol" +alias huesu="$HLOOKUP $GTLANGS/lang-esu/src/fst/analyser-gt-desc.hfstol" +alias huesuNorm="$HLOOKUP $GTLANGS/lang-esu/src/fst/analyser-gt-norm.hfstol" +alias hueus="$HLOOKUP $GTLANGS/lang-eus/src/fst/analyser-gt-desc.hfstol" +alias hueusNorm="$HLOOKUP $GTLANGS/lang-eus/src/fst/analyser-gt-norm.hfstol" alias huevn="$HLOOKUP $GTLANGS/lang-evn/src/fst/analyser-gt-desc.hfstol" alias huevnNorm="$HLOOKUP $GTLANGS/lang-evn/src/fst/analyser-gt-norm.hfstol" alias hufao="$HLOOKUP $GTLANGS/lang-fao/src/fst/analyser-gt-desc.hfstol" @@ -802,6 +811,7 @@ alias ueng="$LOOKUP $GTLANGS/lang-eng/src/fst/analyser-gt-desc.xtst" alias uest2="$LOOKUP $GTLANGS/lang-est/src/fst/analyser-gt-desc.xfst" alias uest="$LOOKUP $GTLANGS/lang-est/src/fst/analyser-gt-desc.xfst" alias uestNorm="$LOOKUP $GTLANGS/lang-est/src/fst/analyser-gt-norm.xfst" +alias uesu="$LOOKUP $GTLANGS/lang-esu/src/fst/analyser-gt-desc.xfst" alias ueus="$LOOKUP $GTLANGS/lang-eus/src/fst/analyser-gt-desc.xfst" alias uevn="$LOOKUP $GTLANGS/lang-evn/src/fst/analyser-gt-desc.xfst" alias uevnNorm="$LOOKUP $GTLANGS/lang-evn/src/fst/analyser-gt-norm.xfst" From bfc468490e75d5cf06932624e7a9c1f39ca4e03f Mon Sep 17 00:00:00 2001 From: Flammie A Pirinen Date: Thu, 28 Mar 2024 15:16:10 +0100 Subject: [PATCH 32/45] fiks --- ...ools-spellcheckers-fstbased-desktop-hfst_area-dir-include.am | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/am-shared/tools-spellcheckers-fstbased-desktop-hfst_area-dir-include.am b/am-shared/tools-spellcheckers-fstbased-desktop-hfst_area-dir-include.am index 58c2f819..ae9c5c2c 100644 --- a/am-shared/tools-spellcheckers-fstbased-desktop-hfst_area-dir-include.am +++ b/am-shared/tools-spellcheckers-fstbased-desktop-hfst_area-dir-include.am @@ -48,7 +48,7 @@ endif # HAVE_AREAS -1 $(top_builddir)/src/fst/filters/remove-all_areas_but_$*-strings.hfst \ | $(HFST_PROJECT) $(HFST_FLAGS) --project=lower \ | $(HFST_MINIMIZE_SPELLER) $(HFST_FLAGS) \ - | $(HFST_DISJUNCT) - easteregg.$*.desktop.hfst \ + | $(HFST_DISJUNCT) - .generated/easteregg.$*.desktop.hfst \ | $(HFST_FST2FST) $(HFST_FLAGS) -f olw \ -o $@ From f4c3dfbc49429265821ab0d495655f81e2e1e4ea Mon Sep 17 00:00:00 2001 From: Flammie A Pirinen Date: Sun, 31 Mar 2024 00:23:55 +0100 Subject: [PATCH 33/45] generate lexc from unimorph --- scripts/unimorph/generate-lexcies.py | 254 +++++++++++++++++++++++++++ 1 file changed, 254 insertions(+) create mode 100755 scripts/unimorph/generate-lexcies.py diff --git a/scripts/unimorph/generate-lexcies.py b/scripts/unimorph/generate-lexcies.py new file mode 100755 index 00000000..a8453a76 --- /dev/null +++ b/scripts/unimorph/generate-lexcies.py @@ -0,0 +1,254 @@ +#!/usr/bin/env python3 +'''CLI program to turn unimorph data to GiellaLT morph tester yaml.''' +import sys + + +def main(): + print('Multichar_Symbols') + print('+N +A +V') + print('+Sg +Pl +Du') + print('+Nom +Acc +Dat +Gen +Loc +Ine +Ill +Abl +Lat +Ela') + print('+Com +Abe +Tra +Ins +Ess') + print('+Prs +Prt +Ind +Pot +Cond +Imprt') + print('+Sg1 +Sg2 +Sg3 +Du1 +Du2 +Du3 +Pl1 +Pl2 +Pl3') + lemmas = 0 + tokens = 0 + suspicious = 0 + prevlemma = None + for line in sys.stdin: + if not line or line.strip() == '': + print() + continue + # gravitáció gravitáción N;ON+ESS;SG + fields = line.strip().split('\t') + tokens += 1 + if len(fields) != 3: + print('Datoissa virhe!', fields) + sys.exit(1) + elif fields[0] == '----' and fields[1] == '----' and\ + fields[2] == '----': + # this is the kind bs that unimorph is full of + suspicious += 1 + continue + lemma = fields[0] + surf = fields[1] + unimorphs = fields[2] + if lemma != prevlemma: + prevlemma = lemma + print() + lemmas += 1 + if 'intransitive verb' in surf: + suspicious += 1 + elif 'subjunctive forms' in surf: + suspicious += 1 + elif '|' in surf: + suspicious += 1 + giellatags = list() + for unimorph in unimorphs.split(';'): + if unimorph == 'N': + giellatags += ['+N'] + elif unimorph == 'V': + giellatags += ['+V'] + elif unimorph == 'ADJ': + giellatags += ['+A'] + elif unimorph == 'NEUT': + giellatags += ['+Neu'] + elif unimorph == 'MASC': + giellatags += ['+Msc'] + elif unimorph == 'FEM': + giellatags += ['+Fem'] + elif unimorph == 'MASC+FEM': + giellatags += ['+Common'] + elif unimorph == 'GEN': + giellatags += ['+Gen'] + elif unimorph == 'COM': + giellatags += ['+Com'] + elif unimorph == 'ON+ESS': + giellatags += ['+Ses'] + elif unimorph == 'FRML': + giellatags += ['+Ess'] + elif unimorph == 'ESS': + giellatags += ['+Ess'] + elif unimorph == 'INAN': + giellatags += ['+Inan'] + elif unimorph == 'ANIM': + giellatags += ['+Anim'] + elif unimorph == 'PRIV': + giellatags += ['+Abe'] + elif unimorph == 'PRT': + giellatags += ['+Par'] + elif unimorph == 'INS': + giellatags += ['+Ins'] + elif unimorph == 'IN+ESS': + giellatags += ['+Ine'] + elif unimorph == 'NOM': + giellatags += ['+Nom'] + elif unimorph == 'ON+ALL': + giellatags += ['+Sub'] + elif unimorph == 'AT+ALL': + giellatags += ['+All'] + elif unimorph == 'PRP': + giellatags += ['+Loc'] + elif unimorph == 'INST': + giellatags += ['+Inst'] + elif unimorph == 'TRANS': + giellatags += ['+Tra'] + elif unimorph == 'TERM': + giellatags += ['+Term'] + elif unimorph == 'ON+ABL': + giellatags += ['+Del'] + elif unimorph == 'IN+ABL': + giellatags += ['+Ela'] + elif unimorph == 'IN+ALL': + giellatags += ['+Ill'] + elif unimorph == 'DAT': + giellatags += ['+Dat'] + elif unimorph == 'ACC': + giellatags += ['+Acc'] + elif unimorph == 'AT+ESS': + giellatags += ['+Ade'] + elif unimorph == 'AT+ABL': + giellatags += ['+Abl'] + elif unimorph == 'SG': + giellatags += ['+Sg'] + elif unimorph == 'DU': + giellatags += ['+Du'] + elif unimorph == 'PL': + giellatags += ['+Pl'] + elif unimorph == 'SG+PL': + # giellatags += ['+Sg/Pl'] + pass + elif unimorph == 'IND': + giellatags += ['+Ind'] + elif unimorph == 'PRS': + giellatags += ['+Prs'] + elif unimorph == 'PST': + giellatags += ['+Prt'] + elif unimorph == 'PRF': + giellatags += ['+Perf'] + elif unimorph == 'FUT': + giellatags += ['+Fut'] + elif unimorph == '1': + giellatags += ['+1'] + elif unimorph == '2': + giellatags += ['+2'] + elif unimorph == '3': + giellatags += ['+3'] + elif unimorph == 'INDF': + pass # unmarked in giellatags + elif unimorph == 'GEADJ': + giellatags += ['+Gen'] + suspicious += 1 + elif unimorph == 'DEF': + giellatags += ['+Def'] + elif unimorph == 'NDEF': + giellatags += ['+Ind'] + elif unimorph == 'V.PTCP': + giellatags += ['+V'] + if 'PRS' in unimorphs: + giellatags += ['+PrsPrc'] + elif 'PST' in unimorphs: + giellatags += ['+PrtPrc'] + elif 'FUT' in unimorphs: + giellatags += ['+Fut'] + else: + giellatags += ['+Drv/Ptcp'] + elif unimorph == 'NFIN': + giellatags += '+Ger' + elif unimorph == 'ACT': + giellatags += ['+Actv'] + elif unimorph == 'PASS': + giellatags += ['+Pasv'] + elif unimorph == 'COND': + giellatags += ['+Cond'] + elif unimorph == 'POT': + giellatags += ['+Pot'] + elif unimorph == 'IMP': + giellatags += ['+Imprt'] + elif unimorph == 'SBJV': + giellatags += ['+Subj'] + elif unimorph == 'V.CVB': + giellatags += ['+V'] + giellatags += ['+Der/Adv'] + elif unimorph == 'CMPR': + giellatags += ['+Comp'] + elif unimorph == 'SPRL': + giellatags += ['+Sup'] + elif unimorph == 'NEG': + giellatags += ['+Neg'] + elif unimorph == 'POS': + # giellatags += ['+Pos'] + pass + elif unimorph == 'LGSPEC': + pass + elif unimorph == 'LGSPEC1': + pass + else: + print('missing unimorph mapping for:', unimorph) + sys.exit(2) + reorg = list() + for ape in giellatags: + if ape in ['+N', '+V', '+A']: + reorg += [ape] + break + if reorg == ['+N']: + for ape in giellatags: + if ape in ['+Sg', '+Pl', '+Du']: + reorg += [ape] + for ape in giellatags: + if ape not in reorg: + reorg += [ape] + elif reorg == ['+V']: + for ape in giellatags: + if ape not in reorg: + reorg += [ape] + if '+1' in reorg and '+Sg' in reorg: + reorg += ['+Sg1'] + reorg.remove('+1') + reorg.remove('+Sg') + elif '+2' in reorg and '+Sg' in reorg: + reorg += ['+Sg2'] + reorg.remove('+2') + reorg.remove('+Sg') + elif '+3' in reorg and '+Sg' in reorg: + reorg += ['+Sg3'] + reorg.remove('+3') + reorg.remove('+Sg') + elif '+1' in reorg and '+Du' in reorg: + reorg += ['+Du1'] + reorg.remove('+1') + reorg.remove('+Du') + elif '+2' in reorg and '+Du' in reorg: + reorg += ['+Du2'] + reorg.remove('+2') + reorg.remove('+Du') + elif '+3' in reorg and '+Du' in reorg: + reorg += ['+Du3'] + reorg.remove('+3') + reorg.remove('+Du') + elif '+1' in reorg and '+Pl' in reorg: + reorg += ['+Pl1'] + reorg.remove('+1') + reorg.remove('+Pl') + elif '+2' in reorg and '+Pl' in reorg: + reorg += ['+Pl2'] + reorg.remove('+2') + reorg.remove('+Pl') + elif '+3' in reorg and '+Pl' in reorg: + reorg += ['+Pl3'] + reorg.remove('+3') + reorg.remove('+Pl') + elif reorg == ['+A']: + for ape in giellatags: + if ape not in reorg: + reorg += [ape] + else: + print('REORG FAIL', reorg) + sys.exit(1) + giellatags = reorg + print(lemma, ''.join(giellatags), ':', surf, sep='') + + + +if __name__ == '__main__': + main() From 59a1c47cf1c71f19b8673fb69c14c9cd9af33da1 Mon Sep 17 00:00:00 2001 From: Flammie A Pirinen Date: Tue, 2 Apr 2024 17:10:18 +0200 Subject: [PATCH 34/45] more gendir fixes --- ...llcheckers-fstbased-desktop-hfst-dir-include.am | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/am-shared/tools-spellcheckers-fstbased-desktop-hfst-dir-include.am b/am-shared/tools-spellcheckers-fstbased-desktop-hfst-dir-include.am index 376e2efc..1cda6e0c 100644 --- a/am-shared/tools-spellcheckers-fstbased-desktop-hfst-dir-include.am +++ b/am-shared/tools-spellcheckers-fstbased-desktop-hfst-dir-include.am @@ -230,7 +230,7 @@ noinst_DATA+=$(GT_ERRMODELS) \ # Easter egg content - depends also on the fst, to # make sure the easter egg is rebuilt every time the fst is rebuilt: $(GIELLA_DESKTOP_EASTEREGGS): \ -easteregg.%.desktop.txt: +.generated/easteregg.%.desktop.txt: $(GENDIR) $(AM_V_GEN)$(GTCORE)/scripts/make-hfstspeller-version-easter-egg.sh \ $(GTLANG2) \ $(top_srcdir) \ @@ -240,7 +240,7 @@ easteregg.%.desktop.txt: > $@ # Easter egg suggestions: -easteregg.%.desktop.suggtxt: easteregg.%.desktop.txt +.generated/easteregg.%.desktop.suggtxt: .generated/easteregg.%.desktop.txt $(AM_V_GEN)sed -e 's/^/nuvviDspeller:/' < $< \ | sed = \ | sed 'N;s/\n/ /' \ @@ -261,7 +261,7 @@ easteregg.%.desktop.suggtxt: easteregg.%.desktop.txt # Easter egg string acceptor: # easteregg.%.desktop.temp.hfst: easteregg.%.desktop.txt -.generated/easteregg.%.desktop.hfst: easteregg.%.desktop.txt $(GENDIR) +.generated/easteregg.%.desktop.hfst: .generated/easteregg.%.desktop.txt $(AM_V_GEN)$(HFST_STRINGS2FST) $(HFST_FLAGS) -j < $< \ > $@ @@ -360,7 +360,7 @@ easteregg.%.desktop.suggtxt: easteregg.%.desktop.txt -o $@ # Error model building - list of words known to be misspelled: -.generated/words.%.hfst: $(words_deps) easteregg.%.desktop.suggtxt $(GENDIR) +.generated/words.%.hfst: $(words_deps) .generated/easteregg.%.desktop.suggtxt $(GENDIR) $(AM_V_STR2FST)grep -h -v '^#' $^ | grep -v '^$$' \ | $(HFST_STRINGS2FST) $(HFST_FLAGS) -j \ --format=openfst-tropical \ @@ -388,7 +388,7 @@ errmodel.%.hfst: .generated/words.%.hfst \ ####### Alternate error model: ####### # Alternatively, the error model can be constructed as a long list of regular # expressions, semicolon separated: -errmodel.%.hfst: errmodel.%.regex .generated/easteregg.%.hfst +.generated/errmodel.%.hfst: errmodel.%.regex .generated/asteregg.%.hfst $(AM_V_GEN)$(HFST_REGEXP2FST) $(HFSTFLAGS) -S -i $< \ | $(HFST_DISJUNCT) - .generated/easteregg.$*.hfst \ | $(HFST_PUSH_WEIGHTS) --push=initial \ @@ -396,7 +396,7 @@ errmodel.%.hfst: errmodel.%.regex .generated/easteregg.%.hfst -o $@ # ... or as an xfscript file: -errmodel.%.hfst: errmodel.%.xfscript .generated/easteregg.%.hfst +.generated/errmodel.%.hfst: errmodel.%.xfscript .generated/easteregg.%.hfst $(AM_V_GEN)$(HFST_REGEXP2FST) $(HFSTFLAGS) -S -i $< \ | $(HFST_DISJUNCT) - .generated/easteregg.$*.hfst \ | $(HFST_PUSH_WEIGHTS) --push=initial \ @@ -406,7 +406,7 @@ errmodel.%.hfst: errmodel.%.xfscript .generated/easteregg.%.hfst ####### Speller acceptor: ####### # Build the automaton used for the speller $(GT_SPELLER_ACCEPTOR): \ -acceptor.%.hfst: $(GT_SPELLER_HFST) filters/remove-error-strings.hfst \ +.generated/acceptor.%.hfst: $(GT_SPELLER_HFST) filters/remove-error-strings.hfst \ .generated/easteregg.%.desktop.hfst $(AM_V_PROJECT)$(HFST_COMPOSE) -1 filters/remove-error-strings.hfst -2 $< -F \ | $(HFST_PROJECT) $(HFST_FLAGS) \ From 95b6790e63eefe4cc97fe3d661110e330a49353f Mon Sep 17 00:00:00 2001 From: Flammie A Pirinen Date: Tue, 2 Apr 2024 19:46:24 +0200 Subject: [PATCH 35/45] maybe fix something or other --- ...tools-spellcheckers-fstbased-desktop-hfst-dir-include.am | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/am-shared/tools-spellcheckers-fstbased-desktop-hfst-dir-include.am b/am-shared/tools-spellcheckers-fstbased-desktop-hfst-dir-include.am index 1cda6e0c..a3d81aa1 100644 --- a/am-shared/tools-spellcheckers-fstbased-desktop-hfst-dir-include.am +++ b/am-shared/tools-spellcheckers-fstbased-desktop-hfst-dir-include.am @@ -16,7 +16,7 @@ ## along with this program. If not, see . GT_SPELLER_HFST=generator-desktopspeller-gt-norm.hfst -GT_SPELLER_ACCEPTOR=acceptor.default.hfst +GT_SPELLER_ACCEPTOR=.generated/acceptor.default.hfst # Max compression for zipped files: ZIPFLAGS=-9 $(VERBOSITY) @@ -229,7 +229,6 @@ noinst_DATA+=$(GT_ERRMODELS) \ ####### Easter egg version info: ####### # Easter egg content - depends also on the fst, to # make sure the easter egg is rebuilt every time the fst is rebuilt: -$(GIELLA_DESKTOP_EASTEREGGS): \ .generated/easteregg.%.desktop.txt: $(GENDIR) $(AM_V_GEN)$(GTCORE)/scripts/make-hfstspeller-version-easter-egg.sh \ $(GTLANG2) \ @@ -405,7 +404,6 @@ errmodel.%.hfst: .generated/words.%.hfst \ ####### Speller acceptor: ####### # Build the automaton used for the speller -$(GT_SPELLER_ACCEPTOR): \ .generated/acceptor.%.hfst: $(GT_SPELLER_HFST) filters/remove-error-strings.hfst \ .generated/easteregg.%.desktop.hfst $(AM_V_PROJECT)$(HFST_COMPOSE) -1 filters/remove-error-strings.hfst -2 $< -F \ @@ -427,7 +425,7 @@ $(GT_SPELLING_HFST): index.xml \ $(AM_V_at)$(MKDIR_P) build/$@ $(AM_V_at)rm -f build/$@/* $(AM_V_at)cp index.xml build/$@/index.xml - $(AM_V_at)cp $(GT_SPELLER_ACCEPTOR) build/$@/$(GT_SPELLER_ACCEPTOR) + $(AM_V_at)cp $(GT_SPELLER_ACCEPTOR) build/$@/ $(AM_V_at)cp $(GT_ERRMODELS) build/$@/$(GT_ERRMODELS) $(AM_V_ZIP)cd build/$@/ && $(ZIP) $(ZIPFLAGS) ../../$@ * $(AM_V_at)$(MKDIR_P) 3 From b4cd4dbd4f9dc66fc8b1ae2d9582bf20fc5c789f Mon Sep 17 00:00:00 2001 From: Flammie A Pirinen Date: Tue, 2 Apr 2024 22:29:48 +0200 Subject: [PATCH 36/45] 3 months should be enough for a warning --- configure.ac | 1 - 1 file changed, 1 deletion(-) diff --git a/configure.ac b/configure.ac index c4c74207..c42e3f93 100644 --- a/configure.ac +++ b/configure.ac @@ -84,4 +84,3 @@ AS_IF([test x$PRECOMMIT = xfalse], on mac: brew install pre-commit others: python3 -m pip install pre-commit])]) -AC_MSG_WARN([January 2024: this version involves whole move of src/fst to src/fst/morphology]) From dd78ab56c64e768e5c07898980396b9ebbd88c34 Mon Sep 17 00:00:00 2001 From: Sjur N Moshagen Date: Wed, 3 Apr 2024 12:13:00 +0300 Subject: [PATCH 37/45] All these targets are now in the .generated dir --- ...rs-fstbased-desktop_weights-dir-include.am | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/am-shared/tools-spellcheckers-fstbased-desktop_weights-dir-include.am b/am-shared/tools-spellcheckers-fstbased-desktop_weights-dir-include.am index 3105365c..c2ff6572 100644 --- a/am-shared/tools-spellcheckers-fstbased-desktop_weights-dir-include.am +++ b/am-shared/tools-spellcheckers-fstbased-desktop_weights-dir-include.am @@ -111,16 +111,16 @@ corpus_size_limit_command=$(shell \ -o $@ # Keep these intermediate targets when building using --debug: -.SECONDARY: spellercorpus.sort.txt \ - spellercorpus.uniq.txt \ - spellercorpus.surfs.hfst \ - spellercorpus.tropical.txt \ - spellercorpus.typecount.txt \ - pellercorpus.wordcount.txt \ - word-boundary.hfst \ - generator-*-gt-norm-freq_weighted.hfst \ - generator-*-gt-norm-unit_weighted.hfst \ - generator-*-gt-norm-norm_weighted.hfst \ - generator-*-gt-norm-tag_weighted.hfst \ - $(SURFWEIGHTS) \ - $(UNITWEIGHT) +.SECONDARY: .generated/spellercorpus.sort.txt \ + .generated/spellercorpus.uniq.txt \ + .generated/spellercorpus.surfs.hfst \ + .generated/spellercorpus.tropical.txt \ + .generated/spellercorpus.typecount.txt \ + .generated/pellercorpus.wordcount.txt \ + .generated/word-boundary.hfst \ + .generated/generator-*-gt-norm-freq_weighted.hfst \ + .generated/generator-*-gt-norm-unit_weighted.hfst \ + .generated/generator-*-gt-norm-norm_weighted.hfst \ + .generated/generator-*-gt-norm-tag_weighted.hfst \ + .generated/$(SURFWEIGHTS) \ + .generated/$(UNITWEIGHT) From 36c6bb463e10799011ec5afd9ed78deb8e3b3f49 Mon Sep 17 00:00:00 2001 From: Sjur N Moshagen Date: Wed, 3 Apr 2024 12:15:23 +0300 Subject: [PATCH 38/45] Rremove .generated dir for the final analyser/generator before making the acceptor MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These fst’s are used for testing as well as a starting point for the grammar checker speller, which needs the analysis part for the suggestions in the grammar checker speller. --- .../tools-spellcheckers-fstbased-desktop-dir-include.am | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/am-shared/tools-spellcheckers-fstbased-desktop-dir-include.am b/am-shared/tools-spellcheckers-fstbased-desktop-dir-include.am index 39c7ef38..4d357256 100644 --- a/am-shared/tools-spellcheckers-fstbased-desktop-dir-include.am +++ b/am-shared/tools-spellcheckers-fstbased-desktop-dir-include.am @@ -34,8 +34,8 @@ tag_weighted_dep=$(shell \ if WANT_SPELLERS if CAN_HFST -GT_COMMON_SPELLER_HFST+=.generated/generator-desktopspeller-gt-norm.hfst -GT_COMMON_SPELLER_HFST+=.generated/analyser-desktopspeller-gt-norm.hfst +GT_COMMON_SPELLER_HFST+=generator-desktopspeller-gt-norm.hfst +GT_COMMON_SPELLER_HFST+=analyser-desktopspeller-gt-norm.hfst endif # CAN_HFST endif # WANT_SPELLERS @@ -99,11 +99,11 @@ endif # WANT_SPELLERS quit\n" | $(HFST_XFST) -p $(MORE_VERBOSITY) # Copy the tmp transducer to the final one. This allows local overrides. -.generated/%.hfst: .generated/%.tmp.hfst +%.hfst: .generated/%.tmp.hfst $(AM_V_CP)cp -f $< $@ # Invert the final fst, to enable symmetric yaml tests and easy manual testing: -.generated/analyser-desktopspeller-gt-norm.hfst: .generated/generator-desktopspeller-gt-norm.hfst +analyser-desktopspeller-gt-norm.hfst: generator-desktopspeller-gt-norm.hfst $(AM_V_INVERT)$(HFST_INVERT) $(MORE_VERBOSITY) $(HFST_FLAGS) -i $< \ | $(HFST_PRUNE_ALPHABET) $(MORE_VERBOSITY) \ | $(HFST_REMOVE_EPSILONS) $(MORE_VERBOSITY) -o $@ From 551c096b629cfa9ee3e956a60c8a3830a57e5bf4 Mon Sep 17 00:00:00 2001 From: Sjur N Moshagen Date: Wed, 3 Apr 2024 12:15:46 +0300 Subject: [PATCH 39/45] The surfweigiht fst should be in the .generated dir --- am-shared/tools-spellcheckers-fstbased-desktop-dir-include.am | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/am-shared/tools-spellcheckers-fstbased-desktop-dir-include.am b/am-shared/tools-spellcheckers-fstbased-desktop-dir-include.am index 4d357256..91734049 100644 --- a/am-shared/tools-spellcheckers-fstbased-desktop-dir-include.am +++ b/am-shared/tools-spellcheckers-fstbased-desktop-dir-include.am @@ -47,9 +47,9 @@ endif # WANT_SPELLERS #### the fst): .generated/generator-desktopspeller-gt-norm-freq_weighted.hfst: \ .generated/generator-desktopspeller-gt-norm-base.hfst \ - $(SURFWEIGHTS) + .generated/$(SURFWEIGHTS) $(AM_V_COMPOSE)$(HFST_COMPOSE) $(HFST_FLAGS) -F \ - $< $(SURFWEIGHTS) \ + $< .generated/$(SURFWEIGHTS) \ -o $@ #### 3. Add a default unit weight to anything not covered by the corpus From f1a2b11b4a09515646ebe7699938ed1060129d08 Mon Sep 17 00:00:00 2001 From: Flammie A Pirinen Date: Wed, 3 Apr 2024 14:44:31 +0200 Subject: [PATCH 40/45] changing upper level dir resets the subtree fixes #47 --- am-shared/docs-dir-include.am | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/am-shared/docs-dir-include.am b/am-shared/docs-dir-include.am index 34d969ab..f246ef41 100644 --- a/am-shared/docs-dir-include.am +++ b/am-shared/docs-dir-include.am @@ -186,24 +186,32 @@ $(LINKS): if test "x$$d1" != "x$$oldd1" ; then \ echo "* \`$$d1/\`" ;\ oldd1=$$d1 ;\ + oldd2="";\ + oldd3="";\ + oldd4="";\ fi ; \ if test "x$$d2" = x ; then \ echo " * [$$docname]($$html) ([src]($(REPOURL)/$$doc))" ;\ elif test "x$$d2" != "x$$oldd2" ; then \ echo " * \`$$d2/\`" ;\ oldd2=$$d2 ;\ + oldd3="";\ + oldd4="";\ + oldd5="";\ fi ; \ if test "x$$d3" = x -a "x$$d2" != x; then \ echo " * [$$docname]($$html) ([src]($(REPOURL)/$$doc))" ;\ elif test "x$$d3" != "x$$oldd3" ; then \ echo " * \`$$d3/\`" ;\ oldd3=$$d3 ;\ + oldd4="";\ fi ; \ if test "x$$d4" = x -a "x$$d3" != x ; then \ echo " * [$$docname]($$html) ([src]($(REPOURL)/$$doc))" ;\ elif test "x$$d4" != "x$$oldd4" ; then \ echo " * \`$$d4/\`" ;\ oldd4=$$d4 ;\ + oldd5="";\ fi ; \ if test "x$$d5" = x -a "x$$d4" != x ; then \ echo " * [$$docname]($$html) ([src]($(REPOURL)/$$doc))" ;\ From 817861ab634d1d537f7b00e3bddaef7a215c8a28 Mon Sep 17 00:00:00 2001 From: Sjur N Moshagen Date: Fri, 5 Apr 2024 16:12:16 +0300 Subject: [PATCH 41/45] Make each filename a level 1 header Makes it easier to identify the following content independent of what the authors have written. --- am-shared/docs-dir-include.am | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/am-shared/docs-dir-include.am b/am-shared/docs-dir-include.am index f246ef41..b1ef1ac7 100644 --- a/am-shared/docs-dir-include.am +++ b/am-shared/docs-dir-include.am @@ -166,9 +166,14 @@ REPOURL=$(shell if test "x$(GH_REPO)" != x ; then \ fi) # Collect all target files into one big MD file: +# Remove the VPATH prefix to create the header for each file/chapter: $(ALLINONE_MD_PAGE): $(VPATH_MDFILES) - $(AM_V_GEN)printf "# $(GLANGUAGE) description \n\nAll documents in one file\n\n" \ - | cat - $(VPATH_MDFILES) > $@ + $(AM_V_GEN)printf "# $(GLANGUAGE) language model documentation\n\nAll doc-comment documentation in one large file.\n" > $@ + for f in $(VPATH_MDFILES); do \ + header=$${f#"$(top_srcdir)/docs/"};\ + printf "\n---\n\n# $$header \n\n" >> $@ ;\ + cat $$f >> $@ ;\ + done $(LINKS): $(AM_V_GEN)for doc2md in $(DOCSRC_MDFILES) ; do \ From 81dccfd57ff0db526e9d46ac591242432d699a66 Mon Sep 17 00:00:00 2001 From: Sjur N Moshagen Date: Fri, 5 Apr 2024 16:12:56 +0300 Subject: [PATCH 42/45] Consistent use of separators and newlines --- scripts/doccomments2ghpages-vislcg.awk | 4 ++-- scripts/doccomments2ghpages.awk | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/doccomments2ghpages-vislcg.awk b/scripts/doccomments2ghpages-vislcg.awk index 99348b85..142dd882 100644 --- a/scripts/doccomments2ghpages-vislcg.awk +++ b/scripts/doccomments2ghpages-vislcg.awk @@ -94,7 +94,7 @@ function docupath(s) { return gensub("\\.\\./", "", "g", s); } END { - printf("\n* * *\nThis (part of) documentation was generated from " \ + printf("\n* * *\n\nThis (part of) documentation was generated from " \ "[%s](%s/%s)" \ - "", docupath(FILENAME), REPOURL, docupath(FILENAME)); + "\n", docupath(FILENAME), REPOURL, docupath(FILENAME)); } diff --git a/scripts/doccomments2ghpages.awk b/scripts/doccomments2ghpages.awk index a97d2aa9..cff94019 100644 --- a/scripts/doccomments2ghpages.awk +++ b/scripts/doccomments2ghpages.awk @@ -122,5 +122,5 @@ function docupath(s) { END { printf("\n* * *\n\nThis (part of) documentation was generated from " \ "[%s](%s/%s)" \ - "\n\n---\n\n", docupath(FILENAME), REPOURL, docupath(FILENAME)); + "\n", docupath(FILENAME), REPOURL, docupath(FILENAME)); } From d34af3a3194f75f35b119c258b7bdff5faa737fa Mon Sep 17 00:00:00 2001 From: Flammie A Pirinen Date: Wed, 10 Apr 2024 12:12:05 +0200 Subject: [PATCH 43/45] cleanups --- am-shared/src-morphology-dir-include.am | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/am-shared/src-morphology-dir-include.am b/am-shared/src-morphology-dir-include.am index 2faf1dc3..d0379b79 100644 --- a/am-shared/src-morphology-dir-include.am +++ b/am-shared/src-morphology-dir-include.am @@ -290,7 +290,7 @@ lexicon.hfst: .generated/lexicon.hfst $(AM_CP)cp -v $< $@ clean-local: - -rm -f lexicon.hfst .generated/lexicon.hfst + -rm -f lexicon.hfst .generated/lexicon.hfst $(GIELLA_LOCAL_TARGETS) ####### Other targets: ########### maintainer-clean-local: From 43594b0ff43290701f55702cd4fe1ec339ea1591 Mon Sep 17 00:00:00 2001 From: Flammie A Pirinen Date: Wed, 10 Apr 2024 13:47:59 +0200 Subject: [PATCH 44/45] gendir fixes fixes #48 --- ...ers-fstbased-mobile_weights-dir-include.am | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/am-shared/tools-spellcheckers-fstbased-mobile_weights-dir-include.am b/am-shared/tools-spellcheckers-fstbased-mobile_weights-dir-include.am index 79fd3100..f1ded5b5 100644 --- a/am-shared/tools-spellcheckers-fstbased-mobile_weights-dir-include.am +++ b/am-shared/tools-spellcheckers-fstbased-mobile_weights-dir-include.am @@ -38,19 +38,19 @@ mob_corpus_size_limit_command=$(shell \ $(AM_V_GEN)$(HFST_TXT2FST) $(HFST_FLAGS) -f openfst-tropical $< -o $@ # sort the clean corpus: -%.sort.txt: weights/%.clean.txt - $(AM_V_GEN)sort < $< > $@ +.generated/%.sort.txt: weights/%.clean.txt $(GENDIR) + $(AM_V_GEN)LC_ALL=C.utf8 sort < $< > $@ # token count: -%.wordcount.txt: %.sort.txt +.generated/%.wordcount.txt: .generated/%.sort.txt $(AM_V_GEN)wc -l < $< > $@ # Unique the sorted, clean corpus: -mob_%.uniq.txt: %.sort.txt +.generated/mob_%.uniq.txt: .generated/%.sort.txt $(AM_V_GEN)uniq -c < $< | sort -nr $(mob_corpus_size_limit_command) > $@ # type count: -mob_%.typecount.txt: mob_%.uniq.txt +.generated/mob_%.typecount.txt: .generated/mob_%.uniq.txt $(AM_V_GEN)wc -l < $< > $@ # calculate unit weight, smoothed using ALPHA: @@ -66,16 +66,16 @@ mob_%.typecount.txt: mob_%.uniq.txt # | $(BC) -l > $@ # # add tropical weights to the corpus: -mob_%.tropical.txt: mob_%.uniq.txt %.wordcount.txt mob_%.typecount.txt +.generated/mob_%.tropical.txt: .generated/mob_%.uniq.txt .generated/%.wordcount.txt .generated/mob_%.typecount.txt $(AM_V_GEN)cat $< |\ - $(GAWK) -v CS="$$(cat $*.wordcount.txt)" \ - -v DS="$$(cat mob_$*.typecount.txt)" \ + $(GAWK) -v CS="$$(cat .generated/$*.wordcount.txt)" \ + -v DS="$$(cat .generated/mob_$*.typecount.txt)" \ -v ALPHA=$(ALPHA) \ -f $(GTCORE)/scripts/uniq_count2tropical_weight.awk \ > $@ # build an fst of surface forms with tropical weights for each word form: -.generated/mob_%.surfs.hfst: mob_%.tropical.txt $(GENDIR) +.generated/mob_%.surfs.hfst: .generated/mob_%.tropical.txt $(AM_V_STR2FST)cat $< |\ $(HFST_STRINGS2FST) -j $(HFST_FLAGS) -f openfst-tropical -o $@ @@ -89,10 +89,10 @@ mob_%.tropical.txt: mob_%.uniq.txt %.wordcount.txt mob_%.typecount.txt # Add the unit weight to each unit in compounds, both dynamic and lexical: .generated/mob_unitweighted.hfst: $(UW_SPELLER_SRC) \ - $(MOB_UNITWEIGHT) \ + .generated/$(MOB_UNITWEIGHT) \ $(srcdir)/weights/word-boundary.txt $(AM_V_REWEIGHT)$(HFST_REWEIGHT) $(HFST_FLAGS) \ - -e -a $$(cat $(MOB_UNITWEIGHT)) $< \ + -e -a $$(cat .generated/$(MOB_UNITWEIGHT)) $< \ -o $@ # Keep these intermediate targets when building using --debug: From 3b31f19101c94b4bb77b9edb1eabbc86142cfe11 Mon Sep 17 00:00:00 2001 From: trondtynnol Date: Wed, 10 Apr 2024 21:40:53 +0200 Subject: [PATCH 45/45] Import sys --- dicts/scripts/merge_giella_dicts.py | 1 + 1 file changed, 1 insertion(+) diff --git a/dicts/scripts/merge_giella_dicts.py b/dicts/scripts/merge_giella_dicts.py index 2c0dfa85..adab0637 100644 --- a/dicts/scripts/merge_giella_dicts.py +++ b/dicts/scripts/merge_giella_dicts.py @@ -7,6 +7,7 @@ This script can also be imported from python code, see the comment in the file. """ +import sys import xml.etree.ElementTree as ET from pathlib import Path