Skip to content

Commit

Permalink
Merge branch 'main' into use_gramtools
Browse files Browse the repository at this point in the history
  • Loading branch information
flammie committed Apr 10, 2024
2 parents 8e854e9 + 3b31f19 commit 4e8a557
Show file tree
Hide file tree
Showing 32 changed files with 1,253 additions and 222 deletions.
26 changes: 23 additions & 3 deletions am-shared/docs-dir-include.am
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ endif

# Generate endpoint json file for shield.io lemma count badge.
# Only to be stored in the gh-pages branch, ignored in main.
$(srcdir)/lemmacount.json: $(top_srcdir)/src/fst/morphology/stems/*.lexc
$(srcdir)/lemmacount.json:
$(AM_V_GEN)$(GTCORE)/scripts/make-lemmacount.json.sh $(abs_top_srcdir) > $@

# Generate a maturity.json file as endpoint for the maturity badge.
Expand Down Expand Up @@ -166,9 +166,14 @@ REPOURL=$(shell if test "x$(GH_REPO)" != x ; then \
fi)

# Collect all target files into one big MD file:
# Remove the VPATH prefix to create the header for each file/chapter:
$(ALLINONE_MD_PAGE): $(VPATH_MDFILES)
$(AM_V_GEN)printf "# $(GLANGUAGE) description \n\nAll documents in one file\n\n" \
| cat - $(VPATH_MDFILES) > $@
$(AM_V_GEN)printf "# $(GLANGUAGE) language model documentation\n\nAll doc-comment documentation in one large file.\n" > $@
for f in $(VPATH_MDFILES); do \
header=$${f#"$(top_srcdir)/docs/"};\
printf "\n---\n\n# $$header \n\n" >> $@ ;\
cat $$f >> $@ ;\
done

$(LINKS):
$(AM_V_GEN)for doc2md in $(DOCSRC_MDFILES) ; do \
Expand All @@ -182,27 +187,42 @@ $(LINKS):
d2=`echo "$$d" | cut -d '/' -f 2` ;\
d3=`echo "$$d" | cut -d '/' -f 3` ;\
d4=`echo "$$d" | cut -d '/' -f 4` ;\
d5=`echo "$$d" | cut -d '/' -f 5` ;\
if test "x$$d1" != "x$$oldd1" ; then \
echo "* \`$$d1/\`" ;\
oldd1=$$d1 ;\
oldd2="";\
oldd3="";\
oldd4="";\
fi ; \
if test "x$$d2" = x ; then \
echo " * [$$docname]($$html) ([src]($(REPOURL)/$$doc))" ;\
elif test "x$$d2" != "x$$oldd2" ; then \
echo " * \`$$d2/\`" ;\
oldd2=$$d2 ;\
oldd3="";\
oldd4="";\
oldd5="";\
fi ; \
if test "x$$d3" = x -a "x$$d2" != x; then \
echo " * [$$docname]($$html) ([src]($(REPOURL)/$$doc))" ;\
elif test "x$$d3" != "x$$oldd3" ; then \
echo " * \`$$d3/\`" ;\
oldd3=$$d3 ;\
oldd4="";\
fi ; \
if test "x$$d4" = x -a "x$$d3" != x ; then \
echo " * [$$docname]($$html) ([src]($(REPOURL)/$$doc))" ;\
elif test "x$$d4" != "x$$oldd4" ; then \
echo " * \`$$d4/\`" ;\
oldd4=$$d4 ;\
oldd5="";\
fi ; \
if test "x$$d5" = x -a "x$$d4" != x ; then \
echo " * [$$docname]($$html) ([src]($(REPOURL)/$$doc))" ;\
elif test "x$$d5" != "x$$oldd5" ; then \
echo " * \`$$d5/\`" ;\
oldd5=$$d5 ;\
fi ; \
done > $@

Expand Down
2 changes: 1 addition & 1 deletion am-shared/src-morphology-dir-include.am
Original file line number Diff line number Diff line change
Expand Up @@ -290,7 +290,7 @@ lexicon.hfst: .generated/lexicon.hfst
$(AM_CP)cp -v $< $@

clean-local:
-rm -f lexicon.hfst .generated/lexicon.hfst
-rm -f lexicon.hfst .generated/lexicon.hfst $(GIELLA_LOCAL_TARGETS)

####### Other targets: ###########
maintainer-clean-local:
Expand Down
2 changes: 1 addition & 1 deletion am-shared/src_alt_orth-include.am
Original file line number Diff line number Diff line change
Expand Up @@ -459,7 +459,7 @@ $(foreach alt_orth,$(ALT_ORTHS),$(eval $(call alt_orth_norm_generators,$(alt_ort

# morpher is a morph segmenting variant: taloautoissani -> talo#auto>i>ssa>ni
define alt_orth_morphers
.generated/morpher-gt-desc.$(1).tmp.%: a.generated/nalyser-raw-gt-desc.% \
.generated/morpher-gt-desc.$(1).tmp.%: .generated/analyser-raw-gt-desc.% \
orthography/inituppercase.compose.% \
orthography/spellrelax.compose.% \
filters/remove-hyphenation-marks.% \
Expand Down
10 changes: 5 additions & 5 deletions am-shared/tools-spellcheckers-fstbased-desktop-dir-include.am
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
# Use this as the source lexical fst for unit weighting, it contains correct
# surface forms except for the word boundary #, which is still present, and
# used in the weighting:
UW_SPELLER_SRC=generator-desktopspeller-gt-norm-base.hfst
UW_SPELLER_SRC=.generated/generator-desktopspeller-gt-norm-base.hfst

## Conditional string variables;
# Set file name to the empty string or initial_letters_all depending on variable:
Expand Down Expand Up @@ -47,9 +47,9 @@ endif # WANT_SPELLERS
#### the fst):
.generated/generator-desktopspeller-gt-norm-freq_weighted.hfst: \
.generated/generator-desktopspeller-gt-norm-base.hfst \
$(SURFWEIGHTS)
.generated/$(SURFWEIGHTS)
$(AM_V_COMPOSE)$(HFST_COMPOSE) $(HFST_FLAGS) -F \
$< $(SURFWEIGHTS) \
$< .generated/$(SURFWEIGHTS) \
-o $@

#### 3. Add a default unit weight to anything not covered by the corpus
Expand Down Expand Up @@ -99,11 +99,11 @@ endif # WANT_SPELLERS
quit\n" | $(HFST_XFST) -p $(MORE_VERBOSITY)

# Copy the tmp transducer to the final one. This allows local overrides.
.generated/%.hfst: .generated/%.tmp.hfst
%.hfst: .generated/%.tmp.hfst
$(AM_V_CP)cp -f $< $@

# Invert the final fst, to enable symmetric yaml tests and easy manual testing:
.generated/analyser-desktopspeller-gt-norm.hfst: .generated/generator-desktopspeller-gt-norm.hfst
analyser-desktopspeller-gt-norm.hfst: generator-desktopspeller-gt-norm.hfst
$(AM_V_INVERT)$(HFST_INVERT) $(MORE_VERBOSITY) $(HFST_FLAGS) -i $< \
| $(HFST_PRUNE_ALPHABET) $(MORE_VERBOSITY) \
| $(HFST_REMOVE_EPSILONS) $(MORE_VERBOSITY) -o $@
Expand Down
74 changes: 36 additions & 38 deletions am-shared/tools-spellcheckers-fstbased-desktop-hfst-dir-include.am
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
## along with this program. If not, see <http://www.gnu.org/licenses/>.

GT_SPELLER_HFST=generator-desktopspeller-gt-norm.hfst
GT_SPELLER_ACCEPTOR=acceptor.default.hfst
GT_SPELLER_ACCEPTOR=.generated/acceptor.default.hfst

# Max compression for zipped files:
ZIPFLAGS=-9 $(VERBOSITY)
Expand Down Expand Up @@ -47,15 +47,15 @@ swaps=$(shell \
# Set file name to the empty string or initial_letters.all depending on variable:
initial_letter_deps=$(shell \
if [[ $(INITIAL_EDITS) != 'no' ]] ; then \
echo "initial_letters.all.%.hfst"; \
echo ".generated/initial_letters.all.%.hfst"; \
else \
echo ""; \
fi)

# Set file name to the empty string or initial_letters.all depending on variable:
initial_letter_fst_include=$(shell \
if [[ $(INITIAL_EDITS) != 'no' ]] ; then \
echo "( @\\\"initial_letters.all.$*.hfst\\\" )"; \
echo "( @\\\".generated/initial_letters.all.$*.hfst\\\" )"; \
else \
echo ""; \
fi)
Expand All @@ -71,11 +71,11 @@ initial_letter_error_model_option=$(shell \
# Set dependency file name(s) depending on variable value:
initial_letter_all_deps=$(shell \
if [[ $(INITIAL_EDITS) == 'regex' ]] ; then \
echo "initial_letters.regex.%.hfst"; \
echo ".generated/initial_letters.regex.%.hfst"; \
elif [[ $(INITIAL_EDITS) == 'txt' ]] ; then \
echo "initial_letters.txt.%.hfst"; \
echo ".generated/initial_letters.txt.%.hfst"; \
elif [[ $(INITIAL_EDITS) == 'both' ]] ; then \
echo "initial_letters.regex.%.hfst initial_letters.txt.%.hfst"; \
echo ".generated/initial_letters.regex.%.hfst .generated/initial_letters.txt.%.hfst"; \
else \
echo ""; \
fi)
Expand All @@ -96,7 +96,7 @@ initial_letter_all_build=$(shell \
# Set file name to the empty string or strings.all depending on variable:
strings_deps=$(shell \
if [[ $(STRING_EDITS) != 'no' ]] ; then \
echo "strings.all.%.hfst"; \
echo ".generated/strings.all.%.hfst"; \
else \
echo ""; \
fi)
Expand All @@ -112,11 +112,11 @@ strings_fst_include=$(shell \
# Set dependency file name(s) depending on variable value:
strings_all_deps=$(shell \
if [[ $(STRING_EDITS) == 'regex' ]] ; then \
echo "strings.regex.%.hfst"; \
echo ".generated/strings.regex.%.hfst"; \
elif [[ $(STRING_EDITS) == 'txt' ]] ; then \
echo "strings.txt.%.hfst"; \
echo ".generated/strings.txt.%.hfst"; \
elif [[ $(STRING_EDITS) == 'both' ]] ; then \
echo "strings.regex.%.hfst strings.txt.%.hfst"; \
echo ".generated/strings.regex.%.hfst .generated/strings.txt.%.hfst"; \
else \
echo ""; \
fi)
Expand All @@ -137,27 +137,27 @@ strings_all_build=$(shell \
# Set file name to the empty string or final_strings.all depending on variable:
final_strings_deps=$(shell \
if [[ $(FINAL_STRING_EDITS) != 'no' ]] ; then \
echo "final_strings.all.%.hfst"; \
echo ".generated/final_strings.all.%.hfst"; \
else \
echo ""; \
fi)

# Set build command for strings.all depending on variable value:
final_strings_fst_include=$(shell \
if [[ $(FINAL_STRING_EDITS) != 'no' ]] ; then \
echo "( @\\\"final_strings.all.$*.hfst\\\" )"; \
echo "( @\\\".generated/final_strings.all.$*.hfst\\\" )"; \
else \
echo ""; \
fi)

# Set dependency file name(s) depending on variable value:
final_strings_all_deps=$(shell \
if [[ $(FINAL_STRING_EDITS) == 'regex' ]] ; then \
echo "final_strings.regex.%.hfst"; \
echo ".generated/final_strings.regex.%.hfst"; \
elif [[ $(FINAL_STRING_EDITS) == 'txt' ]] ; then \
echo "final_strings.txt.%.hfst"; \
echo ".generated/final_strings.txt.%.hfst"; \
elif [[ $(FINAL_STRING_EDITS) == 'both' ]] ; then \
echo "final_strings.regex.%.hfst final_strings.txt.%.hfst"; \
echo ".generated/final_strings.regex.%.hfst final_strings.txt.%.hfst"; \
else \
echo ""; \
fi)
Expand Down Expand Up @@ -229,8 +229,7 @@ noinst_DATA+=$(GT_ERRMODELS) \
####### Easter egg version info: #######
# Easter egg content - depends also on the fst, to
# make sure the easter egg is rebuilt every time the fst is rebuilt:
$(GIELLA_DESKTOP_EASTEREGGS): \
easteregg.%.desktop.txt:
.generated/easteregg.%.desktop.txt: $(GENDIR)
$(AM_V_GEN)$(GTCORE)/scripts/make-hfstspeller-version-easter-egg.sh \
$(GTLANG2) \
$(top_srcdir) \
Expand All @@ -240,7 +239,7 @@ easteregg.%.desktop.txt:
> $@

# Easter egg suggestions:
easteregg.%.desktop.suggtxt: easteregg.%.desktop.txt
.generated/easteregg.%.desktop.suggtxt: .generated/easteregg.%.desktop.txt
$(AM_V_GEN)sed -e 's/^/nuvviDspeller:/' < $< \
| sed = \
| sed 'N;s/\n/ /' \
Expand All @@ -261,7 +260,7 @@ easteregg.%.desktop.suggtxt: easteregg.%.desktop.txt

# Easter egg string acceptor:
# easteregg.%.desktop.temp.hfst: easteregg.%.desktop.txt
.generated/easteregg.%.desktop.hfst: easteregg.%.desktop.txt $(GENDIR)
.generated/easteregg.%.desktop.hfst: .generated/easteregg.%.desktop.txt
$(AM_V_GEN)$(HFST_STRINGS2FST) $(HFST_FLAGS) -j < $< \
> $@

Expand All @@ -278,7 +277,7 @@ easteregg.%.desktop.suggtxt: easteregg.%.desktop.txt

####### Error model: #######
# Error model building - edit distance based on transducer alphabet:
editdist.%.regex: editdist.%.txt $(initial_letter_deps)
.generated/editdist.%.regex: editdist.%.txt $(initial_letter_deps) $(GENDIR)
$(AM_V_GEN)$(GTCORE)/scripts/editdist.py \
--verbose \
$(swaps) \
Expand All @@ -289,7 +288,7 @@ editdist.%.regex: editdist.%.txt $(initial_letter_deps)
--output-file=$@ \
$(initial_letter_error_model_option)

.generated/editdist.%.hfst: editdist.%.regex $(GENDIR)
.generated/editdist.%.hfst: .generated/editdist.%.regex $(GENDIR)
$(AM_V_RGX2FST)$(HFST_REGEXP2FST) -S $(HFST_FLAGS) -i $<\
--format=openfst-tropical \
-o $@
Expand Down Expand Up @@ -341,42 +340,42 @@ editdist.%.regex: editdist.%.txt $(initial_letter_deps)
# larger as the edit distance, since the file is multiplied again as part of
# the editStrings build target. The idea is that the regex should contain a
# highly targeted set of frequent spelling errors.
strings.regex.%.hfst: strings.%.regex anystar.hfst
.generated/strings.regex.%.hfst: strings.%.regex .generated/anystar.hfst
$(AM_V_RGX2FST)$(HFST_REGEXP2FST) -S $(HFST_FLAGS) -i $<\
| $(HFST_CONCATENATE) anystar.hfst - \
| $(HFST_CONCATENATE) - anystar.hfst \
| $(HFST_CONCATENATE) .generated/anystar.hfst - \
| $(HFST_CONCATENATE) - .generated/anystar.hfst \
| $(HFST_REPEAT) -f 1 -t $(STRING_REGEX_EDIT_DISTANCE) \
-o $@

strings.all.%.hfst: $(strings_all_deps)
.generated/strings.all.%.hfst: $(strings_all_deps)
$(strings_all_build)

# Combine edit distance with string pattern edits, then multiply according to
# the specified editing distance. The strings part is included depending on
# variable setting in Makefile.am.
editdist.all.%.hfst: $(strings_deps) editdist.%.hfst
.generated/editdist.all.%.hfst: $(strings_deps) .generated/editdist.%.hfst
$(strings_fst_include) \
| $(HFST_REPEAT) -f 1 -t $(EDIT_DISTANCE) \
-o $@

# Error model building - list of words known to be misspelled:
words.%.hfst: $(words_deps) easteregg.%.desktop.suggtxt
.generated/words.%.hfst: $(words_deps) .generated/easteregg.%.desktop.suggtxt $(GENDIR)
$(AM_V_STR2FST)grep -h -v '^#' $^ | grep -v '^$$' \
| $(HFST_STRINGS2FST) $(HFST_FLAGS) -j \
--format=openfst-tropical \
-o $@

# The final error model is assembled here:
errmodel.%.hfst: words.%.hfst \
errmodel.%.hfst: .generated/words.%.hfst \
$(initial_letter_deps) \
editdist.all.%.hfst \
.generated/editdist.all.%.hfst \
$(final_strings_deps)
$(AM_V_RGX2FST)printf "\
[ @\"words.$*.hfst\" \
[ @\".generated/words.$*.hfst\" \
| \
[ \
$(initial_letter_fst_include) \
@\"editdist.all.$*.hfst\" \
@\".generated/editdist.all.$*.hfst\" \
$(final_strings_fst_include) \
] \
];" \
Expand All @@ -388,25 +387,24 @@ errmodel.%.hfst: words.%.hfst \
####### Alternate error model: #######
# Alternatively, the error model can be constructed as a long list of regular
# expressions, semicolon separated:
errmodel.%.hfst: errmodel.%.regex easteregg.%.hfst
.generated/errmodel.%.hfst: errmodel.%.regex .generated/asteregg.%.hfst
$(AM_V_GEN)$(HFST_REGEXP2FST) $(HFSTFLAGS) -S -i $< \
| $(HFST_DISJUNCT) - easteregg.$*.hfst \
| $(HFST_DISJUNCT) - .generated/easteregg.$*.hfst \
| $(HFST_PUSH_WEIGHTS) --push=initial \
| $(HFST_FST2FST) $(HFST_FLAGS) -f olw \
-o $@

# ... or as an xfscript file:
errmodel.%.hfst: errmodel.%.xfscript easteregg.%.hfst
.generated/errmodel.%.hfst: errmodel.%.xfscript .generated/easteregg.%.hfst
$(AM_V_GEN)$(HFST_REGEXP2FST) $(HFSTFLAGS) -S -i $< \
| $(HFST_DISJUNCT) - easteregg.$*.hfst \
| $(HFST_DISJUNCT) - .generated/easteregg.$*.hfst \
| $(HFST_PUSH_WEIGHTS) --push=initial \
| $(HFST_FST2FST) $(HFST_FLAGS) -f olw \
-o $@

####### Speller acceptor: #######
# Build the automaton used for the speller
$(GT_SPELLER_ACCEPTOR): \
acceptor.%.hfst: $(GT_SPELLER_HFST) filters/remove-error-strings.hfst \
.generated/acceptor.%.hfst: $(GT_SPELLER_HFST) filters/remove-error-strings.hfst \
.generated/easteregg.%.desktop.hfst
$(AM_V_PROJECT)$(HFST_COMPOSE) -1 filters/remove-error-strings.hfst -2 $< -F \
| $(HFST_PROJECT) $(HFST_FLAGS) \
Expand All @@ -427,7 +425,7 @@ $(GT_SPELLING_HFST): index.xml \
$(AM_V_at)$(MKDIR_P) build/$@
$(AM_V_at)rm -f build/$@/*
$(AM_V_at)cp index.xml build/$@/index.xml
$(AM_V_at)cp $(GT_SPELLER_ACCEPTOR) build/$@/$(GT_SPELLER_ACCEPTOR)
$(AM_V_at)cp $(GT_SPELLER_ACCEPTOR) build/$@/
$(AM_V_at)cp $(GT_ERRMODELS) build/$@/$(GT_ERRMODELS)
$(AM_V_ZIP)cd build/$@/ && $(ZIP) $(ZIPFLAGS) ../../$@ *
$(AM_V_at)$(MKDIR_P) 3
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,11 @@ if WANT_SPELLERS
if WANT_ALT_ORTH_PROOFTOOLS

GT_ALT_ORTH_ERRMODELS=$(shell for ld in $(ALT_ORTHS); do\
echo "errmodel.$$ld.hfst" ; \
echo ".generated/errmodel.$$ld.hfst" ; \
done)

GT_ALT_ORTH_SPELLER_ACCEPTORS=$(shell for ld in $(ALT_ORTHS); do\
echo "acceptor.$$ld.hfst" ; \
echo ".generated/acceptor.$$ld.hfst" ; \
done)

ALT_ORTH_ZHFST_FILES=$(shell for ld in $(ALT_ORTHS); do\
Expand All @@ -42,10 +42,10 @@ endif # HAVE_ALT_ORTHS
#### Build rules: ####

# Alternative based on the raw fst instead of the standard orthography:
acceptor.%.hfst: \
.generated/acceptor.%.hfst: \
$(GT_SPELLER_HFST) \
easteregg.%.desktop.hfst \
$(top_builddir)/src/fst/orthography/raw-to-%.compose.hfst
.generated/easteregg.%.desktop.hfst \
$(top_builddir)/src/fst/orthography/raw-to-%.compose.hfst $(GENDIR)
$(AM_V_GEN)cat $< \
| $(HFST_COMPOSE) $(HFST_FLAGS) -F \
-2 $(top_builddir)/src/fst/orthography/raw-to-$*.compose.hfst \
Expand All @@ -56,9 +56,9 @@ acceptor.%.hfst: \
-o $@

# Build rule for acceptors for alternate writing systems:
acceptor.%.hfst: \
.generated/acceptor.%.hfst: \
$(GT_SPELLER_HFST) \
easteregg.%.desktop.hfst \
.generated/easteregg.%.desktop.hfst \
$(top_builddir)/src/fst/orthography/$(DEFAULT_ORTH)-to-%.compose.hfst
$(AM_V_GEN)cat $< \
| $(HFST_COMPOSE) $(HFST_FLAGS) -F \
Expand Down
Loading

0 comments on commit 4e8a557

Please sign in to comment.