Merge pull request #913 from dchiller/i910-compound-neumes

Fix missing pitches in neume ngrams + refactorings
DDMAL · Oct 4, 2024 · 1be46da · 1be46da
2 parents c273192 + 92a25b0
commit 1be46da
Show file tree

Hide file tree

Showing 12 changed files with 194 additions and 106 deletions.
diff --git a/app/public/cantusdata/admin/admin.py b/app/public/cantusdata/admin/admin.py
@@ -1,23 +1,31 @@
 from django.contrib import admin
+from django.contrib.admin import ModelAdmin
+from django.db.models import Model
+from django.db.models.query import QuerySet
+from django.http import HttpRequest
+
+from django_celery_results.models import TaskResult  # type: ignore[import-untyped]
+from django_celery_results.admin import TaskResultAdmin  # type: ignore[import-untyped]
+
 from cantusdata.models.manuscript import Manuscript
 from cantusdata.models.chant import Chant
 from cantusdata.models.folio import Folio
 from cantusdata.models.plugin import Plugin
 from cantusdata.models.neume_exemplar import NeumeExemplar
 from cantusdata.tasks import chant_import_task
-from django_celery_results.models import TaskResult
-from django_celery_results.admin import TaskResultAdmin
 
 
-def reindex_in_solr(modeladmin, request, queryset):
+@admin.action(description="ReIndex in Solr")
+def reindex_in_solr(
+    modeladmin: ModelAdmin,  # type: ignore[type-arg]
+    request: HttpRequest,
+    queryset: QuerySet[Model],
+) -> None:
     for item in queryset:
         item.save()
 
 
-reindex_in_solr.short_description = "ReIndex in Solr"
-
-
-class ManuscriptAdmin(admin.ModelAdmin):
+class ManuscriptAdmin(ModelAdmin):  # type: ignore[type-arg]
     actions = [reindex_in_solr, "load_chants"]
     ordering = ["-public", "name"]
     list_per_page = 200
@@ -49,6 +57,13 @@ class ManuscriptAdmin(admin.ModelAdmin):
                     "chants_loaded",
                     "is_mapped",
                     "dbl_folio_img",
+                ]
+            },
+        ),
+        (
+            "Search",
+            {
+                "fields": [
                     "plugins",
                 ]
             },
@@ -68,39 +83,43 @@ class ManuscriptAdmin(admin.ModelAdmin):
         description="Imports the chants associated \
         with the selected manuscript(s)"
     )
-    def load_chants(self, request, queryset):
+    def load_chants(self, request: HttpRequest, queryset: QuerySet[Manuscript]) -> None:
         for ms in queryset:
             chant_import_task.apply_async(kwargs={"manuscript_ids": [ms.pk]})
         self.message_user(
             request,
-            "Importing chants for the selected manuscripts. This may take a few minutes. Check status on the Task Results page.",
+            (
+                "Importing chants for the selected manuscripts. "
+                "This may take a few minutes. "
+                "Check status on the Task Results page."
+            ),
         )
 
 
-class ChantAdmin(admin.ModelAdmin):
+class ChantAdmin(ModelAdmin):  # type: ignore[type-arg]
     actions = [reindex_in_solr]
 
 
-class FolioAdmin(admin.ModelAdmin):
+class FolioAdmin(ModelAdmin):  # type: ignore[type-arg]
     actions = [reindex_in_solr]
     readonly_fields = ("chant_count",)
 
 
-class PluginAdmin(admin.ModelAdmin):
+class PluginAdmin(ModelAdmin):  # type: ignore[type-arg]
     readonly_fields = ("slug",)
 
 
-class NeumeExemplarAdmin(admin.ModelAdmin):
-    list_display = ("admin_image", "__str__")
+class NeumeExemplarAdmin(ModelAdmin):  # type: ignore[type-arg]
+    list_display = ("name", "folio")
     readonly_fields = ("admin_image",)
 
 
-class NewTaskResultAdmin(TaskResultAdmin):
+class NewTaskResultAdmin(TaskResultAdmin):  # type: ignore[misc]
     list_display = ("task_name", "date_done", "status", "get_task_manuscript_ids")
     list_filter = ("status", "date_done", "task_name")
 
     @admin.display(description="Manuscript(s)")
-    def get_task_manuscript_ids(self, obj):
+    def get_task_manuscript_ids(self, obj: TaskResult) -> list[Manuscript]:
         if obj.status == "RECEIVED":
             obj_man_ids = eval(obj.task_kwargs)["manuscript_ids"]
         else:

diff --git a/app/public/cantusdata/helpers/mei_processing/mei_parser.py b/app/public/cantusdata/helpers/mei_processing/mei_parser.py
@@ -13,44 +13,24 @@
 Defines associated types for the data structures used by the parser.
 """
 
-from typing import Tuple, Dict, List, Iterator, Optional, Literal
+from typing import Tuple, Dict, List, Iterator, Optional
 from lxml import etree  # pylint: disable=no-name-in-module
+from cantusdata.helpers.neume_helpers import NEUME_GROUPS, NeumeName
 from .mei_parsing_types import (
     Zone,
     SyllableText,
     NeumeComponentElementData,
     NeumeComponent,
     ContourType,
-    NeumeName,
     Neume,
     Syllable,
 )
 from .bounding_box_utils import combine_bounding_boxes_single_system
 
+
 # Mapping from pitch names to integer pitch class where C = 0
 PITCH_CLASS = {"c": 0, "d": 2, "e": 4, "f": 5, "g": 7, "a": 9, "b": 11}
 
-# Mapping from neume contours to neume names
-NEUME_GROUPS: Dict[str, NeumeName] = {
-    "": "punctum",
-    "u": "pes",
-    "d": "clivis",
-    "uu": "scandicus",
-    "ud": "torculus",
-    "du": "porrectus",
-    "r": "distropha",
-    "rr": "tristopha",
-    "rd": "pressus",
-    "dd": "climacus",
-    "ddu": "climacus_resupinus",
-    "udu": "torculus_resupinus",
-    "dud": "porrectus_flexus",
-    "udd": "pes_subpunctis",
-    "uud": "scandicus_flexus",
-    "uudd": "scandicus_subpunctis",
-    "dudd": "porrectus_subpunctis",
-}
-
 
 class MEIParser:
     """
@@ -122,7 +102,7 @@ def _get_element_zone(self, element: etree._Element) -> Zone:
             return zone
         return {"coordinates": (-1, -1, -1, -1), "rotate": 0.0}
 
-    def _parse_syllable_text(self, syl_elem: Optional[etree.Element]) -> SyllableText:
+    def _parse_syllable_text(self, syl_elem: Optional[etree._Element]) -> SyllableText:
         """
         Get the text of a syllable and its associated bounding box from
         a 'syl' element.
@@ -132,8 +112,8 @@ def _parse_syllable_text(self, syl_elem: Optional[etree.Element]) -> SyllableTex
         """
         # Ignoring type of next two expressions because for some reason
         # mypy thinks they are unreachable, but we know they are not.
-        if syl_elem is not None and syl_elem.text:  # type: ignore
-            text_dict: SyllableText = {  # type: ignore
+        if syl_elem is not None and syl_elem.text:
+            text_dict: SyllableText = {
                 "text": syl_elem.text.strip(),
                 "bounding_box": self._get_element_zone(syl_elem),
             }
@@ -305,7 +285,7 @@ def _syllable_iterator(
             elem_iterator = first_syllable.itersiblings(
                 tag=[f"{self.MEINS}syllable", f"{self.MEINS}sb"]
             )
-            current_elem = first_syllable
+            current_elem: Optional[etree._Element] = first_syllable
             while current_elem is not None:
                 if current_elem.tag == f"{self.MEINS}syllable":
                     current_syl = current_elem.find(f"{self.MEINS}syl")

diff --git a/app/public/cantusdata/helpers/mei_processing/mei_parsing_types.py b/app/public/cantusdata/helpers/mei_processing/mei_parsing_types.py
@@ -5,6 +5,8 @@
 from typing import Tuple, TypedDict, Literal, List, Optional, NotRequired
 from typing_extensions import TypeAlias
 
+from cantusdata.helpers.neume_helpers import NeumeName
+
 # A type for coordinates of bounding boxes
 CoordinatesType: TypeAlias = Tuple[int, int, int, int]
 """
@@ -31,26 +33,6 @@ class Zone(TypedDict):
 
 
 ContourType = Literal["u", "d", "r"]
-NeumeName = Literal[
-    "punctum",
-    "pes",
-    "clivis",
-    "scandicus",
-    "torculus",
-    "porrectus",
-    "distropha",
-    "tristopha",
-    "pressus",
-    "climacus",
-    "climacus_resupinus",
-    "torculus_resupinus",
-    "porrectus_flexus",
-    "pes_subpunctis",
-    "scandicus_flexus",
-    "scandicus_subpunctis",
-    "porrectus_subpunctis",
-    "compound",
-]
 
 
 class NeumeComponentElementData(TypedDict):

diff --git a/app/public/cantusdata/helpers/mei_processing/mei_tokenizer.py b/app/public/cantusdata/helpers/mei_processing/mei_tokenizer.py
@@ -6,12 +6,12 @@
 
 import uuid
 from typing import List, Tuple, Optional
+from cantusdata.helpers.neume_helpers import NeumeName
 from .mei_parser import MEIParser
 from .mei_parsing_types import (
     Neume,
     NeumeComponent,
     ContourType,
-    NeumeName,
     NgramDocument,
     Zone,
 )
@@ -159,6 +159,8 @@ def create_ngram_documents(self) -> List[NgramDocument]:
         # At each pitch in the file, we'll generate all the necessary
         # ngrams that start with that pitch.
         for start_idx in range(num_pitches):
+            # Start by collecting ngrams of pitches of lengths min_ngram
+            # to max_ngram.
             largest_num_neumes = 0
             for ngram_length in range(self.min_ngram, self.max_ngram + 1):
                 # Collect the pitches for an ngram of ngram_length
@@ -167,8 +169,9 @@ def create_ngram_documents(self) -> List[NgramDocument]:
                 end_idx = start_idx + ngram_length
                 if end_idx > num_pitches:
                     break
-                nc_ngram = pitches[start_idx:end_idx]
-                doc = self._create_document_from_neume_components(nc_ngram)
+                doc = self._create_document_from_neume_components(
+                    pitches[start_idx:end_idx]
+                )
                 # If the pitch at start_idx is the beginning of a neume
                 # and the pitch following this ngram is also the beginning
                 # of a neume (or we've reached the end of the file),
@@ -204,11 +207,15 @@ def create_ngram_documents(self) -> List[NgramDocument]:
                     ):
                         if (
                             name_at_pitch := neume_names[start_idx + ngram_num_pitches]
-                        ) is not None and len(ngram_neume_names) < wanted_ngram_length:
+                        ) is not None:
+                            # If we've reached a new neume name, but we already
+                            # have the desired number of neumes in our ngram,
+                            # we've added all the required pitches for this ngram
+                            # to ngram_num_pitches and can break the while loop.
+                            if len(ngram_neume_names) == wanted_ngram_length:
+                                break
                             ngram_neume_names.append(name_at_pitch)
                         ngram_num_pitches += 1
-                        if len(ngram_neume_names) == wanted_ngram_length:
-                            break
                     # We'll only add this ngram if we've actually gotten to
                     # the desired number of neumes (if we didn't, it means
                     # we reached the end of the file)

diff --git a/app/public/cantusdata/helpers/neume_helpers.py b/app/public/cantusdata/helpers/neume_helpers.py
@@ -0,0 +1,77 @@
+"""
+Contains various neume-related constructs that are used throughout the backend,
+especially for MEI parsing and OMR search.
+"""
+
+from typing import Literal, Dict
+
+# NEUME_NAMES contains the currently-supported neumes. They are
+# included in the order used for UI (esp. as neume exemplars).
+# Ordering is by:
+#   1. The number of pitches in the neume (ascending)
+#   2. The the direction of the first interval in the neume (first ascending,
+#      then pitch repetition, then descending)
+#   3+. The direction of following intervals in the neume (according to 2.)
+#   N. The all-purpose "compound" neume at the end
+NEUME_NAMES = [
+    "punctum",
+    "pes",
+    "distropha",
+    "clivis",
+    "scandicus",
+    "torculus",
+    "tristopha",
+    "pressus",
+    "porrectus",
+    "climacus",
+    "scandicus-flexus",
+    "torculus-resupinus",
+    "pes-subpunctis",
+    "porrectus-flexus",
+    "climacus-resupinus",
+    "scandicus-subpunctis",
+    "porrectus-subpunctis",
+    "compound",
+]
+
+NeumeName = Literal[
+    "punctum",
+    "pes",
+    "distropha",
+    "clivis",
+    "scandicus",
+    "torculus",
+    "tristopha",
+    "pressus",
+    "porrectus",
+    "climacus",
+    "scandicus-flexus",
+    "torculus-resupinus",
+    "pes-subpunctis",
+    "porrectus-flexus",
+    "climacus-resupinus",
+    "scandicus-subpunctis",
+    "porrectus-subpunctis",
+    "compound",
+]
+
+# Mapping from neume contours to neume names
+NEUME_GROUPS: Dict[str, NeumeName] = {
+    "": "punctum",
+    "u": "pes",
+    "r": "distropha",
+    "d": "clivis",
+    "uu": "scandicus",
+    "ud": "torculus",
+    "rr": "tristopha",
+    "rd": "pressus",
+    "du": "porrectus",
+    "dd": "climacus",
+    "uud": "scandicus-flexus",
+    "udu": "torculus-resupinus",
+    "udd": "pes-subpunctis",
+    "dud": "porrectus-flexus",
+    "ddu": "climacus-resupinus",
+    "uudd": "scandicus-subpunctis",
+    "dudd": "porrectus-subpunctis",
+}
diff --git a/app/public/cantusdata/helpers/search_utils.py b/app/public/cantusdata/helpers/search_utils.py
@@ -3,24 +3,7 @@
 queries.
 """
 
-# Contains the words that are allowed
-# in a neume_name query
-VALID_NEUME_NAME_WORDS = {
-    "punctum",
-    "pes",
-    "clivis",
-    "scandicus",
-    "torculus",
-    "porrectus",
-    "distropha",
-    "tristopha",
-    "pressus",
-    "climacus",
-    "resupinus",
-    "flexus",
-    "subpunctis",
-    "compound",
-}
+from cantusdata.helpers.neume_helpers import NEUME_NAMES
 
 
 def validate_intervals_query_word(word: str) -> bool:
@@ -45,7 +28,7 @@ def validate_query(q: list[str], q_type: str) -> bool:
     """
     match q_type:
         case "neume_names":
-            return all(neume in VALID_NEUME_NAME_WORDS for neume in q)
+            return all(neume in NEUME_NAMES for neume in q)
         case "pitch_names" | "pitch_names_transposed":
             return all(pitch in "abcdefg" for pitch in q)
         case "contour":

diff --git a/app/public/cantusdata/management/commands/index_manuscript_mei.py b/app/public/cantusdata/management/commands/index_manuscript_mei.py
@@ -74,7 +74,7 @@ def handle(self, *args: Any, **options: Any) -> None:
             self.flush_manuscript_ngrams_from_index(solr_conn, manuscript_id)
             return None
         folio_map: Dict[str, str] = dict(
-            Folio.objects.filter(manuscript_id=manuscript_id).values_list(
+            Folio.objects.filter(manuscript_id=manuscript_id).values_list(  # type: ignore[arg-type]
                 "number", "image_uri"
             )
         )