bug fixes for align

TalkBank · Jan 10, 2024 · 5551e89 · 5551e89
1 parent 7e5f90b
commit 5551e89
Show file tree

Hide file tree

Showing 9 changed files with 26 additions and 10 deletions.
diff --git a/batchalign/constants.py b/batchalign/constants.py
@@ -11,6 +11,10 @@
 UD__GENDERS = ["Masc", "Fem", "Neut", "ComNeut"]
 
 # audio extensions
+# anything in the first set but not the second set
+# should be added to the third set for conversion
+# we set these seperately in order to garantee fmpeg test
 MEDIA_EXTENSIONS = ["*.mp3", "*.mp4", "*.wav"]
+PARSABLE_MEDIA = ["*.mp3", "*.wav"]
 FORCED_CONVERSION = ["mp4"] # force conversion using ffmpeg
 
diff --git a/batchalign/document.py b/batchalign/document.py
@@ -111,6 +111,7 @@ class Tier(BaseModel):
     id: str = Field(default="PAR") # PAR0
     name: str = Field(default="Participant") # Participant
     birthday: str = Field(default="") # Participant
+    additional: List[str] = Field(default=["","","","",""]) # additional fields 
 
 def get_token_type(str):
     if str in ENDING_PUNCT or str in MOR_PUNCT:

diff --git a/batchalign/formats/chat/file.py b/batchalign/formats/chat/file.py
@@ -70,7 +70,7 @@ def __init__(self, path=None, doc=None, lines=None):
             if self.__doc.media != None:
                 name = self.__doc.media.name
                 dir = os.path.dirname(path)
-                globs = [os.path.join(dir, i) for i in MEDIA_EXTENSIONS]
+                globs = [os.path.join(dir, i) for i in PARSABLE_MEDIA]
 
                 # try to find the media file
                 media_files = sum([glob(i) for i in globs], [])

diff --git a/batchalign/formats/chat/generator.py b/batchalign/formats/chat/generator.py
@@ -103,7 +103,7 @@ def generate_chat_preamble(doc, birthdays=[]):
     header.append("@Languages:\t"+", ".join(doc.langs))
     header.append("@Participants:\t"+", ".join([f"{i.id} {i.name}" for i in doc.tiers]))
     header.append("@Options:\tmulti")
-    header.append("\n".join([f"@ID:\t{i.lang}|{i.corpus}|{i.id}|{i.birthday}||||{i.name}|||" for i in doc.tiers]))
+    header.append("\n".join([f"@ID:\t{i.lang}|{i.corpus}|{i.id}|{i.birthday}|{i.additional[0]}|{i.additional[1]}|{i.additional[2]}|{i.name}|{i.additional[3]}|{i.additional[4]}|" for i in doc.tiers]))
     for i in birthdays:
         header.append(f"@{i.id}:\t{i.content}")
     if doc.media:

diff --git a/batchalign/formats/chat/parser.py b/batchalign/formats/chat/parser.py
@@ -217,7 +217,8 @@ def chat_parse_doc(lines):
 
                 tier = Tier(lang=participant[0], corpus=participant[1], 
                             id=participant[2], name=participant[7],
-                            birthday=participant[3])
+                            birthday=participant[3], additional=[participant[i]
+                                                                 for i in [4,5,6,8,9]])
                 tiers[participant[2]] = tier
             # parse media type
             elif "@Media" in line.strip():

diff --git a/batchalign/pipelines/fa/whisper_fa.py b/batchalign/pipelines/fa/whisper_fa.py
@@ -123,8 +123,10 @@ def process(self, doc:Document):
                 elif indx != len(ut.content)-1:
                     # search forward for the next compatible time
                     tmp = indx+1
-                    while tmp < len(ut.content) and ut.content[tmp].time == None:
+                    while tmp < len(ut.content)-1 and ut.content[tmp].time == None:
                         tmp += 1
+                    if w.time == None:
+                        continue
                     if ut.content[tmp].time == None:
                         w.time = (w.time[0], w.time[0]+1000) # give a second because we don't know
                     else:

diff --git a/batchalign/pipelines/utr/utils.py b/batchalign/pipelines/utr/utils.py
@@ -47,6 +47,14 @@ def bulletize_doc(asr, doc):
             doc[a][b].time = (int(round(i.payload[0]*1000)),
                               int(round(i.payload[1]*1000)))
 
+    # set media
+    if doc.media:
+        if doc.media.type == MediaType.UNLINKED_AUDIO:
+            doc.media.type = MediaType.AUDIO
+        elif doc.media.type == MediaType.UNLINKED_VIDEO:
+            doc.media.type = MediaType.VIDEO
+
+
     return doc 
 
 
diff --git a/batchalign/tests/pipelines/test_pipeline.py b/batchalign/tests/pipelines/test_pipeline.py
@@ -5,11 +5,11 @@
 
 from batchalign.tests.pipelines.fixures import *
 
-PROCESSED_OUTPUT_GENERATION = {'content': [{'tier': {'lang': 'eng', 'corpus': 'corpus_name', 'id': 'PAR', 'name': 'Participant', 'name': 'Participant', 'birthday': ''}, 'content': [{'text': 'This', 'time': None, 'morphology': None, 'dependency': None, 'type': 0}, {'text': 'is', 'time': None, 'morphology': None, 'dependency': None, 'type': 0}, {'text': 'a', 'time': None, 'morphology': None, 'dependency': None, 'type': 0}, {'text': 'test', 'time': None, 'morphology': None, 'dependency': None, 'type': 0}, {'text': 'generation', 'time': None, 'morphology': None, 'dependency': None, 'type': 0}, {'text': '.', 'time': None, 'morphology': None, 'dependency': None, 'type': 5}], 'text': None, 'delim': '.', 'time': None, 'custom_dependencies': []}], 'media': {'type': 'audio', 'name': 'generator_wuz_here', 'url': None}, 'langs': ['eng'], 'tiers': [{'lang': 'eng', 'corpus': 'corpus_name', 'id': 'PAR', 'name': 'Participant', 'birthday': ''}]}
+PROCESSED_OUTPUT_GENERATION = {'content': [{'tier': {'lang': 'eng', 'corpus': 'corpus_name', 'id': 'PAR', 'name': 'Participant', 'name': 'Participant', 'birthday': '', 'additional': ['', '', '', '', '']}, 'content': [{'text': 'This', 'time': None, 'morphology': None, 'dependency': None, 'type': 0}, {'text': 'is', 'time': None, 'morphology': None, 'dependency': None, 'type': 0}, {'text': 'a', 'time': None, 'morphology': None, 'dependency': None, 'type': 0}, {'text': 'test', 'time': None, 'morphology': None, 'dependency': None, 'type': 0}, {'text': 'generation', 'time': None, 'morphology': None, 'dependency': None, 'type': 0}, {'text': '.', 'time': None, 'morphology': None, 'dependency': None, 'type': 5}], 'text': None, 'delim': '.', 'time': None, 'custom_dependencies': []}], 'media': {'type': 'audio', 'name': 'generator_wuz_here', 'url': None}, 'langs': ['eng'], 'tiers': [{'lang': 'eng', 'corpus': 'corpus_name', 'id': 'PAR', 'name': 'Participant', 'birthday': '', 'additional': ['', '', '', '', '']}]}
 
-PROCESSED_OUTPUT = {'content': [{'tier': {'lang': 'eng', 'corpus': 'corpus_name', 'id': 'PAR', 'name': 'Participant', 'name': 'Participant', 'birthday': ''}, 'content': [{'text': 'This', 'time': None, 'morphology': None, 'dependency': None, 'type': 0}, {'text': 'is', 'time': None, 'morphology': None, 'dependency': None, 'type': 0}, {'text': 'a', 'time': None, 'morphology': None, 'dependency': None, 'type': 0}, {'text': 'test', 'time': None, 'morphology': None, 'dependency': None, 'type': 0}, {'text': 'process', 'time': None, 'morphology': None, 'dependency': None, 'type': 0}, {'text': '.', 'time': None, 'morphology': None, 'dependency': None, 'type': 5}], 'text': None, 'delim': '.', 'time': None, 'custom_dependencies': []}], 'media': {'type': 'audio', 'name': 'generator_wuz_here', 'url': None}, 'langs': ['eng'], 'tiers': [{'lang': 'eng', 'corpus': 'corpus_name', 'id': 'PAR', 'name': 'Participant', 'birthday': ''}]}
+PROCESSED_OUTPUT = {'content': [{'tier': {'lang': 'eng', 'corpus': 'corpus_name', 'id': 'PAR', 'name': 'Participant', 'name': 'Participant', 'birthday': '', 'additional': ['', '', '', '', '']}, 'content': [{'text': 'This', 'time': None, 'morphology': None, 'dependency': None, 'type': 0}, {'text': 'is', 'time': None, 'morphology': None, 'dependency': None, 'type': 0}, {'text': 'a', 'time': None, 'morphology': None, 'dependency': None, 'type': 0}, {'text': 'test', 'time': None, 'morphology': None, 'dependency': None, 'type': 0}, {'text': 'process', 'time': None, 'morphology': None, 'dependency': None, 'type': 0}, {'text': '.', 'time': None, 'morphology': None, 'dependency': None, 'type': 5}], 'text': None, 'delim': '.', 'time': None, 'custom_dependencies': []}], 'media': {'type': 'audio', 'name': 'generator_wuz_here', 'url': None}, 'langs': ['eng'], 'tiers': [{'lang': 'eng', 'corpus': 'corpus_name', 'id': 'PAR', 'name': 'Participant', 'birthday': '', 'additional': ['', '', '', '', '']}]}
 
-MODEL_NO_MEDIA = {'content': [{'tier': {'lang': 'eng', 'corpus': 'corpus_name', 'id': 'PAR', 'name': 'Participant', 'name': 'Participant', 'birthday': ''}, 'content': [{'text': 'This', 'time': None, 'morphology': None, 'dependency': None}, {'text': 'is', 'time': None, 'morphology': None, 'dependency': None}, {'text': 'a', 'time': None, 'morphology': None, 'dependency': None}, {'text': 'test', 'time': None, 'morphology': None, 'dependency': None}, {'text': 'process', 'time': None, 'morphology': None, 'dependency': None}, {'text': '.', 'time': None, 'morphology': None, 'dependency': None, 'type': 5}], 'text': None, 'delim': '.', 'time': None, 'custom_dependencies': []}], 'langs': ['eng'], 'tiers': [{'lang': 'eng', 'corpus': 'corpus_name', 'id': 'PAR', 'name': 'Participant', 'birthday': ''}]}
+MODEL_NO_MEDIA = {'content': [{'tier': {'lang': 'eng', 'corpus': 'corpus_name', 'id': 'PAR', 'name': 'Participant', 'name': 'Participant', 'birthday': '', 'additional': ['', '', '', '', '']}, 'content': [{'text': 'This', 'time': None, 'morphology': None, 'dependency': None}, {'text': 'is', 'time': None, 'morphology': None, 'dependency': None}, {'text': 'a', 'time': None, 'morphology': None, 'dependency': None}, {'text': 'test', 'time': None, 'morphology': None, 'dependency': None}, {'text': 'process', 'time': None, 'morphology': None, 'dependency': None}, {'text': '.', 'time': None, 'morphology': None, 'dependency': None, 'type': 5}], 'text': None, 'delim': '.', 'time': None, 'custom_dependencies': []}], 'langs': ['eng'], 'tiers': [{'lang': 'eng', 'corpus': 'corpus_name', 'id': 'PAR', 'name': 'Participant', 'birthday': '', 'additional': ['', '', '', '', '']}]}
 
 
 def test_standard_pipeline(generator, processor, analyzer):

diff --git a/batchalign/version b/batchalign/version
@@ -1,3 +1,3 @@
-0.4.0-alpha.17.post1
-Jan 9th, 2024
-support for ffmpeg
+0.4.0-alpha.18
+Jan 10th, 2024
+bug fixes for align