Skip to content

Commit

Permalink
bug fixes for align
Browse files Browse the repository at this point in the history
  • Loading branch information
Jemoka committed Jan 10, 2024
1 parent 7e5f90b commit 5551e89
Show file tree
Hide file tree
Showing 9 changed files with 26 additions and 10 deletions.
4 changes: 4 additions & 0 deletions batchalign/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@
UD__GENDERS = ["Masc", "Fem", "Neut", "ComNeut"]

# audio extensions
# anything in the first set but not the second set
# should be added to the third set for conversion
# we set these seperately in order to garantee fmpeg test
MEDIA_EXTENSIONS = ["*.mp3", "*.mp4", "*.wav"]
PARSABLE_MEDIA = ["*.mp3", "*.wav"]
FORCED_CONVERSION = ["mp4"] # force conversion using ffmpeg

1 change: 1 addition & 0 deletions batchalign/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ class Tier(BaseModel):
id: str = Field(default="PAR") # PAR0
name: str = Field(default="Participant") # Participant
birthday: str = Field(default="") # Participant
additional: List[str] = Field(default=["","","","",""]) # additional fields

def get_token_type(str):
if str in ENDING_PUNCT or str in MOR_PUNCT:
Expand Down
2 changes: 1 addition & 1 deletion batchalign/formats/chat/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def __init__(self, path=None, doc=None, lines=None):
if self.__doc.media != None:
name = self.__doc.media.name
dir = os.path.dirname(path)
globs = [os.path.join(dir, i) for i in MEDIA_EXTENSIONS]
globs = [os.path.join(dir, i) for i in PARSABLE_MEDIA]

# try to find the media file
media_files = sum([glob(i) for i in globs], [])
Expand Down
2 changes: 1 addition & 1 deletion batchalign/formats/chat/generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ def generate_chat_preamble(doc, birthdays=[]):
header.append("@Languages:\t"+", ".join(doc.langs))
header.append("@Participants:\t"+", ".join([f"{i.id} {i.name}" for i in doc.tiers]))
header.append("@Options:\tmulti")
header.append("\n".join([f"@ID:\t{i.lang}|{i.corpus}|{i.id}|{i.birthday}||||{i.name}|||" for i in doc.tiers]))
header.append("\n".join([f"@ID:\t{i.lang}|{i.corpus}|{i.id}|{i.birthday}|{i.additional[0]}|{i.additional[1]}|{i.additional[2]}|{i.name}|{i.additional[3]}|{i.additional[4]}|" for i in doc.tiers]))
for i in birthdays:
header.append(f"@{i.id}:\t{i.content}")
if doc.media:
Expand Down
3 changes: 2 additions & 1 deletion batchalign/formats/chat/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,8 @@ def chat_parse_doc(lines):

tier = Tier(lang=participant[0], corpus=participant[1],
id=participant[2], name=participant[7],
birthday=participant[3])
birthday=participant[3], additional=[participant[i]
for i in [4,5,6,8,9]])
tiers[participant[2]] = tier
# parse media type
elif "@Media" in line.strip():
Expand Down
4 changes: 3 additions & 1 deletion batchalign/pipelines/fa/whisper_fa.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,8 +123,10 @@ def process(self, doc:Document):
elif indx != len(ut.content)-1:
# search forward for the next compatible time
tmp = indx+1
while tmp < len(ut.content) and ut.content[tmp].time == None:
while tmp < len(ut.content)-1 and ut.content[tmp].time == None:
tmp += 1
if w.time == None:
continue
if ut.content[tmp].time == None:
w.time = (w.time[0], w.time[0]+1000) # give a second because we don't know
else:
Expand Down
8 changes: 8 additions & 0 deletions batchalign/pipelines/utr/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,14 @@ def bulletize_doc(asr, doc):
doc[a][b].time = (int(round(i.payload[0]*1000)),
int(round(i.payload[1]*1000)))

# set media
if doc.media:
if doc.media.type == MediaType.UNLINKED_AUDIO:
doc.media.type = MediaType.AUDIO
elif doc.media.type == MediaType.UNLINKED_VIDEO:
doc.media.type = MediaType.VIDEO


return doc


6 changes: 3 additions & 3 deletions batchalign/tests/pipelines/test_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@

from batchalign.tests.pipelines.fixures import *

PROCESSED_OUTPUT_GENERATION = {'content': [{'tier': {'lang': 'eng', 'corpus': 'corpus_name', 'id': 'PAR', 'name': 'Participant', 'name': 'Participant', 'birthday': ''}, 'content': [{'text': 'This', 'time': None, 'morphology': None, 'dependency': None, 'type': 0}, {'text': 'is', 'time': None, 'morphology': None, 'dependency': None, 'type': 0}, {'text': 'a', 'time': None, 'morphology': None, 'dependency': None, 'type': 0}, {'text': 'test', 'time': None, 'morphology': None, 'dependency': None, 'type': 0}, {'text': 'generation', 'time': None, 'morphology': None, 'dependency': None, 'type': 0}, {'text': '.', 'time': None, 'morphology': None, 'dependency': None, 'type': 5}], 'text': None, 'delim': '.', 'time': None, 'custom_dependencies': []}], 'media': {'type': 'audio', 'name': 'generator_wuz_here', 'url': None}, 'langs': ['eng'], 'tiers': [{'lang': 'eng', 'corpus': 'corpus_name', 'id': 'PAR', 'name': 'Participant', 'birthday': ''}]}
PROCESSED_OUTPUT_GENERATION = {'content': [{'tier': {'lang': 'eng', 'corpus': 'corpus_name', 'id': 'PAR', 'name': 'Participant', 'name': 'Participant', 'birthday': '', 'additional': ['', '', '', '', '']}, 'content': [{'text': 'This', 'time': None, 'morphology': None, 'dependency': None, 'type': 0}, {'text': 'is', 'time': None, 'morphology': None, 'dependency': None, 'type': 0}, {'text': 'a', 'time': None, 'morphology': None, 'dependency': None, 'type': 0}, {'text': 'test', 'time': None, 'morphology': None, 'dependency': None, 'type': 0}, {'text': 'generation', 'time': None, 'morphology': None, 'dependency': None, 'type': 0}, {'text': '.', 'time': None, 'morphology': None, 'dependency': None, 'type': 5}], 'text': None, 'delim': '.', 'time': None, 'custom_dependencies': []}], 'media': {'type': 'audio', 'name': 'generator_wuz_here', 'url': None}, 'langs': ['eng'], 'tiers': [{'lang': 'eng', 'corpus': 'corpus_name', 'id': 'PAR', 'name': 'Participant', 'birthday': '', 'additional': ['', '', '', '', '']}]}

PROCESSED_OUTPUT = {'content': [{'tier': {'lang': 'eng', 'corpus': 'corpus_name', 'id': 'PAR', 'name': 'Participant', 'name': 'Participant', 'birthday': ''}, 'content': [{'text': 'This', 'time': None, 'morphology': None, 'dependency': None, 'type': 0}, {'text': 'is', 'time': None, 'morphology': None, 'dependency': None, 'type': 0}, {'text': 'a', 'time': None, 'morphology': None, 'dependency': None, 'type': 0}, {'text': 'test', 'time': None, 'morphology': None, 'dependency': None, 'type': 0}, {'text': 'process', 'time': None, 'morphology': None, 'dependency': None, 'type': 0}, {'text': '.', 'time': None, 'morphology': None, 'dependency': None, 'type': 5}], 'text': None, 'delim': '.', 'time': None, 'custom_dependencies': []}], 'media': {'type': 'audio', 'name': 'generator_wuz_here', 'url': None}, 'langs': ['eng'], 'tiers': [{'lang': 'eng', 'corpus': 'corpus_name', 'id': 'PAR', 'name': 'Participant', 'birthday': ''}]}
PROCESSED_OUTPUT = {'content': [{'tier': {'lang': 'eng', 'corpus': 'corpus_name', 'id': 'PAR', 'name': 'Participant', 'name': 'Participant', 'birthday': '', 'additional': ['', '', '', '', '']}, 'content': [{'text': 'This', 'time': None, 'morphology': None, 'dependency': None, 'type': 0}, {'text': 'is', 'time': None, 'morphology': None, 'dependency': None, 'type': 0}, {'text': 'a', 'time': None, 'morphology': None, 'dependency': None, 'type': 0}, {'text': 'test', 'time': None, 'morphology': None, 'dependency': None, 'type': 0}, {'text': 'process', 'time': None, 'morphology': None, 'dependency': None, 'type': 0}, {'text': '.', 'time': None, 'morphology': None, 'dependency': None, 'type': 5}], 'text': None, 'delim': '.', 'time': None, 'custom_dependencies': []}], 'media': {'type': 'audio', 'name': 'generator_wuz_here', 'url': None}, 'langs': ['eng'], 'tiers': [{'lang': 'eng', 'corpus': 'corpus_name', 'id': 'PAR', 'name': 'Participant', 'birthday': '', 'additional': ['', '', '', '', '']}]}

MODEL_NO_MEDIA = {'content': [{'tier': {'lang': 'eng', 'corpus': 'corpus_name', 'id': 'PAR', 'name': 'Participant', 'name': 'Participant', 'birthday': ''}, 'content': [{'text': 'This', 'time': None, 'morphology': None, 'dependency': None}, {'text': 'is', 'time': None, 'morphology': None, 'dependency': None}, {'text': 'a', 'time': None, 'morphology': None, 'dependency': None}, {'text': 'test', 'time': None, 'morphology': None, 'dependency': None}, {'text': 'process', 'time': None, 'morphology': None, 'dependency': None}, {'text': '.', 'time': None, 'morphology': None, 'dependency': None, 'type': 5}], 'text': None, 'delim': '.', 'time': None, 'custom_dependencies': []}], 'langs': ['eng'], 'tiers': [{'lang': 'eng', 'corpus': 'corpus_name', 'id': 'PAR', 'name': 'Participant', 'birthday': ''}]}
MODEL_NO_MEDIA = {'content': [{'tier': {'lang': 'eng', 'corpus': 'corpus_name', 'id': 'PAR', 'name': 'Participant', 'name': 'Participant', 'birthday': '', 'additional': ['', '', '', '', '']}, 'content': [{'text': 'This', 'time': None, 'morphology': None, 'dependency': None}, {'text': 'is', 'time': None, 'morphology': None, 'dependency': None}, {'text': 'a', 'time': None, 'morphology': None, 'dependency': None}, {'text': 'test', 'time': None, 'morphology': None, 'dependency': None}, {'text': 'process', 'time': None, 'morphology': None, 'dependency': None}, {'text': '.', 'time': None, 'morphology': None, 'dependency': None, 'type': 5}], 'text': None, 'delim': '.', 'time': None, 'custom_dependencies': []}], 'langs': ['eng'], 'tiers': [{'lang': 'eng', 'corpus': 'corpus_name', 'id': 'PAR', 'name': 'Participant', 'birthday': '', 'additional': ['', '', '', '', '']}]}


def test_standard_pipeline(generator, processor, analyzer):
Expand Down
6 changes: 3 additions & 3 deletions batchalign/version
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
0.4.0-alpha.17.post1
Jan 9th, 2024
support for ffmpeg
0.4.0-alpha.18
Jan 10th, 2024
bug fixes for align

0 comments on commit 5551e89

Please sign in to comment.