Merge pull request #75 from bklynhlth/1.6.x

1.6.x release
bklynhlth · Nov 14, 2023 · a89e27a · a89e27a
2 parents 4d9b4b5 + 2b6553f
commit a89e27a
Show file tree

Hide file tree

Showing 16 changed files with 1,093 additions and 1,390 deletions.
diff --git a/openwillis/__init__.py b/openwillis/__init__.py
@@ -8,12 +8,13 @@
     emotional_expressivity,
     eye_blink_rate,
     vocal_acoustics,
-    speech_transcription,
+    speech_transcription_whisper,
     speech_characteristics,
-    speaker_separation,
-    speaker_separation_cloud,
-    speech_transcription_cloud,
+    speaker_separation_nolabels,
+    speaker_separation_labels,
+    speech_transcription_aws,
+    speech_transcription_vosk,
     to_audio
 )
 
-__all__ = ["facial_expressivity", "vocal_acoustics", "emotional_expressivity", "eye_blink_rate", "speech_transcription", "speech_characteristics", "speaker_separation", "speaker_separation_cloud", "speech_transcription_cloud", "to_audio"]
+__all__ = ["facial_expressivity", "vocal_acoustics", "emotional_expressivity", "eye_blink_rate", "speech_transcription_whisper", "speech_characteristics", "speaker_separation_nolabels", "speaker_separation_labels", "speech_transcription_aws", "speech_transcription_vosk", "to_audio"]
diff --git a/openwillis/measures/api.py b/openwillis/measures/api.py
@@ -9,10 +9,11 @@
 )
 from openwillis.measures.audio import (
     vocal_acoustics,
-    speech_transcription,
-    speaker_separation,
-    speaker_separation_cloud,
-    speech_transcription_cloud,
+    speech_transcription_whisper,
+    speaker_separation_nolabels,
+    speaker_separation_labels,
+    speech_transcription_aws,
+    speech_transcription_vosk
 )
 from openwillis.measures.text import (
     speech_characteristics

diff --git a/openwillis/measures/audio/__init__.py b/openwillis/measures/audio/__init__.py
@@ -2,20 +2,24 @@
     vocal_acoustics,
 )
 
-from openwillis.measures.audio.speech_transcribe import (
-    speech_transcription,
+from openwillis.measures.audio.speech_transcribe_whisper import (
+    speech_transcription_whisper,
 )
 
-from openwillis.measures.audio.speech_separation import (
-    speaker_separation,
+from openwillis.measures.audio.speech_separation_nlabels import (
+    speaker_separation_nolabels,
 )
 
-from openwillis.measures.audio.speech_separation_cloud import (
-    speaker_separation_cloud,
+from openwillis.measures.audio.speech_separation_labels import (
+    speaker_separation_labels,
 )
 
 from openwillis.measures.audio.speech_transcribe_cloud import (
-    speech_transcription_cloud,
+    speech_transcription_aws,
 )
 
-__all__ = ["vocal_acoustics", "speech_transcription", "speaker_separation", "speaker_separation_cloud", "speech_transcription_cloud"]
+from openwillis.measures.audio.speech_transcribe_vosk import (
+    speech_transcription_vosk,
+)
+
+__all__ = ["vocal_acoustics", "speech_transcription_whisper", "speaker_separation", "speaker_separation_cloud", "speech_transcription_aws", "speech_transcription_vosk"]
diff --git a/...measures/audio/speech_separation_cloud.py → ...easures/audio/speech_separation_labels.py b/...measures/audio/speech_separation_cloud.py → ...easures/audio/speech_separation_labels.py
@@ -38,7 +38,23 @@ def get_config():
     measures = json.load(file)
     return measures
 
-def speaker_separation_cloud(filepath, json_response):
+def is_amazon_transcribe(json_conf):
+    """
+    ------------------------------------------------------------------------------------------------------
+    This function checks if the json response object is from Amazon Transcribe.
+    Parameters:
+    ...........
+    json_conf: dict
+        JSON response object.
+    Returns:
+    ...........
+    bool: True if the json response object
+     is from Amazon Transcribe, False otherwise.
+    ------------------------------------------------------------------------------------------------------
+    """
+    return "jobName" in json_conf and "results" in json_conf
+
+def speaker_separation_labels(filepath, transcript_json):
     """
     ------------------------------------------------------------------------------------------------------
 
@@ -48,7 +64,7 @@ def speaker_separation_cloud(filepath, json_response):
     ...........
     filepath : str
         Path to the input audio file.
-    json_response : json
+    transcript_json : json
         Speech transcription json response.
 
     Returns:
@@ -66,8 +82,12 @@ def speaker_separation_cloud(filepath, json_response):
             return signal_label
 
         audio_signal = AudioSegment.from_file(file = filepath, format = "wav")
-        speaker_df, speaker_count = sutil.transcribe_response_to_dataframe(json_response)
+        if not is_amazon_transcribe(transcript_json):
 
+            speaker_df, speaker_count = sutil.whisperx_to_dataframe(transcript_json)
+        else:
+            speaker_df, speaker_count = sutil.transcribe_response_to_dataframe(transcript_json)
+
         if len(speaker_df)>0 and speaker_count>1:
             signal_label = sutil.generate_audio_signal(speaker_df , audio_signal, '', measures)
 

diff --git a/...illis/measures/audio/speech_separation.py → ...asures/audio/speech_separation_nlabels.py b/...illis/measures/audio/speech_separation.py → ...asures/audio/speech_separation_nlabels.py
@@ -3,13 +3,11 @@
 
 # import the required packages
 from pyannote.audio import Pipeline
-from openwillis.measures.audio.util import util as ut
 from openwillis.measures.audio.util import separation_util as sutil
 from pydub import AudioSegment
 
 import os
 import json
-import shutil
 import pandas as pd
 import logging
 
@@ -89,11 +87,10 @@ def read_kwargs(kwargs):
     ------------------------------------------------------------------------------------------------------
     """
     input_param = {}
-    input_param['model'] = kwargs.get('model', 'pyannote')
-
     input_param['hf_token'] = kwargs.get('hf_token', '')
-    input_param['json_response'] = kwargs.get('json_response', json.loads("{}"))
-    input_param['c_scale'] = kwargs.get('c_scale', '')
+
+    input_param['transcript_json'] = kwargs.get('transcript_json', json.dumps({}))
+    input_param['context'] = kwargs.get('context', '')
     return input_param
 
 def get_pyannote(input_param, file_name, filepath):
@@ -122,12 +119,12 @@ def get_pyannote(input_param, file_name, filepath):
     """
 
     diart_df = run_pyannote(filepath, input_param['hf_token'])
-    transcribe_df = pd.DataFrame(input_param['json_response'])
+    transcribe_df = pd.DataFrame(input_param['transcript_json'])
 
     speaker_df, speaker_count = sutil.get_speaker_identification(diart_df, transcribe_df)
     return speaker_df, speaker_count
 
-def speaker_separation(filepath, **kwargs):
+def speaker_separation_nolabels(filepath, **kwargs):
     """
     ------------------------------------------------------------------------------------------------------
 
@@ -137,14 +134,12 @@ def speaker_separation(filepath, **kwargs):
     ...........
     filepath : str
         Path to the input audio file.
+    transcript_json : json
+        Speech transcription json response.
     hf_token : str
         Access token for HuggingFace to access pre-trained models.
-    json_response : json
-        Speech transcription json response.
-    model : str, optional
-        Model to use for speech diarization, default is 'pyannote'.
-    c_scale : str, optional
-        Clinical scale to use for slicing the separated audio files, if any.
+    context : str, optional
+        scale to use for slicing the separated audio files, if any.
 
     Returns:
     ...........
@@ -160,18 +155,14 @@ def speaker_separation(filepath, **kwargs):
     measures = get_config()
 
     try:
-        if not os.path.exists(filepath) or 'json_response' not in kwargs:
+        if not os.path.exists(filepath) or 'transcript_json' not in kwargs:
             return signal_label
 
-        if input_param['model'] == 'whisperx': 
-            input_param['c_scale'] = ''
-            speaker_df, speaker_count = sutil.whisperx_to_dataframe(input_param['json_response'])
-        else:
-            speaker_df, speaker_count = get_pyannote(input_param, file_name, filepath)
-
+        speaker_df, speaker_count = get_pyannote(input_param, file_name, filepath)
         audio_signal = AudioSegment.from_file(file = filepath, format = "wav")
+
         if len(speaker_df)>0 and speaker_count>1:
-            signal_label = sutil.generate_audio_signal(speaker_df , audio_signal, input_param['c_scale'], measures)
+            signal_label = sutil.generate_audio_signal(speaker_df , audio_signal, input_param['context'], measures)
 
     except Exception as e:
         logger.error(f'Error in diard processing: {e} & File: {filepath}')

diff --git a/openwillis/measures/audio/speech_transcribe_cloud.py b/openwillis/measures/audio/speech_transcribe_cloud.py
@@ -1,5 +1,5 @@
 # author:    Vijay Yadav
-# website:   http://www.bklynhlth.com
+# website:   http://www.brooklyn.health
 
 # import the required packages
 import os
@@ -53,50 +53,46 @@ def read_kwargs(kwargs):
     ------------------------------------------------------------------------------------------------------
     """
     input_param = {}
-    input_param['model'] = kwargs.get('model', 'pyannote')
     input_param['language'] = kwargs.get('language', 'en-US')
     input_param['region'] = kwargs.get('region', 'us-east-1')
 
     input_param['job_name'] = kwargs.get('job_name', 'transcribe_job_01')
-    input_param['ShowSpeakerLabels'] = kwargs.get('ShowSpeakerLabels', True)
-    input_param['MaxSpeakerLabels'] = kwargs.get('MaxSpeakerLabels', 2)
+    input_param['speaker_labels'] = kwargs.get('speaker_labels', False)
+    input_param['max_speakers'] = kwargs.get('max_speakers', 2)
 
-    input_param['c_scale'] = kwargs.get('c_scale', '')
+    input_param['context'] = kwargs.get('context', '')
     input_param['access_key'] = kwargs.get('access_key', '')
     input_param['secret_key'] = kwargs.get('secret_key', '')
     return input_param
 
-def speech_transcription_cloud(filepath, **kwargs):
+def speech_transcription_aws(s3_uri, **kwargs):
     """
     ------------------------------------------------------------------------------------------------------
 
     Speech transcription function that transcribes an audio file using Amazon Transcribe.
 
     Parameters:
     ...........
-    filepath : str
+    s3_uri : str
         The S3 uri for the recording to be transcribed.
     kwargs: Object
-        model : str, optional
-            The transcription model to use ('aws'). Default is 'aws'.
         language : str, optional
             The language of the audio file (e.g. 'en-US', 'en-IN'). Default is 'en-US'.
         region : str, optional
             The AWS region to use (e.g. 'us-east-1'). Only applicable if model is 'aws'. Default is 'us-east-1'.
         job_name : str, optional
             The name of the transcription job. Only applicable if model is 'aws'. Default is 'transcribe_job_01'.
-        ShowSpeakerLabels : boolean, optional
-            Show speaker labels
-        MaxSpeakerLabels : int, optional
-            Max number of speakers
-        c_scale : str, optional
-            Clinical scale to use for slicing the separated audio files, if any.
         access_key : str, optional
             AWS access key
         secret_key : str, optional
             AWS secret key
-
-
+        speaker_labels : boolean, optional
+            Show speaker labels
+        max_speakers : int, optional
+            Max number of speakers
+        context : str, optional
+            scale to use for slicing the separated audio files, if any.
+            
     Returns:
     ...........
     json_response : JSON Object
@@ -108,10 +104,10 @@ def speech_transcription_cloud(filepath, **kwargs):
     """
     input_param = read_kwargs(kwargs)
     measures = get_config()
-    json_response, transcript = tutil.transcribe_audio(filepath, input_param)
-
-    if input_param['ShowSpeakerLabels'] == True and input_param['c_scale']:
+    json_response, transcript = tutil.transcribe_audio(s3_uri, input_param)
+    
+    if input_param['speaker_labels'] == True and input_param['context'].lower() in measures['scale'].split(','):
         content_dict = tutil.extract_content(json_response)
-        json_response = tutil.get_clinical_labels(input_param['c_scale'], measures, content_dict, json_response)
-
+
+        json_response = tutil.get_clinical_labels(input_param['context'], measures, content_dict, json_response)
     return json_response, transcript