Skip to content

Commit

Permalink
Merge pull request #75 from bklynhlth/1.6.x
Browse files Browse the repository at this point in the history
1.6.x release
  • Loading branch information
vjbytes102 authored Nov 14, 2023
2 parents 4d9b4b5 + 2b6553f commit a89e27a
Show file tree
Hide file tree
Showing 16 changed files with 1,093 additions and 1,390 deletions.
11 changes: 6 additions & 5 deletions openwillis/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,13 @@
emotional_expressivity,
eye_blink_rate,
vocal_acoustics,
speech_transcription,
speech_transcription_whisper,
speech_characteristics,
speaker_separation,
speaker_separation_cloud,
speech_transcription_cloud,
speaker_separation_nolabels,
speaker_separation_labels,
speech_transcription_aws,
speech_transcription_vosk,
to_audio
)

__all__ = ["facial_expressivity", "vocal_acoustics", "emotional_expressivity", "eye_blink_rate", "speech_transcription", "speech_characteristics", "speaker_separation", "speaker_separation_cloud", "speech_transcription_cloud", "to_audio"]
__all__ = ["facial_expressivity", "vocal_acoustics", "emotional_expressivity", "eye_blink_rate", "speech_transcription_whisper", "speech_characteristics", "speaker_separation_nolabels", "speaker_separation_labels", "speech_transcription_aws", "speech_transcription_vosk", "to_audio"]
9 changes: 5 additions & 4 deletions openwillis/measures/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,11 @@
)
from openwillis.measures.audio import (
vocal_acoustics,
speech_transcription,
speaker_separation,
speaker_separation_cloud,
speech_transcription_cloud,
speech_transcription_whisper,
speaker_separation_nolabels,
speaker_separation_labels,
speech_transcription_aws,
speech_transcription_vosk
)
from openwillis.measures.text import (
speech_characteristics
Expand Down
20 changes: 12 additions & 8 deletions openwillis/measures/audio/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,24 @@
vocal_acoustics,
)

from openwillis.measures.audio.speech_transcribe import (
speech_transcription,
from openwillis.measures.audio.speech_transcribe_whisper import (
speech_transcription_whisper,
)

from openwillis.measures.audio.speech_separation import (
speaker_separation,
from openwillis.measures.audio.speech_separation_nlabels import (
speaker_separation_nolabels,
)

from openwillis.measures.audio.speech_separation_cloud import (
speaker_separation_cloud,
from openwillis.measures.audio.speech_separation_labels import (
speaker_separation_labels,
)

from openwillis.measures.audio.speech_transcribe_cloud import (
speech_transcription_cloud,
speech_transcription_aws,
)

__all__ = ["vocal_acoustics", "speech_transcription", "speaker_separation", "speaker_separation_cloud", "speech_transcription_cloud"]
from openwillis.measures.audio.speech_transcribe_vosk import (
speech_transcription_vosk,
)

__all__ = ["vocal_acoustics", "speech_transcription_whisper", "speaker_separation", "speaker_separation_cloud", "speech_transcription_aws", "speech_transcription_vosk"]
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,23 @@ def get_config():
measures = json.load(file)
return measures

def speaker_separation_cloud(filepath, json_response):
def is_amazon_transcribe(json_conf):
"""
------------------------------------------------------------------------------------------------------
This function checks if the json response object is from Amazon Transcribe.
Parameters:
...........
json_conf: dict
JSON response object.
Returns:
...........
bool: True if the json response object
is from Amazon Transcribe, False otherwise.
------------------------------------------------------------------------------------------------------
"""
return "jobName" in json_conf and "results" in json_conf

def speaker_separation_labels(filepath, transcript_json):
"""
------------------------------------------------------------------------------------------------------
Expand All @@ -48,7 +64,7 @@ def speaker_separation_cloud(filepath, json_response):
...........
filepath : str
Path to the input audio file.
json_response : json
transcript_json : json
Speech transcription json response.
Returns:
Expand All @@ -66,8 +82,12 @@ def speaker_separation_cloud(filepath, json_response):
return signal_label

audio_signal = AudioSegment.from_file(file = filepath, format = "wav")
speaker_df, speaker_count = sutil.transcribe_response_to_dataframe(json_response)
if not is_amazon_transcribe(transcript_json):

speaker_df, speaker_count = sutil.whisperx_to_dataframe(transcript_json)
else:
speaker_df, speaker_count = sutil.transcribe_response_to_dataframe(transcript_json)

if len(speaker_df)>0 and speaker_count>1:
signal_label = sutil.generate_audio_signal(speaker_df , audio_signal, '', measures)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,11 @@

# import the required packages
from pyannote.audio import Pipeline
from openwillis.measures.audio.util import util as ut
from openwillis.measures.audio.util import separation_util as sutil
from pydub import AudioSegment

import os
import json
import shutil
import pandas as pd
import logging

Expand Down Expand Up @@ -89,11 +87,10 @@ def read_kwargs(kwargs):
------------------------------------------------------------------------------------------------------
"""
input_param = {}
input_param['model'] = kwargs.get('model', 'pyannote')

input_param['hf_token'] = kwargs.get('hf_token', '')
input_param['json_response'] = kwargs.get('json_response', json.loads("{}"))
input_param['c_scale'] = kwargs.get('c_scale', '')

input_param['transcript_json'] = kwargs.get('transcript_json', json.dumps({}))
input_param['context'] = kwargs.get('context', '')
return input_param

def get_pyannote(input_param, file_name, filepath):
Expand Down Expand Up @@ -122,12 +119,12 @@ def get_pyannote(input_param, file_name, filepath):
"""

diart_df = run_pyannote(filepath, input_param['hf_token'])
transcribe_df = pd.DataFrame(input_param['json_response'])
transcribe_df = pd.DataFrame(input_param['transcript_json'])

speaker_df, speaker_count = sutil.get_speaker_identification(diart_df, transcribe_df)
return speaker_df, speaker_count

def speaker_separation(filepath, **kwargs):
def speaker_separation_nolabels(filepath, **kwargs):
"""
------------------------------------------------------------------------------------------------------
Expand All @@ -137,14 +134,12 @@ def speaker_separation(filepath, **kwargs):
...........
filepath : str
Path to the input audio file.
transcript_json : json
Speech transcription json response.
hf_token : str
Access token for HuggingFace to access pre-trained models.
json_response : json
Speech transcription json response.
model : str, optional
Model to use for speech diarization, default is 'pyannote'.
c_scale : str, optional
Clinical scale to use for slicing the separated audio files, if any.
context : str, optional
scale to use for slicing the separated audio files, if any.
Returns:
...........
Expand All @@ -160,18 +155,14 @@ def speaker_separation(filepath, **kwargs):
measures = get_config()

try:
if not os.path.exists(filepath) or 'json_response' not in kwargs:
if not os.path.exists(filepath) or 'transcript_json' not in kwargs:
return signal_label

if input_param['model'] == 'whisperx':
input_param['c_scale'] = ''
speaker_df, speaker_count = sutil.whisperx_to_dataframe(input_param['json_response'])
else:
speaker_df, speaker_count = get_pyannote(input_param, file_name, filepath)

speaker_df, speaker_count = get_pyannote(input_param, file_name, filepath)
audio_signal = AudioSegment.from_file(file = filepath, format = "wav")

if len(speaker_df)>0 and speaker_count>1:
signal_label = sutil.generate_audio_signal(speaker_df , audio_signal, input_param['c_scale'], measures)
signal_label = sutil.generate_audio_signal(speaker_df , audio_signal, input_param['context'], measures)

except Exception as e:
logger.error(f'Error in diard processing: {e} & File: {filepath}')
Expand Down
40 changes: 18 additions & 22 deletions openwillis/measures/audio/speech_transcribe_cloud.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# author: Vijay Yadav
# website: http://www.bklynhlth.com
# website: http://www.brooklyn.health

# import the required packages
import os
Expand Down Expand Up @@ -53,50 +53,46 @@ def read_kwargs(kwargs):
------------------------------------------------------------------------------------------------------
"""
input_param = {}
input_param['model'] = kwargs.get('model', 'pyannote')
input_param['language'] = kwargs.get('language', 'en-US')
input_param['region'] = kwargs.get('region', 'us-east-1')

input_param['job_name'] = kwargs.get('job_name', 'transcribe_job_01')
input_param['ShowSpeakerLabels'] = kwargs.get('ShowSpeakerLabels', True)
input_param['MaxSpeakerLabels'] = kwargs.get('MaxSpeakerLabels', 2)
input_param['speaker_labels'] = kwargs.get('speaker_labels', False)
input_param['max_speakers'] = kwargs.get('max_speakers', 2)

input_param['c_scale'] = kwargs.get('c_scale', '')
input_param['context'] = kwargs.get('context', '')
input_param['access_key'] = kwargs.get('access_key', '')
input_param['secret_key'] = kwargs.get('secret_key', '')
return input_param

def speech_transcription_cloud(filepath, **kwargs):
def speech_transcription_aws(s3_uri, **kwargs):
"""
------------------------------------------------------------------------------------------------------
Speech transcription function that transcribes an audio file using Amazon Transcribe.
Parameters:
...........
filepath : str
s3_uri : str
The S3 uri for the recording to be transcribed.
kwargs: Object
model : str, optional
The transcription model to use ('aws'). Default is 'aws'.
language : str, optional
The language of the audio file (e.g. 'en-US', 'en-IN'). Default is 'en-US'.
region : str, optional
The AWS region to use (e.g. 'us-east-1'). Only applicable if model is 'aws'. Default is 'us-east-1'.
job_name : str, optional
The name of the transcription job. Only applicable if model is 'aws'. Default is 'transcribe_job_01'.
ShowSpeakerLabels : boolean, optional
Show speaker labels
MaxSpeakerLabels : int, optional
Max number of speakers
c_scale : str, optional
Clinical scale to use for slicing the separated audio files, if any.
access_key : str, optional
AWS access key
secret_key : str, optional
AWS secret key
speaker_labels : boolean, optional
Show speaker labels
max_speakers : int, optional
Max number of speakers
context : str, optional
scale to use for slicing the separated audio files, if any.
Returns:
...........
json_response : JSON Object
Expand All @@ -108,10 +104,10 @@ def speech_transcription_cloud(filepath, **kwargs):
"""
input_param = read_kwargs(kwargs)
measures = get_config()
json_response, transcript = tutil.transcribe_audio(filepath, input_param)

if input_param['ShowSpeakerLabels'] == True and input_param['c_scale']:
json_response, transcript = tutil.transcribe_audio(s3_uri, input_param)
if input_param['speaker_labels'] == True and input_param['context'].lower() in measures['scale'].split(','):
content_dict = tutil.extract_content(json_response)
json_response = tutil.get_clinical_labels(input_param['c_scale'], measures, content_dict, json_response)


json_response = tutil.get_clinical_labels(input_param['context'], measures, content_dict, json_response)
return json_response, transcript
Loading

0 comments on commit a89e27a

Please sign in to comment.