diff --git a/openwillis/__init__.py b/openwillis/__init__.py index dd36a91..41d7d3b 100644 --- a/openwillis/__init__.py +++ b/openwillis/__init__.py @@ -8,12 +8,13 @@ emotional_expressivity, eye_blink_rate, vocal_acoustics, - speech_transcription, + speech_transcription_whisper, speech_characteristics, - speaker_separation, - speaker_separation_cloud, - speech_transcription_cloud, + speaker_separation_nolabels, + speaker_separation_labels, + speech_transcription_aws, + speech_transcription_vosk, to_audio ) -__all__ = ["facial_expressivity", "vocal_acoustics", "emotional_expressivity", "eye_blink_rate", "speech_transcription", "speech_characteristics", "speaker_separation", "speaker_separation_cloud", "speech_transcription_cloud", "to_audio"] +__all__ = ["facial_expressivity", "vocal_acoustics", "emotional_expressivity", "eye_blink_rate", "speech_transcription_whisper", "speech_characteristics", "speaker_separation_nolabels", "speaker_separation_labels", "speech_transcription_aws", "speech_transcription_vosk", "to_audio"] diff --git a/openwillis/measures/api.py b/openwillis/measures/api.py index ecc0897..38dad3e 100644 --- a/openwillis/measures/api.py +++ b/openwillis/measures/api.py @@ -9,10 +9,11 @@ ) from openwillis.measures.audio import ( vocal_acoustics, - speech_transcription, - speaker_separation, - speaker_separation_cloud, - speech_transcription_cloud, + speech_transcription_whisper, + speaker_separation_nolabels, + speaker_separation_labels, + speech_transcription_aws, + speech_transcription_vosk ) from openwillis.measures.text import ( speech_characteristics diff --git a/openwillis/measures/audio/__init__.py b/openwillis/measures/audio/__init__.py index d53a3af..355a3bd 100644 --- a/openwillis/measures/audio/__init__.py +++ b/openwillis/measures/audio/__init__.py @@ -2,20 +2,24 @@ vocal_acoustics, ) -from openwillis.measures.audio.speech_transcribe import ( - speech_transcription, +from openwillis.measures.audio.speech_transcribe_whisper import ( + speech_transcription_whisper, ) -from openwillis.measures.audio.speech_separation import ( - speaker_separation, +from openwillis.measures.audio.speech_separation_nlabels import ( + speaker_separation_nolabels, ) -from openwillis.measures.audio.speech_separation_cloud import ( - speaker_separation_cloud, +from openwillis.measures.audio.speech_separation_labels import ( + speaker_separation_labels, ) from openwillis.measures.audio.speech_transcribe_cloud import ( - speech_transcription_cloud, + speech_transcription_aws, ) -__all__ = ["vocal_acoustics", "speech_transcription", "speaker_separation", "speaker_separation_cloud", "speech_transcription_cloud"] +from openwillis.measures.audio.speech_transcribe_vosk import ( + speech_transcription_vosk, +) + +__all__ = ["vocal_acoustics", "speech_transcription_whisper", "speaker_separation", "speaker_separation_cloud", "speech_transcription_aws", "speech_transcription_vosk"] diff --git a/openwillis/measures/audio/speech_separation_cloud.py b/openwillis/measures/audio/speech_separation_labels.py similarity index 69% rename from openwillis/measures/audio/speech_separation_cloud.py rename to openwillis/measures/audio/speech_separation_labels.py index f314c4d..d251e59 100644 --- a/openwillis/measures/audio/speech_separation_cloud.py +++ b/openwillis/measures/audio/speech_separation_labels.py @@ -38,7 +38,23 @@ def get_config(): measures = json.load(file) return measures -def speaker_separation_cloud(filepath, json_response): +def is_amazon_transcribe(json_conf): + """ + ------------------------------------------------------------------------------------------------------ + This function checks if the json response object is from Amazon Transcribe. + Parameters: + ........... + json_conf: dict + JSON response object. + Returns: + ........... + bool: True if the json response object + is from Amazon Transcribe, False otherwise. + ------------------------------------------------------------------------------------------------------ + """ + return "jobName" in json_conf and "results" in json_conf + +def speaker_separation_labels(filepath, transcript_json): """ ------------------------------------------------------------------------------------------------------ @@ -48,7 +64,7 @@ def speaker_separation_cloud(filepath, json_response): ........... filepath : str Path to the input audio file. - json_response : json + transcript_json : json Speech transcription json response. Returns: @@ -66,8 +82,12 @@ def speaker_separation_cloud(filepath, json_response): return signal_label audio_signal = AudioSegment.from_file(file = filepath, format = "wav") - speaker_df, speaker_count = sutil.transcribe_response_to_dataframe(json_response) + if not is_amazon_transcribe(transcript_json): + speaker_df, speaker_count = sutil.whisperx_to_dataframe(transcript_json) + else: + speaker_df, speaker_count = sutil.transcribe_response_to_dataframe(transcript_json) + if len(speaker_df)>0 and speaker_count>1: signal_label = sutil.generate_audio_signal(speaker_df , audio_signal, '', measures) diff --git a/openwillis/measures/audio/speech_separation.py b/openwillis/measures/audio/speech_separation_nlabels.py similarity index 83% rename from openwillis/measures/audio/speech_separation.py rename to openwillis/measures/audio/speech_separation_nlabels.py index 58d6777..62409eb 100644 --- a/openwillis/measures/audio/speech_separation.py +++ b/openwillis/measures/audio/speech_separation_nlabels.py @@ -3,13 +3,11 @@ # import the required packages from pyannote.audio import Pipeline -from openwillis.measures.audio.util import util as ut from openwillis.measures.audio.util import separation_util as sutil from pydub import AudioSegment import os import json -import shutil import pandas as pd import logging @@ -89,11 +87,10 @@ def read_kwargs(kwargs): ------------------------------------------------------------------------------------------------------ """ input_param = {} - input_param['model'] = kwargs.get('model', 'pyannote') - input_param['hf_token'] = kwargs.get('hf_token', '') - input_param['json_response'] = kwargs.get('json_response', json.loads("{}")) - input_param['c_scale'] = kwargs.get('c_scale', '') + + input_param['transcript_json'] = kwargs.get('transcript_json', json.dumps({})) + input_param['context'] = kwargs.get('context', '') return input_param def get_pyannote(input_param, file_name, filepath): @@ -122,12 +119,12 @@ def get_pyannote(input_param, file_name, filepath): """ diart_df = run_pyannote(filepath, input_param['hf_token']) - transcribe_df = pd.DataFrame(input_param['json_response']) + transcribe_df = pd.DataFrame(input_param['transcript_json']) speaker_df, speaker_count = sutil.get_speaker_identification(diart_df, transcribe_df) return speaker_df, speaker_count -def speaker_separation(filepath, **kwargs): +def speaker_separation_nolabels(filepath, **kwargs): """ ------------------------------------------------------------------------------------------------------ @@ -137,14 +134,12 @@ def speaker_separation(filepath, **kwargs): ........... filepath : str Path to the input audio file. + transcript_json : json + Speech transcription json response. hf_token : str Access token for HuggingFace to access pre-trained models. - json_response : json - Speech transcription json response. - model : str, optional - Model to use for speech diarization, default is 'pyannote'. - c_scale : str, optional - Clinical scale to use for slicing the separated audio files, if any. + context : str, optional + scale to use for slicing the separated audio files, if any. Returns: ........... @@ -160,18 +155,14 @@ def speaker_separation(filepath, **kwargs): measures = get_config() try: - if not os.path.exists(filepath) or 'json_response' not in kwargs: + if not os.path.exists(filepath) or 'transcript_json' not in kwargs: return signal_label - if input_param['model'] == 'whisperx': - input_param['c_scale'] = '' - speaker_df, speaker_count = sutil.whisperx_to_dataframe(input_param['json_response']) - else: - speaker_df, speaker_count = get_pyannote(input_param, file_name, filepath) - + speaker_df, speaker_count = get_pyannote(input_param, file_name, filepath) audio_signal = AudioSegment.from_file(file = filepath, format = "wav") + if len(speaker_df)>0 and speaker_count>1: - signal_label = sutil.generate_audio_signal(speaker_df , audio_signal, input_param['c_scale'], measures) + signal_label = sutil.generate_audio_signal(speaker_df , audio_signal, input_param['context'], measures) except Exception as e: logger.error(f'Error in diard processing: {e} & File: {filepath}') diff --git a/openwillis/measures/audio/speech_transcribe_cloud.py b/openwillis/measures/audio/speech_transcribe_cloud.py index 513e7a5..6156262 100644 --- a/openwillis/measures/audio/speech_transcribe_cloud.py +++ b/openwillis/measures/audio/speech_transcribe_cloud.py @@ -1,5 +1,5 @@ # author: Vijay Yadav -# website: http://www.bklynhlth.com +# website: http://www.brooklyn.health # import the required packages import os @@ -53,20 +53,19 @@ def read_kwargs(kwargs): ------------------------------------------------------------------------------------------------------ """ input_param = {} - input_param['model'] = kwargs.get('model', 'pyannote') input_param['language'] = kwargs.get('language', 'en-US') input_param['region'] = kwargs.get('region', 'us-east-1') input_param['job_name'] = kwargs.get('job_name', 'transcribe_job_01') - input_param['ShowSpeakerLabels'] = kwargs.get('ShowSpeakerLabels', True) - input_param['MaxSpeakerLabels'] = kwargs.get('MaxSpeakerLabels', 2) + input_param['speaker_labels'] = kwargs.get('speaker_labels', False) + input_param['max_speakers'] = kwargs.get('max_speakers', 2) - input_param['c_scale'] = kwargs.get('c_scale', '') + input_param['context'] = kwargs.get('context', '') input_param['access_key'] = kwargs.get('access_key', '') input_param['secret_key'] = kwargs.get('secret_key', '') return input_param -def speech_transcription_cloud(filepath, **kwargs): +def speech_transcription_aws(s3_uri, **kwargs): """ ------------------------------------------------------------------------------------------------------ @@ -74,29 +73,26 @@ def speech_transcription_cloud(filepath, **kwargs): Parameters: ........... - filepath : str + s3_uri : str The S3 uri for the recording to be transcribed. kwargs: Object - model : str, optional - The transcription model to use ('aws'). Default is 'aws'. language : str, optional The language of the audio file (e.g. 'en-US', 'en-IN'). Default is 'en-US'. region : str, optional The AWS region to use (e.g. 'us-east-1'). Only applicable if model is 'aws'. Default is 'us-east-1'. job_name : str, optional The name of the transcription job. Only applicable if model is 'aws'. Default is 'transcribe_job_01'. - ShowSpeakerLabels : boolean, optional - Show speaker labels - MaxSpeakerLabels : int, optional - Max number of speakers - c_scale : str, optional - Clinical scale to use for slicing the separated audio files, if any. access_key : str, optional AWS access key secret_key : str, optional AWS secret key - - + speaker_labels : boolean, optional + Show speaker labels + max_speakers : int, optional + Max number of speakers + context : str, optional + scale to use for slicing the separated audio files, if any. + Returns: ........... json_response : JSON Object @@ -108,10 +104,10 @@ def speech_transcription_cloud(filepath, **kwargs): """ input_param = read_kwargs(kwargs) measures = get_config() - json_response, transcript = tutil.transcribe_audio(filepath, input_param) - - if input_param['ShowSpeakerLabels'] == True and input_param['c_scale']: + json_response, transcript = tutil.transcribe_audio(s3_uri, input_param) + + if input_param['speaker_labels'] == True and input_param['context'].lower() in measures['scale'].split(','): content_dict = tutil.extract_content(json_response) - json_response = tutil.get_clinical_labels(input_param['c_scale'], measures, content_dict, json_response) - + + json_response = tutil.get_clinical_labels(input_param['context'], measures, content_dict, json_response) return json_response, transcript diff --git a/openwillis/measures/audio/speech_transcribe.py b/openwillis/measures/audio/speech_transcribe_vosk.py similarity index 72% rename from openwillis/measures/audio/speech_transcribe.py rename to openwillis/measures/audio/speech_transcribe_vosk.py index 6be8eb5..f18ae9f 100644 --- a/openwillis/measures/audio/speech_transcribe.py +++ b/openwillis/measures/audio/speech_transcribe_vosk.py @@ -1,69 +1,43 @@ -# author: Vijay Yadav -# website: http://www.bklynhlth.com +# website: http://www.brooklyn.health # import the required packages - -import numpy as np -import pandas as pd import os import wave import json import logging +import json +from vosk import Model, KaldiRecognizer from pydub import AudioSegment from openwillis.measures.audio.util import util as ut -from openwillis.measures.audio.util import transcribe_util as tutil logging.basicConfig(level=logging.INFO) logger=logging.getLogger() -def run_vosk(filepath, language='en-us', transcribe_interval = []): +def get_config(): """ ------------------------------------------------------------------------------------------------------ - Transcribe speech in an audio file using the Vosk model. + Load the configuration settings for the speech transcription. Parameters: - ............ - filepath : str - The path to the audio file to be transcribed. - language : str, optional - The language of the audio file (e.g. 'en-us', 'es', 'fr'). Default is 'en-us'. - transcribe_interval : list, optional - A list of tuples representing the start and end times (in seconds) of segments of the audio file to be transcribed. - Default is an empty list. + ........... + None Returns: - ............ - json_response : str - The JSON response from the Vosk transcription service. - transcript : str - The transcript of the audio file. + ........... + measures : dict + A dictionary containing the configuration settings. ------------------------------------------------------------------------------------------------------ """ - json_response = '{}' - transcript = mono_filepath = '' - - try: - if os.path.exists(filepath): - - measures = get_config() - mono_filepath = stereo_to_mono(filepath, transcribe_interval) - results = get_vosk(mono_filepath, language) - - ut.remove_dir(os.path.dirname(mono_filepath)) #Clean temp directory - json_response, transcript = filter_speech(measures, results) - - else: - logger.info(f'Audio file not available. File: {filepath}') - - except Exception as e: - ut.remove_dir(os.path.dirname(mono_filepath))#Clean temp directory - logger.error(f'Error in speech Transcription: {e} & File: {filepath}') + #Loading json config + dir_name = os.path.dirname(os.path.abspath(__file__)) + measure_path = os.path.abspath(os.path.join(dir_name, 'config/speech.json')) - finally: - return json_response, transcript + file = open(measure_path) + measures = json.load(file) + return measures def filter_audio(filepath, t_interval): """ @@ -96,35 +70,42 @@ def filter_audio(filepath, t_interval): sound = sound.set_channels(1) return sound -def stereo_to_mono(filepath, t_interval): +def filter_speech(measures, results): """ ------------------------------------------------------------------------------------------------------ - Convert a stereo audio file to a mono audio file. + Filter the speech transcription results to extract the transcript. Parameters: - ............ - filepath : str - The path to the stereo audio file to be converted. - t_interval : list - A list of tuples representing the start and end times (in seconds) of segments of the audio file to be transcribed. + ........... + measures : dict + A dictionary containing the configuration settings for the speech transcription. + results : list of dict + The raw transcription results returned by the transcription service. Returns: - ............ - mono_filepath : str - The path to the mono audio file. + ........... + result_key : list + A list containing the framewise transcription of the audio file. + transcript : str + The transcript of the audio file. ------------------------------------------------------------------------------------------------------ """ - sound = filter_audio(filepath, t_interval) + result_key = [] + text_key = [] + transcript_dict = {} - filename, _ = os.path.splitext(os.path.basename(filepath)) - dir_name = os.path.join(os.path.dirname(filepath), 'temp_mono_' + filename) + for res in results: + dict_keys = res.keys() - ut.make_dir(dir_name) - mono_filepath = os.path.join(dir_name, filename + '.wav') - sound.export(mono_filepath, format="wav") - return mono_filepath + if 'result' in dict_keys and 'text' in dict_keys: + result_key.extend(res['result']) + text_key.append(res['text']) + + transcript_dict['result'] = result_key + transcript_dict['text'] = ' '.join(text_key) + return result_key, ' '.join(text_key) def get_vosk(audio_path, lang): """ @@ -146,9 +127,6 @@ def get_vosk(audio_path, lang): ------------------------------------------------------------------------------------------------------ """ - #import in-case of model=vosk - from vosk import Model, KaldiRecognizer - model = Model(lang=lang) wf = wave.open(audio_path, "rb") @@ -170,126 +148,96 @@ def get_vosk(audio_path, lang): results.append(partial_result) return results -def filter_speech(measures, results): +def stereo_to_mono(filepath, t_interval): """ ------------------------------------------------------------------------------------------------------ - Filter the speech transcription results to extract the transcript. + Convert a stereo audio file to a mono audio file. Parameters: - ........... - measures : dict - A dictionary containing the configuration settings for the speech transcription. - results : list of dict - The raw transcription results returned by the transcription service. + ............ + filepath : str + The path to the stereo audio file to be converted. + t_interval : list + A list of tuples representing the start and end times (in seconds) of segments of the audio file to be transcribed. Returns: - ........... - result_key : list - A list containing the framewise transcription of the audio file. - transcript : str - The transcript of the audio file. + ............ + mono_filepath : str + The path to the mono audio file. ------------------------------------------------------------------------------------------------------ """ - result_key = [] - text_key = [] - transcript_dict = {} - - for res in results: - dict_keys = res.keys() - - if 'result' in dict_keys and 'text' in dict_keys: - result_key.extend(res['result']) - text_key.append(res['text']) + sound = filter_audio(filepath, t_interval) - transcript_dict['result'] = result_key - transcript_dict['text'] = ' '.join(text_key) - return result_key, ' '.join(text_key) + filename, _ = os.path.splitext(os.path.basename(filepath)) + dir_name = os.path.join(os.path.dirname(filepath), 'temp_mono_' + filename) + ut.make_dir(dir_name) + mono_filepath = os.path.join(dir_name, filename + '.wav') + sound.export(mono_filepath, format="wav") + return mono_filepath -def get_config(): +def run_vosk(filepath, language, transcribe_interval = []): """ ------------------------------------------------------------------------------------------------------ - Load the configuration settings for the speech transcription. + Transcribe speech in an audio file using the Vosk model. Parameters: - ........... - None + ............ + filepath : str + The path to the audio file to be transcribed. + language : str, optional + The language of the audio file (e.g. 'en-us', 'es', 'fr'). Default is 'en-us'. + transcribe_interval : list, optional + A list of tuples representing the start and end times (in seconds) of segments of the audio file to be transcribed. + Default is an empty list. Returns: - ........... - measures : dict - A dictionary containing the configuration settings. + ............ + json_response : str + The JSON response from the Vosk transcription service. + transcript : str + The transcript of the audio file. ------------------------------------------------------------------------------------------------------ """ - #Loading json config - dir_name = os.path.dirname(os.path.abspath(__file__)) - measure_path = os.path.abspath(os.path.join(dir_name, 'config/speech.json')) + json_response = json.dumps({}) + transcript = mono_filepath = '' - file = open(measure_path) - measures = json.load(file) - return measures + try: + if os.path.exists(filepath): -def run_whisperx(filepath, hf_token, del_model, num_speakers, infra_model, language): - """ - ------------------------------------------------------------------------------------------------------ + measures = get_config() + mono_filepath = stereo_to_mono(filepath, transcribe_interval) + results = get_vosk(mono_filepath, language) - Transcribe audio data using the WhisperX model. + ut.remove_dir(os.path.dirname(mono_filepath)) #Clean temp directory + json_response, transcript = filter_speech(measures, results) - Parameters: - ........... - filepath : str - The path to the audio file to be transcribed. - hf_token : str - The Hugging Face token for model authentication. - del_model: boolean - Boolean indicator to delete model if low on GPU resources - num_speakers: int - Number of speaker - infra_model:list - whisper model artifacts (this is optional param: to optimize willisInfra) - language: str - language code + else: + logger.info(f'Audio file not available. File: {filepath}') - Returns: - ........... - json_response : JSON Object - A transcription response object in JSON format - transcript : str - The transcript of the recording. + except Exception as e: + ut.remove_dir(os.path.dirname(mono_filepath))#Clean temp directory + logger.error(f'Error in speech Transcription: {e} & File: {filepath}') - ------------------------------------------------------------------------------------------------------ - """ - json_response = '{}' - transcript = '' - - if os.path.exists(filepath)== False or hf_token == '': + finally: return json_response, transcript - - from openwillis.measures.audio.util import whisperx_util as wutil #import in-case of model=whisperx - json_response, transcript = wutil.get_whisperx_diariazation(filepath, hf_token, del_model, num_speakers, infra_model, language) - - if str(json_response) != '{}': - json_response = tutil.replace_whisperx_speaker_labels(json_response, ['SPEAKER_00', 'SPEAKER_01'], - ['speaker0', 'speaker1']) - return json_response, transcript + -def speech_transcription(filepath, **kwargs): +def speech_transcription_vosk(filepath, **kwargs): """ ------------------------------------------------------------------------------------------------------ - Speech transcription function that transcribes an audio file using vosk/whisperx. + Speech transcription function that transcribes an audio file using vosk. Parameters: ........... filepath : str The path to the audio file to be transcribed. - model : str, optional - The transcription model to use ('vosk'). Default is 'vosk'. language : str, optional The language of the audio file (e.g. 'en-us', 'es', 'fr'). Default is 'en-us'. transcribe_interval : list, optional @@ -307,24 +255,8 @@ def speech_transcription(filepath, **kwargs): """ measures = get_config() - model = kwargs.get('model', 'vosk') - language = kwargs.get('language', 'en-us') - scale = kwargs.get('c_scale', '') - num_speakers = kwargs.get('num_speakers', None) - transcribe_interval = kwargs.get('transcribe_interval', []) - hf_token = kwargs.get('hf_token', '') - del_model = kwargs.get('del_model', False) - infra_model = kwargs.get('infra_model', [True, None, None]) - if model == 'whisperx': - json_response, transcript = run_whisperx(filepath, hf_token, del_model, num_speakers, infra_model, language) - - if scale.lower() in measures['scale'].split(','): - content_dict = tutil.get_whisperx_content(json_response) - json_response = tutil.get_whisperx_clinical_labels(scale, measures, content_dict, json_response) - - else: - json_response, transcript = run_vosk(filepath, language, transcribe_interval) + json_response, transcript = run_vosk(filepath, language, transcribe_interval) return json_response, transcript diff --git a/openwillis/measures/audio/speech_transcribe_whisper.py b/openwillis/measures/audio/speech_transcribe_whisper.py new file mode 100644 index 0000000..5e5371d --- /dev/null +++ b/openwillis/measures/audio/speech_transcribe_whisper.py @@ -0,0 +1,148 @@ +# author: Vijay Yadav +# website: http://www.bklynhlth.com + +# import the required packages + +import numpy as np +import pandas as pd +import os +import json +import logging + +from pydub import AudioSegment +from openwillis.measures.audio.util import util as ut +from openwillis.measures.audio.util import transcribe_util as tutil + +logging.basicConfig(level=logging.INFO) +logger=logging.getLogger() + + +def get_config(): + """ + ------------------------------------------------------------------------------------------------------ + + Load the configuration settings for the speech transcription. + + Parameters: + ........... + None + + Returns: + ........... + measures : dict + A dictionary containing the configuration settings. + + ------------------------------------------------------------------------------------------------------ + """ + #Loading json config + dir_name = os.path.dirname(os.path.abspath(__file__)) + measure_path = os.path.abspath(os.path.join(dir_name, 'config/speech.json')) + + file = open(measure_path) + measures = json.load(file) + return measures + +def read_kwargs(kwargs): + """ + ------------------------------------------------------------------------------------------------------ + + Reads keyword arguments and returns a dictionary containing input parameters. + + Parameters: + ........... + kwargs : dict + Keyword arguments to be processed. + + Returns: + ........... + input_param: dict + A dictionary containing input parameters with their corresponding values. + + ------------------------------------------------------------------------------------------------------ + """ + input_param = {} + input_param['model'] = kwargs.get('model', 'tiny') + input_param['language'] = kwargs.get('language', 'en') + + input_param['context'] = kwargs.get('context', '') + input_param['max_speakers'] = kwargs.get('max_speakers', None) + input_param['min_speakers'] = kwargs.get('min_speakers', None) + + input_param['hf_token'] = kwargs.get('hf_token', '') + input_param['del_model'] = kwargs.get('del_model', False) #Temp filter + input_param['infra_model'] = kwargs.get('infra_model', [True, None, None]) #Temp filter + + return input_param + +def run_whisperx(filepath, input_param): + """ + ------------------------------------------------------------------------------------------------------ + + Transcribe audio data using the WhisperX model. + + Parameters: + ........... + filepath : str + The path to the audio file to be transcribed. + input_param : dict + A dictionary containing input parameters + + Returns: + ........... + json_response : JSON Object + A transcription response object in JSON format + transcript : str + The transcript of the recording. + + ------------------------------------------------------------------------------------------------------ + """ + json_response = json.dumps({}) + transcript = '' + + if os.path.exists(filepath)== False or input_param['hf_token'] == '': + return json_response, transcript + + from openwillis.measures.audio.util import whisperx_util as wutil #import in-case of model=whisperx + json_response, transcript = wutil.get_whisperx_diariazation(filepath, input_param) + + if str(json_response) != '{}': + json_response = tutil.filter_labels_whisper(json_response) + return json_response, transcript + + +def speech_transcription_whisper(filepath, **kwargs): + """ + ------------------------------------------------------------------------------------------------------ + + Speech transcription function that transcribes an audio file using whisperx. + + Parameters: + ........... + filepath : str + The path to the audio file to be transcribed. + model : str, optional + The transcription model to use ('vosk'). Default is 'vosk'. + language : str, optional + The language of the audio file (e.g. 'en-us', 'es', 'fr'). Default is 'en-us'. + transcribe_interval : list, optional + A list of tuples representing the start and end times (in seconds) of segments of the audio file to be transcribed. + Only applicable if model is 'vosk'. Default is an empty list. + + Returns: + ........... + json_response : JSON Object + A transcription response object in JSON format + transcript : str + The transcript of the recording. + + ------------------------------------------------------------------------------------------------------ + """ + measures = get_config() + input_param = read_kwargs(kwargs) + + json_response, transcript = run_whisperx(filepath, input_param) + if input_param['context'].lower() in measures['scale'].split(','): + + content_dict = tutil.get_whisperx_content(json_response) + json_response = tutil.get_whisperx_clinical_labels(input_param['context'], measures, content_dict, json_response) + return json_response, transcript diff --git a/openwillis/measures/audio/util/separation_util.py b/openwillis/measures/audio/util/separation_util.py index 0959a4c..fff47d4 100644 --- a/openwillis/measures/audio/util/separation_util.py +++ b/openwillis/measures/audio/util/separation_util.py @@ -322,7 +322,7 @@ def transcribe_response_to_dataframe(response): speakers = 0 df = pd.DataFrame() - if 'segments' in response: + if 'results' in response: if 'speaker_labels' in response['results']: if 'speakers' in response['results']['speaker_labels']: @@ -338,6 +338,7 @@ def transcribe_response_to_dataframe(response): df = df[df["confidence"] > 0].reset_index(drop=True) df = df[["start_time", "end_time", "confidence", "speaker_label", "content"]] + return df, speakers def extract_data(segment_info): @@ -358,12 +359,18 @@ def extract_data(segment_info): ------------------------------------------------------------------------------------------------------ """ - phrase = segment_info["text"] - start = segment_info["start"] - end = segment_info["end"] + phrase = segment_info.get("text", "") + start = segment_info.get("start", np.nan) - score = segment_info["words"][0]["score"] if segment_info["words"] and len(segment_info["words"]) > 0 else 0 - speaker = segment_info["speaker"] if "speaker" in segment_info else "no_speaker" + end = segment_info.get("end", np.nan) + words = segment_info.get("words", None) + + if words is not None and len(words) > 0: + score = words[0].get("score", 0) + else: + score = 0 + + speaker = segment_info.get("speaker", "no_speaker") return pd.Series([start, end, phrase, score, speaker], index=["start", "end", "phrase", "score", "speaker"]) def whisperx_to_dataframe(json_response): @@ -386,23 +393,17 @@ def whisperx_to_dataframe(json_response): ------------------------------------------------------------------------------------------------------ """ - # Initialize an empty DataFrame - df = pd.DataFrame(columns=["start", "end", "phrase", "score", "speaker"]) + df = pd.DataFrame(columns=["start_time", "end_time", "content", "confidence", "speaker_label"]) if 'segments' in json_response: - for segment_info in json_response["segments"]: - try: - - segment_df = extract_data(segment_info) - df = df.append(segment_df, ignore_index=True) - - except Exception as e: - logger.info("Some segments have no speaker labels.") - - df = df[df["score"] > 0].reset_index(drop=True) - df = df[df["speaker"] != "no_speaker"].reset_index(drop=True) - df = df.rename(columns={"start": "start_time", "end": "end_time", "score":"confidence", "speaker":"speaker_label", - "phrase":"content"}) - + segment_infos = json_response["segments"] + df = pd.DataFrame(segment_infos).apply(extract_data, axis=1) + + df = df[df["score"] > 0].reset_index(drop=True) + df = df.dropna(subset=["start", "end"]).reset_index(drop=True) + + df = df[df["speaker"] != "no_speaker"].reset_index(drop=True) + df = df.rename(columns={"start": "start_time", "end": "end_time", "score": "confidence", "speaker": "speaker_label", "phrase": "content"}) + speakers = df['speaker_label'].nunique() return df, speakers \ No newline at end of file diff --git a/openwillis/measures/audio/util/transcribe_util.py b/openwillis/measures/audio/util/transcribe_util.py index 5e4049c..9b99542 100644 --- a/openwillis/measures/audio/util/transcribe_util.py +++ b/openwillis/measures/audio/util/transcribe_util.py @@ -1,5 +1,5 @@ # author: Vijay Yadav -# website: http://www.bklynhlth.com +# website: http://www.brooklyn.health # import the required packages @@ -58,6 +58,89 @@ def replace_speaker_labels(data, check_labels, speaker_labels): return data +def filter_labels_aws(data): + """ + ------------------------------------------------------------------------------------------------------ + + replaces speaker labels in AWS JSON. + + Parameters: + ........... + data : JSON + The JSON containing speaker labels. + + Returns: + ........... + data : JSON + The modified JSON with replaced speaker labels. + + ------------------------------------------------------------------------------------------------------ + """ + if 'results' in data: + speaker_labels = data['results'].get('speaker_labels', {}) + segments = speaker_labels.get('segments', []) + + for segment in segments: + seg_speaker_label = segment.get('speaker_label', '') + + if 'spk_' in seg_speaker_label: + segment['speaker_label'] = seg_speaker_label.replace("spk_", "speaker") + + seg_items = segment.get('items', []) + for seg_item in seg_items: + + seg_item_speaker_label = seg_item.get('speaker_label', '') + if 'spk_' in seg_item_speaker_label: + + seg_item['speaker_label'] = seg_item_speaker_label.replace("spk_", "speaker") + items = data['results'].get('items', []) + + for item in items: + item_speaker_label = item.get('speaker_label', '') + + if 'spk_' in item_speaker_label: + item['speaker_label'] = item_speaker_label.replace("spk_", "speaker") + + return data + +def filter_labels_whisper(data): + """ + ------------------------------------------------------------------------------------------------------ + + replaces speaker labels in Whisper JSON. + + Parameters: + ........... + data : JSON + The JSON containing speaker labels. + + Returns: + ........... + data : JSON + The modified JSON with replaced speaker labels. + + ------------------------------------------------------------------------------------------------------ + """ + for segment in data.get('segments', []): + current_speaker = segment.get('speaker', '') + + if 'SPEAKER_0' in current_speaker: + segment["speaker"] = current_speaker.replace("SPEAKER_0", "speaker") + + for word in segment["words"]: + word_speaker = word.get('speaker', '') + + if 'SPEAKER_0' in word_speaker: + word["speaker"] = word_speaker.replace("SPEAKER_0", "speaker") + + for word_segment in data.get('word_segments', []): + word_seg_speaker = word_segment.get('speaker', '') + + if 'SPEAKER_0' in word_seg_speaker: + word_segment["speaker"] = word_seg_speaker.replace("SPEAKER_0", "speaker") + + return data + def extract_content(data): """ ------------------------------------------------------------------------------------------------------ @@ -117,20 +200,19 @@ def get_clinical_labels(scale, measures, content_dict, json_response): ------------------------------------------------------------------------------------------------------ """ #Check if content is available for all the speaker - if content_dict and content_dict['speaker0'] and content_dict['speaker1']: - if scale.lower() not in measures['scale'].split(","): - return json_response + if len(content_dict) <2: + return json_response - score_string = scale.lower()+'_string' - spk1_score = sutil.match_transcript(measures[score_string], content_dict['speaker0']) - spk2_score = sutil.match_transcript(measures[score_string], content_dict['speaker1']) - - if spk1_score > spk2_score: - json_response = replace_speaker_labels(json_response, ['speaker0', 'speaker1'], ['clinician', 'participant']) + score_string = scale.lower()+'_string' + spk1_score = sutil.match_transcript(measures[score_string], content_dict['speaker0']) + spk2_score = sutil.match_transcript(measures[score_string], content_dict['speaker1']) - else: - json_response = replace_speaker_labels(json_response, ['speaker0', 'speaker1'], ['participant', 'clinician']) + if spk1_score > spk2_score: + json_response = replace_speaker_labels(json_response, ['speaker0', 'speaker1'], ['clinician', 'participant']) + else: + json_response = replace_speaker_labels(json_response, ['speaker0', 'speaker1'], ['participant', 'clinician']) + return json_response def get_job_status(transcribe, input_param): @@ -193,9 +275,9 @@ def filter_transcript_response(status, input_param): response = json.loads(read_data.read().decode('utf-8')) transcript = response['results']['transcripts'][0]['transcript'] - if input_param['ShowSpeakerLabels'] == True:#replace speaker labels with standard names + if input_param['speaker_labels'] == True:#replace speaker labels with standard names - response = replace_speaker_labels(response, ['spk_0', 'spk_1'], ['speaker0', 'speaker1']) + response = filter_labels_aws(response) return response, transcript def transcribe_audio(s3uri, input_param): @@ -222,24 +304,26 @@ def transcribe_audio(s3uri, input_param): ------------------------------------------------------------------------------------------------------ """ - response = json.loads("{}") + response = json.dumps({}) + settings = {} transcript = "" try: if input_param['access_key'] and input_param['secret_key']: - transcribe = boto3.client('transcribe', region_name = input_param['region'], aws_access_key_id = input_param['access_key'], aws_secret_access_key = input_param['secret_key']) + transcribe = boto3.client('transcribe', region_name = input_param['region'], + aws_access_key_id = input_param['access_key'], + aws_secret_access_key = input_param['secret_key']) else: transcribe = boto3.client('transcribe', region_name = input_param['region']) - settings = {'ShowSpeakerLabels': input_param['ShowSpeakerLabels'], 'MaxSpeakerLabels': input_param['MaxSpeakerLabels']} + if input_param['speaker_labels'] == True and input_param['max_speakers']>=2: + settings = {'ShowSpeakerLabels': input_param['speaker_labels'], 'MaxSpeakerLabels': input_param['max_speakers']} + transcribe.start_transcription_job( TranscriptionJobName=input_param['job_name'], Media={'MediaFileUri': s3uri}, - - #IdentifyMultipleLanguages=True, LanguageCode=input_param['language'], - Settings=settings - ) + Settings=settings) status = get_job_status(transcribe, input_param) if status['TranscriptionJob']['TranscriptionJobStatus'] == 'COMPLETED': diff --git a/openwillis/measures/audio/util/whisperx_util.py b/openwillis/measures/audio/util/whisperx_util.py index bd71064..a7bae73 100644 --- a/openwillis/measures/audio/util/whisperx_util.py +++ b/openwillis/measures/audio/util/whisperx_util.py @@ -27,7 +27,7 @@ def delete_model(model): torch.cuda.empty_cache() del model -def get_diarization(audio, align_json, HF_TOKEN, device, num_speakers, infra_model): +def get_diarization(audio, align_json, device, input_param): """ ------------------------------------------------------------------------------------------------------ @@ -38,12 +38,10 @@ def get_diarization(audio, align_json, HF_TOKEN, device, num_speakers, infra_mod audio signal object align_json: json aligned whisper transcribed output - HF_TOKEN : str - The Hugging Face token for model authentication. device : str device type - num_speakers: int - Number of speaker + input_param : dict + A dictionary containing input parameters Returns: ........... @@ -53,17 +51,23 @@ def get_diarization(audio, align_json, HF_TOKEN, device, num_speakers, infra_mod ------------------------------------------------------------------------------------------------------ """ # Assign speaker labels - if infra_model[0]: - diarize_model = whisperx.DiarizationPipeline(use_auth_token=HF_TOKEN, device=device) - + if input_param['infra_model'][0]: + diarize_model = whisperx.DiarizationPipeline(use_auth_token=input_param['hf_token'], device=device) else: - diarize_model = infra_model[2] + diarize_model = input_param['infra_model'][2] - if num_speakers == None: + if input_param['min_speakers'] == None and input_param['max_speakers'] == None: diarize_segments = diarize_model(audio) + elif input_param['min_speakers'] == None and input_param['max_speakers'] != None: + diarize_segments = diarize_model(audio, max_speakers = input_param['max_speakers']) + + elif input_param['min_speakers'] != None and input_param['max_speakers'] == None: + diarize_segments = diarize_model(audio, min_speakers= input_param['min_speakers']) + else: - diarize_segments = diarize_model(audio, min_speakers=num_speakers, max_speakers=num_speakers) + diarize_segments = diarize_model(audio, min_speakers=input_param['min_speakers'], max_speakers=input_param['max_speakers']) + json_response = whisperx.assign_word_speakers(diarize_segments, align_json) return json_response @@ -126,7 +130,7 @@ def transcribe_whisper(filepath, model, device, compute_type, batch_size, infra_ transcribe_json = model_whisp.transcribe(audio, batch_size=batch_size, language=language) return transcribe_json, audio -def get_whisperx_diariazation(filepath, HF_TOKEN, del_model, num_speakers, infra_model, language): +def get_whisperx_diariazation(filepath, input_param): """ ------------------------------------------------------------------------------------------------------ @@ -136,16 +140,8 @@ def get_whisperx_diariazation(filepath, HF_TOKEN, del_model, num_speakers, infra ........... filepath : str The path to the audio file to be transcribed. - HF_TOKEN : str - The Hugging Face token for model authentication. - del_model: boolean - Boolean indicator to delete model if low on GPU resources - num_speakers: int - Number of speaker - infra_model: list - whisper model artifacts (this is optional param: to optimize willisInfra) - language: str - language code + input_param : dict + A dictionary containing input parameters Returns: ........... @@ -158,11 +154,9 @@ def get_whisperx_diariazation(filepath, HF_TOKEN, del_model, num_speakers, infra """ device = 'cpu' compute_type = "int16" - - model = 'large-v2' batch_size = 16 - json_response = '{}' + json_response = json.dumps({}) transcript = '' try: @@ -170,16 +164,16 @@ def get_whisperx_diariazation(filepath, HF_TOKEN, del_model, num_speakers, infra device = 'cuda' compute_type = "float16" - transcribe_json, audio = transcribe_whisper(filepath, model, device, compute_type, batch_size, infra_model, language) + transcribe_json, audio = transcribe_whisper(filepath, input_param['model'], device, compute_type, batch_size, input_param['infra_model'], input_param['language']) # Align whisper output - model_a, metadata = whisperx.load_align_model(language_code=language, device=device) + model_a, metadata = whisperx.load_align_model(language_code=input_param['language'], device=device) align_json = whisperx.align(transcribe_json["segments"], model_a, metadata, audio, device, return_char_alignments=False) - if del_model: + if input_param['del_model']: delete_model(model_a) - json_response = get_diarization(audio, align_json, HF_TOKEN, device, num_speakers, infra_model) + json_response = get_diarization(audio, align_json, device, input_param) transcript = get_transcribe_summary(json_response) except Exception as e: diff --git a/openwillis/measures/commons/common.py b/openwillis/measures/commons/common.py index 9199a2e..d0c4951 100644 --- a/openwillis/measures/commons/common.py +++ b/openwillis/measures/commons/common.py @@ -26,7 +26,7 @@ def make_dir(dir_name): if not os.path.exists(dir_name): os.makedirs(dir_name) -def to_audio(filepath, speaker_label, out_dir): +def to_audio(filepath, speaker_dict, output_dir): """ ------------------------------------------------------------------------------------------------------ @@ -36,22 +36,22 @@ def to_audio(filepath, speaker_label, out_dir): ---------- filepath : str The path to the input audio file. - speaker_label : dict + speaker_dict : dict A dictionary containing speaker labels as keys and corresponding segments (NumPy arrays) as values. - out_dir : str + output_dir : str The directory where the output audio files will be saved. ------------------------------------------------------------------------------------------------------ """ - make_dir(out_dir) - for key, value in speaker_label.items(): + make_dir(output_dir) + for key, value in speaker_dict.items(): file_name, _ = os.path.splitext(os.path.basename(filepath)) audio_signal = AudioSegment.from_file(file = filepath, format = "wav") spk_signal = AudioSegment(value.tobytes(), frame_rate=audio_signal.frame_rate, sample_width=audio_signal.sample_width, channels=audio_signal.channels) - output_file = os.path.join(out_dir, file_name + '_' + key + '.wav') + output_file = os.path.join(output_dir, file_name + '_' + key + '.wav') spk_signal.export(output_file, format="wav") def get_config(filepath, json_file): diff --git a/openwillis/measures/text/config/text.json b/openwillis/measures/text/config/text.json index fe08030..897ace2 100644 --- a/openwillis/measures/text/config/text.json +++ b/openwillis/measures/text/config/text.json @@ -21,8 +21,8 @@ "word_pause": "pre_word_pause", "phrase_pause": "pre_phrase_pause", "turn_pause": "pre_turn_pause", - "word_pause_mean": "word_pause_length_mean", - "word_pause_var": "word_pause_variability", + "word_pause_mean": "mean_pre_word_pause", + "word_pause_var": "mean_pause_variability", "phrase_pause_mean": "phrase_pause_length_mean", "phrase_pause_var": "phrase_pause_variability", "num_syllables": "num_syllables", @@ -33,6 +33,8 @@ "speech_words": "speech_length_words", "turn_minutes": "turn_length_minutes", "turn_words": "turn_length_words", + "file_length": "file_length", + "speaker_percentage": "speaker_percentage", "word_rate": "words_per_min", "syllable_rate": "syllables_per_min", "pause_rate": "pauses_per_min", diff --git a/openwillis/measures/text/speech_attribute.py b/openwillis/measures/text/speech_attribute.py index e5bd708..b030e41 100644 --- a/openwillis/measures/text/speech_attribute.py +++ b/openwillis/measures/text/speech_attribute.py @@ -78,23 +78,25 @@ def is_whisper_transcribe(json_conf): ------------------------------------------------------------------------------------------------------ """ if "segments" in json_conf: - if "words" in json_conf["segments"][0]: - return True - return False + if len(json_conf["segments"])>0: + if "words" in json_conf["segments"][0]: + return True + return False -def filter_transcribe(json_conf, measures, speaker_label=None): +def filter_transcribe(json_conf, measures, min_turn_length, speaker_label=None): """ ------------------------------------------------------------------------------------------------------ - This function extracts the text and filters the JSON data - for Amazon Transcribe json response objects. - Also, it filters the JSON data based on the speaker label if provided. + This function extracts the text and filters the JSON data for Amazon Transcribe json response objects. + Also, it filters the JSON data based on the speaker label if provided. Parameters: ........... json_conf: dict aws transcribe json response. measures: dict A dictionary containing the names of the columns in the output dataframes. + min_turn_length: int + minimum words required in each turn speaker_label: str Speaker label Returns: @@ -104,58 +106,34 @@ def filter_transcribe(json_conf, measures, speaker_label=None): only the relevant data for processing. text_list: list List of transcribed text. - split into words, phrases, turns, and full text. + split into words, turns, and full text. text_indices: list List of indices for text_list. - for phrases and turns. - Raises: - ........... - ValueError: If the speaker label is not found in the json response object. ------------------------------------------------------------------------------------------------------ """ item_data = json_conf["results"]["items"] - - # make a dictionary to map old indices to new indices - item_data = cutil.create_index_column(item_data, measures) + + for i, item in enumerate(item_data): # create_index_column + item[measures["old_index"]] = i # extract text - text = " ".join( - [ - item["alternatives"][0]["content"] - for item in item_data - if "alternatives" in item - ] - ) - - # phrase-split - phrases, phrases_idxs = cutil.phrase_split(text) - - # turn-split - turns = [] - turns_idxs = [] + text = " ".join([item["alternatives"][0]["content"] for item in item_data if "alternatives" in item]) if speaker_label is not None: + turns_idxs, turns = cutil.filter_speaker_aws(item_data, min_turn_length, speaker_label) + text = " ".join(turns) + + else: + turns_idxs, turns = [], [] - turns_idxs, turns, phrases_idxs, phrases = cutil.filter_speaker( - item_data, speaker_label, turns_idxs, turns, phrases_idxs, phrases - ) - - # entire transcript - by joining all the phrases - text = " ".join(phrases) - - # filter json to only include items with start_time and end_time - filter_json = cutil.filter_json_transcribe(item_data, speaker_label, measures) - - # extract words + filter_json = cutil.filter_json_transcribe_aws(item_data, speaker_label, measures) words = [word["alternatives"][0]["content"] for word in filter_json] - text_list = [words, phrases, turns, text] - text_indices = [phrases_idxs, turns_idxs] - - return filter_json, text_list, text_indices + text_list = [words, turns, text] + return filter_json, text_list, turns_idxs -def filter_whisper(json_conf, measures, speaker_label=None): +def filter_whisper(json_conf, measures, min_turn_length, speaker_label=None): """ ------------------------------------------------------------------------------------------------------ @@ -171,6 +149,8 @@ def filter_whisper(json_conf, measures, speaker_label=None): A dictionary containing the names of the columns in the output dataframes. speaker_label: str Speaker label + min_turn_length: int + minimum words required in each turn Returns: ........... @@ -190,43 +170,25 @@ def filter_whisper(json_conf, measures, speaker_label=None): ------------------------------------------------------------------------------------------------------ """ item_data = json_conf["segments"] + text = " ".join(item.get("text", "") for item in item_data) if speaker_label is not None: - # filter out segments that do not have speaker labels - item_data = [ - segment for segment in item_data if "speaker" in segment - ] - - # make a dictionary to map old indices to new indices + item_data = [segment for segment in item_data if "speaker" in segment] + item_data = cutil.create_index_column(item_data, measures) - - # phrase-split - phrases_idxs, phrases = cutil.filter_phrases( - item_data, speaker_label, measures - ) - - # turn-split - if speaker_label is not None: - turns_idxs, turns = cutil.filter_turns( - item_data, speaker_label, measures - ) + if speaker_label is not None: + turns_idxs, turns = cutil.filter_turns(item_data, speaker_label, measures, min_turn_length) + + text = " ".join(turns) else: turns_idxs, turns = [], [] - - + # filter json to only include items with start_time and end_time filter_json = cutil.filter_json_transcribe(item_data, speaker_label, measures) - - # extract words - words = [w["word"] for w in filter_json] - - # entire transcript - by joining all the phrases - text = " ".join(phrases) - - text_list = [words, phrases, turns, text] - text_indices = [phrases_idxs, turns_idxs] - - return filter_json, text_list, text_indices + words = [value["word"] for value in filter_json] + + text_list = [words, turns, text] + return filter_json, text_list, turns_idxs def filter_vosk(json_conf, measures): @@ -258,12 +220,125 @@ def filter_vosk(json_conf, measures): # make a dictionary to map old indices to new indices for i, item in enumerate(json_conf): item[measures["old_index"]] = i + + return words, text +def common_summary_feature(df_summ, json_data, model, speaker_label): + """ + ------------------------------------------------------------------------------------------------------ - return words, text + Calculate file features based on JSON data. + Parameters: + ........... + json_conf: list + JSON response object. + summ_df: pandas dataframe + A dataframe containing summary information on the speech + model: str + model name + speaker_label: str + Speaker label -def speech_characteristics(json_conf, language="en", speaker_label=None): + Returns: + ........... + summ_df: pandas dataframe + A dataframe containing summary information on the speech + + ------------------------------------------------------------------------------------------------------ + """ + try: + if model == 'vosk': + if len(json_data) > 0 and 'end' in json_data[-1]: + + last_dict = json_data[-1] + df_summ['file_length'] = [last_dict['end']] + + else: + if model == 'aws': + json_data = json_data["results"] + fl_length, spk_pct = cutil.calculate_file_feature(json_data, model, speaker_label) + + else: + fl_length, spk_pct = cutil.calculate_file_feature(json_data, model, speaker_label) + + df_summ['file_length'] = [fl_length] + df_summ['speaker_percentage'] = [spk_pct]# if speaker_label is not None else df_summ['speaker_percentage'] + + except Exception as e: + logger.error("Error in file length calculation") + return df_summ + +def process_transcript(df_list, json_conf, measures, min_turn_length, speaker_label, source, language): + """ + ------------------------------------------------------------------------------------------------------ + + Process transcript + + Parameters: + ........... + df_list: list, : + contains pandas dataframe + json_conf: dict + Transcribed json file + measures: dict + A dictionary containing the names of the columns in the output dataframes. + min_turn_length: int + minimum words required in each turn + speaker_label: str + Speaker label + source: str + model name + language: str + Language type + + Returns: + ........... + df_list: list + contains pandas dataframe + + ------------------------------------------------------------------------------------------------------ + """ + common_summary_feature(df_list[2], json_conf, source, speaker_label) + + if source == 'whisper': + info = filter_whisper(json_conf, measures, min_turn_length, speaker_label) + + elif source == 'aws': + info = filter_transcribe(json_conf, measures, min_turn_length, speaker_label) + + else: + words, text = filter_vosk(json_conf, measures) + info = (json_conf, [words, [], text], []) + + if len(info[0]) > 0 and len(info[1][-1]) > 0: + df_list = cutil.process_language_feature(df_list, info, language, get_time_columns(source), measures) + return df_list + +def get_time_columns(source): + """ + ------------------------------------------------------------------------------------------------------ + + get time columns + + Parameters: + ........... + source: str + model name + + Returns: + ........... + object: list + time index name + + ------------------------------------------------------------------------------------------------------ + """ + if source == 'aws': + return ["start_time", "end_time"] + else: + return ["start", "end"] + +def speech_characteristics(json_conf, language="en", speaker_label=None, min_turn_length=1): """ ------------------------------------------------------------------------------------------------------ @@ -277,14 +352,14 @@ def speech_characteristics(json_conf, language="en", speaker_label=None): Language type speaker_label: str Speaker label + min_turn_length: int + minimum words required in each turn Returns: ........... df_list: list, contains: word_df: pandas dataframe A dataframe containing word summary information - phrase_df: pandas dataframe - A dataframe containing phrase summary information turn_df: pandas dataframe A dataframe containing turn summary information summ_df: pandas dataframe @@ -292,56 +367,31 @@ def speech_characteristics(json_conf, language="en", speaker_label=None): ------------------------------------------------------------------------------------------------------ """ - - measures = get_config(os.path.abspath(__file__), "text.json") - df_list = cutil.create_empty_dataframes(measures) - try: - if not isinstance(language, str): - raise ValueError("Language should be a string") - if len(language) < 2: - # if language is not specified, then set it to "xx" - # run speech characteristics as not english - language = "xx" - else: - language = language[:2].lower() + # Load configuration measures + measures = get_config(os.path.abspath(__file__), "text.json") + df_list = cutil.create_empty_dataframes(measures) if bool(json_conf): - cutil.download_nltk_resources() + language = language[:2].lower() if (language and len(language) >= 2) else "na" + + if language == 'en': + cutil.download_nltk_resources() if is_whisper_transcribe(json_conf): - filter_json, text_list, text_indices = filter_whisper( - json_conf, measures, speaker_label - ) - - if len(filter_json) > 0 and len(text_list[-1]) > 0: - df_list = cutil.process_language_feature( - filter_json, df_list, text_list, - text_indices, language, measures, - ) + df_list = process_transcript(df_list, json_conf, measures, min_turn_length, speaker_label, 'whisper', language) + + elif is_amazon_transcribe(json_conf): + df_list = process_transcript(df_list, json_conf, measures, min_turn_length, speaker_label, 'aws', language) + else: - words, text = filter_vosk(json_conf, measures) - if len(text) > 0: - df_list = cutil.process_language_feature( - json_conf, df_list, [words, [], [], text], - [[], []], language, measures, - ) - + df_list = process_transcript(df_list, json_conf, measures, min_turn_length, speaker_label, 'vosk', language) + except Exception as e: logger.error(f"Error in Speech Characteristics {e}") finally: - # if word_df is empty, then add a row of NaNs - if df_list[0].empty: - df_list[0].loc[0] = np.nan - # if phrase_df is empty, then add a row of NaNs - if df_list[1].empty: - df_list[1].loc[0] = np.nan - # if turn_df is empty, then add a row of NaNs - if df_list[2].empty: - df_list[2].loc[0] = np.nan - # if summ_df is empty, then add a row of NaNs - if df_list[3].empty: - df_list[3].loc[0] = np.nan - - return df_list + for df in df_list: + df.loc[0] = np.nan if df.empty else df.loc[0] + + return df_list diff --git a/openwillis/measures/text/util/characteristics_util.py b/openwillis/measures/text/util/characteristics_util.py index dcefbe6..d09d8c1 100644 --- a/openwillis/measures/text/util/characteristics_util.py +++ b/openwillis/measures/text/util/characteristics_util.py @@ -15,23 +15,8 @@ logger = logging.getLogger() # NLTK Tag list -TAG_DICT = { - "PRP": "Pronoun", - "PRP$": "Pronoun", - "VB": "Verb", - "VBD": "Verb", - "VBG": "Verb", - "VBN": "Verb", - "VBP": "Verb", - "VBZ": "Verb", - "JJ": "Adjective", - "JJR": "Adjective", - "JJS": "Adjective", - "NN": "Noun", - "NNP": "Noun", - "NNS": "Noun", -} - +TAG_DICT = {"PRP": "Pronoun", "PRP$": "Pronoun", "VB": "Verb", "VBD": "Verb", "VBG": "Verb", "VBN": "Verb", "VBP": "Verb", + "VBZ": "Verb", "JJ": "Adjective", "JJR": "Adjective", "JJS": "Adjective", "NN": "Noun", "NNP": "Noun", "NNS": "Noun"} def create_empty_dataframes(measures): """ @@ -46,167 +31,87 @@ def create_empty_dataframes(measures): Returns: ........... - word_df: pandas dataframe - A dataframe containing word summary information - phrase_df: pandas dataframe - A dataframe containing phrase summary information - turn_df: pandas dataframe - A dataframe containing turn summary information - summ_df: pandas dataframe - A dataframe containing summary information on the speech + tuple: pandas dataframe + An empty dataframe for word, turn and summary measures ------------------------------------------------------------------------------------------------------ """ - word_df = pd.DataFrame( - columns=[ - measures["word_pause"], - measures["num_syllables"], - measures["part_of_speech"], - measures["pos"], - measures["neg"], - measures["neu"], - measures["compound"], - ] - ) - - phrase_df = pd.DataFrame( - columns=[ - measures["phrase_pause"], - measures["phrase_minutes"], - measures["phrase_words"], - measures["word_rate"], - measures["syllable_rate"], - measures["pause_rate"], - measures["pause_var"], - measures["pause_meandur"], - measures["speech_percentage"], - measures["speech_noun"], - measures["speech_verb"], - measures["speech_adj"], - measures["speech_pronoun"], - measures["pos"], - measures["neg"], - measures["neu"], - measures["compound"], - measures["speech_mattr"], - ] - ) - - turn_df = pd.DataFrame( - columns=[ - measures["turn_pause"], - measures["turn_minutes"], - measures["turn_words"], - measures["word_rate"], - measures["syllable_rate"], - measures["pause_rate"], - measures["pause_var"], - measures["pause_meandur"], - measures["speech_percentage"], - measures["speech_noun"], - measures["speech_verb"], - measures["speech_adj"], - measures["speech_pronoun"], - measures["pos"], - measures["neg"], - measures["neu"], - measures["compound"], - measures["speech_mattr"], - measures["interrupt_flag"], - ] - ) + word_df = pd.DataFrame(columns=[measures["word_pause"], measures["num_syllables"], measures["part_of_speech"]]) + turn_df = pd.DataFrame(columns=[measures["turn_pause"], measures["turn_minutes"], measures["turn_words"], + measures["word_rate"], measures["syllable_rate"], measures["speech_percentage"], + measures["pause_meandur"], measures["pause_var"], measures["pos"], measures["neg"], + measures["neu"], measures["compound"], measures["speech_mattr"], + measures["interrupt_flag"]]) summ_df = pd.DataFrame( - columns=[ - measures["speech_minutes"], - measures["speech_words"], - measures["word_rate"], - measures["syllable_rate"], - measures["pause_rate"], - measures["word_pause_mean"], - measures["word_pause_var"], - measures["phrase_pause_mean"], - measures["phrase_pause_var"], - measures["speech_percentage"], - measures["speech_noun"], - measures["speech_verb"], - measures["speech_adj"], - measures["speech_pronoun"], - measures["pos"], - measures["neg"], - measures["neu"], - measures["compound"], - measures["speech_mattr"], - measures["num_turns"], - measures["turn_minutes_mean"], - measures["turn_words_mean"], - measures["turn_pause_mean"], - measures["num_one_word_turns"], - measures["num_interrupts"], - ] - ) - - return word_df, phrase_df, turn_df, summ_df - - -def filter_speaker_phrase(item_data, speaker_label, phrases_idxs, phrases): + columns=[measures["file_length"], measures["speech_minutes"], measures["speech_words"], measures["word_rate"], + measures["syllable_rate"], measures["word_pause_mean"], measures["word_pause_var"], + measures["speech_percentage"], measures["pos"], measures["neg"], measures["neu"], measures["compound"], + measures["speech_mattr"], measures["num_turns"], measures["num_one_word_turns"], measures["turn_minutes_mean"], + measures["turn_words_mean"], measures["turn_pause_mean"], measures["speaker_percentage"], + measures["num_interrupts"]]) + + return word_df, turn_df, summ_df + +def create_index_column(item_data, measures): """ - ------------------------------------------------------------------------------------------------------ - This function updates the phrases list - to only include the speaker label provided. + This function creates an index column in the JSON response object. + Parameters: - ........... item_data: dict JSON response object. - speaker_label: str - Speaker label - phrases_idxs: list - A list of tuples containing - the start and end indices of the phrases in the JSON object. - phrases: list - A list of phrases extracted from the JSON object. + measures: dict + A dictionary containing the names of the columns in the output dataframes. + Returns: - ........... - phrases_idxs: list - A list of tuples containing - the start and end indices of the phrases in the JSON object. - phrases: list - A list of phrases extracted from the JSON object. - ------------------------------------------------------------------------------------------------------ + item_data: dict + The updated JSON response object. """ - phrases_idxs2 = [] - phrases2 = [] - for i, phrase in enumerate(phrases_idxs): - try: - start_idx = phrase[0] - if item_data[start_idx].get("speaker_label", "") == speaker_label: - phrases_idxs2.append(phrase) - phrases2.append(phrases[i]) - except Exception as e: - logger.error(f"Error in phrase-split for speaker {speaker_label}: {e}") - continue + index = 0 + for item in item_data: + + for word in item.get("words", []): + word[measures["old_index"]] = index + index += 1 + + return item_data - return phrases_idxs2, phrases2 +def download_nltk_resources(): + """ + ------------------------------------------------------------------------------------------------------ + + This function downloads the + required NLTK resources for processing text data. + + ------------------------------------------------------------------------------------------------------ + """ + try: + nltk.data.find("tokenizers/punkt") + except LookupError: + nltk.download("punkt") + try: + nltk.data.find("averaged_perceptron_tagger") + except LookupError: + nltk.download("averaged_perceptron_tagger") -def filter_speaker_turn(item_data, speaker_label, turns_idxs, turns): +def filter_turn_aws(item_data, min_turn_length, speaker_label): """ ------------------------------------------------------------------------------------------------------ This function updates the turns list to only include the speaker label provided. + Parameters: ........... item_data: dict JSON response object. + min_turn_length: int + minimum words required in each turn speaker_label: str Speaker label - turns_idxs: list - A list of tuples containing - the start and end indices of the turns in the JSON object. - turns: list - A list of turns extracted from the JSON object. + Returns: ........... turns_idxs: list @@ -214,148 +119,75 @@ def filter_speaker_turn(item_data, speaker_label, turns_idxs, turns): the start and end indices of the turns in the JSON object. turns: list A list of turns extracted from the JSON object. + ------------------------------------------------------------------------------------------------------ """ start_idx = 0 + turns_idxs, turns = [], [] for i, item in enumerate(item_data): + try: - if ( - i > 0 - and item.get("speaker_label", "") == speaker_label - and item_data[i - 1].get("speaker_label", "") != speaker_label - ): + if (i > 0 and item.get("speaker_label", "") == speaker_label and item_data[i - 1].get("speaker_label", "") != speaker_label): start_idx = i - elif ( - i > 0 - and item.get("speaker_label", "") != speaker_label - and item_data[i - 1].get("speaker_label", "") == speaker_label - ): - turns_idxs.append((start_idx, i - 1)) - # create turns texts - turns.append( - " ".join( - [ - item["alternatives"][0]["content"] - for item in item_data[start_idx:i] - ] - ) - ) + + elif (i > 0 and item.get("speaker_label", "") != speaker_label and item_data[i - 1].get("speaker_label", "") == speaker_label): + turn_text = " ".join([item["alternatives"][0]["content"] for item in item_data[start_idx:i]]) + + if len(turn_text.split(" ")) >= min_turn_length: + turns_idxs.append((start_idx, i - 1)) + turns.append(turn_text) + except Exception as e: logger.error(f"Error in turn-split for speaker {speaker_label}: {e}") continue - # if the last item is the speaker label if start_idx not in [item[0] for item in turns_idxs]: - turns_idxs.append((start_idx, len(item_data) - 1)) - turns.append( - " ".join( - [ - item["alternatives"][0]["content"] - for item in item_data[start_idx:] - ] - ) - ) - return turns_idxs, turns + turn_text = " ".join([item["alternatives"][0]["content"] for item in item_data[start_idx:]]) + if len(turn_text.split(" ")) >= min_turn_length: + turns_idxs.append((start_idx, len(item_data) - 1)) + + turns.append(turn_text) + return turns_idxs, turns -def filter_speaker(item_data, speaker_label, turns_idxs, turns, phrases_idxs, phrases): +def filter_speaker_aws(item_data, min_turn_length, speaker_label): """ ------------------------------------------------------------------------------------------------------ - This function updates the turns and phrases lists - to only include the speaker label provided. + + This function updates the turns lists to only include the speaker label provided. + Parameters: ........... item_data: dict JSON response object. + min_turn_length: int + minimum words required in each turn speaker_label: str Speaker label - turns_idxs: list - A list of tuples containing - the start and end indices of the turns in the JSON object. - turns: list - A list of turns extracted from the JSON object. - phrases_idxs: list - A list of tuples containing - the start and end indices of the phrases in the JSON object. - phrases: list - A list of phrases extracted from the JSON object. + Returns: ........... turns_idxs: list - A list of tuples containing - the start and end indices of the turns in the JSON object. + A list of tuples containing the start and end indices of the turns in the JSON object. turns: list A list of turns extracted from the JSON object. - phrases_idxs: list - A list of tuples containing - the start and end indices of the phrases in the JSON object. - phrases: list - A list of phrases extracted from the JSON object. - Raises: - ........... - ValueError: If the speaker label is not found in the json response object. + ------------------------------------------------------------------------------------------------------ """ - speaker_labels = [ - item["speaker_label"] for item - in item_data if "speaker_label" in item - ] + speaker_labels = [item["speaker_label"] for item in item_data if "speaker_label" in item] if speaker_label not in speaker_labels: - raise ValueError( - f"Speaker label {speaker_label} " - "not found in the json response object." - ) - - # phrase-split for the speaker label - phrases_idxs, phrases = filter_speaker_phrase( - item_data, speaker_label, phrases_idxs, phrases - ) - - # turn-split for the speaker label - turns_idxs, turns = filter_speaker_turn( - item_data, speaker_label, turns_idxs, turns - ) - - return turns_idxs, turns, phrases_idxs, phrases + logger.error(f"Speaker label {speaker_label} not found in the json response object.") + turns_idxs, turns = filter_turn_aws(item_data, min_turn_length, speaker_label) + return turns_idxs, turns -def phrase_split(text): +def filter_json_transcribe_aws(item_data, speaker_label, measures): """ ------------------------------------------------------------------------------------------------------ - This function splits the input text into phrases. - Parameters: - ........... - text: str - The input text. - Returns: - ........... - phrases: list - A list of phrases extracted from the input text. - phrases_idxs: list - A list of tuples containing - the start and end indices of the phrases in the input text. - ------------------------------------------------------------------------------------------------------ - """ - phrases = nltk.tokenize.sent_tokenize(text) - phrases_idxs = [] - - start_idx = 0 - for phrase in phrases: - end_idx = start_idx + len(phrase.split()) - 1 - phrases_idxs.append((start_idx, end_idx)) - start_idx = end_idx + 1 - - return phrases, phrases_idxs - -def filter_turns(item_data, speaker_label, measures): - """ - ------------------------------------------------------------------------------------------------------ - - This function updates the turns list - to only include the speaker label provided. + This function filters the JSON response object to only include items with start_time and end_time. Parameters: ........... @@ -368,85 +200,24 @@ def filter_turns(item_data, speaker_label, measures): Returns: ........... - turns_idxs: list - A list of tuples containing - the start and end indices of the turns in the JSON object. - turns: list - A list of turns extracted from the JSON object. - - Raises: - ........... - ValueError: If the speaker label is not found in the json response object. + filter_json: list + The updated JSON response object. ------------------------------------------------------------------------------------------------------ """ + filter_json = [item for item in item_data if "start_time" in item and "end_time" in item] + filter_json = pause_calculation(filter_json, measures, ['start_time', 'end_time']) - speaker_labels = [ - item["speaker"] for item - in item_data if "speaker" in item - ] - - if speaker_label not in speaker_labels: - raise ValueError( - f"Speaker label {speaker_label} " - "not found in the json response object." - ) - - turns_idxs, turns = [], [] - - start_idx = 0 - start_idx2 = 0 - for i, item in enumerate(item_data): - try: - if ( - i > 0 - and item.get("speaker", "") == speaker_label - and item_data[i - 1].get("speaker", "") != speaker_label - ): - start_idx = i - start_idx2 = item["words"][0][measures["old_index"]] - elif ( - i > 0 - and item.get("speaker", "") != speaker_label - and item_data[i - 1].get("speaker", "") == speaker_label - ): - end_idx = i-1 - end_idx2 = item["words"][-1][measures["old_index"]] - turns_idxs.append((start_idx2, end_idx2)) - # create turns texts - turns.append( - " ".join( - [ - item["text"] - for item in item_data[start_idx:(end_idx+1)] - ] - ) - ) - except Exception as e: - logger.error(f"Error in turn-split for speaker {speaker_label}: {e}") - continue - - # if the last item is the speaker label - if start_idx not in [item[0] for item in turns_idxs]: - end_idx2 = item_data[-1]["words"][-1][measures["old_index"]] - turns_idxs.append((start_idx2, end_idx2)) - turns.append( - " ".join( - [ - item["text"] - for item in item_data[start_idx:] - ] - ) - ) - - return turns_idxs, turns + if speaker_label is not None: + filter_json = [item for item in filter_json if item.get("speaker_label", "") == speaker_label] + return filter_json -def filter_phrases(item_data, speaker_label, measures): +def filter_turns(item_data, speaker_label, measures, min_turn_length): """ ------------------------------------------------------------------------------------------------------ - This function updates the phrases list + This function updates the turns list to only include the speaker label provided. Parameters: @@ -457,74 +228,60 @@ def filter_phrases(item_data, speaker_label, measures): Speaker label measures: dict A dictionary containing the names of the columns in the output dataframes. + min_turn_length: int + minimum words required in each turn Returns: ........... - phrases_idxs: list + turns_idxs: list A list of tuples containing - the start and end indices of the phrases in the JSON object. - phrases: list - A list of phrases extracted from the JSON object. + the start and end indices of the turns in the JSON object. + turns: list + A list of turns extracted from the JSON object. ------------------------------------------------------------------------------------------------------ """ + turns_idxs, turns = [], [] + current_turn = None - - phrases_idxs, phrases = [], [] for item in item_data: try: - - start_idx = item["words"][0][measures["old_index"]] - end_idx = item["words"][-1][measures["old_index"]] - - if speaker_label is not None: + + if "speaker" in item: if item["speaker"] == speaker_label: - phrases.append(item["text"]) - phrases_idxs.append((start_idx, end_idx)) - else: - phrases.append(item["text"]) - phrases_idxs.append((start_idx, end_idx)) - + current_turn = [item] if current_turn is None else current_turn + [item] + + else: + if current_turn is not None: + + if len(current_turn)>0 and len(current_turn[0]["words"])>0: + start_idx2 = current_turn[0]["words"][0][measures["old_index"]] + + end_idx2 = current_turn[-1]["words"][-1][measures["old_index"]] + turn_text = " ".join(item["text"] for item in current_turn) + + if len(turn_text.split(" ")) >= min_turn_length: + turns_idxs.append((start_idx2, end_idx2)) + + turns.append(turn_text) + current_turn = None + except Exception as e: - logger.error(f"Failed to filter phrases: {e}") - return phrases_idxs, phrases - - -def create_index_column(item_data, measures): - """ - ------------------------------------------------------------------------------------------------------ - - This function creates an index column in the JSON response object. - - Parameters: - ........... - item_data: dict - JSON response object. - measures: dict - A dictionary containing the names of the columns in the output dataframes. - - Returns: - ........... - item_data: dict - The updated JSON response object. - - ------------------------------------------------------------------------------------------------------ - """ - i = 0 - i_p = 0 - while True: - for j, word in enumerate(item_data[i_p]["words"]): - item_data[i_p]["words"][j][measures["old_index"]] = i - i += 1 - - i_p += 1 - if i_p >= len(item_data): - break + logger.error(f"Error in turn calculation {e}") - return item_data - + if current_turn is not None: + start_idx2 = current_turn[0]["words"][0][measures["old_index"]] + + end_idx2 = current_turn[-1]["words"][-1][measures["old_index"]] + turn_text = " ".join(item["text"] for item in current_turn) + + if len(turn_text.split(" ")) >= min_turn_length: + turns_idxs.append((start_idx2, end_idx2)) + + turns.append(turn_text) + return turns_idxs, turns -def pause_calculation(filter_json, measures): +def pause_calculation(filter_json, measures, time_index): """ ------------------------------------------------------------------------------------------------------ @@ -546,15 +303,12 @@ def pause_calculation(filter_json, measures): """ for i, item in enumerate(filter_json): if i > 0: - item[measures["pause"]] = float(item["start"]) - float( - filter_json[i - 1]["end"] - ) + item[measures["pause"]] = float(item[time_index[0]]) - float(filter_json[i - 1][time_index[1]]) + else: item[measures["pause"]] = np.nan - return filter_json - def filter_json_transcribe(item_data, speaker_label, measures): """ ------------------------------------------------------------------------------------------------------ @@ -577,131 +331,113 @@ def filter_json_transcribe(item_data, speaker_label, measures): ------------------------------------------------------------------------------------------------------ """ - # phrase filtering item_data2 = [] for item in item_data: try: speaker = item["speaker"] words = item["words"] - - # update speaker labels - for j, w in enumerate(words): + + for j, w in enumerate(words):# update speaker labels words[j]["speaker"] = speaker item_data2 += words except Exception as e: logger.error(f"Failed to filter word: {e}") - filter_json = [ - item for item in item_data2 - if "start" in item and "end" in item - ] - - # calculate time difference between each word - filter_json = pause_calculation(filter_json, measures) + filter_json = [item for item in item_data2 if "start" in item and "end" in item] + filter_json = pause_calculation(filter_json, measures, ['start', 'end']) if speaker_label is not None: - filter_json = [ - item - for item in filter_json - if item.get("speaker", "") == speaker_label - ] - + filter_json = [item for item in filter_json if item.get("speaker", "") == speaker_label] return filter_json - -def download_nltk_resources(): +def get_num_of_syllables(text): """ ------------------------------------------------------------------------------------------------------ - This function downloads the - required NLTK resources for processing text data. + This function calculates the number of syllables in the input text. Parameters: ........... - None + text: str + The input text. Returns: ........... - None + syllable_count: int + The number of syllables in the input text. - ------------------------------------------------------------------------------------------------------ + --------------------------------------------------------------------------------------- """ - try: - nltk.data.find("tokenizers/punkt") - except LookupError: - nltk.download("punkt") - try: - nltk.data.find("averaged_perceptron_tagger") - except LookupError: - nltk.download("averaged_perceptron_tagger") + syllable_tokenizer = nltk.tokenize.SyllableTokenizer() + punctuation = "!\"#$%&()*+,-./:;<=>?@[\]^_`{|}~" # remove punctuation + + syllables = [syllable_tokenizer.tokenize(token) for token in nltk.word_tokenize(text) if token not in punctuation] + syllable_count = sum([len(token) for token in syllables]) + return syllable_count -def get_tag(json_conf, tag_dict, measures): +def get_pause_feature_word(word_df, df_diff, word_list, turn_index, measures): """ ------------------------------------------------------------------------------------------------------ - This function performs part-of-speech - tagging on the input text using NLTK, and returns an updated - json_conf list with the part-of-speech tags. + This function calculates various pause-related speech characteristic + features at the word level and adds them to the output dataframe word_df. Parameters: ........... - json_conf: list - JSON response object. - tag_dict: dict - A dictionary mapping the NLTK tags to more readable tags. + word_df: pandas dataframe + A dataframe containing word summary information + df_diff: pandas dataframe + A dataframe containing the word-level information + from the JSON response. + word_list: list + List of transcribed text at the word level. + turn_index: list + A list containing the indices of the first and last word measures: dict A dictionary containing the names of the columns in the output dataframes. Returns: ........... - json_conf: list - The updated json_conf list. + word_df: pandas dataframe + The updated word_df dataframe. ------------------------------------------------------------------------------------------------------ """ - if len(json_conf) <= 0: - return json_conf - - if "alternatives" not in json_conf[0].keys(): - # local vosk transcriber - word_list = [word["word"] for word in json_conf if "word" in word] - else: - # aws transcriber - word_list = [item["alternatives"][0]["content"] for item in json_conf] - - tag_list = nltk.pos_tag(word_list) - - for i, tag in enumerate(tag_list): - if tag[1] in tag_dict.keys(): - json_conf[i][measures["tag"]] = tag_dict[tag[1]] - else: - json_conf[i][measures["tag"]] = "Other" - - return json_conf - + turn_starts = [pindex[0] for pindex in turn_index] + word_df[measures["word_pause"]] = df_diff[measures["pause"]].where(~df_diff[measures["old_index"]].isin(turn_starts), np.nan) + + word_df[measures["num_syllables"]] = [get_num_of_syllables(word) for word in word_list] + return word_df -def get_part_of_speech(df, tags, measures, index=0): +def process_pause_feature(df_diff, df, text_level, index_list, time_index, level_name, measures, language): """ ------------------------------------------------------------------------------------------------------ - This function calculates the proportions of verbs, - pronouns, adjectives, and nouns in the - transcribed text, and adds them to the output dataframe df. + This function calculates various pause-related speech + characteristic features at the turn + level and adds them to the output dataframe df. Parameters: ........... + df_diff: pandas dataframe + A dataframe containing the word-level information from the JSON response. df: pandas dataframe - A dataframe containing the speech characteristics of the input text. - tags: list - A list of part-of-speech tags for the input text. + A dataframe containing turn summary information + text_level: list + List of transcribed text at the turn level. + index_list: list + A list containing the indices of the first and last word in each turn. + time_index: list + A list containing the names of the columns in json that contain + the start and end times of each word. + level_name: str + The name of the level being analyzed turn. measures: dict A dictionary containing the names of the columns in the output dataframes. - index: int - The index of the row in the output dataframe df. Returns: ........... @@ -710,319 +446,96 @@ def get_part_of_speech(df, tags, measures, index=0): ------------------------------------------------------------------------------------------------------ """ - if len(tags) == 0: + + if level_name not in [measures["turn"]]: + logger.error(f"level_name must be turn") return df - df.loc[index, measures["speech_noun"]] = ( - 100 * len(tags[tags == "Noun"]) / len(tags) - ) - df.loc[index, measures["speech_verb"]] = ( - 100 * len(tags[tags == "Verb"]) / len(tags) - ) - df.loc[index, measures["speech_adj"]] = ( - 100 * len(tags[tags == "Adjective"]) / len(tags) - ) - df.loc[index, measures["speech_pronoun"]] = ( - 100 * len(tags[tags == "Pronoun"]) / len(tags) - ) - - return df - - -def get_tag_summ(json_conf, df_list, text_indices, measures): - """ - ------------------------------------------------------------------------------------------------------ - - This function calculates the proportions of verbs, - pronouns, adjectives, and nouns in the - transcribed text, and adds them to the output dataframe summ_df. - - Parameters: - ........... - json_conf: list - JSON response object. - df_list: list - List of pandas dataframes. - word_df, phrase_df, turn_df, summ_df - text_indices: list - List of indices for text_list. - for phrases and turns. - measures: dict - A dictionary containing the names of the columns in the output dataframes. - - Returns: - ........... - df_list: list - List of updated pandas dataframes. - - ------------------------------------------------------------------------------------------------------ - """ - - word_df, phrase_df, turn_df, summ_df = df_list - phrase_index, turn_index = text_indices - - df_conf = pd.DataFrame(json_conf) - - # word-level analysis - word_df[measures["part_of_speech"]] = df_conf[measures["tag"]] - - # phrase-level analysis - for j, pindex in enumerate(phrase_index): - prange = range(pindex[0], pindex[1] + 1) - phrase_tags = df_conf.loc[df_conf[measures["old_index"]].isin(prange), measures["tag"]] - - phrase_df = get_part_of_speech(phrase_df, phrase_tags, measures, j) - - # turn-level analysis - for j, uindex in enumerate(turn_index): - urange = range(uindex[0], uindex[1] + 1) - turn_tags = df_conf.loc[df_conf[measures["old_index"]].isin(urange), measures["tag"]] - - turn_df = get_part_of_speech(turn_df, turn_tags, measures, j) - - # file-level analysis - summ_df = get_part_of_speech(summ_df, df_conf[measures["tag"]], measures) - - df_list = [word_df, phrase_df, turn_df, summ_df] - - return df_list - - -def get_mattr(text): - """ - ------------------------------------------------------------------------------------------------------ - This function calculates the Moving Average Type-Token Ratio (MATTR) - of the input text using the - LexicalRichness library. - - Parameters: - ........... - text : str - The input text to be analyzed. - - Returns: - ........... - mattr : float - The calculated MATTR value. - - ------------------------------------------------------------------------------------------------------ - """ - word = nltk.word_tokenize(text) - filter_punc = list(value for value in word if value not in [".", "!", "?"]) - filter_punc = " ".join(filter_punc) - mattr = np.nan - - lex_richness = LexicalRichness(filter_punc) - if lex_richness.words > 0: - mattr = lex_richness.mattr(window_size=lex_richness.words) - - return mattr - - -def get_sentiment(df_list, text_list, measures): - """ - ------------------------------------------------------------------------------------------------------ - - This function calculates the sentiment scores of the input text using - VADER, and adds them to the output dataframe summ_df. - - Parameters: - ........... - df_list: list - List of pandas dataframes. - word_df, phrase_df, turn_df, summ_df - text_list: list - List of transcribed text. - split into words, phrases, turns, and full text. - measures: dict - A dictionary containing the names of the columns in the output dataframes. - - Returns: - ........... - df_list: list - List of updated pandas dataframes. - - ------------------------------------------------------------------------------------------------------ - """ - word_df, phrase_df, turn_df, summ_df = df_list - word_list, phrase_list, turn_list, full_text = text_list - - sentiment = SentimentIntensityAnalyzer() - - # column names - cols = [ - measures["neg"], - measures["neu"], - measures["pos"], - measures["compound"], - measures["speech_mattr"], - ] - - # word-level analysis - for idx, w in enumerate(word_list): - try: - sentiment_dict = sentiment.polarity_scores(w) - - word_df.loc[idx, cols[:-1]] = list(sentiment_dict.values()) - except Exception as e: - logger.error(f"Error in sentiment analysis for word {w}: {e}") - continue - - # phrase-level analysis - for idx, p in enumerate(phrase_list): + for j, index in enumerate(index_list): try: - sentiment_dict = sentiment.polarity_scores(p) - mattr = get_mattr(p) + + rng = range(index[0], index[1] + 1) + level_json = df_diff[df_diff[measures["old_index"]].isin(rng)] + + pauses = level_json[measures["pause"]].values[1:] + level_min_val = (float(level_json.iloc[-1][time_index[1]]) - float(level_json.iloc[0][time_index[0]])) / 60 + + df.loc[j, measures[f"{level_name}_minutes"]] = level_min_val + df.loc[j, measures[f"{level_name}_words"]] = len(level_json) - phrase_df.loc[idx, cols] = list(sentiment_dict.values()) + [mattr] - except Exception as e: - logger.error(f"Error in sentiment analysis for phrase {p}: {e}") - continue + if len(pauses) == 1: + df.loc[j, measures["pause_var"]] = 0 + df.loc[j, measures["pause_meandur"]] = np.mean(pauses) - # turn-level analysis - for idx, u in enumerate(turn_list): - try: - sentiment_dict = sentiment.polarity_scores(u) - mattr = get_mattr(u) + elif len(pauses) > 1: + df.loc[j, measures["pause_var"]] = np.var(pauses) + df.loc[j, measures["pause_meandur"]] = np.mean(pauses) - turn_df.loc[idx, cols] = list(sentiment_dict.values()) + [mattr] + if df.loc[j, measures[f"{level_name}_minutes"]] > 0: + speech_pct_val = 100 * (1 - np.sum(pauses) / (60 * df.loc[j, measures[f"{level_name}_minutes"]])) + df.loc[j, measures["speech_percentage"]] = speech_pct_val + + if language == 'en': + syllable_rate = (get_num_of_syllables(text_level[j]) / df.loc[j, measures[f"{level_name}_minutes"]]) + df.loc[j, measures["syllable_rate"]] = syllable_rate + + word_rate_val = (df.loc[j, measures[f"{level_name}_words"]] / df.loc[j, measures[f"{level_name}_minutes"]]) + df.loc[j, measures["word_rate"]] = word_rate_val + except Exception as e: - logger.error(f"Error in sentiment analysis for turn {u}: {e}") + logger.error(f"Error in pause feature calculation for {level_name} {j}: {e}") continue - # file-level analysis - sentiment_dict = sentiment.polarity_scores(full_text) - mattr = get_mattr(full_text) - - summ_df.loc[0, cols] = list(sentiment_dict.values()) + [mattr] - - df_list = [word_df, phrase_df, turn_df, summ_df] - - return df_list - - -def get_num_of_syllables(text): - """ - ------------------------------------------------------------------------------------------------------ - - This function calculates the number of syllables in the input text. - - Parameters: - ........... - text: str - The input text. - - Returns: - ........... - syllable_count: int - The number of syllables in the input text. - - --------------------------------------------------------------------------------------- - """ - - syllable_tokenizer = nltk.tokenize.SyllableTokenizer() - - # remove punctuation - punctuation = "!\"#$%&()*+,-./:;<=>?@[\]^_`{|}~" - syllables = [syllable_tokenizer.tokenize(token) for token in nltk.word_tokenize(text) if token not in punctuation] - # count the number of syllables in each word - syllable_count = sum([len(token) for token in syllables]) - - return syllable_count - + return df -def process_pause_feature(df_diff, df, text_level, index_list, time_index, level_name, measures): +def get_pause_feature_turn(turn_df, df_diff, turn_list, turn_index, time_index, measures, language): """ ------------------------------------------------------------------------------------------------------ - This function calculates various pause-related speech - characteristic features at the phrase or turn - level and adds them to the output dataframe df. + This function calculates various pause-related speech characteristic + features at the turn level and adds them to the output dataframe turn_df. Parameters: ........... + turn_df: pandas dataframe + A dataframe containing turn summary information df_diff: pandas dataframe A dataframe containing the word-level information - from the JSON response. - df: pandas dataframe - A dataframe containing phrase or turn summary information - text_level: list - List of transcribed text at the phrase or turn level. - index_list: list + from the JSON response. + turn_list: list + List of transcribed text at the turn level. + turn_index: list A list containing the indices of the first and last word - in each phrase or turn. + in each turn. time_index: list A list containing the names of the columns in json that contain - the start and end times of each word. - level_name: str - The name of the level being analyzed (phrase or turn). + the start and end times of each word. measures: dict A dictionary containing the names of the columns in the output dataframes. Returns: ........... - df: pandas dataframe - The updated df dataframe. + turn_df: pandas dataframe + The updated turn_df dataframe. ------------------------------------------------------------------------------------------------------ """ + turn_starts = [uindex[0] for uindex in turn_index] + df_diff_turn = df_diff[df_diff[measures["old_index"]].isin(turn_starts)] - if level_name not in [measures["phrase"], measures["turn"]]: - logger.error( - f"level_name must be either {measures['phrase']} or {measures['turn']}" - ) - return df - - for j, index in enumerate(index_list): - try: - rng = range(index[0], index[1] + 1) - level_json = df_diff[df_diff[measures["old_index"]].isin(rng)] - - # remove first pause as it is the pre_pause - pauses = level_json[measures["pause"]].values[1:] - - df.loc[j, measures[f"{level_name}_minutes"]] = ( - float(level_json.iloc[-1][time_index[1]]) - - float(level_json.iloc[0][time_index[0]]) - ) / 60 - df.loc[j, measures[f"{level_name}_words"]] = len(level_json) - - # if there is 1 pause - if len(pauses) == 1: - df.loc[j, measures["pause_var"]] = 0 - df.loc[j, measures["pause_meandur"]] = np.mean(pauses) - # if there are more than 1 pauses - elif len(pauses) > 1: - df.loc[j, measures["pause_var"]] = np.var(pauses) - df.loc[j, measures["pause_meandur"]] = np.mean(pauses) - - if df.loc[j, measures[f"{level_name}_minutes"]] > 0: - df.loc[j, measures["speech_percentage"]] = 100 * ( - 1 - np.sum(pauses) / ( - 60 * df.loc[j, measures[f"{level_name}_minutes"]] - ) - ) - - # articulation rate - df.loc[j, measures["syllable_rate"]] = ( - get_num_of_syllables(text_level[j]) / df.loc[j, measures[f"{level_name}_minutes"]] - ) - - df.loc[j, measures["word_rate"]] = ( - df.loc[j, measures[f"{level_name}_words"]] / df.loc[j, measures[f"{level_name}_minutes"]] - ) - except Exception as e: - logger.error(f"Error in pause feature calculation for {level_name} {j}: {e}") - continue - - df[measures["pause_rate"]] = df[measures["word_rate"]] - - return df + turn_df[measures["turn_pause"]] = df_diff_turn[measures["pause"]] + turn_df[measures["interrupt_flag"]] = False + + negative_pause = turn_df[measures["turn_pause"]] <= 0 + turn_df.loc[negative_pause, measures["turn_pause"]] = 0 + + turn_df.loc[negative_pause, measures["interrupt_flag"]] = True + turn_df = turn_df.reset_index(drop=True) + turn_df = process_pause_feature(df_diff, turn_df, turn_list, turn_index, time_index, measures["turn"], measures, language) + return turn_df -def update_summ_df( - df_diff, summ_df, full_text, time_index, word_df, phrase_df, turn_df, measures -): +def update_summ_df(df_diff, summ_df, full_text, time_index, word_df, turn_df, measures): """ ------------------------------------------------------------------------------------------------------ @@ -1041,8 +554,6 @@ def update_summ_df( that contain the start and end times of each word. word_df: pandas dataframe A dataframe containing word summary information - phrase_df: pandas dataframe - A dataframe containing phrase summary information turn_df: pandas dataframe A dataframe containing turn summary information measures: dict @@ -1055,302 +566,283 @@ def update_summ_df( ------------------------------------------------------------------------------------------------------ """ - if len(phrase_df) > 0: - speech_minutes = phrase_df[measures["phrase_minutes"]].sum() + if len(turn_df) > 0: + speech_minutes = turn_df[measures["turn_minutes"]].sum() else: speech_minutes = (float(df_diff.iloc[-1][time_index[1]]) - float(df_diff.iloc[0][time_index[0]])) / 60 - summ_df[measures["speech_minutes"]] = [speech_minutes] - + summ_df[measures["speech_words"]] = len(df_diff) if speech_minutes > 0: - summ_df[measures["word_rate"]] = ( - summ_df[measures["speech_words"]] / summ_df[measures["speech_minutes"]] - ) - summ_df[measures["syllable_rate"]] = ( - get_num_of_syllables(full_text) / summ_df[measures["speech_minutes"]] - ) - summ_df[measures["speech_percentage"]] = 100 * ( - 1 - - df_diff.loc[1:, measures["pause"]].sum() - / (60 * summ_df[measures["speech_minutes"]]) - ) - - summ_df[measures["pause_rate"]] = summ_df[measures["word_rate"]] - + + summ_df[measures["word_rate"]] = (summ_df[measures["speech_words"]] / summ_df[measures["speech_minutes"]]) + summ_df[measures["syllable_rate"]] = (get_num_of_syllables(full_text) / summ_df[measures["speech_minutes"]]) + summ_df[measures["speech_percentage"]] = 100 * (summ_df[measures["speech_minutes"]] / summ_df[measures["file_length"]]) + if len(word_df[measures["word_pause"]]) > 1: - summ_df[measures["word_pause_mean"]] = word_df[measures["word_pause"]].mean( - skipna=True - ) - summ_df[measures["word_pause_var"]] = word_df[measures["word_pause"]].var( - skipna=True - ) - - if len(phrase_df[measures["phrase_pause"]]) > 1: - summ_df[measures["phrase_pause_mean"]] = phrase_df[measures["phrase_pause"]].mean( - skipna=True - ) - summ_df[measures["phrase_pause_var"]] = phrase_df[measures["phrase_pause"]].var( - skipna=True - ) + summ_df[measures["word_pause_mean"]] = word_df[measures["word_pause"]].mean(skipna=True) + summ_df[measures["word_pause_var"]] = word_df[measures["word_pause"]].var(skipna=True) if len(turn_df) > 0: summ_df[measures["num_turns"]] = len(turn_df) - summ_df[measures["turn_minutes_mean"]] = turn_df[ - measures["turn_minutes"] - ].mean(skipna=True) - summ_df[measures["turn_words_mean"]] = turn_df[ - measures["turn_words"] - ].mean(skipna=True) - summ_df[measures["turn_pause_mean"]] = turn_df[ - measures["turn_pause"] - ].mean(skipna=True) - summ_df["num_one_word_turns"] = len( - turn_df[turn_df[measures["turn_words"]] == 1] - ) - summ_df[measures["num_interrupts"]] = sum(turn_df[measures["interrupt_flag"]]) + summ_df[measures["turn_minutes_mean"]] = turn_df[measures["turn_minutes"]].mean(skipna=True) + + summ_df[measures["turn_words_mean"]] = turn_df[measures["turn_words"]].mean(skipna=True) + summ_df[measures["turn_pause_mean"]] = turn_df[measures["turn_pause"]].mean(skipna=True) + + summ_df["num_one_word_turns"] = len(turn_df[turn_df[measures["turn_words"]] == 1]) + summ_df[measures["num_interrupts"]] = len(turn_df[turn_df[measures["interrupt_flag"]]==True]) return summ_df - -def get_pause_feature_word(word_df, df_diff, word_list, phrase_index, measures): +def get_pause_feature(json_conf, df_list, text_list, turn_index, measures, time_index, language): """ ------------------------------------------------------------------------------------------------------ - This function calculates various pause-related speech characteristic - features at the word level and adds them to the output dataframe word_df. + This function calculates various pause-related + speech characteristic features Parameters: ........... - word_df: pandas dataframe - A dataframe containing word summary information - df_diff: pandas dataframe - A dataframe containing the word-level information - from the JSON response. - word_list: list - List of transcribed text at the word level. - phrase_index: list - A list containing the indices of the first and last word - in each phrase or turn. + json_conf: list + JSON response object. + df_list: list + List of pandas dataframes: word_df, turn_df, summ_df + text_list: list + List of transcribed text: split into words, turns, and full text. + turn_index: list + List of indices for text_list. measures: dict A dictionary containing the names of the columns in the output dataframes. + time_index: list + timepoint index (start/end) + language: str + Language of the transcribed text. Returns: ........... - word_df: pandas dataframe - The updated word_df dataframe. + df_feature: list + List of updated pandas dataframes (word_df, turn_df and summ_df) ------------------------------------------------------------------------------------------------------ """ - phrase_starts = [pindex[0] for pindex in phrase_index] + if len(json_conf) <= 0: + return df_list - word_df[measures["word_pause"]] = df_diff[measures["pause"]].where( - ~df_diff[measures["old_index"]].isin(phrase_starts), np.nan - ) + word_df, turn_df, summ_df = df_list + word_list, turn_list, full_text = text_list + df_diff = pd.DataFrame(json_conf) - # calculate the number of syllables in each word from the word list - word_df[measures["num_syllables"]] = [ - get_num_of_syllables(word) for word in word_list - ] - return word_df + # Calculate the pause time between; each word and add the results to pause_list + if measures["pause"] not in df_diff.columns: + df_diff[measures["pause"]] = df_diff[time_index[0]].astype(float) - df_diff[time_index[1]].astype(float).shift(1) + + # word-level analysis + word_df = get_pause_feature_word(word_df, df_diff, word_list, turn_index, measures) + + # turn-level analysis + if len(turn_index) > 0: + turn_df = get_pause_feature_turn(turn_df, df_diff, turn_list, turn_index, time_index, measures, language) + # file-level analysis + summ_df = update_summ_df(df_diff, summ_df, full_text, time_index, word_df, turn_df, measures) + df_feature = [word_df, turn_df, summ_df] + return df_feature -def get_pause_feature_phrase(phrase_df, df_diff, phrase_list, phrase_index, turn_index, time_index, measures): +def get_mattr(text): """ ------------------------------------------------------------------------------------------------------ - - This function calculates various pause-related speech characteristic - features at the phrase level and adds them to the output dataframe phrase_df. + This function calculates the Moving Average Type-Token Ratio (MATTR) + of the input text using the + LexicalRichness library. Parameters: ........... - phrase_df: pandas dataframe - A dataframe containing phrase summary information - df_diff: pandas dataframe - A dataframe containing the word-level information - from the JSON response. - phrase_list: list - List of transcribed text at the phrase level. - phrase_index: list - A list containing the indices of the first and last word - in each phrase - turn_index: list - A list containing the indices of the first and last word - in each turn. - time_index: list - A list containing the names of the columns in json that contain - the start and end times of each word. - measures: dict - A dictionary containing the names of the columns in the output dataframes. + text : str + The input text to be analyzed. Returns: ........... - phrase_df: pandas dataframe - The updated phrase_df dataframe. + mattr : float + The calculated MATTR value. ------------------------------------------------------------------------------------------------------ """ - phrase_starts = [pindex[0] for pindex in phrase_index] - - df_diff_phrase = df_diff[ - df_diff[measures["old_index"]].isin(phrase_starts) - ] # get the rows corresponding to the start of each phrase - - if len(turn_index) > 0: - turn_starts = [ - uindex[0] for uindex in turn_index - ] # get the start index of each turn - phrase_df[measures["phrase_pause"]] = df_diff_phrase[measures["pause"]].where( - ~df_diff_phrase[measures["old_index"]].isin(turn_starts), np.nan - ) - else: - phrase_df[measures["phrase_pause"]] = df_diff_phrase[measures["pause"]] - - phrase_df = phrase_df.reset_index(drop=True) - - phrase_df = process_pause_feature( - df_diff, phrase_df, phrase_list, phrase_index, time_index, measures["phrase"], measures - ) + word = nltk.word_tokenize(text) + filter_punc = list(value for value in word if value not in [".", "!", "?"]) + filter_punc = " ".join(filter_punc) + mattr = np.nan - return phrase_df + lex_richness = LexicalRichness(filter_punc) + if lex_richness.words > 0: + mattr = lex_richness.mattr(window_size=lex_richness.words) + return mattr -def get_pause_feature_turn(turn_df, df_diff, turn_list, turn_index, time_index, measures): +def get_tag(json_conf, tag_dict, measures): """ ------------------------------------------------------------------------------------------------------ - This function calculates various pause-related speech characteristic - features at the turn level and adds them to the output dataframe turn_df. + This function performs part-of-speech + tagging on the input text using NLTK, and returns an updated + json_conf list with the part-of-speech tags. Parameters: ........... - turn_df: pandas dataframe - A dataframe containing turn summary information - df_diff: pandas dataframe - A dataframe containing the word-level information - from the JSON response. - turn_list: list - List of transcribed text at the turn level. - turn_index: list - A list containing the indices of the first and last word - in each turn. - time_index: list - A list containing the names of the columns in json that contain - the start and end times of each word. + json_conf: list + JSON response object. + tag_dict: dict + A dictionary mapping the NLTK tags to more readable tags. measures: dict A dictionary containing the names of the columns in the output dataframes. Returns: ........... - turn_df: pandas dataframe - The updated turn_df dataframe. + json_conf: list + The updated json_conf list. ------------------------------------------------------------------------------------------------------ """ + if len(json_conf) <= 0: + return json_conf - turn_starts = [uindex[0] for uindex in turn_index] + if "alternatives" not in json_conf[0].keys(): + word_list = [word["word"] for word in json_conf if "word" in word]# local vosk transcriber + + else: + word_list = [item["alternatives"][0]["content"] for item in json_conf]# aws transcriber - # get the rows corresponding to the start of each turn - df_diff_turn = df_diff[ - df_diff[measures["old_index"]].isin(turn_starts) - ] + tag_list = nltk.pos_tag(word_list) + for i, tag in enumerate(tag_list): + + if tag[1] in tag_dict.keys(): + json_conf[i][measures["tag"]] = tag_dict[tag[1]] + + else: + json_conf[i][measures["tag"]] = "Other" + return json_conf - turn_df[measures["turn_pause"]] = df_diff_turn[measures["pause"]] - turn_df[measures["interrupt_flag"]] = False - # set pre_turn_pause to 0 if negative (due to overlapping turns) - # and set interrupt_flag to True - negative_pause = turn_df[measures["turn_pause"]] < 0 - turn_df.loc[negative_pause, measures["turn_pause"]] = 0 - turn_df.loc[negative_pause, measures["interrupt_flag"]] = True +def get_tag_summ(json_conf, df_list, measures): + """ + ------------------------------------------------------------------------------------------------------ - turn_df = turn_df.reset_index(drop=True) + This function calculates the proportions of verbs, + pronouns, adjectives, and nouns in the + transcribed text, and adds them to the output dataframe summ_df. - turn_df = process_pause_feature( - df_diff, turn_df, turn_list, turn_index, time_index, measures["turn"], measures - ) + Parameters: + ........... + json_conf: list + JSON response object. + df_list: list + List of pandas dataframes: word_df, turn_df, summ_df + measures: dict + A dictionary containing the names of the columns in the output dataframes. - return turn_df + Returns: + ........... + df_list: list + List of updated pandas dataframes. + ------------------------------------------------------------------------------------------------------ + """ + word_df, turn_df, summ_df = df_list + df_conf = pd.DataFrame(json_conf) + word_df[measures["part_of_speech"]] = df_conf[measures["tag"]] + + df_list = [word_df, turn_df, summ_df] + return df_list -def get_pause_feature(json_conf, df_list, text_list, text_indices, measures): +def get_sentiment(df_list, text_list, measures): """ ------------------------------------------------------------------------------------------------------ - This function calculates various pause-related - speech characteristic features + This function calculates the sentiment scores of the input text using + VADER, and adds them to the output dataframe summ_df. Parameters: ........... - json_conf: list - JSON response object. df_list: list List of pandas dataframes. - word_df, phrase_df, turn_df, summ_df text_list: list List of transcribed text. - split into words, phrases, turns, and full text. - text_indices: list - List of indices for text_list. - for phrases and turns. measures: dict A dictionary containing the names of the columns in the output dataframes. Returns: ........... - df_feature: list + df_list: list List of updated pandas dataframes. - word_df, phrase_df, turn_df, summ_df ------------------------------------------------------------------------------------------------------ """ - # Check if json_conf is empty - if len(json_conf) <= 0: - return df_list - - word_df, phrase_df, turn_df, summ_df = df_list - word_list, phrase_list, turn_list, full_text = text_list - phrase_index, turn_index = text_indices + word_df, turn_df, summ_df = df_list + word_list, turn_list, full_text = text_list - # Convert json_conf to a pandas DataFrame - df_diff = pd.DataFrame(json_conf) + sentiment = SentimentIntensityAnalyzer() + cols = [measures["neg"], measures["neu"], measures["pos"], measures["compound"], measures["speech_mattr"]] - time_index = ["start", "end"] + for idx, u in enumerate(turn_list): + try: + + sentiment_dict = sentiment.polarity_scores(u) + mattr = get_mattr(u) + turn_df.loc[idx, cols] = list(sentiment_dict.values()) + [mattr] + + except Exception as e: + logger.error(f"Error in sentiment analysis: {e}") + continue + + sentiment_dict = sentiment.polarity_scores(full_text) + mattr = get_mattr(full_text) - # Calculate the pause time between - # each word and add the results to pause_list - if measures["pause"] not in df_diff.columns: - df_diff[measures["pause"]] = df_diff[time_index[0]].astype(float) - df_diff[ - time_index[1] - ].astype(float).shift(1) + summ_df.loc[0, cols] = list(sentiment_dict.values()) + [mattr] + df_list = [word_df, turn_df, summ_df] + return df_list - # word-level analysis - word_df = get_pause_feature_word(word_df, df_diff, word_list, phrase_index, measures) +def calculate_file_feature(json_data, model, speakers): + """ + ------------------------------------------------------------------------------------------------------ - # phrase-level analysis - phrase_df = get_pause_feature_phrase( - phrase_df, df_diff, phrase_list, phrase_index, turn_index, time_index, measures - ) + Calculate file features based on JSON data. - # turn-level analysis - if len(turn_index) > 0: - turn_df = get_pause_feature_turn( - turn_df, df_diff, turn_list, turn_index, time_index, measures - ) + Parameters: + ........... + json_conf: list + JSON response object. + model: str + model name (vosk/aws/whisper) + speakers: str + speakers label - # file-level analysis - summ_df = update_summ_df( - df_diff, summ_df, full_text, time_index, word_df, phrase_df, turn_df, measures - ) + Returns: + ........... + tuple: A tuple containing two values - the total file length and the percentage of time spent speaking. - df_feature = [word_df, phrase_df, turn_df, summ_df] + ------------------------------------------------------------------------------------------------------ + """ + + if model == 'aws': + segments = json_data.get('items', []) + file_length = max(float(segment.get("end_time", "0")) for segment in segments) + + if speakers is None: + return file_length/60, np.NaN - return df_feature + speaking_time = sum(float(segment.get("end_time", "0") or "0") - float(segment.get("start_time", "0") or "0") + for segment in segments if segment.get("speaker_label", "") in speakers) + else: + segments = json_data.get('segments', []) + file_length = max(segment.get('end', 0) for segment in segments) + + if speakers is None: + return file_length/60, np.NaN + speaking_time = sum(segment['end'] - segment['start'] for segment in segments if segment.get('speaker', '') in speakers) + speaking_pct = (speaking_time / file_length) * 100 + return file_length/60, speaking_pct -def process_language_feature( - json_conf, df_list, text_list, - text_indices, language, measures, -): +def process_language_feature(df_list, transcribe_info, language, time_index, measures): """ ------------------------------------------------------------------------------------------------------ @@ -1358,17 +850,12 @@ def process_language_feature( Parameters: ........... - json_conf: list - JSON response object. df_list: list List of pandas dataframes. - word_df, phrase_df, turn_df, summ_df - text_list: list - List of transcribed text. - split into words, phrases, turns, and full text. - text_indices: list - List of indices for text_list. - for phrases and turns. + transcribe_info: list + transcribed info + time_index: list + timepoint index (start/end) language: str Language of the transcribed text. measures: dict @@ -1376,25 +863,17 @@ def process_language_feature( Returns: ........... - word_df: pandas dataframe - A dataframe containing word summary information - phrase_df: pandas dataframe - A dataframe containing phrase summary information - turn_df: pandas dataframe - A dataframe containing turn summary information - summ_df: pandas dataframe - A dataframe containing summary information on the speech + df_list: list + List of pandas dataframes (word_df, turn_df and summ_df) ------------------------------------------------------------------------------------------------------ """ - - df_list = get_pause_feature(json_conf, df_list, text_list, text_indices, measures) + json_conf, text_list, turn_indices = transcribe_info + df_list = get_pause_feature(json_conf, df_list, text_list, turn_indices, measures, time_index, language) if language == "en": json_conf = get_tag(json_conf, TAG_DICT, measures) - df_list = get_tag_summ(json_conf, df_list, text_indices, measures) + df_list = get_tag_summ(json_conf, df_list, measures) df_list = get_sentiment(df_list, text_list, measures) - - word_df, phrase_df, turn_df, summ_df = df_list - return word_df, phrase_df, turn_df, summ_df + return df_list diff --git a/setup.py b/setup.py index 7545ba1..b9f51fc 100644 --- a/setup.py +++ b/setup.py @@ -12,7 +12,7 @@ install_requires = fp.read() setuptools.setup(name='openwillis', - version='1.5.2', + version='1.6', description='digital health measurement', long_description=long_description, long_description_content_type="text/markdown",