From 3cabd771019da5fef4e3f7445d0f6a9466f41478 Mon Sep 17 00:00:00 2001 From: Vijay Yadav Date: Mon, 30 Oct 2023 22:41:32 -0400 Subject: [PATCH 01/21] speech characteristic update --- openwillis/measures/text/config/text.json | 4 +- openwillis/measures/text/speech_attribute.py | 99 +- .../text/util/characteristics_util.py | 1252 ++++++----------- 3 files changed, 439 insertions(+), 916 deletions(-) diff --git a/openwillis/measures/text/config/text.json b/openwillis/measures/text/config/text.json index fe08030..443433d 100644 --- a/openwillis/measures/text/config/text.json +++ b/openwillis/measures/text/config/text.json @@ -21,8 +21,8 @@ "word_pause": "pre_word_pause", "phrase_pause": "pre_phrase_pause", "turn_pause": "pre_turn_pause", - "word_pause_mean": "word_pause_length_mean", - "word_pause_var": "word_pause_variability", + "word_pause_mean": "mean_pre_word_pause", + "word_pause_var": "mean_pause_variability", "phrase_pause_mean": "phrase_pause_length_mean", "phrase_pause_var": "phrase_pause_variability", "num_syllables": "num_syllables", diff --git a/openwillis/measures/text/speech_attribute.py b/openwillis/measures/text/speech_attribute.py index e5bd708..b3eaba5 100644 --- a/openwillis/measures/text/speech_attribute.py +++ b/openwillis/measures/text/speech_attribute.py @@ -9,7 +9,8 @@ import nltk import numpy as np import pandas as pd -from openwillis.measures.text.util import characteristics_util as cutil +#from openwillis.measures.text.util import characteristics_util as cutil +from util import characteristics_util as cutil logging.basicConfig(level=logging.INFO) logger = logging.getLogger() @@ -155,7 +156,7 @@ def filter_transcribe(json_conf, measures, speaker_label=None): return filter_json, text_list, text_indices -def filter_whisper(json_conf, measures, speaker_label=None): +def filter_whisper(json_conf, measures, min_turn_length, speaker_label=None): """ ------------------------------------------------------------------------------------------------------ @@ -171,6 +172,8 @@ def filter_whisper(json_conf, measures, speaker_label=None): A dictionary containing the names of the columns in the output dataframes. speaker_label: str Speaker label + min_turn_length: int + minimum words required in each turn Returns: ........... @@ -192,38 +195,24 @@ def filter_whisper(json_conf, measures, speaker_label=None): item_data = json_conf["segments"] if speaker_label is not None: - # filter out segments that do not have speaker labels - item_data = [ - segment for segment in item_data if "speaker" in segment - ] - - # make a dictionary to map old indices to new indices + item_data = [segment for segment in item_data if "speaker" in segment] + item_data = cutil.create_index_column(item_data, measures) - - # phrase-split - phrases_idxs, phrases = cutil.filter_phrases( - item_data, speaker_label, measures - ) + phrases_idxs, phrases = cutil.filter_phrases(item_data, speaker_label, measures) # phrase-split # turn-split if speaker_label is not None: - turns_idxs, turns = cutil.filter_turns( - item_data, speaker_label, measures - ) + turns_idxs, turns = cutil.filter_turns(item_data, speaker_label, measures, min_turn_length) + else: turns_idxs, turns = [], [] - - + # filter json to only include items with start_time and end_time filter_json = cutil.filter_json_transcribe(item_data, speaker_label, measures) - - # extract words - words = [w["word"] for w in filter_json] - - # entire transcript - by joining all the phrases + words = [value["word"] for value in filter_json] text = " ".join(phrases) - - text_list = [words, phrases, turns, text] + + text_list = [words, turns, text] text_indices = [phrases_idxs, turns_idxs] return filter_json, text_list, text_indices @@ -258,12 +247,10 @@ def filter_vosk(json_conf, measures): # make a dictionary to map old indices to new indices for i, item in enumerate(json_conf): item[measures["old_index"]] = i - - + return words, text - -def speech_characteristics(json_conf, language="en", speaker_label=None): +def speech_characteristics(json_conf, language="en", speaker_label=None, min_turn_length=1): """ ------------------------------------------------------------------------------------------------------ @@ -277,6 +264,8 @@ def speech_characteristics(json_conf, language="en", speaker_label=None): Language type speaker_label: str Speaker label + min_turn_length: int + minimum words required in each turn Returns: ........... @@ -295,53 +284,31 @@ def speech_characteristics(json_conf, language="en", speaker_label=None): measures = get_config(os.path.abspath(__file__), "text.json") df_list = cutil.create_empty_dataframes(measures) - + try: - if not isinstance(language, str): - raise ValueError("Language should be a string") - if len(language) < 2: - # if language is not specified, then set it to "xx" - # run speech characteristics as not english - language = "xx" - else: - language = language[:2].lower() - if bool(json_conf): - cutil.download_nltk_resources() + language = "na" if language is None or len(language) < 2 else language[:2].lower() + + if language == 'en': + cutil.download_nltk_resources() if is_whisper_transcribe(json_conf): - filter_json, text_list, text_indices = filter_whisper( - json_conf, measures, speaker_label - ) + filter_json, text_list, text_indices = filter_whisper(json_conf, measures, min_turn_length, speaker_label) if len(filter_json) > 0 and len(text_list[-1]) > 0: - df_list = cutil.process_language_feature( - filter_json, df_list, text_list, - text_indices, language, measures, - ) + df_list = cutil.process_language_feature(filter_json, df_list, text_list, text_indices, language, measures) + else: words, text = filter_vosk(json_conf, measures) if len(text) > 0: - df_list = cutil.process_language_feature( - json_conf, df_list, [words, [], [], text], - [[], []], language, measures, - ) - + df_list = cutil.process_language_feature(json_conf, df_list, [words,[],[],text],[[],[]], language, measures) + + except Exception as e: logger.error(f"Error in Speech Characteristics {e}") finally: - # if word_df is empty, then add a row of NaNs - if df_list[0].empty: - df_list[0].loc[0] = np.nan - # if phrase_df is empty, then add a row of NaNs - if df_list[1].empty: - df_list[1].loc[0] = np.nan - # if turn_df is empty, then add a row of NaNs - if df_list[2].empty: - df_list[2].loc[0] = np.nan - # if summ_df is empty, then add a row of NaNs - if df_list[3].empty: - df_list[3].loc[0] = np.nan - - return df_list + for df in df_list: + df.loc[0] = np.nan if df.empty else df.loc[0] + + return df_list diff --git a/openwillis/measures/text/util/characteristics_util.py b/openwillis/measures/text/util/characteristics_util.py index dcefbe6..4d939bb 100644 --- a/openwillis/measures/text/util/characteristics_util.py +++ b/openwillis/measures/text/util/characteristics_util.py @@ -70,29 +70,6 @@ def create_empty_dataframes(measures): ] ) - phrase_df = pd.DataFrame( - columns=[ - measures["phrase_pause"], - measures["phrase_minutes"], - measures["phrase_words"], - measures["word_rate"], - measures["syllable_rate"], - measures["pause_rate"], - measures["pause_var"], - measures["pause_meandur"], - measures["speech_percentage"], - measures["speech_noun"], - measures["speech_verb"], - measures["speech_adj"], - measures["speech_pronoun"], - measures["pos"], - measures["neg"], - measures["neu"], - measures["compound"], - measures["speech_mattr"], - ] - ) - turn_df = pd.DataFrame( columns=[ measures["turn_pause"], @@ -123,11 +100,8 @@ def create_empty_dataframes(measures): measures["speech_words"], measures["word_rate"], measures["syllable_rate"], - measures["pause_rate"], measures["word_pause_mean"], measures["word_pause_var"], - measures["phrase_pause_mean"], - measures["phrase_pause_var"], measures["speech_percentage"], measures["speech_noun"], measures["speech_verb"], @@ -147,301 +121,57 @@ def create_empty_dataframes(measures): ] ) - return word_df, phrase_df, turn_df, summ_df + return word_df, turn_df, summ_df - -def filter_speaker_phrase(item_data, speaker_label, phrases_idxs, phrases): - """ - ------------------------------------------------------------------------------------------------------ - This function updates the phrases list - to only include the speaker label provided. - Parameters: - ........... - item_data: dict - JSON response object. - speaker_label: str - Speaker label - phrases_idxs: list - A list of tuples containing - the start and end indices of the phrases in the JSON object. - phrases: list - A list of phrases extracted from the JSON object. - Returns: - ........... - phrases_idxs: list - A list of tuples containing - the start and end indices of the phrases in the JSON object. - phrases: list - A list of phrases extracted from the JSON object. - ------------------------------------------------------------------------------------------------------ +def create_index_column(item_data, measures): """ - phrases_idxs2 = [] - phrases2 = [] - for i, phrase in enumerate(phrases_idxs): - try: - start_idx = phrase[0] - if item_data[start_idx].get("speaker_label", "") == speaker_label: - phrases_idxs2.append(phrase) - phrases2.append(phrases[i]) - except Exception as e: - logger.error(f"Error in phrase-split for speaker {speaker_label}: {e}") - continue - - return phrases_idxs2, phrases2 - + This function creates an index column in the JSON response object. -def filter_speaker_turn(item_data, speaker_label, turns_idxs, turns): - """ - ------------------------------------------------------------------------------------------------------ - - This function updates the turns list - to only include the speaker label provided. Parameters: - ........... item_data: dict JSON response object. - speaker_label: str - Speaker label - turns_idxs: list - A list of tuples containing - the start and end indices of the turns in the JSON object. - turns: list - A list of turns extracted from the JSON object. - Returns: - ........... - turns_idxs: list - A list of tuples containing - the start and end indices of the turns in the JSON object. - turns: list - A list of turns extracted from the JSON object. - ------------------------------------------------------------------------------------------------------ - """ - start_idx = 0 - for i, item in enumerate(item_data): - try: - if ( - i > 0 - and item.get("speaker_label", "") == speaker_label - and item_data[i - 1].get("speaker_label", "") != speaker_label - ): - start_idx = i - elif ( - i > 0 - and item.get("speaker_label", "") != speaker_label - and item_data[i - 1].get("speaker_label", "") == speaker_label - ): - turns_idxs.append((start_idx, i - 1)) - # create turns texts - turns.append( - " ".join( - [ - item["alternatives"][0]["content"] - for item in item_data[start_idx:i] - ] - ) - ) - except Exception as e: - logger.error(f"Error in turn-split for speaker {speaker_label}: {e}") - continue - - # if the last item is the speaker label - if start_idx not in [item[0] for item in turns_idxs]: - turns_idxs.append((start_idx, len(item_data) - 1)) - turns.append( - " ".join( - [ - item["alternatives"][0]["content"] - for item in item_data[start_idx:] - ] - ) - ) - return turns_idxs, turns - + measures: dict + A dictionary containing the names of the columns in the output dataframes. -def filter_speaker(item_data, speaker_label, turns_idxs, turns, phrases_idxs, phrases): - """ - ------------------------------------------------------------------------------------------------------ - This function updates the turns and phrases lists - to only include the speaker label provided. - Parameters: - ........... - item_data: dict - JSON response object. - speaker_label: str - Speaker label - turns_idxs: list - A list of tuples containing - the start and end indices of the turns in the JSON object. - turns: list - A list of turns extracted from the JSON object. - phrases_idxs: list - A list of tuples containing - the start and end indices of the phrases in the JSON object. - phrases: list - A list of phrases extracted from the JSON object. Returns: - ........... - turns_idxs: list - A list of tuples containing - the start and end indices of the turns in the JSON object. - turns: list - A list of turns extracted from the JSON object. - phrases_idxs: list - A list of tuples containing - the start and end indices of the phrases in the JSON object. - phrases: list - A list of phrases extracted from the JSON object. - Raises: - ........... - ValueError: If the speaker label is not found in the json response object. - ------------------------------------------------------------------------------------------------------ + item_data: dict + The updated JSON response object. """ + index = 0 + for item in item_data: + for word in item.get("words", []): + word[measures["old_index"]] = index + index += 1 - speaker_labels = [ - item["speaker_label"] for item - in item_data if "speaker_label" in item - ] - - if speaker_label not in speaker_labels: - raise ValueError( - f"Speaker label {speaker_label} " - "not found in the json response object." - ) - - # phrase-split for the speaker label - phrases_idxs, phrases = filter_speaker_phrase( - item_data, speaker_label, phrases_idxs, phrases - ) - - # turn-split for the speaker label - turns_idxs, turns = filter_speaker_turn( - item_data, speaker_label, turns_idxs, turns - ) - - return turns_idxs, turns, phrases_idxs, phrases - + return item_data -def phrase_split(text): +def download_nltk_resources(): """ ------------------------------------------------------------------------------------------------------ - This function splits the input text into phrases. - Parameters: - ........... - text: str - The input text. - Returns: - ........... - phrases: list - A list of phrases extracted from the input text. - phrases_idxs: list - A list of tuples containing - the start and end indices of the phrases in the input text. - ------------------------------------------------------------------------------------------------------ - """ - phrases = nltk.tokenize.sent_tokenize(text) - phrases_idxs = [] - - start_idx = 0 - for phrase in phrases: - end_idx = start_idx + len(phrase.split()) - 1 - phrases_idxs.append((start_idx, end_idx)) - start_idx = end_idx + 1 - return phrases, phrases_idxs - - -def filter_turns(item_data, speaker_label, measures): - """ - ------------------------------------------------------------------------------------------------------ - - This function updates the turns list - to only include the speaker label provided. + This function downloads the + required NLTK resources for processing text data. Parameters: ........... - item_data: dict - JSON response object. - speaker_label: str - Speaker label - measures: dict - A dictionary containing the names of the columns in the output dataframes. + None Returns: ........... - turns_idxs: list - A list of tuples containing - the start and end indices of the turns in the JSON object. - turns: list - A list of turns extracted from the JSON object. - - Raises: - ........... - ValueError: If the speaker label is not found in the json response object. + None ------------------------------------------------------------------------------------------------------ """ + try: + nltk.data.find("tokenizers/punkt") + except LookupError: + nltk.download("punkt") - speaker_labels = [ - item["speaker"] for item - in item_data if "speaker" in item - ] - - if speaker_label not in speaker_labels: - raise ValueError( - f"Speaker label {speaker_label} " - "not found in the json response object." - ) - - turns_idxs, turns = [], [] - - start_idx = 0 - start_idx2 = 0 - for i, item in enumerate(item_data): - try: - if ( - i > 0 - and item.get("speaker", "") == speaker_label - and item_data[i - 1].get("speaker", "") != speaker_label - ): - start_idx = i - start_idx2 = item["words"][0][measures["old_index"]] - elif ( - i > 0 - and item.get("speaker", "") != speaker_label - and item_data[i - 1].get("speaker", "") == speaker_label - ): - end_idx = i-1 - end_idx2 = item["words"][-1][measures["old_index"]] - turns_idxs.append((start_idx2, end_idx2)) - # create turns texts - turns.append( - " ".join( - [ - item["text"] - for item in item_data[start_idx:(end_idx+1)] - ] - ) - ) - except Exception as e: - logger.error(f"Error in turn-split for speaker {speaker_label}: {e}") - continue - - # if the last item is the speaker label - if start_idx not in [item[0] for item in turns_idxs]: - end_idx2 = item_data[-1]["words"][-1][measures["old_index"]] - turns_idxs.append((start_idx2, end_idx2)) - turns.append( - " ".join( - [ - item["text"] - for item in item_data[start_idx:] - ] - ) - ) - - return turns_idxs, turns - - + try: + nltk.data.find("averaged_perceptron_tagger") + except LookupError: + nltk.download("averaged_perceptron_tagger") + def filter_phrases(item_data, speaker_label, measures): """ ------------------------------------------------------------------------------------------------------ @@ -489,40 +219,75 @@ def filter_phrases(item_data, speaker_label, measures): logger.error(f"Failed to filter phrases: {e}") return phrases_idxs, phrases - -def create_index_column(item_data, measures): +def filter_turns(item_data, speaker_label, measures, min_turn_length): """ ------------------------------------------------------------------------------------------------------ - - This function creates an index column in the JSON response object. + + This function updates the turns list + to only include the speaker label provided. Parameters: ........... item_data: dict JSON response object. + speaker_label: str + Speaker label measures: dict A dictionary containing the names of the columns in the output dataframes. + min_turn_length: int + minimum words required in each turn Returns: ........... - item_data: dict - The updated JSON response object. + turns_idxs: list + A list of tuples containing + the start and end indices of the turns in the JSON object. + turns: list + A list of turns extracted from the JSON object. + + Raises: + ........... + ValueError: If the speaker label is not found in the json response object. ------------------------------------------------------------------------------------------------------ """ - i = 0 - i_p = 0 - while True: - for j, word in enumerate(item_data[i_p]["words"]): - item_data[i_p]["words"][j][measures["old_index"]] = i - i += 1 - - i_p += 1 - if i_p >= len(item_data): - break + turns_idxs, turns = [], [] + current_turn = None + + for item in item_data: + try: + + if "speaker" in item: + if item["speaker"] == speaker_label: + current_turn = [item] if current_turn is None else current_turn + [item] + + else: + if current_turn is not None: + + start_idx2 = current_turn[0]["words"][0][measures["old_index"]] + end_idx2 = current_turn[-1]["words"][-1][measures["old_index"]] + turn_text = " ".join(item["text"] for item in current_turn) + + if len(turn_text.split(" ")) >= min_turn_length: + turns_idxs.append((start_idx2, end_idx2)) + + turns.append(turn_text) + current_turn = None + + except Exception as e: + logger.error(f"Error in turn calculation {e}") - return item_data + if current_turn is not None: + start_idx2 = current_turn[0]["words"][0][measures["old_index"]] + + end_idx2 = current_turn[-1]["words"][-1][measures["old_index"]] + turn_text = " ".join(item["text"] for item in current_turn) + + if len(turn_text.split(" ")) >= min_turn_length: + turns_idxs.append((start_idx2, end_idx2)) + turns.append(turn_text) + return turns_idxs, turns def pause_calculation(filter_json, measures): """ @@ -546,15 +311,13 @@ def pause_calculation(filter_json, measures): """ for i, item in enumerate(filter_json): if i > 0: - item[measures["pause"]] = float(item["start"]) - float( - filter_json[i - 1]["end"] - ) + item[measures["pause"]] = float(item["start"]) - float(filter_json[i - 1]["end"]) + else: item[measures["pause"]] = np.nan return filter_json - def filter_json_transcribe(item_data, speaker_label, measures): """ ------------------------------------------------------------------------------------------------------ @@ -584,352 +347,90 @@ def filter_json_transcribe(item_data, speaker_label, measures): speaker = item["speaker"] words = item["words"] - - # update speaker labels - for j, w in enumerate(words): + + for j, w in enumerate(words):# update speaker labels words[j]["speaker"] = speaker item_data2 += words except Exception as e: logger.error(f"Failed to filter word: {e}") - filter_json = [ - item for item in item_data2 - if "start" in item and "end" in item - ] - - # calculate time difference between each word - filter_json = pause_calculation(filter_json, measures) + filter_json = [item for item in item_data2 if "start" in item and "end" in item] + filter_json = pause_calculation(filter_json, measures) # calculate time difference between each word if speaker_label is not None: - filter_json = [ - item - for item in filter_json - if item.get("speaker", "") == speaker_label - ] - + filter_json = [item for item in filter_json if item.get("speaker", "") == speaker_label] return filter_json - -def download_nltk_resources(): +def get_num_of_syllables(text): """ ------------------------------------------------------------------------------------------------------ - This function downloads the - required NLTK resources for processing text data. + This function calculates the number of syllables in the input text. Parameters: ........... - None + text: str + The input text. Returns: ........... - None + syllable_count: int + The number of syllables in the input text. - ------------------------------------------------------------------------------------------------------ + --------------------------------------------------------------------------------------- """ - try: - nltk.data.find("tokenizers/punkt") - except LookupError: - nltk.download("punkt") - try: - nltk.data.find("averaged_perceptron_tagger") - except LookupError: - nltk.download("averaged_perceptron_tagger") - - -def get_tag(json_conf, tag_dict, measures): - """ - ------------------------------------------------------------------------------------------------------ - - This function performs part-of-speech - tagging on the input text using NLTK, and returns an updated - json_conf list with the part-of-speech tags. - - Parameters: - ........... - json_conf: list - JSON response object. - tag_dict: dict - A dictionary mapping the NLTK tags to more readable tags. - measures: dict - A dictionary containing the names of the columns in the output dataframes. - - Returns: - ........... - json_conf: list - The updated json_conf list. - - ------------------------------------------------------------------------------------------------------ - """ - if len(json_conf) <= 0: - return json_conf - - if "alternatives" not in json_conf[0].keys(): - # local vosk transcriber - word_list = [word["word"] for word in json_conf if "word" in word] - else: - # aws transcriber - word_list = [item["alternatives"][0]["content"] for item in json_conf] - - tag_list = nltk.pos_tag(word_list) - - for i, tag in enumerate(tag_list): - if tag[1] in tag_dict.keys(): - json_conf[i][measures["tag"]] = tag_dict[tag[1]] - else: - json_conf[i][measures["tag"]] = "Other" + syllable_tokenizer = nltk.tokenize.SyllableTokenizer() - return json_conf + # remove punctuation + punctuation = "!\"#$%&()*+,-./:;<=>?@[\]^_`{|}~" + syllables = [syllable_tokenizer.tokenize(token) for token in nltk.word_tokenize(text) if token not in punctuation] + # count the number of syllables in each word + syllable_count = sum([len(token) for token in syllables]) + return syllable_count -def get_part_of_speech(df, tags, measures, index=0): +def get_pause_feature_word(word_df, df_diff, word_list, phrase_index, measures): """ ------------------------------------------------------------------------------------------------------ - This function calculates the proportions of verbs, - pronouns, adjectives, and nouns in the - transcribed text, and adds them to the output dataframe df. + This function calculates various pause-related speech characteristic + features at the word level and adds them to the output dataframe word_df. Parameters: ........... - df: pandas dataframe - A dataframe containing the speech characteristics of the input text. - tags: list - A list of part-of-speech tags for the input text. + word_df: pandas dataframe + A dataframe containing word summary information + df_diff: pandas dataframe + A dataframe containing the word-level information + from the JSON response. + word_list: list + List of transcribed text at the word level. + phrase_index: list + A list containing the indices of the first and last word + in each phrase or turn. measures: dict A dictionary containing the names of the columns in the output dataframes. - index: int - The index of the row in the output dataframe df. Returns: ........... - df: pandas dataframe - The updated df dataframe. + word_df: pandas dataframe + The updated word_df dataframe. ------------------------------------------------------------------------------------------------------ """ - if len(tags) == 0: - return df + phrase_starts = [pindex[0] for pindex in phrase_index] - df.loc[index, measures["speech_noun"]] = ( - 100 * len(tags[tags == "Noun"]) / len(tags) - ) - df.loc[index, measures["speech_verb"]] = ( - 100 * len(tags[tags == "Verb"]) / len(tags) - ) - df.loc[index, measures["speech_adj"]] = ( - 100 * len(tags[tags == "Adjective"]) / len(tags) - ) - df.loc[index, measures["speech_pronoun"]] = ( - 100 * len(tags[tags == "Pronoun"]) / len(tags) + word_df[measures["word_pause"]] = df_diff[measures["pause"]].where( + ~df_diff[measures["old_index"]].isin(phrase_starts), np.nan ) - return df - - -def get_tag_summ(json_conf, df_list, text_indices, measures): - """ - ------------------------------------------------------------------------------------------------------ - - This function calculates the proportions of verbs, - pronouns, adjectives, and nouns in the - transcribed text, and adds them to the output dataframe summ_df. - - Parameters: - ........... - json_conf: list - JSON response object. - df_list: list - List of pandas dataframes. - word_df, phrase_df, turn_df, summ_df - text_indices: list - List of indices for text_list. - for phrases and turns. - measures: dict - A dictionary containing the names of the columns in the output dataframes. - - Returns: - ........... - df_list: list - List of updated pandas dataframes. - - ------------------------------------------------------------------------------------------------------ - """ - - word_df, phrase_df, turn_df, summ_df = df_list - phrase_index, turn_index = text_indices - - df_conf = pd.DataFrame(json_conf) - - # word-level analysis - word_df[measures["part_of_speech"]] = df_conf[measures["tag"]] - - # phrase-level analysis - for j, pindex in enumerate(phrase_index): - prange = range(pindex[0], pindex[1] + 1) - phrase_tags = df_conf.loc[df_conf[measures["old_index"]].isin(prange), measures["tag"]] - - phrase_df = get_part_of_speech(phrase_df, phrase_tags, measures, j) - - # turn-level analysis - for j, uindex in enumerate(turn_index): - urange = range(uindex[0], uindex[1] + 1) - turn_tags = df_conf.loc[df_conf[measures["old_index"]].isin(urange), measures["tag"]] - - turn_df = get_part_of_speech(turn_df, turn_tags, measures, j) - - # file-level analysis - summ_df = get_part_of_speech(summ_df, df_conf[measures["tag"]], measures) - - df_list = [word_df, phrase_df, turn_df, summ_df] - - return df_list - - -def get_mattr(text): - """ - ------------------------------------------------------------------------------------------------------ - This function calculates the Moving Average Type-Token Ratio (MATTR) - of the input text using the - LexicalRichness library. - - Parameters: - ........... - text : str - The input text to be analyzed. - - Returns: - ........... - mattr : float - The calculated MATTR value. - - ------------------------------------------------------------------------------------------------------ - """ - word = nltk.word_tokenize(text) - filter_punc = list(value for value in word if value not in [".", "!", "?"]) - filter_punc = " ".join(filter_punc) - mattr = np.nan - - lex_richness = LexicalRichness(filter_punc) - if lex_richness.words > 0: - mattr = lex_richness.mattr(window_size=lex_richness.words) - - return mattr - - -def get_sentiment(df_list, text_list, measures): - """ - ------------------------------------------------------------------------------------------------------ - - This function calculates the sentiment scores of the input text using - VADER, and adds them to the output dataframe summ_df. - - Parameters: - ........... - df_list: list - List of pandas dataframes. - word_df, phrase_df, turn_df, summ_df - text_list: list - List of transcribed text. - split into words, phrases, turns, and full text. - measures: dict - A dictionary containing the names of the columns in the output dataframes. - - Returns: - ........... - df_list: list - List of updated pandas dataframes. - - ------------------------------------------------------------------------------------------------------ - """ - word_df, phrase_df, turn_df, summ_df = df_list - word_list, phrase_list, turn_list, full_text = text_list - - sentiment = SentimentIntensityAnalyzer() - - # column names - cols = [ - measures["neg"], - measures["neu"], - measures["pos"], - measures["compound"], - measures["speech_mattr"], + # calculate the number of syllables in each word from the word list + word_df[measures["num_syllables"]] = [ + get_num_of_syllables(word) for word in word_list ] - - # word-level analysis - for idx, w in enumerate(word_list): - try: - sentiment_dict = sentiment.polarity_scores(w) - - word_df.loc[idx, cols[:-1]] = list(sentiment_dict.values()) - except Exception as e: - logger.error(f"Error in sentiment analysis for word {w}: {e}") - continue - - # phrase-level analysis - for idx, p in enumerate(phrase_list): - try: - sentiment_dict = sentiment.polarity_scores(p) - mattr = get_mattr(p) - - phrase_df.loc[idx, cols] = list(sentiment_dict.values()) + [mattr] - except Exception as e: - logger.error(f"Error in sentiment analysis for phrase {p}: {e}") - continue - - # turn-level analysis - for idx, u in enumerate(turn_list): - try: - sentiment_dict = sentiment.polarity_scores(u) - mattr = get_mattr(u) - - turn_df.loc[idx, cols] = list(sentiment_dict.values()) + [mattr] - except Exception as e: - logger.error(f"Error in sentiment analysis for turn {u}: {e}") - continue - - # file-level analysis - sentiment_dict = sentiment.polarity_scores(full_text) - mattr = get_mattr(full_text) - - summ_df.loc[0, cols] = list(sentiment_dict.values()) + [mattr] - - df_list = [word_df, phrase_df, turn_df, summ_df] - - return df_list - - -def get_num_of_syllables(text): - """ - ------------------------------------------------------------------------------------------------------ - - This function calculates the number of syllables in the input text. - - Parameters: - ........... - text: str - The input text. - - Returns: - ........... - syllable_count: int - The number of syllables in the input text. - - --------------------------------------------------------------------------------------- - """ - - syllable_tokenizer = nltk.tokenize.SyllableTokenizer() - - # remove punctuation - punctuation = "!\"#$%&()*+,-./:;<=>?@[\]^_`{|}~" - syllables = [syllable_tokenizer.tokenize(token) for token in nltk.word_tokenize(text) if token not in punctuation] - # count the number of syllables in each word - syllable_count = sum([len(token) for token in syllables]) - - return syllable_count - + return word_df def process_pause_feature(df_diff, df, text_level, index_list, time_index, level_name, measures): """ @@ -1017,203 +518,7 @@ def process_pause_feature(df_diff, df, text_level, index_list, time_index, level df[measures["pause_rate"]] = df[measures["word_rate"]] - return df - - -def update_summ_df( - df_diff, summ_df, full_text, time_index, word_df, phrase_df, turn_df, measures -): - """ - ------------------------------------------------------------------------------------------------------ - - This function calculates various pause-related speech characteristic - features at the file level and adds them to the output dataframe summ_df. - - Parameters: - ........... - df_diff: pandas dataframe - A dataframe containing the word-level information - from the JSON response. - summ_df: pandas dataframe - A dataframe containing the speech characteristics of the input text. - time_index: list - A list containing the names of the columns in json - that contain the start and end times of each word. - word_df: pandas dataframe - A dataframe containing word summary information - phrase_df: pandas dataframe - A dataframe containing phrase summary information - turn_df: pandas dataframe - A dataframe containing turn summary information - measures: dict - A dictionary containing the names of the columns in the output dataframes. - - Returns: - ........... - summ_df: pandas dataframe - The updated summ_df dataframe. - - ------------------------------------------------------------------------------------------------------ - """ - if len(phrase_df) > 0: - speech_minutes = phrase_df[measures["phrase_minutes"]].sum() - else: - speech_minutes = (float(df_diff.iloc[-1][time_index[1]]) - float(df_diff.iloc[0][time_index[0]])) / 60 - - summ_df[measures["speech_minutes"]] = [speech_minutes] - - summ_df[measures["speech_words"]] = len(df_diff) - if speech_minutes > 0: - summ_df[measures["word_rate"]] = ( - summ_df[measures["speech_words"]] / summ_df[measures["speech_minutes"]] - ) - summ_df[measures["syllable_rate"]] = ( - get_num_of_syllables(full_text) / summ_df[measures["speech_minutes"]] - ) - summ_df[measures["speech_percentage"]] = 100 * ( - 1 - - df_diff.loc[1:, measures["pause"]].sum() - / (60 * summ_df[measures["speech_minutes"]]) - ) - - summ_df[measures["pause_rate"]] = summ_df[measures["word_rate"]] - - if len(word_df[measures["word_pause"]]) > 1: - summ_df[measures["word_pause_mean"]] = word_df[measures["word_pause"]].mean( - skipna=True - ) - summ_df[measures["word_pause_var"]] = word_df[measures["word_pause"]].var( - skipna=True - ) - - if len(phrase_df[measures["phrase_pause"]]) > 1: - summ_df[measures["phrase_pause_mean"]] = phrase_df[measures["phrase_pause"]].mean( - skipna=True - ) - summ_df[measures["phrase_pause_var"]] = phrase_df[measures["phrase_pause"]].var( - skipna=True - ) - - if len(turn_df) > 0: - summ_df[measures["num_turns"]] = len(turn_df) - summ_df[measures["turn_minutes_mean"]] = turn_df[ - measures["turn_minutes"] - ].mean(skipna=True) - summ_df[measures["turn_words_mean"]] = turn_df[ - measures["turn_words"] - ].mean(skipna=True) - summ_df[measures["turn_pause_mean"]] = turn_df[ - measures["turn_pause"] - ].mean(skipna=True) - summ_df["num_one_word_turns"] = len( - turn_df[turn_df[measures["turn_words"]] == 1] - ) - summ_df[measures["num_interrupts"]] = sum(turn_df[measures["interrupt_flag"]]) - - return summ_df - - -def get_pause_feature_word(word_df, df_diff, word_list, phrase_index, measures): - """ - ------------------------------------------------------------------------------------------------------ - - This function calculates various pause-related speech characteristic - features at the word level and adds them to the output dataframe word_df. - - Parameters: - ........... - word_df: pandas dataframe - A dataframe containing word summary information - df_diff: pandas dataframe - A dataframe containing the word-level information - from the JSON response. - word_list: list - List of transcribed text at the word level. - phrase_index: list - A list containing the indices of the first and last word - in each phrase or turn. - measures: dict - A dictionary containing the names of the columns in the output dataframes. - - Returns: - ........... - word_df: pandas dataframe - The updated word_df dataframe. - - ------------------------------------------------------------------------------------------------------ - """ - phrase_starts = [pindex[0] for pindex in phrase_index] - - word_df[measures["word_pause"]] = df_diff[measures["pause"]].where( - ~df_diff[measures["old_index"]].isin(phrase_starts), np.nan - ) - - # calculate the number of syllables in each word from the word list - word_df[measures["num_syllables"]] = [ - get_num_of_syllables(word) for word in word_list - ] - return word_df - - -def get_pause_feature_phrase(phrase_df, df_diff, phrase_list, phrase_index, turn_index, time_index, measures): - """ - ------------------------------------------------------------------------------------------------------ - - This function calculates various pause-related speech characteristic - features at the phrase level and adds them to the output dataframe phrase_df. - - Parameters: - ........... - phrase_df: pandas dataframe - A dataframe containing phrase summary information - df_diff: pandas dataframe - A dataframe containing the word-level information - from the JSON response. - phrase_list: list - List of transcribed text at the phrase level. - phrase_index: list - A list containing the indices of the first and last word - in each phrase - turn_index: list - A list containing the indices of the first and last word - in each turn. - time_index: list - A list containing the names of the columns in json that contain - the start and end times of each word. - measures: dict - A dictionary containing the names of the columns in the output dataframes. - - Returns: - ........... - phrase_df: pandas dataframe - The updated phrase_df dataframe. - - ------------------------------------------------------------------------------------------------------ - """ - phrase_starts = [pindex[0] for pindex in phrase_index] - - df_diff_phrase = df_diff[ - df_diff[measures["old_index"]].isin(phrase_starts) - ] # get the rows corresponding to the start of each phrase - - if len(turn_index) > 0: - turn_starts = [ - uindex[0] for uindex in turn_index - ] # get the start index of each turn - phrase_df[measures["phrase_pause"]] = df_diff_phrase[measures["pause"]].where( - ~df_diff_phrase[measures["old_index"]].isin(turn_starts), np.nan - ) - else: - phrase_df[measures["phrase_pause"]] = df_diff_phrase[measures["pause"]] - - phrase_df = phrase_df.reset_index(drop=True) - - phrase_df = process_pause_feature( - df_diff, phrase_df, phrase_list, phrase_index, time_index, measures["phrase"], measures - ) - - return phrase_df - + return df def get_pause_feature_turn(turn_df, df_diff, turn_list, turn_index, time_index, measures): """ @@ -1271,6 +576,64 @@ def get_pause_feature_turn(turn_df, df_diff, turn_list, turn_index, time_index, return turn_df +def update_summ_df(df_diff, summ_df, full_text, time_index, word_df, turn_df, measures): + """ + ------------------------------------------------------------------------------------------------------ + + This function calculates various pause-related speech characteristic + features at the file level and adds them to the output dataframe summ_df. + + Parameters: + ........... + df_diff: pandas dataframe + A dataframe containing the word-level information + from the JSON response. + summ_df: pandas dataframe + A dataframe containing the speech characteristics of the input text. + time_index: list + A list containing the names of the columns in json + that contain the start and end times of each word. + word_df: pandas dataframe + A dataframe containing word summary information + turn_df: pandas dataframe + A dataframe containing turn summary information + measures: dict + A dictionary containing the names of the columns in the output dataframes. + + Returns: + ........... + summ_df: pandas dataframe + The updated summ_df dataframe. + + ------------------------------------------------------------------------------------------------------ + """ + speech_minutes = (float(df_diff.iloc[-1][time_index[1]]) - float(df_diff.iloc[0][time_index[0]])) / 60 + summ_df[measures["speech_minutes"]] = [speech_minutes] + + summ_df[measures["speech_words"]] = len(df_diff) + if speech_minutes > 0: + + summ_df[measures["word_rate"]] = (summ_df[measures["speech_words"]] / summ_df[measures["speech_minutes"]]) + summ_df[measures["syllable_rate"]] = (get_num_of_syllables(full_text) / summ_df[measures["speech_minutes"]]) + + summ_df[measures["speech_percentage"]] = 100 * ( + 1 - df_diff.loc[1:, measures["pause"]].sum()/ (60 * summ_df[measures["speech_minutes"]])) + + if len(word_df[measures["word_pause"]]) > 1: + summ_df[measures["word_pause_mean"]] = word_df[measures["word_pause"]].mean(skipna=True) + summ_df[measures["word_pause_var"]] = word_df[measures["word_pause"]].var(skipna=True) + + if len(turn_df) > 0: + summ_df[measures["num_turns"]] = len(turn_df) + summ_df[measures["turn_minutes_mean"]] = turn_df[measures["turn_minutes"]].mean(skipna=True) + + summ_df[measures["turn_words_mean"]] = turn_df[measures["turn_words"]].mean(skipna=True) + summ_df[measures["turn_pause_mean"]] = turn_df[measures["turn_pause"]].mean(skipna=True) + + summ_df["num_one_word_turns"] = len(turn_df[turn_df[measures["turn_words"]] == 1]) + summ_df[measures["num_interrupts"]] = sum(turn_df[measures["interrupt_flag"]]) + + return summ_df def get_pause_feature(json_conf, df_list, text_list, text_indices, measures): """ @@ -1303,54 +666,249 @@ def get_pause_feature(json_conf, df_list, text_list, text_indices, measures): ------------------------------------------------------------------------------------------------------ """ - # Check if json_conf is empty if len(json_conf) <= 0: return df_list - word_df, phrase_df, turn_df, summ_df = df_list - word_list, phrase_list, turn_list, full_text = text_list + word_df, turn_df, summ_df = df_list + word_list, turn_list, full_text = text_list phrase_index, turn_index = text_indices - # Convert json_conf to a pandas DataFrame df_diff = pd.DataFrame(json_conf) - time_index = ["start", "end"] - # Calculate the pause time between - # each word and add the results to pause_list + # Calculate the pause time between; each word and add the results to pause_list if measures["pause"] not in df_diff.columns: - df_diff[measures["pause"]] = df_diff[time_index[0]].astype(float) - df_diff[ - time_index[1] - ].astype(float).shift(1) + df_diff[measures["pause"]] = df_diff[time_index[0]].astype(float) - df_diff[time_index[1]].astype(float).shift(1) # word-level analysis word_df = get_pause_feature_word(word_df, df_diff, word_list, phrase_index, measures) - # phrase-level analysis - phrase_df = get_pause_feature_phrase( - phrase_df, df_diff, phrase_list, phrase_index, turn_index, time_index, measures - ) - # turn-level analysis if len(turn_index) > 0: - turn_df = get_pause_feature_turn( - turn_df, df_diff, turn_list, turn_index, time_index, measures - ) + turn_df = get_pause_feature_turn(turn_df, df_diff, turn_list, turn_index, time_index, measures) # file-level analysis - summ_df = update_summ_df( - df_diff, summ_df, full_text, time_index, word_df, phrase_df, turn_df, measures - ) + summ_df = update_summ_df(df_diff, summ_df, full_text, time_index, word_df, turn_df, measures) + df_feature = [word_df, turn_df, summ_df] + return df_feature - df_feature = [word_df, phrase_df, turn_df, summ_df] +def get_mattr(text): + """ + ------------------------------------------------------------------------------------------------------ + This function calculates the Moving Average Type-Token Ratio (MATTR) + of the input text using the + LexicalRichness library. - return df_feature + Parameters: + ........... + text : str + The input text to be analyzed. + + Returns: + ........... + mattr : float + The calculated MATTR value. + + ------------------------------------------------------------------------------------------------------ + """ + word = nltk.word_tokenize(text) + filter_punc = list(value for value in word if value not in [".", "!", "?"]) + filter_punc = " ".join(filter_punc) + mattr = np.nan + + lex_richness = LexicalRichness(filter_punc) + if lex_richness.words > 0: + mattr = lex_richness.mattr(window_size=lex_richness.words) + + return mattr + +def get_tag(json_conf, tag_dict, measures): + """ + ------------------------------------------------------------------------------------------------------ + + This function performs part-of-speech + tagging on the input text using NLTK, and returns an updated + json_conf list with the part-of-speech tags. + + Parameters: + ........... + json_conf: list + JSON response object. + tag_dict: dict + A dictionary mapping the NLTK tags to more readable tags. + measures: dict + A dictionary containing the names of the columns in the output dataframes. + + Returns: + ........... + json_conf: list + The updated json_conf list. + + ------------------------------------------------------------------------------------------------------ + """ + if len(json_conf) <= 0: + return json_conf + + if "alternatives" not in json_conf[0].keys(): + # local vosk transcriber + word_list = [word["word"] for word in json_conf if "word" in word] + else: + # aws transcriber + word_list = [item["alternatives"][0]["content"] for item in json_conf] + + tag_list = nltk.pos_tag(word_list) + for i, tag in enumerate(tag_list): + + if tag[1] in tag_dict.keys(): + json_conf[i][measures["tag"]] = tag_dict[tag[1]] + + else: + json_conf[i][measures["tag"]] = "Other" + return json_conf + +def get_part_of_speech(df, tags, measures, index=0): + """ + ------------------------------------------------------------------------------------------------------ + + This function calculates the proportions of verbs, + pronouns, adjectives, and nouns in the + transcribed text, and adds them to the output dataframe df. + + Parameters: + ........... + df: pandas dataframe + A dataframe containing the speech characteristics of the input text. + tags: list + A list of part-of-speech tags for the input text. + measures: dict + A dictionary containing the names of the columns in the output dataframes. + index: int + The index of the row in the output dataframe df. + + Returns: + ........... + df: pandas dataframe + The updated df dataframe. + + ------------------------------------------------------------------------------------------------------ + """ + if len(tags) == 0: + return df + + df.loc[index, measures["speech_noun"]] = (100 * len(tags[tags == "Noun"]) / len(tags)) + df.loc[index, measures["speech_verb"]] = (100 * len(tags[tags == "Verb"]) / len(tags)) + + df.loc[index, measures["speech_adj"]] = (100 * len(tags[tags == "Adjective"]) / len(tags)) + df.loc[index, measures["speech_pronoun"]] = (100 * len(tags[tags == "Pronoun"]) / len(tags)) + return df + +def get_tag_summ(json_conf, df_list, text_indices, measures): + """ + ------------------------------------------------------------------------------------------------------ + + This function calculates the proportions of verbs, + pronouns, adjectives, and nouns in the + transcribed text, and adds them to the output dataframe summ_df. + + Parameters: + ........... + json_conf: list + JSON response object. + df_list: list + List of pandas dataframes. + word_df, phrase_df, turn_df, summ_df + text_indices: list + List of indices for text_list. + for phrases and turns. + measures: dict + A dictionary containing the names of the columns in the output dataframes. + + Returns: + ........... + df_list: list + List of updated pandas dataframes. + + ------------------------------------------------------------------------------------------------------ + """ + + word_df, turn_df, summ_df = df_list + _ , turn_index = text_indices + + df_conf = pd.DataFrame(json_conf) + word_df[measures["part_of_speech"]] = df_conf[measures["tag"]] + + # turn-level analysis + for j, uindex in enumerate(turn_index): + urange = range(uindex[0], uindex[1] + 1) + + turn_tags = df_conf.loc[df_conf[measures["old_index"]].isin(urange), measures["tag"]] + turn_df = get_part_of_speech(turn_df, turn_tags, measures, j) + + # file-level analysis + summ_df = get_part_of_speech(summ_df, df_conf[measures["tag"]], measures) + df_list = [word_df, turn_df, summ_df] + return df_list + +def get_sentiment(df_list, text_list, measures): + """ + ------------------------------------------------------------------------------------------------------ + + This function calculates the sentiment scores of the input text using + VADER, and adds them to the output dataframe summ_df. + + Parameters: + ........... + df_list: list + List of pandas dataframes. + word_df, phrase_df, turn_df, summ_df + text_list: list + List of transcribed text. + split into words, phrases, turns, and full text. + measures: dict + A dictionary containing the names of the columns in the output dataframes. + + Returns: + ........... + df_list: list + List of updated pandas dataframes. + + ------------------------------------------------------------------------------------------------------ + """ + word_df, turn_df, summ_df = df_list + word_list, turn_list, full_text = text_list + + sentiment = SentimentIntensityAnalyzer() + cols = [measures["neg"], measures["neu"], measures["pos"], measures["compound"], measures["speech_mattr"]] + + for idx, w in enumerate(word_list): + try: + + sentiment_dict = sentiment.polarity_scores(w) + word_df.loc[idx, cols[:-1]] = list(sentiment_dict.values()) + + except Exception as e: + logger.error(f"Error in sentiment analysis: {e}") + continue + for idx, u in enumerate(turn_list): + try: + + sentiment_dict = sentiment.polarity_scores(u) + mattr = get_mattr(u) + turn_df.loc[idx, cols] = list(sentiment_dict.values()) + [mattr] + + except Exception as e: + logger.error(f"Error in sentiment analysis: {e}") + continue + + sentiment_dict = sentiment.polarity_scores(full_text) + mattr = get_mattr(full_text) -def process_language_feature( - json_conf, df_list, text_list, - text_indices, language, measures, -): + summ_df.loc[0, cols] = list(sentiment_dict.values()) + [mattr] + df_list = [word_df, turn_df, summ_df] + return df_list + +def process_language_feature(json_conf, df_list, text_list, text_indices, language, measures): """ ------------------------------------------------------------------------------------------------------ @@ -1395,6 +953,4 @@ def process_language_feature( df_list = get_tag_summ(json_conf, df_list, text_indices, measures) df_list = get_sentiment(df_list, text_list, measures) - - word_df, phrase_df, turn_df, summ_df = df_list - return word_df, phrase_df, turn_df, summ_df + return df_list \ No newline at end of file From 552ac96a11de4b8ad27947bd2c2bb95a23ef65a7 Mon Sep 17 00:00:00 2001 From: Vijay Yadav Date: Wed, 1 Nov 2023 12:52:50 -0400 Subject: [PATCH 02/21] speech characteristics update --- openwillis/measures/text/config/text.json | 2 + openwillis/measures/text/speech_attribute.py | 179 +++++-- .../text/util/characteristics_util.py | 494 +++++++++--------- 3 files changed, 381 insertions(+), 294 deletions(-) diff --git a/openwillis/measures/text/config/text.json b/openwillis/measures/text/config/text.json index 443433d..897ace2 100644 --- a/openwillis/measures/text/config/text.json +++ b/openwillis/measures/text/config/text.json @@ -33,6 +33,8 @@ "speech_words": "speech_length_words", "turn_minutes": "turn_length_minutes", "turn_words": "turn_length_words", + "file_length": "file_length", + "speaker_percentage": "speaker_percentage", "word_rate": "words_per_min", "syllable_rate": "syllables_per_min", "pause_rate": "pauses_per_min", diff --git a/openwillis/measures/text/speech_attribute.py b/openwillis/measures/text/speech_attribute.py index b3eaba5..5b44096 100644 --- a/openwillis/measures/text/speech_attribute.py +++ b/openwillis/measures/text/speech_attribute.py @@ -9,7 +9,7 @@ import nltk import numpy as np import pandas as pd -#from openwillis.measures.text.util import characteristics_util as cutil +from openwillis.measures.text.util import characteristics_util as cutil from util import characteristics_util as cutil logging.basicConfig(level=logging.INFO) @@ -83,7 +83,6 @@ def is_whisper_transcribe(json_conf): return True return False - def filter_transcribe(json_conf, measures, speaker_label=None): """ ------------------------------------------------------------------------------------------------------ @@ -115,44 +114,25 @@ def filter_transcribe(json_conf, measures, speaker_label=None): ------------------------------------------------------------------------------------------------------ """ item_data = json_conf["results"]["items"] - - # make a dictionary to map old indices to new indices - item_data = cutil.create_index_column(item_data, measures) + + for i, item in enumerate(item_data): # create_index_column + item[measures["old_index"]] = i # extract text - text = " ".join( - [ - item["alternatives"][0]["content"] - for item in item_data - if "alternatives" in item - ] - ) - - # phrase-split + text = " ".join([item["alternatives"][0]["content"] for item in item_data if "alternatives" in item]) phrases, phrases_idxs = cutil.phrase_split(text) - # turn-split - turns = [] - turns_idxs = [] - if speaker_label is not None: + turns_idxs, turns = cutil.filter_speaker_aws(item_data, speaker_label) + else: + turns_idxs, turns = [], [] - turns_idxs, turns, phrases_idxs, phrases = cutil.filter_speaker( - item_data, speaker_label, turns_idxs, turns, phrases_idxs, phrases - ) - - # entire transcript - by joining all the phrases text = " ".join(phrases) - - # filter json to only include items with start_time and end_time - filter_json = cutil.filter_json_transcribe(item_data, speaker_label, measures) - - # extract words + filter_json = cutil.filter_json_transcribe_aws(item_data, speaker_label, measures) words = [word["alternatives"][0]["content"] for word in filter_json] - text_list = [words, phrases, turns, text] + text_list = [words, turns, text] text_indices = [phrases_idxs, turns_idxs] - return filter_json, text_list, text_indices @@ -250,6 +230,118 @@ def filter_vosk(json_conf, measures): return words, text +def common_summary_feature(df_summ, json_data, model): + """ + ------------------------------------------------------------------------------------------------------ + + Calculate file features based on JSON data. + + Parameters: + ........... + json_conf: list + JSON response object. + summ_df: pandas dataframe + A dataframe containing summary information on the speech + model: str + model name + + Returns: + ........... + summ_df: pandas dataframe + A dataframe containing summary information on the speech + + ------------------------------------------------------------------------------------------------------ + """ + try: + if model == 'vosk': + if len(json_data) > 0 and 'end' in json_data[-1]: + + last_dict = json_data[-1] + df_summ['file_length'] = [last_dict['end']] + + else: + if model == 'aws': + json_data = json_data["results"] + fl_length, spk_pct = cutil.calculate_file_feature(json_data, model) + + else: + fl_length, spk_pct = cutil.calculate_file_feature(json_data, model) + df_summ['file_length'] = [fl_length] + df_summ['speaker_percentage'] = [spk_pct] + + except Exception as e: + logger.error("Error in file length calculation") + return df_summ + +def process_transcript(df_list, json_conf, measures, min_turn_length, speaker_label, source, language): + """ + ------------------------------------------------------------------------------------------------------ + + Process transcript + + Parameters: + ........... + df_list: list, : + contains pandas dataframe + json_conf: dict + Transcribed json file + measures: dict + A dictionary containing the names of the columns in the output dataframes. + min_turn_length: int + minimum words required in each turn + speaker_label: str + Speaker label + source: str + model name + language: str + Language type + + Returns: + ........... + df_list: list + contains pandas dataframe + + ------------------------------------------------------------------------------------------------------ + """ + common_summary_feature(df_list[2], json_conf, source) + + if source == 'whisper': + info = filter_whisper(json_conf, measures, min_turn_length, speaker_label) + + elif source == 'aws': + info = filter_transcribe(json_conf, measures, speaker_label) + + else: + words, text = filter_vosk(json_conf, measures) + info = (json_conf, [words, [], text], [[], []]) + + if len(info[0]) > 0 and len(info[1][-1]) > 0: + df_list = cutil.process_language_feature(df_list, info, language, get_time_columns(source), measures) + return df_list + +def get_time_columns(source): + """ + ------------------------------------------------------------------------------------------------------ + + get time columns + + Parameters: + ........... + source: str + model name + + Returns: + ........... + object: list + time index name + + ------------------------------------------------------------------------------------------------------ + """ + if source == 'aws': + return ["start_time", "end_time"] + else: + return ["start", "end"] + def speech_characteristics(json_conf, language="en", speaker_label=None, min_turn_length=1): """ ------------------------------------------------------------------------------------------------------ @@ -272,8 +364,6 @@ def speech_characteristics(json_conf, language="en", speaker_label=None, min_tur df_list: list, contains: word_df: pandas dataframe A dataframe containing word summary information - phrase_df: pandas dataframe - A dataframe containing phrase summary information turn_df: pandas dataframe A dataframe containing turn summary information summ_df: pandas dataframe @@ -281,34 +371,31 @@ def speech_characteristics(json_conf, language="en", speaker_label=None, min_tur ------------------------------------------------------------------------------------------------------ """ - - measures = get_config(os.path.abspath(__file__), "text.json") - df_list = cutil.create_empty_dataframes(measures) - try: + # Load configuration measures + measures = get_config(os.path.abspath(__file__), "text.json") + df_list = cutil.create_empty_dataframes(measures) + if bool(json_conf): - language = "na" if language is None or len(language) < 2 else language[:2].lower() + language = language[:2].lower() if (language and len(language) >= 2) else "na" if language == 'en': cutil.download_nltk_resources() if is_whisper_transcribe(json_conf): - filter_json, text_list, text_indices = filter_whisper(json_conf, measures, min_turn_length, speaker_label) + df_list = process_transcript(df_list, json_conf, measures, min_turn_length, speaker_label, 'whisper', language) - if len(filter_json) > 0 and len(text_list[-1]) > 0: - df_list = cutil.process_language_feature(filter_json, df_list, text_list, text_indices, language, measures) + elif is_amazon_transcribe(json_conf): + df_list = process_transcript(df_list, json_conf, measures, min_turn_length, speaker_label, 'aws', language) else: - words, text = filter_vosk(json_conf, measures) - if len(text) > 0: - df_list = cutil.process_language_feature(json_conf, df_list, [words,[],[],text],[[],[]], language, measures) - - + df_list = process_transcript(df_list, json_conf, measures, min_turn_length, speaker_label, 'vosk', language) + except Exception as e: logger.error(f"Error in Speech Characteristics {e}") finally: for df in df_list: df.loc[0] = np.nan if df.empty else df.loc[0] - + return df_list diff --git a/openwillis/measures/text/util/characteristics_util.py b/openwillis/measures/text/util/characteristics_util.py index 4d939bb..0a60710 100644 --- a/openwillis/measures/text/util/characteristics_util.py +++ b/openwillis/measures/text/util/characteristics_util.py @@ -15,23 +15,8 @@ logger = logging.getLogger() # NLTK Tag list -TAG_DICT = { - "PRP": "Pronoun", - "PRP$": "Pronoun", - "VB": "Verb", - "VBD": "Verb", - "VBG": "Verb", - "VBN": "Verb", - "VBP": "Verb", - "VBZ": "Verb", - "JJ": "Adjective", - "JJR": "Adjective", - "JJS": "Adjective", - "NN": "Noun", - "NNP": "Noun", - "NNS": "Noun", -} - +TAG_DICT = {"PRP": "Pronoun", "PRP$": "Pronoun", "VB": "Verb", "VBD": "Verb", "VBG": "Verb", "VBN": "Verb", "VBP": "Verb", + "VBZ": "Verb", "JJ": "Adjective", "JJR": "Adjective", "JJS": "Adjective", "NN": "Noun", "NNP": "Noun", "NNS": "Noun"} def create_empty_dataframes(measures): """ @@ -46,80 +31,26 @@ def create_empty_dataframes(measures): Returns: ........... - word_df: pandas dataframe - A dataframe containing word summary information - phrase_df: pandas dataframe - A dataframe containing phrase summary information - turn_df: pandas dataframe - A dataframe containing turn summary information - summ_df: pandas dataframe - A dataframe containing summary information on the speech - - ------------------------------------------------------------------------------------------------------ - """ - - word_df = pd.DataFrame( - columns=[ - measures["word_pause"], - measures["num_syllables"], - measures["part_of_speech"], - measures["pos"], - measures["neg"], - measures["neu"], - measures["compound"], - ] - ) - - turn_df = pd.DataFrame( - columns=[ - measures["turn_pause"], - measures["turn_minutes"], - measures["turn_words"], - measures["word_rate"], - measures["syllable_rate"], - measures["pause_rate"], - measures["pause_var"], - measures["pause_meandur"], - measures["speech_percentage"], - measures["speech_noun"], - measures["speech_verb"], - measures["speech_adj"], - measures["speech_pronoun"], - measures["pos"], - measures["neg"], - measures["neu"], - measures["compound"], - measures["speech_mattr"], - measures["interrupt_flag"], - ] - ) + tuple: pandas dataframe + An empty dataframe for word, turn and summary measures + + ------------------------------------------------------------------------------------------------------ + """ + + word_df = pd.DataFrame(columns=[measures["word_pause"], measures["num_syllables"], measures["part_of_speech"]]) + turn_df = pd.DataFrame(columns=[measures["turn_pause"], measures["turn_minutes"], measures["turn_words"], + measures["word_rate"], measures["syllable_rate"], measures["speech_percentage"], + measures["pause_meandur"], measures["pause_var"], measures["pos"], measures["neg"], + measures["neu"], measures["compound"], measures["speech_mattr"], + measures["interrupt_flag"]]) summ_df = pd.DataFrame( - columns=[ - measures["speech_minutes"], - measures["speech_words"], - measures["word_rate"], - measures["syllable_rate"], - measures["word_pause_mean"], - measures["word_pause_var"], - measures["speech_percentage"], - measures["speech_noun"], - measures["speech_verb"], - measures["speech_adj"], - measures["speech_pronoun"], - measures["pos"], - measures["neg"], - measures["neu"], - measures["compound"], - measures["speech_mattr"], - measures["num_turns"], - measures["turn_minutes_mean"], - measures["turn_words_mean"], - measures["turn_pause_mean"], - measures["num_one_word_turns"], - measures["num_interrupts"], - ] - ) + columns=[measures["file_length"], measures["speech_minutes"], measures["speech_words"], measures["word_rate"], + measures["syllable_rate"], measures["word_pause_mean"], measures["word_pause_var"], + measures["speech_percentage"], measures["pos"], measures["neg"], measures["neu"], measures["compound"], + measures["speech_mattr"], measures["num_turns"], measures["num_one_word_turns"], measures["turn_minutes_mean"], + measures["turn_words_mean"], measures["turn_pause_mean"], measures["speaker_percentage"], + measures["num_interrupts"]]) return word_df, turn_df, summ_df @@ -139,6 +70,7 @@ def create_index_column(item_data, measures): """ index = 0 for item in item_data: + for word in item.get("words", []): word[measures["old_index"]] = index index += 1 @@ -152,14 +84,6 @@ def download_nltk_resources(): This function downloads the required NLTK resources for processing text data. - Parameters: - ........... - None - - Returns: - ........... - None - ------------------------------------------------------------------------------------------------------ """ try: @@ -172,6 +96,151 @@ def download_nltk_resources(): except LookupError: nltk.download("averaged_perceptron_tagger") +def phrase_split(text): + """ + ------------------------------------------------------------------------------------------------------ + + This function splits the input text into phrases. + + Parameters: + ........... + text: str + The input text. + + Returns: + ........... + phrases: list + A list of phrases extracted from the input text. + phrases_idxs: list + A list of tuples containing + the start and end indices of the phrases in the input text. + + ------------------------------------------------------------------------------------------------------ + """ + phrases = nltk.tokenize.sent_tokenize(text) + phrases_idxs = [] + + start_idx = 0 + for phrase in phrases: + end_idx = start_idx + len(phrase.split()) - 1 + + phrases_idxs.append((start_idx, end_idx)) + start_idx = end_idx + 1 + + return phrases, phrases_idxs + +def filter_turn_aws(item_data, speaker_label): + """ + ------------------------------------------------------------------------------------------------------ + + This function updates the turns list + to only include the speaker label provided. + + Parameters: + ........... + item_data: dict + JSON response object. + speaker_label: str + Speaker label + turns_idxs: list + A list of tuples containing + the start and end indices of the turns in the JSON object. + turns: list + A list of turns extracted from the JSON object. + + Returns: + ........... + turns_idxs: list + A list of tuples containing + the start and end indices of the turns in the JSON object. + turns: list + A list of turns extracted from the JSON object. + + ------------------------------------------------------------------------------------------------------ + """ + start_idx = 0 + turns_idxs, turns = [], [] + for i, item in enumerate(item_data): + + try: + if (i > 0 and item.get("speaker_label", "") == speaker_label and item_data[i - 1].get("speaker_label", "") != speaker_label): + start_idx = i + elif (i > 0 and item.get("speaker_label", "") != speaker_label and item_data[i - 1].get("speaker_label", "") == speaker_label): + turns_idxs.append((start_idx, i - 1)) + turns.append(" ".join([item["alternatives"][0]["content"] for item in item_data[start_idx:i]])) + + except Exception as e: + logger.error(f"Error in turn-split for speaker {speaker_label}: {e}") + continue + + if start_idx not in [item[0] for item in turns_idxs]: + turns_idxs.append((start_idx, len(item_data) - 1)) + + turns.append(" ".join([item["alternatives"][0]["content"] for item in item_data[start_idx:]])) + return turns_idxs, turns + +def filter_speaker_aws(item_data, speaker_label): + """ + ------------------------------------------------------------------------------------------------------ + + This function updates the turns and phrases lists + to only include the speaker label provided. + + Parameters: + ........... + item_data: dict + JSON response object. + speaker_label: str + Speaker label + Returns: + ........... + turns_idxs: list + A list of tuples containing + the start and end indices of the turns in the JSON object. + turns: list + A list of turns extracted from the JSON object. + + ------------------------------------------------------------------------------------------------------ + """ + + speaker_labels = [item["speaker_label"] for item in item_data if "speaker_label" in item] + + if speaker_label not in speaker_labels: + logger.error(f"Speaker label {speaker_label} not found in the json response object.") + + turns_idxs, turns = filter_turn_aws(item_data, speaker_label) + return turns_idxs, turns + +def filter_json_transcribe_aws(item_data, speaker_label, measures): + """ + ------------------------------------------------------------------------------------------------------ + + This function filters the JSON response object to only include items with start_time and end_time. + + Parameters: + ........... + item_data: dict + JSON response object. + speaker_label: str + Speaker label + measures: dict + A dictionary containing the names of the columns in the output dataframes. + + Returns: + ........... + filter_json: list + The updated JSON response object. + + ------------------------------------------------------------------------------------------------------ + """ + filter_json = [item for item in item_data if "start_time" in item and "end_time" in item] + filter_json = pause_calculation(filter_json, measures, ['start_time', 'end_time']) + + if speaker_label is not None: + filter_json = [item for item in filter_json if item.get("speaker_label", "") == speaker_label] + + return filter_json + def filter_phrases(item_data, speaker_label, measures): """ ------------------------------------------------------------------------------------------------------ @@ -245,10 +314,6 @@ def filter_turns(item_data, speaker_label, measures, min_turn_length): turns: list A list of turns extracted from the JSON object. - Raises: - ........... - ValueError: If the speaker label is not found in the json response object. - ------------------------------------------------------------------------------------------------------ """ turns_idxs, turns = [], [] @@ -285,11 +350,11 @@ def filter_turns(item_data, speaker_label, measures, min_turn_length): if len(turn_text.split(" ")) >= min_turn_length: turns_idxs.append((start_idx2, end_idx2)) + turns.append(turn_text) - return turns_idxs, turns -def pause_calculation(filter_json, measures): +def pause_calculation(filter_json, measures, time_index): """ ------------------------------------------------------------------------------------------------------ @@ -311,11 +376,10 @@ def pause_calculation(filter_json, measures): """ for i, item in enumerate(filter_json): if i > 0: - item[measures["pause"]] = float(item["start"]) - float(filter_json[i - 1]["end"]) + item[measures["pause"]] = float(item[time_index[0]]) - float(filter_json[i - 1][time_index[0]]) else: item[measures["pause"]] = np.nan - return filter_json def filter_json_transcribe(item_data, speaker_label, measures): @@ -340,7 +404,6 @@ def filter_json_transcribe(item_data, speaker_label, measures): ------------------------------------------------------------------------------------------------------ """ - # phrase filtering item_data2 = [] for item in item_data: try: @@ -356,7 +419,7 @@ def filter_json_transcribe(item_data, speaker_label, measures): logger.error(f"Failed to filter word: {e}") filter_json = [item for item in item_data2 if "start" in item and "end" in item] - filter_json = pause_calculation(filter_json, measures) # calculate time difference between each word + filter_json = pause_calculation(filter_json, measures, ['start', 'end']) if speaker_label is not None: filter_json = [item for item in filter_json if item.get("speaker", "") == speaker_label] @@ -382,11 +445,9 @@ def get_num_of_syllables(text): """ syllable_tokenizer = nltk.tokenize.SyllableTokenizer() - - # remove punctuation - punctuation = "!\"#$%&()*+,-./:;<=>?@[\]^_`{|}~" + punctuation = "!\"#$%&()*+,-./:;<=>?@[\]^_`{|}~" # remove punctuation + syllables = [syllable_tokenizer.tokenize(token) for token in nltk.word_tokenize(text) if token not in punctuation] - # count the number of syllables in each word syllable_count = sum([len(token) for token in syllables]) return syllable_count @@ -421,18 +482,12 @@ def get_pause_feature_word(word_df, df_diff, word_list, phrase_index, measures): ------------------------------------------------------------------------------------------------------ """ phrase_starts = [pindex[0] for pindex in phrase_index] - - word_df[measures["word_pause"]] = df_diff[measures["pause"]].where( - ~df_diff[measures["old_index"]].isin(phrase_starts), np.nan - ) - - # calculate the number of syllables in each word from the word list - word_df[measures["num_syllables"]] = [ - get_num_of_syllables(word) for word in word_list - ] + word_df[measures["word_pause"]] = df_diff[measures["pause"]].where(~df_diff[measures["old_index"]].isin(phrase_starts), np.nan) + + word_df[measures["num_syllables"]] = [get_num_of_syllables(word) for word in word_list] return word_df -def process_pause_feature(df_diff, df, text_level, index_list, time_index, level_name, measures): +def process_pause_feature(df_diff, df, text_level, index_list, time_index, level_name, measures, language): """ ------------------------------------------------------------------------------------------------------ @@ -469,58 +524,47 @@ def process_pause_feature(df_diff, df, text_level, index_list, time_index, level """ if level_name not in [measures["phrase"], measures["turn"]]: - logger.error( - f"level_name must be either {measures['phrase']} or {measures['turn']}" - ) + logger.error(f"level_name must be either in phrase or turn") return df for j, index in enumerate(index_list): try: + rng = range(index[0], index[1] + 1) level_json = df_diff[df_diff[measures["old_index"]].isin(rng)] - - # remove first pause as it is the pre_pause + pauses = level_json[measures["pause"]].values[1:] - - df.loc[j, measures[f"{level_name}_minutes"]] = ( - float(level_json.iloc[-1][time_index[1]]) - - float(level_json.iloc[0][time_index[0]]) - ) / 60 + level_min_val = (float(level_json.iloc[-1][time_index[1]]) - float(level_json.iloc[0][time_index[0]])) / 60 + + df.loc[j, measures[f"{level_name}_minutes"]] = level_min_val df.loc[j, measures[f"{level_name}_words"]] = len(level_json) - # if there is 1 pause if len(pauses) == 1: df.loc[j, measures["pause_var"]] = 0 df.loc[j, measures["pause_meandur"]] = np.mean(pauses) - # if there are more than 1 pauses + elif len(pauses) > 1: df.loc[j, measures["pause_var"]] = np.var(pauses) df.loc[j, measures["pause_meandur"]] = np.mean(pauses) if df.loc[j, measures[f"{level_name}_minutes"]] > 0: - df.loc[j, measures["speech_percentage"]] = 100 * ( - 1 - np.sum(pauses) / ( - 60 * df.loc[j, measures[f"{level_name}_minutes"]] - ) - ) - - # articulation rate - df.loc[j, measures["syllable_rate"]] = ( - get_num_of_syllables(text_level[j]) / df.loc[j, measures[f"{level_name}_minutes"]] - ) - - df.loc[j, measures["word_rate"]] = ( - df.loc[j, measures[f"{level_name}_words"]] / df.loc[j, measures[f"{level_name}_minutes"]] - ) + speech_pct_val = 100 * (1 - np.sum(pauses) / (60 * df.loc[j, measures[f"{level_name}_minutes"]])) + df.loc[j, measures["speech_percentage"]] = speech_pct_val + + if language == 'en': + syllable_rate = (get_num_of_syllables(text_level[j]) / df.loc[j, measures[f"{level_name}_minutes"]]) + df.loc[j, measures["syllable_rate"]] = syllable_rate + + word_rate_val = (df.loc[j, measures[f"{level_name}_words"]] / df.loc[j, measures[f"{level_name}_minutes"]]) + df.loc[j, measures["word_rate"]] = word_rate_val + except Exception as e: logger.error(f"Error in pause feature calculation for {level_name} {j}: {e}") continue - df[measures["pause_rate"]] = df[measures["word_rate"]] - return df -def get_pause_feature_turn(turn_df, df_diff, turn_list, turn_index, time_index, measures): +def get_pause_feature_turn(turn_df, df_diff, turn_list, turn_index, time_index, measures, language): """ ------------------------------------------------------------------------------------------------------ @@ -552,28 +596,19 @@ def get_pause_feature_turn(turn_df, df_diff, turn_list, turn_index, time_index, ------------------------------------------------------------------------------------------------------ """ - turn_starts = [uindex[0] for uindex in turn_index] - - # get the rows corresponding to the start of each turn - df_diff_turn = df_diff[ - df_diff[measures["old_index"]].isin(turn_starts) - ] + df_diff_turn = df_diff[df_diff[measures["old_index"]].isin(turn_starts)] turn_df[measures["turn_pause"]] = df_diff_turn[measures["pause"]] turn_df[measures["interrupt_flag"]] = False - # set pre_turn_pause to 0 if negative (due to overlapping turns) - # and set interrupt_flag to True - negative_pause = turn_df[measures["turn_pause"]] < 0 + + negative_pause = turn_df[measures["turn_pause"]] <= 0 turn_df.loc[negative_pause, measures["turn_pause"]] = 0 + turn_df.loc[negative_pause, measures["interrupt_flag"]] = True - turn_df = turn_df.reset_index(drop=True) - turn_df = process_pause_feature( - df_diff, turn_df, turn_list, turn_index, time_index, measures["turn"], measures - ) - + turn_df = process_pause_feature(df_diff, turn_df, turn_list, turn_index, time_index, measures["turn"], measures, language) return turn_df def update_summ_df(df_diff, summ_df, full_text, time_index, word_df, turn_df, measures): @@ -631,11 +666,11 @@ def update_summ_df(df_diff, summ_df, full_text, time_index, word_df, turn_df, me summ_df[measures["turn_pause_mean"]] = turn_df[measures["turn_pause"]].mean(skipna=True) summ_df["num_one_word_turns"] = len(turn_df[turn_df[measures["turn_words"]] == 1]) - summ_df[measures["num_interrupts"]] = sum(turn_df[measures["interrupt_flag"]]) + summ_df[measures["num_interrupts"]] = len(turn_df[turn_df[measures["interrupt_flag"]]==True]) return summ_df -def get_pause_feature(json_conf, df_list, text_list, text_indices, measures): +def get_pause_feature(json_conf, df_list, text_list, text_indices, measures, time_index, language): """ ------------------------------------------------------------------------------------------------------ @@ -661,8 +696,7 @@ def get_pause_feature(json_conf, df_list, text_list, text_indices, measures): Returns: ........... df_feature: list - List of updated pandas dataframes. - word_df, phrase_df, turn_df, summ_df + List of updated pandas dataframes (word_df, turn_df and summ_df) ------------------------------------------------------------------------------------------------------ """ @@ -674,7 +708,7 @@ def get_pause_feature(json_conf, df_list, text_list, text_indices, measures): phrase_index, turn_index = text_indices df_diff = pd.DataFrame(json_conf) - time_index = ["start", "end"] + time_index = [time_index[0], time_index[1]] # Calculate the pause time between; each word and add the results to pause_list if measures["pause"] not in df_diff.columns: @@ -685,7 +719,7 @@ def get_pause_feature(json_conf, df_list, text_list, text_indices, measures): # turn-level analysis if len(turn_index) > 0: - turn_df = get_pause_feature_turn(turn_df, df_diff, turn_list, turn_index, time_index, measures) + turn_df = get_pause_feature_turn(turn_df, df_diff, turn_list, turn_index, time_index, measures, language) # file-level analysis summ_df = update_summ_df(df_diff, summ_df, full_text, time_index, word_df, turn_df, measures) @@ -750,11 +784,10 @@ def get_tag(json_conf, tag_dict, measures): return json_conf if "alternatives" not in json_conf[0].keys(): - # local vosk transcriber - word_list = [word["word"] for word in json_conf if "word" in word] + word_list = [word["word"] for word in json_conf if "word" in word]# local vosk transcriber + else: - # aws transcriber - word_list = [item["alternatives"][0]["content"] for item in json_conf] + word_list = [item["alternatives"][0]["content"] for item in json_conf]# aws transcriber tag_list = nltk.pos_tag(word_list) for i, tag in enumerate(tag_list): @@ -766,42 +799,6 @@ def get_tag(json_conf, tag_dict, measures): json_conf[i][measures["tag"]] = "Other" return json_conf -def get_part_of_speech(df, tags, measures, index=0): - """ - ------------------------------------------------------------------------------------------------------ - - This function calculates the proportions of verbs, - pronouns, adjectives, and nouns in the - transcribed text, and adds them to the output dataframe df. - - Parameters: - ........... - df: pandas dataframe - A dataframe containing the speech characteristics of the input text. - tags: list - A list of part-of-speech tags for the input text. - measures: dict - A dictionary containing the names of the columns in the output dataframes. - index: int - The index of the row in the output dataframe df. - - Returns: - ........... - df: pandas dataframe - The updated df dataframe. - - ------------------------------------------------------------------------------------------------------ - """ - if len(tags) == 0: - return df - - df.loc[index, measures["speech_noun"]] = (100 * len(tags[tags == "Noun"]) / len(tags)) - df.loc[index, measures["speech_verb"]] = (100 * len(tags[tags == "Verb"]) / len(tags)) - - df.loc[index, measures["speech_adj"]] = (100 * len(tags[tags == "Adjective"]) / len(tags)) - df.loc[index, measures["speech_pronoun"]] = (100 * len(tags[tags == "Pronoun"]) / len(tags)) - return df - def get_tag_summ(json_conf, df_list, text_indices, measures): """ ------------------------------------------------------------------------------------------------------ @@ -837,15 +834,6 @@ def get_tag_summ(json_conf, df_list, text_indices, measures): df_conf = pd.DataFrame(json_conf) word_df[measures["part_of_speech"]] = df_conf[measures["tag"]] - # turn-level analysis - for j, uindex in enumerate(turn_index): - urange = range(uindex[0], uindex[1] + 1) - - turn_tags = df_conf.loc[df_conf[measures["old_index"]].isin(urange), measures["tag"]] - turn_df = get_part_of_speech(turn_df, turn_tags, measures, j) - - # file-level analysis - summ_df = get_part_of_speech(summ_df, df_conf[measures["tag"]], measures) df_list = [word_df, turn_df, summ_df] return df_list @@ -880,16 +868,6 @@ def get_sentiment(df_list, text_list, measures): sentiment = SentimentIntensityAnalyzer() cols = [measures["neg"], measures["neu"], measures["pos"], measures["compound"], measures["speech_mattr"]] - for idx, w in enumerate(word_list): - try: - - sentiment_dict = sentiment.polarity_scores(w) - word_df.loc[idx, cols[:-1]] = list(sentiment_dict.values()) - - except Exception as e: - logger.error(f"Error in sentiment analysis: {e}") - continue - for idx, u in enumerate(turn_list): try: @@ -908,25 +886,51 @@ def get_sentiment(df_list, text_list, measures): df_list = [word_df, turn_df, summ_df] return df_list -def process_language_feature(json_conf, df_list, text_list, text_indices, language, measures): +def calculate_file_feature(json_data, model): """ ------------------------------------------------------------------------------------------------------ - This function processes the language features from json response. + Calculate file features based on JSON data. Parameters: ........... json_conf: list JSON response object. + + Returns: + ........... + tuple: A tuple containing two values - the total file length and the percentage of time spent speaking. + + ------------------------------------------------------------------------------------------------------ + """ + speakers = ['clinician', 'speaker0'] + + if model == 'aws': + segments = json_data.get('items', []) + file_length = max(float(segment.get("end_time", "0")) for segment in segments) + + speaking_time = sum(float(segment.get("end_time", "0") or "0") - float(segment.get("start_time", "0") or "0") + for segment in segments if segment.get("speaker_label", "") in speakers) + else: + segments = json_data.get('segments', []) + file_length = max(segment.get('end', 0) for segment in segments) + speaking_time = sum(segment['end'] - segment['start'] for segment in segments if segment.get('speaker', '') in speakers) + + speaking_pct = (speaking_time / file_length) * 100 + return file_length, speaking_pct + +def process_language_feature(df_list, transcribe_info, language, time_index, measures): + """ + ------------------------------------------------------------------------------------------------------ + + This function processes the language features from json response. + + Parameters: + ........... df_list: list List of pandas dataframes. - word_df, phrase_df, turn_df, summ_df - text_list: list - List of transcribed text. - split into words, phrases, turns, and full text. - text_indices: list - List of indices for text_list. - for phrases and turns. + transcribe_info: list + transcribed info language: str Language of the transcribed text. measures: dict @@ -934,19 +938,13 @@ def process_language_feature(json_conf, df_list, text_list, text_indices, langua Returns: ........... - word_df: pandas dataframe - A dataframe containing word summary information - phrase_df: pandas dataframe - A dataframe containing phrase summary information - turn_df: pandas dataframe - A dataframe containing turn summary information - summ_df: pandas dataframe - A dataframe containing summary information on the speech + df_list: list + List of pandas dataframes (word_df, turn_df and summ_df) ------------------------------------------------------------------------------------------------------ """ - - df_list = get_pause_feature(json_conf, df_list, text_list, text_indices, measures) + json_conf, text_list, text_indices = transcribe_info + df_list = get_pause_feature(json_conf, df_list, text_list, text_indices, measures, time_index, language) if language == "en": json_conf = get_tag(json_conf, TAG_DICT, measures) From d9b74e4335fbf55983845447dd5a07be4b04fa76 Mon Sep 17 00:00:00 2001 From: Vijay Yadav Date: Wed, 1 Nov 2023 13:12:51 -0400 Subject: [PATCH 03/21] speech update --- openwillis/measures/text/speech_attribute.py | 14 ++++++++------ .../measures/text/util/characteristics_util.py | 9 +++++++-- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/openwillis/measures/text/speech_attribute.py b/openwillis/measures/text/speech_attribute.py index 5b44096..934b016 100644 --- a/openwillis/measures/text/speech_attribute.py +++ b/openwillis/measures/text/speech_attribute.py @@ -10,7 +10,6 @@ import numpy as np import pandas as pd from openwillis.measures.text.util import characteristics_util as cutil -from util import characteristics_util as cutil logging.basicConfig(level=logging.INFO) logger = logging.getLogger() @@ -230,7 +229,7 @@ def filter_vosk(json_conf, measures): return words, text -def common_summary_feature(df_summ, json_data, model): +def common_summary_feature(df_summ, json_data, model, speaker_label): """ ------------------------------------------------------------------------------------------------------ @@ -244,6 +243,8 @@ def common_summary_feature(df_summ, json_data, model): A dataframe containing summary information on the speech model: str model name + speaker_label: str + Speaker label Returns: ........... @@ -262,12 +263,13 @@ def common_summary_feature(df_summ, json_data, model): else: if model == 'aws': json_data = json_data["results"] - fl_length, spk_pct = cutil.calculate_file_feature(json_data, model) + fl_length, spk_pct = cutil.calculate_file_feature(json_data, model, speaker_label) else: - fl_length, spk_pct = cutil.calculate_file_feature(json_data, model) + fl_length, spk_pct = cutil.calculate_file_feature(json_data, model, speaker_label) + df_summ['file_length'] = [fl_length] - df_summ['speaker_percentage'] = [spk_pct] + df_summ['speaker_percentage'] = [spk_pct]# if speaker_label is not None else df_summ['speaker_percentage'] except Exception as e: logger.error("Error in file length calculation") @@ -303,7 +305,7 @@ def process_transcript(df_list, json_conf, measures, min_turn_length, speaker_la ------------------------------------------------------------------------------------------------------ """ - common_summary_feature(df_list[2], json_conf, source) + common_summary_feature(df_list[2], json_conf, source, speaker_label) if source == 'whisper': info = filter_whisper(json_conf, measures, min_turn_length, speaker_label) diff --git a/openwillis/measures/text/util/characteristics_util.py b/openwillis/measures/text/util/characteristics_util.py index 0a60710..397fd3b 100644 --- a/openwillis/measures/text/util/characteristics_util.py +++ b/openwillis/measures/text/util/characteristics_util.py @@ -886,7 +886,7 @@ def get_sentiment(df_list, text_list, measures): df_list = [word_df, turn_df, summ_df] return df_list -def calculate_file_feature(json_data, model): +def calculate_file_feature(json_data, model, speakers): """ ------------------------------------------------------------------------------------------------------ @@ -903,17 +903,22 @@ def calculate_file_feature(json_data, model): ------------------------------------------------------------------------------------------------------ """ - speakers = ['clinician', 'speaker0'] if model == 'aws': segments = json_data.get('items', []) file_length = max(float(segment.get("end_time", "0")) for segment in segments) + + if speakers is None: + return file_length, np.NaN speaking_time = sum(float(segment.get("end_time", "0") or "0") - float(segment.get("start_time", "0") or "0") for segment in segments if segment.get("speaker_label", "") in speakers) else: segments = json_data.get('segments', []) file_length = max(segment.get('end', 0) for segment in segments) + + if speakers is None: + return file_length, np.NaN speaking_time = sum(segment['end'] - segment['start'] for segment in segments if segment.get('speaker', '') in speakers) speaking_pct = (speaking_time / file_length) * 100 From d1ab70826ea0e1dbf56c39aad39a6dfbe1d6f7a4 Mon Sep 17 00:00:00 2001 From: vjbytes102 Date: Wed, 1 Nov 2023 23:33:37 -0400 Subject: [PATCH 04/21] Update openwillis/measures/text/util/characteristics_util.py Co-authored-by: GeorgiosEfstathiadis <54844705+GeorgeEfstathiadis@users.noreply.github.com> --- openwillis/measures/text/util/characteristics_util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openwillis/measures/text/util/characteristics_util.py b/openwillis/measures/text/util/characteristics_util.py index 397fd3b..d79544b 100644 --- a/openwillis/measures/text/util/characteristics_util.py +++ b/openwillis/measures/text/util/characteristics_util.py @@ -922,7 +922,7 @@ def calculate_file_feature(json_data, model, speakers): speaking_time = sum(segment['end'] - segment['start'] for segment in segments if segment.get('speaker', '') in speakers) speaking_pct = (speaking_time / file_length) * 100 - return file_length, speaking_pct + return file_length/60, speaking_pct def process_language_feature(df_list, transcribe_info, language, time_index, measures): """ From 1bcee4331a767dc8f78097c98175b24deb6a21b5 Mon Sep 17 00:00:00 2001 From: vjbytes102 Date: Wed, 1 Nov 2023 23:35:16 -0400 Subject: [PATCH 05/21] Update characteristics_util Co-authored-by: GeorgiosEfstathiadis <54844705+GeorgeEfstathiadis@users.noreply.github.com> --- openwillis/measures/text/util/characteristics_util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openwillis/measures/text/util/characteristics_util.py b/openwillis/measures/text/util/characteristics_util.py index d79544b..91ff69a 100644 --- a/openwillis/measures/text/util/characteristics_util.py +++ b/openwillis/measures/text/util/characteristics_util.py @@ -376,7 +376,7 @@ def pause_calculation(filter_json, measures, time_index): """ for i, item in enumerate(filter_json): if i > 0: - item[measures["pause"]] = float(item[time_index[0]]) - float(filter_json[i - 1][time_index[0]]) + item[measures["pause"]] = float(item[time_index[0]]) - float(filter_json[i - 1][time_index[1]]) else: item[measures["pause"]] = np.nan From 8078d7bc8b8227d9855d346af2691892eb4fea6e Mon Sep 17 00:00:00 2001 From: vjbytes102 Date: Wed, 1 Nov 2023 23:37:14 -0400 Subject: [PATCH 06/21] Update characteristics_util Co-authored-by: GeorgiosEfstathiadis <54844705+GeorgeEfstathiadis@users.noreply.github.com> --- openwillis/measures/text/util/characteristics_util.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/openwillis/measures/text/util/characteristics_util.py b/openwillis/measures/text/util/characteristics_util.py index 91ff69a..776d877 100644 --- a/openwillis/measures/text/util/characteristics_util.py +++ b/openwillis/measures/text/util/characteristics_util.py @@ -642,7 +642,10 @@ def update_summ_df(df_diff, summ_df, full_text, time_index, word_df, turn_df, me ------------------------------------------------------------------------------------------------------ """ - speech_minutes = (float(df_diff.iloc[-1][time_index[1]]) - float(df_diff.iloc[0][time_index[0]])) / 60 + if len(turn_df) > 0: + speech_minutes = turn_df[measures["turn_minutes"]].sum() + else: + speech_minutes = (float(df_diff.iloc[-1][time_index[1]]) - float(df_diff.iloc[0][time_index[0]])) / 60 summ_df[measures["speech_minutes"]] = [speech_minutes] summ_df[measures["speech_words"]] = len(df_diff) From 42de460474cf7d0e881e0638f87e5895381f195e Mon Sep 17 00:00:00 2001 From: vjbytes102 Date: Wed, 1 Nov 2023 23:43:38 -0400 Subject: [PATCH 07/21] Update speech_attribute --- openwillis/measures/text/speech_attribute.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/openwillis/measures/text/speech_attribute.py b/openwillis/measures/text/speech_attribute.py index 934b016..957b5a7 100644 --- a/openwillis/measures/text/speech_attribute.py +++ b/openwillis/measures/text/speech_attribute.py @@ -85,9 +85,8 @@ def is_whisper_transcribe(json_conf): def filter_transcribe(json_conf, measures, speaker_label=None): """ ------------------------------------------------------------------------------------------------------ - This function extracts the text and filters the JSON data - for Amazon Transcribe json response objects. - Also, it filters the JSON data based on the speaker label if provided. + This function extracts the text and filters the JSON data for Amazon Transcribe json response objects. + Also, it filters the JSON data based on the speaker label if provided. Parameters: ........... json_conf: dict @@ -103,13 +102,9 @@ def filter_transcribe(json_conf, measures, speaker_label=None): only the relevant data for processing. text_list: list List of transcribed text. - split into words, phrases, turns, and full text. + split into words, turns, and full text. text_indices: list List of indices for text_list. - for phrases and turns. - Raises: - ........... - ValueError: If the speaker label is not found in the json response object. ------------------------------------------------------------------------------------------------------ """ item_data = json_conf["results"]["items"] From 3be0122ff3fb4afb694af8446f03edfdbaa62e00 Mon Sep 17 00:00:00 2001 From: vjbytes102 Date: Wed, 1 Nov 2023 23:49:33 -0400 Subject: [PATCH 08/21] Update characteristics_util --- .../measures/text/util/characteristics_util.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/openwillis/measures/text/util/characteristics_util.py b/openwillis/measures/text/util/characteristics_util.py index 776d877..8c413ac 100644 --- a/openwillis/measures/text/util/characteristics_util.py +++ b/openwillis/measures/text/util/characteristics_util.py @@ -142,12 +142,7 @@ def filter_turn_aws(item_data, speaker_label): JSON response object. speaker_label: str Speaker label - turns_idxs: list - A list of tuples containing - the start and end indices of the turns in the JSON object. - turns: list - A list of turns extracted from the JSON object. - + Returns: ........... turns_idxs: list @@ -899,6 +894,10 @@ def calculate_file_feature(json_data, model, speakers): ........... json_conf: list JSON response object. + model: str + model name (vosk/aws/whisper) + speakers: str + speakers label Returns: ........... @@ -939,6 +938,8 @@ def process_language_feature(df_list, transcribe_info, language, time_index, mea List of pandas dataframes. transcribe_info: list transcribed info + time_index: list + timepoint index (start/end) language: str Language of the transcribed text. measures: dict @@ -959,4 +960,4 @@ def process_language_feature(df_list, transcribe_info, language, time_index, mea df_list = get_tag_summ(json_conf, df_list, text_indices, measures) df_list = get_sentiment(df_list, text_list, measures) - return df_list \ No newline at end of file + return df_list From 97f237dc296db20d3480d2efa5a8b0177fc4f25f Mon Sep 17 00:00:00 2001 From: Vijay Yadav Date: Thu, 2 Nov 2023 17:10:16 -0400 Subject: [PATCH 09/21] speech update --- openwillis/measures/text/speech_attribute.py | 31 ++- .../text/util/characteristics_util.py | 204 ++++++------------ 2 files changed, 74 insertions(+), 161 deletions(-) diff --git a/openwillis/measures/text/speech_attribute.py b/openwillis/measures/text/speech_attribute.py index 957b5a7..674b20e 100644 --- a/openwillis/measures/text/speech_attribute.py +++ b/openwillis/measures/text/speech_attribute.py @@ -78,11 +78,13 @@ def is_whisper_transcribe(json_conf): ------------------------------------------------------------------------------------------------------ """ if "segments" in json_conf: - if "words" in json_conf["segments"][0]: - return True + if len(json_conf["segments"])>0: + + if "words" in json_conf["segments"][0]: + return True return False -def filter_transcribe(json_conf, measures, speaker_label=None): +def filter_transcribe(json_conf, measures, min_turn_length, speaker_label=None): """ ------------------------------------------------------------------------------------------------------ This function extracts the text and filters the JSON data for Amazon Transcribe json response objects. @@ -93,6 +95,8 @@ def filter_transcribe(json_conf, measures, speaker_label=None): aws transcribe json response. measures: dict A dictionary containing the names of the columns in the output dataframes. + min_turn_length: int + minimum words required in each turn speaker_label: str Speaker label Returns: @@ -114,20 +118,18 @@ def filter_transcribe(json_conf, measures, speaker_label=None): # extract text text = " ".join([item["alternatives"][0]["content"] for item in item_data if "alternatives" in item]) - phrases, phrases_idxs = cutil.phrase_split(text) if speaker_label is not None: - turns_idxs, turns = cutil.filter_speaker_aws(item_data, speaker_label) + turns_idxs, turns = cutil.filter_speaker_aws(item_data, min_turn_length, speaker_label) else: turns_idxs, turns = [], [] - text = " ".join(phrases) + text = " ".join(turns) filter_json = cutil.filter_json_transcribe_aws(item_data, speaker_label, measures) words = [word["alternatives"][0]["content"] for word in filter_json] text_list = [words, turns, text] - text_indices = [phrases_idxs, turns_idxs] - return filter_json, text_list, text_indices + return filter_json, text_list, turns_idxs def filter_whisper(json_conf, measures, min_turn_length, speaker_label=None): @@ -172,9 +174,6 @@ def filter_whisper(json_conf, measures, min_turn_length, speaker_label=None): item_data = [segment for segment in item_data if "speaker" in segment] item_data = cutil.create_index_column(item_data, measures) - phrases_idxs, phrases = cutil.filter_phrases(item_data, speaker_label, measures) # phrase-split - - # turn-split if speaker_label is not None: turns_idxs, turns = cutil.filter_turns(item_data, speaker_label, measures, min_turn_length) @@ -184,12 +183,10 @@ def filter_whisper(json_conf, measures, min_turn_length, speaker_label=None): # filter json to only include items with start_time and end_time filter_json = cutil.filter_json_transcribe(item_data, speaker_label, measures) words = [value["word"] for value in filter_json] - text = " ".join(phrases) + text = " ".join(turns) text_list = [words, turns, text] - text_indices = [phrases_idxs, turns_idxs] - - return filter_json, text_list, text_indices + return filter_json, text_list, turns_idxs def filter_vosk(json_conf, measures): @@ -306,11 +303,11 @@ def process_transcript(df_list, json_conf, measures, min_turn_length, speaker_la info = filter_whisper(json_conf, measures, min_turn_length, speaker_label) elif source == 'aws': - info = filter_transcribe(json_conf, measures, speaker_label) + info = filter_transcribe(json_conf, measures, min_turn_length, speaker_label) else: words, text = filter_vosk(json_conf, measures) - info = (json_conf, [words, [], text], [[], []]) + info = (json_conf, [words, [], text], []) if len(info[0]) > 0 and len(info[1][-1]) > 0: df_list = cutil.process_language_feature(df_list, info, language, get_time_columns(source), measures) diff --git a/openwillis/measures/text/util/characteristics_util.py b/openwillis/measures/text/util/characteristics_util.py index 8c413ac..d09d8c1 100644 --- a/openwillis/measures/text/util/characteristics_util.py +++ b/openwillis/measures/text/util/characteristics_util.py @@ -95,41 +95,8 @@ def download_nltk_resources(): nltk.data.find("averaged_perceptron_tagger") except LookupError: nltk.download("averaged_perceptron_tagger") - -def phrase_split(text): - """ - ------------------------------------------------------------------------------------------------------ - - This function splits the input text into phrases. - - Parameters: - ........... - text: str - The input text. - - Returns: - ........... - phrases: list - A list of phrases extracted from the input text. - phrases_idxs: list - A list of tuples containing - the start and end indices of the phrases in the input text. - ------------------------------------------------------------------------------------------------------ - """ - phrases = nltk.tokenize.sent_tokenize(text) - phrases_idxs = [] - - start_idx = 0 - for phrase in phrases: - end_idx = start_idx + len(phrase.split()) - 1 - - phrases_idxs.append((start_idx, end_idx)) - start_idx = end_idx + 1 - - return phrases, phrases_idxs - -def filter_turn_aws(item_data, speaker_label): +def filter_turn_aws(item_data, min_turn_length, speaker_label): """ ------------------------------------------------------------------------------------------------------ @@ -140,6 +107,8 @@ def filter_turn_aws(item_data, speaker_label): ........... item_data: dict JSON response object. + min_turn_length: int + minimum words required in each turn speaker_label: str Speaker label @@ -160,38 +129,46 @@ def filter_turn_aws(item_data, speaker_label): try: if (i > 0 and item.get("speaker_label", "") == speaker_label and item_data[i - 1].get("speaker_label", "") != speaker_label): start_idx = i + elif (i > 0 and item.get("speaker_label", "") != speaker_label and item_data[i - 1].get("speaker_label", "") == speaker_label): - turns_idxs.append((start_idx, i - 1)) - turns.append(" ".join([item["alternatives"][0]["content"] for item in item_data[start_idx:i]])) + turn_text = " ".join([item["alternatives"][0]["content"] for item in item_data[start_idx:i]]) + + if len(turn_text.split(" ")) >= min_turn_length: + turns_idxs.append((start_idx, i - 1)) + turns.append(turn_text) except Exception as e: logger.error(f"Error in turn-split for speaker {speaker_label}: {e}") continue if start_idx not in [item[0] for item in turns_idxs]: - turns_idxs.append((start_idx, len(item_data) - 1)) - - turns.append(" ".join([item["alternatives"][0]["content"] for item in item_data[start_idx:]])) + turn_text = " ".join([item["alternatives"][0]["content"] for item in item_data[start_idx:]]) + + if len(turn_text.split(" ")) >= min_turn_length: + turns_idxs.append((start_idx, len(item_data) - 1)) + + turns.append(turn_text) return turns_idxs, turns -def filter_speaker_aws(item_data, speaker_label): +def filter_speaker_aws(item_data, min_turn_length, speaker_label): """ ------------------------------------------------------------------------------------------------------ - This function updates the turns and phrases lists - to only include the speaker label provided. + This function updates the turns lists to only include the speaker label provided. Parameters: ........... item_data: dict JSON response object. + min_turn_length: int + minimum words required in each turn speaker_label: str Speaker label + Returns: ........... turns_idxs: list - A list of tuples containing - the start and end indices of the turns in the JSON object. + A list of tuples containing the start and end indices of the turns in the JSON object. turns: list A list of turns extracted from the JSON object. @@ -203,7 +180,7 @@ def filter_speaker_aws(item_data, speaker_label): if speaker_label not in speaker_labels: logger.error(f"Speaker label {speaker_label} not found in the json response object.") - turns_idxs, turns = filter_turn_aws(item_data, speaker_label) + turns_idxs, turns = filter_turn_aws(item_data, min_turn_length, speaker_label) return turns_idxs, turns def filter_json_transcribe_aws(item_data, speaker_label, measures): @@ -235,53 +212,6 @@ def filter_json_transcribe_aws(item_data, speaker_label, measures): filter_json = [item for item in filter_json if item.get("speaker_label", "") == speaker_label] return filter_json - -def filter_phrases(item_data, speaker_label, measures): - """ - ------------------------------------------------------------------------------------------------------ - - This function updates the phrases list - to only include the speaker label provided. - - Parameters: - ........... - item_data: dict - JSON response object. - speaker_label: str - Speaker label - measures: dict - A dictionary containing the names of the columns in the output dataframes. - - Returns: - ........... - phrases_idxs: list - A list of tuples containing - the start and end indices of the phrases in the JSON object. - phrases: list - A list of phrases extracted from the JSON object. - - ------------------------------------------------------------------------------------------------------ - """ - - - phrases_idxs, phrases = [], [] - for item in item_data: - try: - - start_idx = item["words"][0][measures["old_index"]] - end_idx = item["words"][-1][measures["old_index"]] - - if speaker_label is not None: - if item["speaker"] == speaker_label: - phrases.append(item["text"]) - phrases_idxs.append((start_idx, end_idx)) - else: - phrases.append(item["text"]) - phrases_idxs.append((start_idx, end_idx)) - - except Exception as e: - logger.error(f"Failed to filter phrases: {e}") - return phrases_idxs, phrases def filter_turns(item_data, speaker_label, measures, min_turn_length): """ @@ -323,15 +253,17 @@ def filter_turns(item_data, speaker_label, measures, min_turn_length): else: if current_turn is not None: - - start_idx2 = current_turn[0]["words"][0][measures["old_index"]] - end_idx2 = current_turn[-1]["words"][-1][measures["old_index"]] - turn_text = " ".join(item["text"] for item in current_turn) - if len(turn_text.split(" ")) >= min_turn_length: - turns_idxs.append((start_idx2, end_idx2)) + if len(current_turn)>0 and len(current_turn[0]["words"])>0: + start_idx2 = current_turn[0]["words"][0][measures["old_index"]] - turns.append(turn_text) + end_idx2 = current_turn[-1]["words"][-1][measures["old_index"]] + turn_text = " ".join(item["text"] for item in current_turn) + + if len(turn_text.split(" ")) >= min_turn_length: + turns_idxs.append((start_idx2, end_idx2)) + + turns.append(turn_text) current_turn = None except Exception as e: @@ -447,7 +379,7 @@ def get_num_of_syllables(text): return syllable_count -def get_pause_feature_word(word_df, df_diff, word_list, phrase_index, measures): +def get_pause_feature_word(word_df, df_diff, word_list, turn_index, measures): """ ------------------------------------------------------------------------------------------------------ @@ -463,9 +395,8 @@ def get_pause_feature_word(word_df, df_diff, word_list, phrase_index, measures): from the JSON response. word_list: list List of transcribed text at the word level. - phrase_index: list + turn_index: list A list containing the indices of the first and last word - in each phrase or turn. measures: dict A dictionary containing the names of the columns in the output dataframes. @@ -476,8 +407,8 @@ def get_pause_feature_word(word_df, df_diff, word_list, phrase_index, measures): ------------------------------------------------------------------------------------------------------ """ - phrase_starts = [pindex[0] for pindex in phrase_index] - word_df[measures["word_pause"]] = df_diff[measures["pause"]].where(~df_diff[measures["old_index"]].isin(phrase_starts), np.nan) + turn_starts = [pindex[0] for pindex in turn_index] + word_df[measures["word_pause"]] = df_diff[measures["pause"]].where(~df_diff[measures["old_index"]].isin(turn_starts), np.nan) word_df[measures["num_syllables"]] = [get_num_of_syllables(word) for word in word_list] return word_df @@ -487,26 +418,24 @@ def process_pause_feature(df_diff, df, text_level, index_list, time_index, level ------------------------------------------------------------------------------------------------------ This function calculates various pause-related speech - characteristic features at the phrase or turn + characteristic features at the turn level and adds them to the output dataframe df. Parameters: ........... df_diff: pandas dataframe - A dataframe containing the word-level information - from the JSON response. + A dataframe containing the word-level information from the JSON response. df: pandas dataframe - A dataframe containing phrase or turn summary information + A dataframe containing turn summary information text_level: list - List of transcribed text at the phrase or turn level. + List of transcribed text at the turn level. index_list: list - A list containing the indices of the first and last word - in each phrase or turn. + A list containing the indices of the first and last word in each turn. time_index: list A list containing the names of the columns in json that contain the start and end times of each word. level_name: str - The name of the level being analyzed (phrase or turn). + The name of the level being analyzed turn. measures: dict A dictionary containing the names of the columns in the output dataframes. @@ -518,8 +447,8 @@ def process_pause_feature(df_diff, df, text_level, index_list, time_index, level ------------------------------------------------------------------------------------------------------ """ - if level_name not in [measures["phrase"], measures["turn"]]: - logger.error(f"level_name must be either in phrase or turn") + if level_name not in [measures["turn"]]: + logger.error(f"level_name must be turn") return df for j, index in enumerate(index_list): @@ -648,9 +577,7 @@ def update_summ_df(df_diff, summ_df, full_text, time_index, word_df, turn_df, me summ_df[measures["word_rate"]] = (summ_df[measures["speech_words"]] / summ_df[measures["speech_minutes"]]) summ_df[measures["syllable_rate"]] = (get_num_of_syllables(full_text) / summ_df[measures["speech_minutes"]]) - - summ_df[measures["speech_percentage"]] = 100 * ( - 1 - df_diff.loc[1:, measures["pause"]].sum()/ (60 * summ_df[measures["speech_minutes"]])) + summ_df[measures["speech_percentage"]] = 100 * (summ_df[measures["speech_minutes"]] / summ_df[measures["file_length"]]) if len(word_df[measures["word_pause"]]) > 1: summ_df[measures["word_pause_mean"]] = word_df[measures["word_pause"]].mean(skipna=True) @@ -668,7 +595,7 @@ def update_summ_df(df_diff, summ_df, full_text, time_index, word_df, turn_df, me return summ_df -def get_pause_feature(json_conf, df_list, text_list, text_indices, measures, time_index, language): +def get_pause_feature(json_conf, df_list, text_list, turn_index, measures, time_index, language): """ ------------------------------------------------------------------------------------------------------ @@ -680,16 +607,17 @@ def get_pause_feature(json_conf, df_list, text_list, text_indices, measures, tim json_conf: list JSON response object. df_list: list - List of pandas dataframes. - word_df, phrase_df, turn_df, summ_df + List of pandas dataframes: word_df, turn_df, summ_df text_list: list - List of transcribed text. - split into words, phrases, turns, and full text. - text_indices: list + List of transcribed text: split into words, turns, and full text. + turn_index: list List of indices for text_list. - for phrases and turns. measures: dict A dictionary containing the names of the columns in the output dataframes. + time_index: list + timepoint index (start/end) + language: str + Language of the transcribed text. Returns: ........... @@ -703,17 +631,14 @@ def get_pause_feature(json_conf, df_list, text_list, text_indices, measures, tim word_df, turn_df, summ_df = df_list word_list, turn_list, full_text = text_list - phrase_index, turn_index = text_indices - df_diff = pd.DataFrame(json_conf) - time_index = [time_index[0], time_index[1]] # Calculate the pause time between; each word and add the results to pause_list if measures["pause"] not in df_diff.columns: df_diff[measures["pause"]] = df_diff[time_index[0]].astype(float) - df_diff[time_index[1]].astype(float).shift(1) # word-level analysis - word_df = get_pause_feature_word(word_df, df_diff, word_list, phrase_index, measures) + word_df = get_pause_feature_word(word_df, df_diff, word_list, turn_index, measures) # turn-level analysis if len(turn_index) > 0: @@ -797,7 +722,7 @@ def get_tag(json_conf, tag_dict, measures): json_conf[i][measures["tag"]] = "Other" return json_conf -def get_tag_summ(json_conf, df_list, text_indices, measures): +def get_tag_summ(json_conf, df_list, measures): """ ------------------------------------------------------------------------------------------------------ @@ -810,11 +735,7 @@ def get_tag_summ(json_conf, df_list, text_indices, measures): json_conf: list JSON response object. df_list: list - List of pandas dataframes. - word_df, phrase_df, turn_df, summ_df - text_indices: list - List of indices for text_list. - for phrases and turns. + List of pandas dataframes: word_df, turn_df, summ_df measures: dict A dictionary containing the names of the columns in the output dataframes. @@ -825,10 +746,7 @@ def get_tag_summ(json_conf, df_list, text_indices, measures): ------------------------------------------------------------------------------------------------------ """ - word_df, turn_df, summ_df = df_list - _ , turn_index = text_indices - df_conf = pd.DataFrame(json_conf) word_df[measures["part_of_speech"]] = df_conf[measures["tag"]] @@ -846,10 +764,8 @@ def get_sentiment(df_list, text_list, measures): ........... df_list: list List of pandas dataframes. - word_df, phrase_df, turn_df, summ_df text_list: list List of transcribed text. - split into words, phrases, turns, and full text. measures: dict A dictionary containing the names of the columns in the output dataframes. @@ -911,7 +827,7 @@ def calculate_file_feature(json_data, model, speakers): file_length = max(float(segment.get("end_time", "0")) for segment in segments) if speakers is None: - return file_length, np.NaN + return file_length/60, np.NaN speaking_time = sum(float(segment.get("end_time", "0") or "0") - float(segment.get("start_time", "0") or "0") for segment in segments if segment.get("speaker_label", "") in speakers) @@ -920,7 +836,7 @@ def calculate_file_feature(json_data, model, speakers): file_length = max(segment.get('end', 0) for segment in segments) if speakers is None: - return file_length, np.NaN + return file_length/60, np.NaN speaking_time = sum(segment['end'] - segment['start'] for segment in segments if segment.get('speaker', '') in speakers) speaking_pct = (speaking_time / file_length) * 100 @@ -952,12 +868,12 @@ def process_language_feature(df_list, transcribe_info, language, time_index, mea ------------------------------------------------------------------------------------------------------ """ - json_conf, text_list, text_indices = transcribe_info - df_list = get_pause_feature(json_conf, df_list, text_list, text_indices, measures, time_index, language) + json_conf, text_list, turn_indices = transcribe_info + df_list = get_pause_feature(json_conf, df_list, text_list, turn_indices, measures, time_index, language) if language == "en": json_conf = get_tag(json_conf, TAG_DICT, measures) - df_list = get_tag_summ(json_conf, df_list, text_indices, measures) + df_list = get_tag_summ(json_conf, df_list, measures) df_list = get_sentiment(df_list, text_list, measures) return df_list From 5d1433154311b1fc60bf95f705151ff371bf5358 Mon Sep 17 00:00:00 2001 From: vjbytes102 Date: Thu, 2 Nov 2023 17:40:00 -0400 Subject: [PATCH 10/21] Update speech_attribute --- openwillis/measures/text/speech_attribute.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/openwillis/measures/text/speech_attribute.py b/openwillis/measures/text/speech_attribute.py index 674b20e..004cbb1 100644 --- a/openwillis/measures/text/speech_attribute.py +++ b/openwillis/measures/text/speech_attribute.py @@ -121,10 +121,11 @@ def filter_transcribe(json_conf, measures, min_turn_length, speaker_label=None): if speaker_label is not None: turns_idxs, turns = cutil.filter_speaker_aws(item_data, min_turn_length, speaker_label) + text = " ".join(turns) + else: turns_idxs, turns = [], [] - text = " ".join(turns) filter_json = cutil.filter_json_transcribe_aws(item_data, speaker_label, measures) words = [word["alternatives"][0]["content"] for word in filter_json] @@ -183,7 +184,7 @@ def filter_whisper(json_conf, measures, min_turn_length, speaker_label=None): # filter json to only include items with start_time and end_time filter_json = cutil.filter_json_transcribe(item_data, speaker_label, measures) words = [value["word"] for value in filter_json] - text = " ".join(turns) + text = " ".join(words) text_list = [words, turns, text] return filter_json, text_list, turns_idxs From af3fd098c1fcbb22b4c05a203cf60883447e30bf Mon Sep 17 00:00:00 2001 From: GeorgiosEfstathiadis <54844705+GeorgeEfstathiadis@users.noreply.github.com> Date: Thu, 2 Nov 2023 17:58:25 -0400 Subject: [PATCH 11/21] text joining in whisper doesnt word when multiple speakers and minimum turn length --- openwillis/measures/text/speech_attribute.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/openwillis/measures/text/speech_attribute.py b/openwillis/measures/text/speech_attribute.py index 004cbb1..b8ce3f7 100644 --- a/openwillis/measures/text/speech_attribute.py +++ b/openwillis/measures/text/speech_attribute.py @@ -171,20 +171,21 @@ def filter_whisper(json_conf, measures, min_turn_length, speaker_label=None): """ item_data = json_conf["segments"] + text = " ".join(item["text"] for item in item_data) + if speaker_label is not None: item_data = [segment for segment in item_data if "speaker" in segment] item_data = cutil.create_index_column(item_data, measures) if speaker_label is not None: turns_idxs, turns = cutil.filter_turns(item_data, speaker_label, measures, min_turn_length) - + text = " ".join(turns) else: turns_idxs, turns = [], [] # filter json to only include items with start_time and end_time filter_json = cutil.filter_json_transcribe(item_data, speaker_label, measures) words = [value["word"] for value in filter_json] - text = " ".join(words) text_list = [words, turns, text] return filter_json, text_list, turns_idxs From 5c1acd8167d8f9e9d831f894ccedb6d46da03b6b Mon Sep 17 00:00:00 2001 From: vjbytes102 Date: Thu, 2 Nov 2023 18:20:43 -0400 Subject: [PATCH 12/21] Update speech_attribute --- openwillis/measures/text/speech_attribute.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/openwillis/measures/text/speech_attribute.py b/openwillis/measures/text/speech_attribute.py index b8ce3f7..b030e41 100644 --- a/openwillis/measures/text/speech_attribute.py +++ b/openwillis/measures/text/speech_attribute.py @@ -170,15 +170,15 @@ def filter_whisper(json_conf, measures, min_turn_length, speaker_label=None): ------------------------------------------------------------------------------------------------------ """ item_data = json_conf["segments"] - - text = " ".join(item["text"] for item in item_data) + text = " ".join(item.get("text", "") for item in item_data) if speaker_label is not None: item_data = [segment for segment in item_data if "speaker" in segment] item_data = cutil.create_index_column(item_data, measures) - if speaker_label is not None: + if speaker_label is not None: turns_idxs, turns = cutil.filter_turns(item_data, speaker_label, measures, min_turn_length) + text = " ".join(turns) else: turns_idxs, turns = [], [] From 247005ebd93b218d175bff0e669e15a977e73dac Mon Sep 17 00:00:00 2001 From: vjbytes102 Date: Thu, 2 Nov 2023 19:57:10 -0400 Subject: [PATCH 13/21] Update version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 7545ba1..b9f51fc 100644 --- a/setup.py +++ b/setup.py @@ -12,7 +12,7 @@ install_requires = fp.read() setuptools.setup(name='openwillis', - version='1.5.2', + version='1.6', description='digital health measurement', long_description=long_description, long_description_content_type="text/markdown", From 5167eae89505891215694eeed2c3cd524699de9f Mon Sep 17 00:00:00 2001 From: Vijay Yadav Date: Mon, 6 Nov 2023 12:57:30 -0500 Subject: [PATCH 14/21] speech transcription vosk --- openwillis/__init__.py | 3 +- openwillis/measures/api.py | 1 + openwillis/measures/audio/__init__.py | 6 +- .../measures/audio/speech_transcribe_vosk.py | 262 ++++++++++++++++++ 4 files changed, 270 insertions(+), 2 deletions(-) create mode 100644 openwillis/measures/audio/speech_transcribe_vosk.py diff --git a/openwillis/__init__.py b/openwillis/__init__.py index dd36a91..5188a2c 100644 --- a/openwillis/__init__.py +++ b/openwillis/__init__.py @@ -13,7 +13,8 @@ speaker_separation, speaker_separation_cloud, speech_transcription_cloud, + speech_transcription_vosk, to_audio ) -__all__ = ["facial_expressivity", "vocal_acoustics", "emotional_expressivity", "eye_blink_rate", "speech_transcription", "speech_characteristics", "speaker_separation", "speaker_separation_cloud", "speech_transcription_cloud", "to_audio"] +__all__ = ["facial_expressivity", "vocal_acoustics", "emotional_expressivity", "eye_blink_rate", "speech_transcription", "speech_characteristics", "speaker_separation", "speaker_separation_cloud", "speech_transcription_cloud", "speech_transcription_vosk", "to_audio"] diff --git a/openwillis/measures/api.py b/openwillis/measures/api.py index ecc0897..b25367a 100644 --- a/openwillis/measures/api.py +++ b/openwillis/measures/api.py @@ -13,6 +13,7 @@ speaker_separation, speaker_separation_cloud, speech_transcription_cloud, + speech_transcription_vosk ) from openwillis.measures.text import ( speech_characteristics diff --git a/openwillis/measures/audio/__init__.py b/openwillis/measures/audio/__init__.py index d53a3af..b146e8c 100644 --- a/openwillis/measures/audio/__init__.py +++ b/openwillis/measures/audio/__init__.py @@ -18,4 +18,8 @@ speech_transcription_cloud, ) -__all__ = ["vocal_acoustics", "speech_transcription", "speaker_separation", "speaker_separation_cloud", "speech_transcription_cloud"] +from openwillis.measures.audio.speech_transcribe_vosk import ( + speech_transcription_vosk, +) + +__all__ = ["vocal_acoustics", "speech_transcription", "speaker_separation", "speaker_separation_cloud", "speech_transcription_cloud", "speech_transcription_vosk"] diff --git a/openwillis/measures/audio/speech_transcribe_vosk.py b/openwillis/measures/audio/speech_transcribe_vosk.py new file mode 100644 index 0000000..f18ae9f --- /dev/null +++ b/openwillis/measures/audio/speech_transcribe_vosk.py @@ -0,0 +1,262 @@ +# website: http://www.brooklyn.health + +# import the required packages +import os +import wave +import json +import logging +import json + +from vosk import Model, KaldiRecognizer +from pydub import AudioSegment +from openwillis.measures.audio.util import util as ut + +logging.basicConfig(level=logging.INFO) +logger=logging.getLogger() + +def get_config(): + """ + ------------------------------------------------------------------------------------------------------ + + Load the configuration settings for the speech transcription. + + Parameters: + ........... + None + + Returns: + ........... + measures : dict + A dictionary containing the configuration settings. + + ------------------------------------------------------------------------------------------------------ + """ + #Loading json config + dir_name = os.path.dirname(os.path.abspath(__file__)) + measure_path = os.path.abspath(os.path.join(dir_name, 'config/speech.json')) + + file = open(measure_path) + measures = json.load(file) + return measures + +def filter_audio(filepath, t_interval): + """ + ------------------------------------------------------------------------------------------------------ + + Filter an audio file to extract a segment based on the specified time interval. + + Parameters: + ............ + filepath : str + The path to the audio file to be filtered. + t_interval : list + A list of tuples representing the start and end times (in seconds) of the segment to extract. + + Returns: + ............ + sound : AudioSegment + The filtered audio segment. + + ------------------------------------------------------------------------------------------------------ + """ + sound = AudioSegment.from_wav(filepath) + + if len(t_interval)==2: + sound = sound[int(t_interval[0])*1000 : int(t_interval[1])*1000] + + elif len(t_interval)==1: + sound = sound[int(t_interval[0])*1000:] + + sound = sound.set_channels(1) + return sound + +def filter_speech(measures, results): + """ + ------------------------------------------------------------------------------------------------------ + + Filter the speech transcription results to extract the transcript. + + Parameters: + ........... + measures : dict + A dictionary containing the configuration settings for the speech transcription. + results : list of dict + The raw transcription results returned by the transcription service. + + Returns: + ........... + result_key : list + A list containing the framewise transcription of the audio file. + transcript : str + The transcript of the audio file. + + ------------------------------------------------------------------------------------------------------ + """ + result_key = [] + text_key = [] + transcript_dict = {} + + for res in results: + dict_keys = res.keys() + + if 'result' in dict_keys and 'text' in dict_keys: + result_key.extend(res['result']) + text_key.append(res['text']) + + transcript_dict['result'] = result_key + transcript_dict['text'] = ' '.join(text_key) + return result_key, ' '.join(text_key) + +def get_vosk(audio_path, lang): + """ + ------------------------------------------------------------------------------------------------------ + + Recognize speech using the Vosk model. + + Parameters: + ............ + audio_path : str + The path to the audio file to be transcribed. + lang : str + The language of the audio file (e.g. 'en-us', 'es', 'fr'). + + Returns: + ............ + results : list of dict + The raw transcription results returned by the Vosk model. + + ------------------------------------------------------------------------------------------------------ + """ + model = Model(lang=lang) + wf = wave.open(audio_path, "rb") + + recog = KaldiRecognizer(model, wf.getframerate()) + recog.SetWords(True) + + results = [] + while True: + + data = wf.readframes(4000) #Future work + if len(data) == 0: + break + + if recog.AcceptWaveform(data): + partial_result = json.loads(recog.Result()) + results.append(partial_result) + + partial_result = json.loads(recog.FinalResult()) + results.append(partial_result) + return results + +def stereo_to_mono(filepath, t_interval): + """ + ------------------------------------------------------------------------------------------------------ + + Convert a stereo audio file to a mono audio file. + + Parameters: + ............ + filepath : str + The path to the stereo audio file to be converted. + t_interval : list + A list of tuples representing the start and end times (in seconds) of segments of the audio file to be transcribed. + + Returns: + ............ + mono_filepath : str + The path to the mono audio file. + + ------------------------------------------------------------------------------------------------------ + """ + sound = filter_audio(filepath, t_interval) + + filename, _ = os.path.splitext(os.path.basename(filepath)) + dir_name = os.path.join(os.path.dirname(filepath), 'temp_mono_' + filename) + + ut.make_dir(dir_name) + mono_filepath = os.path.join(dir_name, filename + '.wav') + sound.export(mono_filepath, format="wav") + return mono_filepath + +def run_vosk(filepath, language, transcribe_interval = []): + """ + ------------------------------------------------------------------------------------------------------ + + Transcribe speech in an audio file using the Vosk model. + + Parameters: + ............ + filepath : str + The path to the audio file to be transcribed. + language : str, optional + The language of the audio file (e.g. 'en-us', 'es', 'fr'). Default is 'en-us'. + transcribe_interval : list, optional + A list of tuples representing the start and end times (in seconds) of segments of the audio file to be transcribed. + Default is an empty list. + + Returns: + ............ + json_response : str + The JSON response from the Vosk transcription service. + transcript : str + The transcript of the audio file. + + ------------------------------------------------------------------------------------------------------ + """ + json_response = json.dumps({}) + transcript = mono_filepath = '' + + try: + if os.path.exists(filepath): + + measures = get_config() + mono_filepath = stereo_to_mono(filepath, transcribe_interval) + results = get_vosk(mono_filepath, language) + + ut.remove_dir(os.path.dirname(mono_filepath)) #Clean temp directory + json_response, transcript = filter_speech(measures, results) + + else: + logger.info(f'Audio file not available. File: {filepath}') + + except Exception as e: + ut.remove_dir(os.path.dirname(mono_filepath))#Clean temp directory + logger.error(f'Error in speech Transcription: {e} & File: {filepath}') + + finally: + return json_response, transcript + + + +def speech_transcription_vosk(filepath, **kwargs): + """ + ------------------------------------------------------------------------------------------------------ + + Speech transcription function that transcribes an audio file using vosk. + + Parameters: + ........... + filepath : str + The path to the audio file to be transcribed. + language : str, optional + The language of the audio file (e.g. 'en-us', 'es', 'fr'). Default is 'en-us'. + transcribe_interval : list, optional + A list of tuples representing the start and end times (in seconds) of segments of the audio file to be transcribed. + Only applicable if model is 'vosk'. Default is an empty list. + + Returns: + ........... + json_response : JSON Object + A transcription response object in JSON format + transcript : str + The transcript of the recording. + + ------------------------------------------------------------------------------------------------------ + """ + + measures = get_config() + language = kwargs.get('language', 'en-us') + transcribe_interval = kwargs.get('transcribe_interval', []) + + json_response, transcript = run_vosk(filepath, language, transcribe_interval) + return json_response, transcript From 6d6707662abbf6d8c22cf48c702a534242bd7608 Mon Sep 17 00:00:00 2001 From: Vijay Yadav Date: Mon, 6 Nov 2023 16:05:50 -0500 Subject: [PATCH 15/21] whisperx update --- openwillis/__init__.py | 4 +- openwillis/measures/api.py | 2 +- openwillis/measures/audio/__init__.py | 6 +- .../measures/audio/speech_transcribe.py | 330 ------------------ .../audio/speech_transcribe_whisper.py | 149 ++++++++ .../measures/audio/util/whisperx_util.py | 52 ++- 6 files changed, 178 insertions(+), 365 deletions(-) delete mode 100644 openwillis/measures/audio/speech_transcribe.py create mode 100644 openwillis/measures/audio/speech_transcribe_whisper.py diff --git a/openwillis/__init__.py b/openwillis/__init__.py index 5188a2c..61e0c35 100644 --- a/openwillis/__init__.py +++ b/openwillis/__init__.py @@ -8,7 +8,7 @@ emotional_expressivity, eye_blink_rate, vocal_acoustics, - speech_transcription, + speech_transcription_whisper, speech_characteristics, speaker_separation, speaker_separation_cloud, @@ -17,4 +17,4 @@ to_audio ) -__all__ = ["facial_expressivity", "vocal_acoustics", "emotional_expressivity", "eye_blink_rate", "speech_transcription", "speech_characteristics", "speaker_separation", "speaker_separation_cloud", "speech_transcription_cloud", "speech_transcription_vosk", "to_audio"] +__all__ = ["facial_expressivity", "vocal_acoustics", "emotional_expressivity", "eye_blink_rate", "speech_transcription_whisper", "speech_characteristics", "speaker_separation", "speaker_separation_cloud", "speech_transcription_cloud", "speech_transcription_vosk", "to_audio"] diff --git a/openwillis/measures/api.py b/openwillis/measures/api.py index b25367a..f76db67 100644 --- a/openwillis/measures/api.py +++ b/openwillis/measures/api.py @@ -9,7 +9,7 @@ ) from openwillis.measures.audio import ( vocal_acoustics, - speech_transcription, + speech_transcription_whisper, speaker_separation, speaker_separation_cloud, speech_transcription_cloud, diff --git a/openwillis/measures/audio/__init__.py b/openwillis/measures/audio/__init__.py index b146e8c..f7448e6 100644 --- a/openwillis/measures/audio/__init__.py +++ b/openwillis/measures/audio/__init__.py @@ -2,8 +2,8 @@ vocal_acoustics, ) -from openwillis.measures.audio.speech_transcribe import ( - speech_transcription, +from openwillis.measures.audio.speech_transcribe_whisper import ( + speech_transcription_whisper, ) from openwillis.measures.audio.speech_separation import ( @@ -22,4 +22,4 @@ speech_transcription_vosk, ) -__all__ = ["vocal_acoustics", "speech_transcription", "speaker_separation", "speaker_separation_cloud", "speech_transcription_cloud", "speech_transcription_vosk"] +__all__ = ["vocal_acoustics", "speech_transcription_whisper", "speaker_separation", "speaker_separation_cloud", "speech_transcription_cloud", "speech_transcription_vosk"] diff --git a/openwillis/measures/audio/speech_transcribe.py b/openwillis/measures/audio/speech_transcribe.py deleted file mode 100644 index 6be8eb5..0000000 --- a/openwillis/measures/audio/speech_transcribe.py +++ /dev/null @@ -1,330 +0,0 @@ -# author: Vijay Yadav -# website: http://www.bklynhlth.com - -# import the required packages - -import numpy as np -import pandas as pd -import os -import wave -import json -import logging - -from pydub import AudioSegment -from openwillis.measures.audio.util import util as ut -from openwillis.measures.audio.util import transcribe_util as tutil - -logging.basicConfig(level=logging.INFO) -logger=logging.getLogger() - -def run_vosk(filepath, language='en-us', transcribe_interval = []): - """ - ------------------------------------------------------------------------------------------------------ - - Transcribe speech in an audio file using the Vosk model. - - Parameters: - ............ - filepath : str - The path to the audio file to be transcribed. - language : str, optional - The language of the audio file (e.g. 'en-us', 'es', 'fr'). Default is 'en-us'. - transcribe_interval : list, optional - A list of tuples representing the start and end times (in seconds) of segments of the audio file to be transcribed. - Default is an empty list. - - Returns: - ............ - json_response : str - The JSON response from the Vosk transcription service. - transcript : str - The transcript of the audio file. - - ------------------------------------------------------------------------------------------------------ - """ - json_response = '{}' - transcript = mono_filepath = '' - - try: - if os.path.exists(filepath): - - measures = get_config() - mono_filepath = stereo_to_mono(filepath, transcribe_interval) - results = get_vosk(mono_filepath, language) - - ut.remove_dir(os.path.dirname(mono_filepath)) #Clean temp directory - json_response, transcript = filter_speech(measures, results) - - else: - logger.info(f'Audio file not available. File: {filepath}') - - except Exception as e: - ut.remove_dir(os.path.dirname(mono_filepath))#Clean temp directory - logger.error(f'Error in speech Transcription: {e} & File: {filepath}') - - finally: - return json_response, transcript - -def filter_audio(filepath, t_interval): - """ - ------------------------------------------------------------------------------------------------------ - - Filter an audio file to extract a segment based on the specified time interval. - - Parameters: - ............ - filepath : str - The path to the audio file to be filtered. - t_interval : list - A list of tuples representing the start and end times (in seconds) of the segment to extract. - - Returns: - ............ - sound : AudioSegment - The filtered audio segment. - - ------------------------------------------------------------------------------------------------------ - """ - sound = AudioSegment.from_wav(filepath) - - if len(t_interval)==2: - sound = sound[int(t_interval[0])*1000 : int(t_interval[1])*1000] - - elif len(t_interval)==1: - sound = sound[int(t_interval[0])*1000:] - - sound = sound.set_channels(1) - return sound - -def stereo_to_mono(filepath, t_interval): - """ - ------------------------------------------------------------------------------------------------------ - - Convert a stereo audio file to a mono audio file. - - Parameters: - ............ - filepath : str - The path to the stereo audio file to be converted. - t_interval : list - A list of tuples representing the start and end times (in seconds) of segments of the audio file to be transcribed. - - Returns: - ............ - mono_filepath : str - The path to the mono audio file. - - ------------------------------------------------------------------------------------------------------ - """ - sound = filter_audio(filepath, t_interval) - - filename, _ = os.path.splitext(os.path.basename(filepath)) - dir_name = os.path.join(os.path.dirname(filepath), 'temp_mono_' + filename) - - ut.make_dir(dir_name) - mono_filepath = os.path.join(dir_name, filename + '.wav') - sound.export(mono_filepath, format="wav") - return mono_filepath - -def get_vosk(audio_path, lang): - """ - ------------------------------------------------------------------------------------------------------ - - Recognize speech using the Vosk model. - - Parameters: - ............ - audio_path : str - The path to the audio file to be transcribed. - lang : str - The language of the audio file (e.g. 'en-us', 'es', 'fr'). - - Returns: - ............ - results : list of dict - The raw transcription results returned by the Vosk model. - - ------------------------------------------------------------------------------------------------------ - """ - #import in-case of model=vosk - from vosk import Model, KaldiRecognizer - - model = Model(lang=lang) - wf = wave.open(audio_path, "rb") - - recog = KaldiRecognizer(model, wf.getframerate()) - recog.SetWords(True) - - results = [] - while True: - - data = wf.readframes(4000) #Future work - if len(data) == 0: - break - - if recog.AcceptWaveform(data): - partial_result = json.loads(recog.Result()) - results.append(partial_result) - - partial_result = json.loads(recog.FinalResult()) - results.append(partial_result) - return results - -def filter_speech(measures, results): - """ - ------------------------------------------------------------------------------------------------------ - - Filter the speech transcription results to extract the transcript. - - Parameters: - ........... - measures : dict - A dictionary containing the configuration settings for the speech transcription. - results : list of dict - The raw transcription results returned by the transcription service. - - Returns: - ........... - result_key : list - A list containing the framewise transcription of the audio file. - transcript : str - The transcript of the audio file. - - ------------------------------------------------------------------------------------------------------ - """ - result_key = [] - text_key = [] - transcript_dict = {} - - for res in results: - dict_keys = res.keys() - - if 'result' in dict_keys and 'text' in dict_keys: - result_key.extend(res['result']) - text_key.append(res['text']) - - transcript_dict['result'] = result_key - transcript_dict['text'] = ' '.join(text_key) - return result_key, ' '.join(text_key) - - -def get_config(): - """ - ------------------------------------------------------------------------------------------------------ - - Load the configuration settings for the speech transcription. - - Parameters: - ........... - None - - Returns: - ........... - measures : dict - A dictionary containing the configuration settings. - - ------------------------------------------------------------------------------------------------------ - """ - #Loading json config - dir_name = os.path.dirname(os.path.abspath(__file__)) - measure_path = os.path.abspath(os.path.join(dir_name, 'config/speech.json')) - - file = open(measure_path) - measures = json.load(file) - return measures - -def run_whisperx(filepath, hf_token, del_model, num_speakers, infra_model, language): - """ - ------------------------------------------------------------------------------------------------------ - - Transcribe audio data using the WhisperX model. - - Parameters: - ........... - filepath : str - The path to the audio file to be transcribed. - hf_token : str - The Hugging Face token for model authentication. - del_model: boolean - Boolean indicator to delete model if low on GPU resources - num_speakers: int - Number of speaker - infra_model:list - whisper model artifacts (this is optional param: to optimize willisInfra) - language: str - language code - - Returns: - ........... - json_response : JSON Object - A transcription response object in JSON format - transcript : str - The transcript of the recording. - - ------------------------------------------------------------------------------------------------------ - """ - json_response = '{}' - transcript = '' - - if os.path.exists(filepath)== False or hf_token == '': - return json_response, transcript - - from openwillis.measures.audio.util import whisperx_util as wutil #import in-case of model=whisperx - json_response, transcript = wutil.get_whisperx_diariazation(filepath, hf_token, del_model, num_speakers, infra_model, language) - - if str(json_response) != '{}': - json_response = tutil.replace_whisperx_speaker_labels(json_response, ['SPEAKER_00', 'SPEAKER_01'], - ['speaker0', 'speaker1']) - return json_response, transcript - - -def speech_transcription(filepath, **kwargs): - """ - ------------------------------------------------------------------------------------------------------ - - Speech transcription function that transcribes an audio file using vosk/whisperx. - - Parameters: - ........... - filepath : str - The path to the audio file to be transcribed. - model : str, optional - The transcription model to use ('vosk'). Default is 'vosk'. - language : str, optional - The language of the audio file (e.g. 'en-us', 'es', 'fr'). Default is 'en-us'. - transcribe_interval : list, optional - A list of tuples representing the start and end times (in seconds) of segments of the audio file to be transcribed. - Only applicable if model is 'vosk'. Default is an empty list. - - Returns: - ........... - json_response : JSON Object - A transcription response object in JSON format - transcript : str - The transcript of the recording. - - ------------------------------------------------------------------------------------------------------ - """ - - measures = get_config() - model = kwargs.get('model', 'vosk') - - language = kwargs.get('language', 'en-us') - scale = kwargs.get('c_scale', '') - num_speakers = kwargs.get('num_speakers', None) - - transcribe_interval = kwargs.get('transcribe_interval', []) - hf_token = kwargs.get('hf_token', '') - del_model = kwargs.get('del_model', False) - infra_model = kwargs.get('infra_model', [True, None, None]) - - if model == 'whisperx': - json_response, transcript = run_whisperx(filepath, hf_token, del_model, num_speakers, infra_model, language) - - if scale.lower() in measures['scale'].split(','): - content_dict = tutil.get_whisperx_content(json_response) - json_response = tutil.get_whisperx_clinical_labels(scale, measures, content_dict, json_response) - - else: - json_response, transcript = run_vosk(filepath, language, transcribe_interval) - return json_response, transcript diff --git a/openwillis/measures/audio/speech_transcribe_whisper.py b/openwillis/measures/audio/speech_transcribe_whisper.py new file mode 100644 index 0000000..ed434c8 --- /dev/null +++ b/openwillis/measures/audio/speech_transcribe_whisper.py @@ -0,0 +1,149 @@ +# author: Vijay Yadav +# website: http://www.bklynhlth.com + +# import the required packages + +import numpy as np +import pandas as pd +import os +import json +import logging + +from pydub import AudioSegment +from openwillis.measures.audio.util import util as ut +from openwillis.measures.audio.util import transcribe_util as tutil + +logging.basicConfig(level=logging.INFO) +logger=logging.getLogger() + + +def get_config(): + """ + ------------------------------------------------------------------------------------------------------ + + Load the configuration settings for the speech transcription. + + Parameters: + ........... + None + + Returns: + ........... + measures : dict + A dictionary containing the configuration settings. + + ------------------------------------------------------------------------------------------------------ + """ + #Loading json config + dir_name = os.path.dirname(os.path.abspath(__file__)) + measure_path = os.path.abspath(os.path.join(dir_name, 'config/speech.json')) + + file = open(measure_path) + measures = json.load(file) + return measures + +def read_kwargs(kwargs): + """ + ------------------------------------------------------------------------------------------------------ + + Reads keyword arguments and returns a dictionary containing input parameters. + + Parameters: + ........... + kwargs : dict + Keyword arguments to be processed. + + Returns: + ........... + input_param: dict + A dictionary containing input parameters with their corresponding values. + + ------------------------------------------------------------------------------------------------------ + """ + input_param = {} + input_param['model'] = kwargs.get('model', 'tiny') + input_param['language'] = kwargs.get('language', 'en') + + input_param['context'] = kwargs.get('context', '') + input_param['max_speakers'] = kwargs.get('max_speakers', None) + input_param['min_speakers'] = kwargs.get('min_speakers', None) + + input_param['hf_token'] = kwargs.get('hf_token', '') + input_param['del_model'] = kwargs.get('del_model', False) #Temp filter + input_param['infra_model'] = kwargs.get('infra_model', [True, None, None]) #Temp filter + + return input_param + +def run_whisperx(filepath, input_param): + """ + ------------------------------------------------------------------------------------------------------ + + Transcribe audio data using the WhisperX model. + + Parameters: + ........... + filepath : str + The path to the audio file to be transcribed. + input_param : dict + A dictionary containing input parameters + + Returns: + ........... + json_response : JSON Object + A transcription response object in JSON format + transcript : str + The transcript of the recording. + + ------------------------------------------------------------------------------------------------------ + """ + json_response = json.dumps({}) + transcript = '' + + if os.path.exists(filepath)== False or input_param['hf_token'] == '': + return json_response, transcript + + from openwillis.measures.audio.util import whisperx_util as wutil #import in-case of model=whisperx + json_response, transcript = wutil.get_whisperx_diariazation(filepath, input_param) + + if str(json_response) != '{}': + json_response = tutil.replace_whisperx_speaker_labels(json_response, ['SPEAKER_00', 'SPEAKER_01'], + ['speaker0', 'speaker1']) + return json_response, transcript + + +def speech_transcription_whisper(filepath, **kwargs): + """ + ------------------------------------------------------------------------------------------------------ + + Speech transcription function that transcribes an audio file using whisperx. + + Parameters: + ........... + filepath : str + The path to the audio file to be transcribed. + model : str, optional + The transcription model to use ('vosk'). Default is 'vosk'. + language : str, optional + The language of the audio file (e.g. 'en-us', 'es', 'fr'). Default is 'en-us'. + transcribe_interval : list, optional + A list of tuples representing the start and end times (in seconds) of segments of the audio file to be transcribed. + Only applicable if model is 'vosk'. Default is an empty list. + + Returns: + ........... + json_response : JSON Object + A transcription response object in JSON format + transcript : str + The transcript of the recording. + + ------------------------------------------------------------------------------------------------------ + """ + measures = get_config() + input_param = read_kwargs(kwargs) + + json_response, transcript = run_whisperx(filepath, input_param) + if input_param['context'].lower() in measures['scale'].split(','): + + content_dict = tutil.get_whisperx_content(json_response) + json_response = tutil.get_whisperx_clinical_labels(input_param['context'], measures, content_dict, json_response) + return json_response, transcript diff --git a/openwillis/measures/audio/util/whisperx_util.py b/openwillis/measures/audio/util/whisperx_util.py index bd71064..a7bae73 100644 --- a/openwillis/measures/audio/util/whisperx_util.py +++ b/openwillis/measures/audio/util/whisperx_util.py @@ -27,7 +27,7 @@ def delete_model(model): torch.cuda.empty_cache() del model -def get_diarization(audio, align_json, HF_TOKEN, device, num_speakers, infra_model): +def get_diarization(audio, align_json, device, input_param): """ ------------------------------------------------------------------------------------------------------ @@ -38,12 +38,10 @@ def get_diarization(audio, align_json, HF_TOKEN, device, num_speakers, infra_mod audio signal object align_json: json aligned whisper transcribed output - HF_TOKEN : str - The Hugging Face token for model authentication. device : str device type - num_speakers: int - Number of speaker + input_param : dict + A dictionary containing input parameters Returns: ........... @@ -53,17 +51,23 @@ def get_diarization(audio, align_json, HF_TOKEN, device, num_speakers, infra_mod ------------------------------------------------------------------------------------------------------ """ # Assign speaker labels - if infra_model[0]: - diarize_model = whisperx.DiarizationPipeline(use_auth_token=HF_TOKEN, device=device) - + if input_param['infra_model'][0]: + diarize_model = whisperx.DiarizationPipeline(use_auth_token=input_param['hf_token'], device=device) else: - diarize_model = infra_model[2] + diarize_model = input_param['infra_model'][2] - if num_speakers == None: + if input_param['min_speakers'] == None and input_param['max_speakers'] == None: diarize_segments = diarize_model(audio) + elif input_param['min_speakers'] == None and input_param['max_speakers'] != None: + diarize_segments = diarize_model(audio, max_speakers = input_param['max_speakers']) + + elif input_param['min_speakers'] != None and input_param['max_speakers'] == None: + diarize_segments = diarize_model(audio, min_speakers= input_param['min_speakers']) + else: - diarize_segments = diarize_model(audio, min_speakers=num_speakers, max_speakers=num_speakers) + diarize_segments = diarize_model(audio, min_speakers=input_param['min_speakers'], max_speakers=input_param['max_speakers']) + json_response = whisperx.assign_word_speakers(diarize_segments, align_json) return json_response @@ -126,7 +130,7 @@ def transcribe_whisper(filepath, model, device, compute_type, batch_size, infra_ transcribe_json = model_whisp.transcribe(audio, batch_size=batch_size, language=language) return transcribe_json, audio -def get_whisperx_diariazation(filepath, HF_TOKEN, del_model, num_speakers, infra_model, language): +def get_whisperx_diariazation(filepath, input_param): """ ------------------------------------------------------------------------------------------------------ @@ -136,16 +140,8 @@ def get_whisperx_diariazation(filepath, HF_TOKEN, del_model, num_speakers, infra ........... filepath : str The path to the audio file to be transcribed. - HF_TOKEN : str - The Hugging Face token for model authentication. - del_model: boolean - Boolean indicator to delete model if low on GPU resources - num_speakers: int - Number of speaker - infra_model: list - whisper model artifacts (this is optional param: to optimize willisInfra) - language: str - language code + input_param : dict + A dictionary containing input parameters Returns: ........... @@ -158,11 +154,9 @@ def get_whisperx_diariazation(filepath, HF_TOKEN, del_model, num_speakers, infra """ device = 'cpu' compute_type = "int16" - - model = 'large-v2' batch_size = 16 - json_response = '{}' + json_response = json.dumps({}) transcript = '' try: @@ -170,16 +164,16 @@ def get_whisperx_diariazation(filepath, HF_TOKEN, del_model, num_speakers, infra device = 'cuda' compute_type = "float16" - transcribe_json, audio = transcribe_whisper(filepath, model, device, compute_type, batch_size, infra_model, language) + transcribe_json, audio = transcribe_whisper(filepath, input_param['model'], device, compute_type, batch_size, input_param['infra_model'], input_param['language']) # Align whisper output - model_a, metadata = whisperx.load_align_model(language_code=language, device=device) + model_a, metadata = whisperx.load_align_model(language_code=input_param['language'], device=device) align_json = whisperx.align(transcribe_json["segments"], model_a, metadata, audio, device, return_char_alignments=False) - if del_model: + if input_param['del_model']: delete_model(model_a) - json_response = get_diarization(audio, align_json, HF_TOKEN, device, num_speakers, infra_model) + json_response = get_diarization(audio, align_json, device, input_param) transcript = get_transcribe_summary(json_response) except Exception as e: From d0e97165d81e0ac4004ea822d0a75b4fbe5fa252 Mon Sep 17 00:00:00 2001 From: Vijay Yadav Date: Tue, 7 Nov 2023 14:17:03 -0500 Subject: [PATCH 16/21] aws support --- openwillis/__init__.py | 6 +-- openwillis/measures/api.py | 4 +- openwillis/measures/audio/__init__.py | 4 +- .../measures/audio/speech_transcribe_cloud.py | 40 ++++++++---------- .../measures/audio/util/transcribe_util.py | 41 ++++++++++--------- 5 files changed, 46 insertions(+), 49 deletions(-) diff --git a/openwillis/__init__.py b/openwillis/__init__.py index 61e0c35..724a34a 100644 --- a/openwillis/__init__.py +++ b/openwillis/__init__.py @@ -1,5 +1,5 @@ # author: Vijay Yadav -# website: http://www.bklynhlth.com +# website: http://www.brooklyn.health # import the required packages @@ -12,9 +12,9 @@ speech_characteristics, speaker_separation, speaker_separation_cloud, - speech_transcription_cloud, + speech_transcription_aws, speech_transcription_vosk, to_audio ) -__all__ = ["facial_expressivity", "vocal_acoustics", "emotional_expressivity", "eye_blink_rate", "speech_transcription_whisper", "speech_characteristics", "speaker_separation", "speaker_separation_cloud", "speech_transcription_cloud", "speech_transcription_vosk", "to_audio"] +__all__ = ["facial_expressivity", "vocal_acoustics", "emotional_expressivity", "eye_blink_rate", "speech_transcription_whisper", "speech_characteristics", "speaker_separation", "speaker_separation_cloud", "speech_transcription_aws", "speech_transcription_vosk", "to_audio"] diff --git a/openwillis/measures/api.py b/openwillis/measures/api.py index f76db67..026b153 100644 --- a/openwillis/measures/api.py +++ b/openwillis/measures/api.py @@ -1,5 +1,5 @@ # author: Vijay Yadav -# website: http://www.bklynhlth.com +# website: http://www.brooklyn.health # import the required packages from openwillis.measures.video import ( @@ -12,7 +12,7 @@ speech_transcription_whisper, speaker_separation, speaker_separation_cloud, - speech_transcription_cloud, + speech_transcription_aws, speech_transcription_vosk ) from openwillis.measures.text import ( diff --git a/openwillis/measures/audio/__init__.py b/openwillis/measures/audio/__init__.py index f7448e6..84245fb 100644 --- a/openwillis/measures/audio/__init__.py +++ b/openwillis/measures/audio/__init__.py @@ -15,11 +15,11 @@ ) from openwillis.measures.audio.speech_transcribe_cloud import ( - speech_transcription_cloud, + speech_transcription_aws, ) from openwillis.measures.audio.speech_transcribe_vosk import ( speech_transcription_vosk, ) -__all__ = ["vocal_acoustics", "speech_transcription_whisper", "speaker_separation", "speaker_separation_cloud", "speech_transcription_cloud", "speech_transcription_vosk"] +__all__ = ["vocal_acoustics", "speech_transcription_whisper", "speaker_separation", "speaker_separation_cloud", "speech_transcription_aws", "speech_transcription_vosk"] diff --git a/openwillis/measures/audio/speech_transcribe_cloud.py b/openwillis/measures/audio/speech_transcribe_cloud.py index 513e7a5..6156262 100644 --- a/openwillis/measures/audio/speech_transcribe_cloud.py +++ b/openwillis/measures/audio/speech_transcribe_cloud.py @@ -1,5 +1,5 @@ # author: Vijay Yadav -# website: http://www.bklynhlth.com +# website: http://www.brooklyn.health # import the required packages import os @@ -53,20 +53,19 @@ def read_kwargs(kwargs): ------------------------------------------------------------------------------------------------------ """ input_param = {} - input_param['model'] = kwargs.get('model', 'pyannote') input_param['language'] = kwargs.get('language', 'en-US') input_param['region'] = kwargs.get('region', 'us-east-1') input_param['job_name'] = kwargs.get('job_name', 'transcribe_job_01') - input_param['ShowSpeakerLabels'] = kwargs.get('ShowSpeakerLabels', True) - input_param['MaxSpeakerLabels'] = kwargs.get('MaxSpeakerLabels', 2) + input_param['speaker_labels'] = kwargs.get('speaker_labels', False) + input_param['max_speakers'] = kwargs.get('max_speakers', 2) - input_param['c_scale'] = kwargs.get('c_scale', '') + input_param['context'] = kwargs.get('context', '') input_param['access_key'] = kwargs.get('access_key', '') input_param['secret_key'] = kwargs.get('secret_key', '') return input_param -def speech_transcription_cloud(filepath, **kwargs): +def speech_transcription_aws(s3_uri, **kwargs): """ ------------------------------------------------------------------------------------------------------ @@ -74,29 +73,26 @@ def speech_transcription_cloud(filepath, **kwargs): Parameters: ........... - filepath : str + s3_uri : str The S3 uri for the recording to be transcribed. kwargs: Object - model : str, optional - The transcription model to use ('aws'). Default is 'aws'. language : str, optional The language of the audio file (e.g. 'en-US', 'en-IN'). Default is 'en-US'. region : str, optional The AWS region to use (e.g. 'us-east-1'). Only applicable if model is 'aws'. Default is 'us-east-1'. job_name : str, optional The name of the transcription job. Only applicable if model is 'aws'. Default is 'transcribe_job_01'. - ShowSpeakerLabels : boolean, optional - Show speaker labels - MaxSpeakerLabels : int, optional - Max number of speakers - c_scale : str, optional - Clinical scale to use for slicing the separated audio files, if any. access_key : str, optional AWS access key secret_key : str, optional AWS secret key - - + speaker_labels : boolean, optional + Show speaker labels + max_speakers : int, optional + Max number of speakers + context : str, optional + scale to use for slicing the separated audio files, if any. + Returns: ........... json_response : JSON Object @@ -108,10 +104,10 @@ def speech_transcription_cloud(filepath, **kwargs): """ input_param = read_kwargs(kwargs) measures = get_config() - json_response, transcript = tutil.transcribe_audio(filepath, input_param) - - if input_param['ShowSpeakerLabels'] == True and input_param['c_scale']: + json_response, transcript = tutil.transcribe_audio(s3_uri, input_param) + + if input_param['speaker_labels'] == True and input_param['context'].lower() in measures['scale'].split(','): content_dict = tutil.extract_content(json_response) - json_response = tutil.get_clinical_labels(input_param['c_scale'], measures, content_dict, json_response) - + + json_response = tutil.get_clinical_labels(input_param['context'], measures, content_dict, json_response) return json_response, transcript diff --git a/openwillis/measures/audio/util/transcribe_util.py b/openwillis/measures/audio/util/transcribe_util.py index 5e4049c..c510a2b 100644 --- a/openwillis/measures/audio/util/transcribe_util.py +++ b/openwillis/measures/audio/util/transcribe_util.py @@ -1,5 +1,5 @@ # author: Vijay Yadav -# website: http://www.bklynhlth.com +# website: http://www.brooklyn.health # import the required packages @@ -117,20 +117,19 @@ def get_clinical_labels(scale, measures, content_dict, json_response): ------------------------------------------------------------------------------------------------------ """ #Check if content is available for all the speaker - if content_dict and content_dict['speaker0'] and content_dict['speaker1']: - if scale.lower() not in measures['scale'].split(","): - return json_response + if len(content_dict) <2: + return json_response - score_string = scale.lower()+'_string' - spk1_score = sutil.match_transcript(measures[score_string], content_dict['speaker0']) - spk2_score = sutil.match_transcript(measures[score_string], content_dict['speaker1']) - - if spk1_score > spk2_score: - json_response = replace_speaker_labels(json_response, ['speaker0', 'speaker1'], ['clinician', 'participant']) + score_string = scale.lower()+'_string' + spk1_score = sutil.match_transcript(measures[score_string], content_dict['speaker0']) + spk2_score = sutil.match_transcript(measures[score_string], content_dict['speaker1']) - else: - json_response = replace_speaker_labels(json_response, ['speaker0', 'speaker1'], ['participant', 'clinician']) + if spk1_score > spk2_score: + json_response = replace_speaker_labels(json_response, ['speaker0', 'speaker1'], ['clinician', 'participant']) + else: + json_response = replace_speaker_labels(json_response, ['speaker0', 'speaker1'], ['participant', 'clinician']) + return json_response def get_job_status(transcribe, input_param): @@ -193,7 +192,7 @@ def filter_transcript_response(status, input_param): response = json.loads(read_data.read().decode('utf-8')) transcript = response['results']['transcripts'][0]['transcript'] - if input_param['ShowSpeakerLabels'] == True:#replace speaker labels with standard names + if input_param['speaker_labels'] == True:#replace speaker labels with standard names response = replace_speaker_labels(response, ['spk_0', 'spk_1'], ['speaker0', 'speaker1']) return response, transcript @@ -222,24 +221,26 @@ def transcribe_audio(s3uri, input_param): ------------------------------------------------------------------------------------------------------ """ - response = json.loads("{}") + response = json.dumps({}) + settings = {} transcript = "" try: if input_param['access_key'] and input_param['secret_key']: - transcribe = boto3.client('transcribe', region_name = input_param['region'], aws_access_key_id = input_param['access_key'], aws_secret_access_key = input_param['secret_key']) + transcribe = boto3.client('transcribe', region_name = input_param['region'], + aws_access_key_id = input_param['access_key'], + aws_secret_access_key = input_param['secret_key']) else: transcribe = boto3.client('transcribe', region_name = input_param['region']) - settings = {'ShowSpeakerLabels': input_param['ShowSpeakerLabels'], 'MaxSpeakerLabels': input_param['MaxSpeakerLabels']} + if input_param['speaker_labels'] == True and input_param['max_speakers']>=2: + settings = {'ShowSpeakerLabels': input_param['speaker_labels'], 'MaxSpeakerLabels': input_param['max_speakers']} + transcribe.start_transcription_job( TranscriptionJobName=input_param['job_name'], Media={'MediaFileUri': s3uri}, - - #IdentifyMultipleLanguages=True, LanguageCode=input_param['language'], - Settings=settings - ) + Settings=settings) status = get_job_status(transcribe, input_param) if status['TranscriptionJob']['TranscriptionJobStatus'] == 'COMPLETED': From c0427309d75283313fbb5fa319f45182c1364d07 Mon Sep 17 00:00:00 2001 From: Vijay Yadav Date: Wed, 8 Nov 2023 12:52:34 -0500 Subject: [PATCH 17/21] speaker separation --- openwillis/__init__.py | 6 ++-- openwillis/measures/api.py | 4 +-- openwillis/measures/audio/__init__.py | 4 +-- ...ration.py => speech_separation_nlabels.py} | 35 +++++++------------ openwillis/measures/commons/common.py | 12 +++---- 5 files changed, 26 insertions(+), 35 deletions(-) rename openwillis/measures/audio/{speech_separation.py => speech_separation_nlabels.py} (83%) diff --git a/openwillis/__init__.py b/openwillis/__init__.py index 724a34a..157d244 100644 --- a/openwillis/__init__.py +++ b/openwillis/__init__.py @@ -1,5 +1,5 @@ # author: Vijay Yadav -# website: http://www.brooklyn.health +# website: http://www.bklynhlth.com # import the required packages @@ -10,11 +10,11 @@ vocal_acoustics, speech_transcription_whisper, speech_characteristics, - speaker_separation, + speaker_separation_nolabels, speaker_separation_cloud, speech_transcription_aws, speech_transcription_vosk, to_audio ) -__all__ = ["facial_expressivity", "vocal_acoustics", "emotional_expressivity", "eye_blink_rate", "speech_transcription_whisper", "speech_characteristics", "speaker_separation", "speaker_separation_cloud", "speech_transcription_aws", "speech_transcription_vosk", "to_audio"] +__all__ = ["facial_expressivity", "vocal_acoustics", "emotional_expressivity", "eye_blink_rate", "speech_transcription_whisper", "speech_characteristics", "speaker_separation_nolabels", "speaker_separation_cloud", "speech_transcription_aws", "speech_transcription_vosk", "to_audio"] diff --git a/openwillis/measures/api.py b/openwillis/measures/api.py index 026b153..3cbea8f 100644 --- a/openwillis/measures/api.py +++ b/openwillis/measures/api.py @@ -1,5 +1,5 @@ # author: Vijay Yadav -# website: http://www.brooklyn.health +# website: http://www.bklynhlth.com # import the required packages from openwillis.measures.video import ( @@ -10,7 +10,7 @@ from openwillis.measures.audio import ( vocal_acoustics, speech_transcription_whisper, - speaker_separation, + speaker_separation_nolabels, speaker_separation_cloud, speech_transcription_aws, speech_transcription_vosk diff --git a/openwillis/measures/audio/__init__.py b/openwillis/measures/audio/__init__.py index 84245fb..dd3096c 100644 --- a/openwillis/measures/audio/__init__.py +++ b/openwillis/measures/audio/__init__.py @@ -6,8 +6,8 @@ speech_transcription_whisper, ) -from openwillis.measures.audio.speech_separation import ( - speaker_separation, +from openwillis.measures.audio.speech_separation_nlabels import ( + speaker_separation_nolabels, ) from openwillis.measures.audio.speech_separation_cloud import ( diff --git a/openwillis/measures/audio/speech_separation.py b/openwillis/measures/audio/speech_separation_nlabels.py similarity index 83% rename from openwillis/measures/audio/speech_separation.py rename to openwillis/measures/audio/speech_separation_nlabels.py index 58d6777..62409eb 100644 --- a/openwillis/measures/audio/speech_separation.py +++ b/openwillis/measures/audio/speech_separation_nlabels.py @@ -3,13 +3,11 @@ # import the required packages from pyannote.audio import Pipeline -from openwillis.measures.audio.util import util as ut from openwillis.measures.audio.util import separation_util as sutil from pydub import AudioSegment import os import json -import shutil import pandas as pd import logging @@ -89,11 +87,10 @@ def read_kwargs(kwargs): ------------------------------------------------------------------------------------------------------ """ input_param = {} - input_param['model'] = kwargs.get('model', 'pyannote') - input_param['hf_token'] = kwargs.get('hf_token', '') - input_param['json_response'] = kwargs.get('json_response', json.loads("{}")) - input_param['c_scale'] = kwargs.get('c_scale', '') + + input_param['transcript_json'] = kwargs.get('transcript_json', json.dumps({})) + input_param['context'] = kwargs.get('context', '') return input_param def get_pyannote(input_param, file_name, filepath): @@ -122,12 +119,12 @@ def get_pyannote(input_param, file_name, filepath): """ diart_df = run_pyannote(filepath, input_param['hf_token']) - transcribe_df = pd.DataFrame(input_param['json_response']) + transcribe_df = pd.DataFrame(input_param['transcript_json']) speaker_df, speaker_count = sutil.get_speaker_identification(diart_df, transcribe_df) return speaker_df, speaker_count -def speaker_separation(filepath, **kwargs): +def speaker_separation_nolabels(filepath, **kwargs): """ ------------------------------------------------------------------------------------------------------ @@ -137,14 +134,12 @@ def speaker_separation(filepath, **kwargs): ........... filepath : str Path to the input audio file. + transcript_json : json + Speech transcription json response. hf_token : str Access token for HuggingFace to access pre-trained models. - json_response : json - Speech transcription json response. - model : str, optional - Model to use for speech diarization, default is 'pyannote'. - c_scale : str, optional - Clinical scale to use for slicing the separated audio files, if any. + context : str, optional + scale to use for slicing the separated audio files, if any. Returns: ........... @@ -160,18 +155,14 @@ def speaker_separation(filepath, **kwargs): measures = get_config() try: - if not os.path.exists(filepath) or 'json_response' not in kwargs: + if not os.path.exists(filepath) or 'transcript_json' not in kwargs: return signal_label - if input_param['model'] == 'whisperx': - input_param['c_scale'] = '' - speaker_df, speaker_count = sutil.whisperx_to_dataframe(input_param['json_response']) - else: - speaker_df, speaker_count = get_pyannote(input_param, file_name, filepath) - + speaker_df, speaker_count = get_pyannote(input_param, file_name, filepath) audio_signal = AudioSegment.from_file(file = filepath, format = "wav") + if len(speaker_df)>0 and speaker_count>1: - signal_label = sutil.generate_audio_signal(speaker_df , audio_signal, input_param['c_scale'], measures) + signal_label = sutil.generate_audio_signal(speaker_df , audio_signal, input_param['context'], measures) except Exception as e: logger.error(f'Error in diard processing: {e} & File: {filepath}') diff --git a/openwillis/measures/commons/common.py b/openwillis/measures/commons/common.py index 9199a2e..d0c4951 100644 --- a/openwillis/measures/commons/common.py +++ b/openwillis/measures/commons/common.py @@ -26,7 +26,7 @@ def make_dir(dir_name): if not os.path.exists(dir_name): os.makedirs(dir_name) -def to_audio(filepath, speaker_label, out_dir): +def to_audio(filepath, speaker_dict, output_dir): """ ------------------------------------------------------------------------------------------------------ @@ -36,22 +36,22 @@ def to_audio(filepath, speaker_label, out_dir): ---------- filepath : str The path to the input audio file. - speaker_label : dict + speaker_dict : dict A dictionary containing speaker labels as keys and corresponding segments (NumPy arrays) as values. - out_dir : str + output_dir : str The directory where the output audio files will be saved. ------------------------------------------------------------------------------------------------------ """ - make_dir(out_dir) - for key, value in speaker_label.items(): + make_dir(output_dir) + for key, value in speaker_dict.items(): file_name, _ = os.path.splitext(os.path.basename(filepath)) audio_signal = AudioSegment.from_file(file = filepath, format = "wav") spk_signal = AudioSegment(value.tobytes(), frame_rate=audio_signal.frame_rate, sample_width=audio_signal.sample_width, channels=audio_signal.channels) - output_file = os.path.join(out_dir, file_name + '_' + key + '.wav') + output_file = os.path.join(output_dir, file_name + '_' + key + '.wav') spk_signal.export(output_file, format="wav") def get_config(filepath, json_file): From 755c744246ac09464263ebd466374ffb5c4c3e87 Mon Sep 17 00:00:00 2001 From: Vijay Yadav Date: Wed, 8 Nov 2023 15:27:28 -0500 Subject: [PATCH 18/21] speaker separation update --- openwillis/__init__.py | 4 +-- openwillis/measures/api.py | 2 +- openwillis/measures/audio/__init__.py | 4 +-- ...n_cloud.py => speech_separation_labels.py} | 27 ++++++++++++++++--- .../measures/audio/util/separation_util.py | 3 ++- 5 files changed, 31 insertions(+), 9 deletions(-) rename openwillis/measures/audio/{speech_separation_cloud.py => speech_separation_labels.py} (68%) diff --git a/openwillis/__init__.py b/openwillis/__init__.py index 157d244..41d7d3b 100644 --- a/openwillis/__init__.py +++ b/openwillis/__init__.py @@ -11,10 +11,10 @@ speech_transcription_whisper, speech_characteristics, speaker_separation_nolabels, - speaker_separation_cloud, + speaker_separation_labels, speech_transcription_aws, speech_transcription_vosk, to_audio ) -__all__ = ["facial_expressivity", "vocal_acoustics", "emotional_expressivity", "eye_blink_rate", "speech_transcription_whisper", "speech_characteristics", "speaker_separation_nolabels", "speaker_separation_cloud", "speech_transcription_aws", "speech_transcription_vosk", "to_audio"] +__all__ = ["facial_expressivity", "vocal_acoustics", "emotional_expressivity", "eye_blink_rate", "speech_transcription_whisper", "speech_characteristics", "speaker_separation_nolabels", "speaker_separation_labels", "speech_transcription_aws", "speech_transcription_vosk", "to_audio"] diff --git a/openwillis/measures/api.py b/openwillis/measures/api.py index 3cbea8f..38dad3e 100644 --- a/openwillis/measures/api.py +++ b/openwillis/measures/api.py @@ -11,7 +11,7 @@ vocal_acoustics, speech_transcription_whisper, speaker_separation_nolabels, - speaker_separation_cloud, + speaker_separation_labels, speech_transcription_aws, speech_transcription_vosk ) diff --git a/openwillis/measures/audio/__init__.py b/openwillis/measures/audio/__init__.py index dd3096c..355a3bd 100644 --- a/openwillis/measures/audio/__init__.py +++ b/openwillis/measures/audio/__init__.py @@ -10,8 +10,8 @@ speaker_separation_nolabels, ) -from openwillis.measures.audio.speech_separation_cloud import ( - speaker_separation_cloud, +from openwillis.measures.audio.speech_separation_labels import ( + speaker_separation_labels, ) from openwillis.measures.audio.speech_transcribe_cloud import ( diff --git a/openwillis/measures/audio/speech_separation_cloud.py b/openwillis/measures/audio/speech_separation_labels.py similarity index 68% rename from openwillis/measures/audio/speech_separation_cloud.py rename to openwillis/measures/audio/speech_separation_labels.py index f314c4d..6d82291 100644 --- a/openwillis/measures/audio/speech_separation_cloud.py +++ b/openwillis/measures/audio/speech_separation_labels.py @@ -38,7 +38,23 @@ def get_config(): measures = json.load(file) return measures -def speaker_separation_cloud(filepath, json_response): +def is_amazon_transcribe(json_conf): + """ + ------------------------------------------------------------------------------------------------------ + This function checks if the json response object is from Amazon Transcribe. + Parameters: + ........... + json_conf: dict + JSON response object. + Returns: + ........... + bool: True if the json response object + is from Amazon Transcribe, False otherwise. + ------------------------------------------------------------------------------------------------------ + """ + return "jobName" in json_conf and "results" in json_conf + +def speaker_separation_labels(filepath, transcript_json): """ ------------------------------------------------------------------------------------------------------ @@ -48,7 +64,7 @@ def speaker_separation_cloud(filepath, json_response): ........... filepath : str Path to the input audio file. - json_response : json + transcript_json : json Speech transcription json response. Returns: @@ -66,8 +82,13 @@ def speaker_separation_cloud(filepath, json_response): return signal_label audio_signal = AudioSegment.from_file(file = filepath, format = "wav") - speaker_df, speaker_count = sutil.transcribe_response_to_dataframe(json_response) + if not is_amazon_transcribe(transcript_json): + speaker_df, speaker_count = sutil.whisperx_to_dataframe(transcript_json) + else: + speaker_df, speaker_count = sutil.transcribe_response_to_dataframe(transcript_json) + print(speaker_df) + print(speaker_count) if len(speaker_df)>0 and speaker_count>1: signal_label = sutil.generate_audio_signal(speaker_df , audio_signal, '', measures) diff --git a/openwillis/measures/audio/util/separation_util.py b/openwillis/measures/audio/util/separation_util.py index 0959a4c..9f623f7 100644 --- a/openwillis/measures/audio/util/separation_util.py +++ b/openwillis/measures/audio/util/separation_util.py @@ -322,7 +322,7 @@ def transcribe_response_to_dataframe(response): speakers = 0 df = pd.DataFrame() - if 'segments' in response: + if 'results' in response: if 'speaker_labels' in response['results']: if 'speakers' in response['results']['speaker_labels']: @@ -338,6 +338,7 @@ def transcribe_response_to_dataframe(response): df = df[df["confidence"] > 0].reset_index(drop=True) df = df[["start_time", "end_time", "confidence", "speaker_label", "content"]] + return df, speakers def extract_data(segment_info): From 5a477b3887f752648e88fe33ac29a71bf1489a05 Mon Sep 17 00:00:00 2001 From: vjbytes102 Date: Wed, 8 Nov 2023 15:36:33 -0500 Subject: [PATCH 19/21] Update speech_separation_labels --- openwillis/measures/audio/speech_separation_labels.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/openwillis/measures/audio/speech_separation_labels.py b/openwillis/measures/audio/speech_separation_labels.py index 6d82291..d251e59 100644 --- a/openwillis/measures/audio/speech_separation_labels.py +++ b/openwillis/measures/audio/speech_separation_labels.py @@ -87,8 +87,7 @@ def speaker_separation_labels(filepath, transcript_json): speaker_df, speaker_count = sutil.whisperx_to_dataframe(transcript_json) else: speaker_df, speaker_count = sutil.transcribe_response_to_dataframe(transcript_json) - print(speaker_df) - print(speaker_count) + if len(speaker_df)>0 and speaker_count>1: signal_label = sutil.generate_audio_signal(speaker_df , audio_signal, '', measures) From 2403e697eee908e0cc8f67e09f0571abcc6edddb Mon Sep 17 00:00:00 2001 From: Vijay Yadav Date: Thu, 9 Nov 2023 12:48:32 -0500 Subject: [PATCH 20/21] whisper update --- .../measures/audio/util/separation_util.py | 42 +++++++++---------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/openwillis/measures/audio/util/separation_util.py b/openwillis/measures/audio/util/separation_util.py index 9f623f7..fff47d4 100644 --- a/openwillis/measures/audio/util/separation_util.py +++ b/openwillis/measures/audio/util/separation_util.py @@ -359,12 +359,18 @@ def extract_data(segment_info): ------------------------------------------------------------------------------------------------------ """ - phrase = segment_info["text"] - start = segment_info["start"] - end = segment_info["end"] + phrase = segment_info.get("text", "") + start = segment_info.get("start", np.nan) - score = segment_info["words"][0]["score"] if segment_info["words"] and len(segment_info["words"]) > 0 else 0 - speaker = segment_info["speaker"] if "speaker" in segment_info else "no_speaker" + end = segment_info.get("end", np.nan) + words = segment_info.get("words", None) + + if words is not None and len(words) > 0: + score = words[0].get("score", 0) + else: + score = 0 + + speaker = segment_info.get("speaker", "no_speaker") return pd.Series([start, end, phrase, score, speaker], index=["start", "end", "phrase", "score", "speaker"]) def whisperx_to_dataframe(json_response): @@ -387,23 +393,17 @@ def whisperx_to_dataframe(json_response): ------------------------------------------------------------------------------------------------------ """ - # Initialize an empty DataFrame - df = pd.DataFrame(columns=["start", "end", "phrase", "score", "speaker"]) + df = pd.DataFrame(columns=["start_time", "end_time", "content", "confidence", "speaker_label"]) if 'segments' in json_response: - for segment_info in json_response["segments"]: - try: - - segment_df = extract_data(segment_info) - df = df.append(segment_df, ignore_index=True) - - except Exception as e: - logger.info("Some segments have no speaker labels.") - - df = df[df["score"] > 0].reset_index(drop=True) - df = df[df["speaker"] != "no_speaker"].reset_index(drop=True) - df = df.rename(columns={"start": "start_time", "end": "end_time", "score":"confidence", "speaker":"speaker_label", - "phrase":"content"}) - + segment_infos = json_response["segments"] + df = pd.DataFrame(segment_infos).apply(extract_data, axis=1) + + df = df[df["score"] > 0].reset_index(drop=True) + df = df.dropna(subset=["start", "end"]).reset_index(drop=True) + + df = df[df["speaker"] != "no_speaker"].reset_index(drop=True) + df = df.rename(columns={"start": "start_time", "end": "end_time", "score": "confidence", "speaker": "speaker_label", "phrase": "content"}) + speakers = df['speaker_label'].nunique() return df, speakers \ No newline at end of file From 6b3652695b7feba152925bbbbfef7906dd4abf27 Mon Sep 17 00:00:00 2001 From: Vijay Yadav Date: Fri, 10 Nov 2023 15:38:55 -0500 Subject: [PATCH 21/21] transcription update --- .../audio/speech_transcribe_whisper.py | 3 +- .../measures/audio/util/transcribe_util.py | 85 ++++++++++++++++++- 2 files changed, 85 insertions(+), 3 deletions(-) diff --git a/openwillis/measures/audio/speech_transcribe_whisper.py b/openwillis/measures/audio/speech_transcribe_whisper.py index ed434c8..5e5371d 100644 --- a/openwillis/measures/audio/speech_transcribe_whisper.py +++ b/openwillis/measures/audio/speech_transcribe_whisper.py @@ -106,8 +106,7 @@ def run_whisperx(filepath, input_param): json_response, transcript = wutil.get_whisperx_diariazation(filepath, input_param) if str(json_response) != '{}': - json_response = tutil.replace_whisperx_speaker_labels(json_response, ['SPEAKER_00', 'SPEAKER_01'], - ['speaker0', 'speaker1']) + json_response = tutil.filter_labels_whisper(json_response) return json_response, transcript diff --git a/openwillis/measures/audio/util/transcribe_util.py b/openwillis/measures/audio/util/transcribe_util.py index c510a2b..9b99542 100644 --- a/openwillis/measures/audio/util/transcribe_util.py +++ b/openwillis/measures/audio/util/transcribe_util.py @@ -58,6 +58,89 @@ def replace_speaker_labels(data, check_labels, speaker_labels): return data +def filter_labels_aws(data): + """ + ------------------------------------------------------------------------------------------------------ + + replaces speaker labels in AWS JSON. + + Parameters: + ........... + data : JSON + The JSON containing speaker labels. + + Returns: + ........... + data : JSON + The modified JSON with replaced speaker labels. + + ------------------------------------------------------------------------------------------------------ + """ + if 'results' in data: + speaker_labels = data['results'].get('speaker_labels', {}) + segments = speaker_labels.get('segments', []) + + for segment in segments: + seg_speaker_label = segment.get('speaker_label', '') + + if 'spk_' in seg_speaker_label: + segment['speaker_label'] = seg_speaker_label.replace("spk_", "speaker") + + seg_items = segment.get('items', []) + for seg_item in seg_items: + + seg_item_speaker_label = seg_item.get('speaker_label', '') + if 'spk_' in seg_item_speaker_label: + + seg_item['speaker_label'] = seg_item_speaker_label.replace("spk_", "speaker") + items = data['results'].get('items', []) + + for item in items: + item_speaker_label = item.get('speaker_label', '') + + if 'spk_' in item_speaker_label: + item['speaker_label'] = item_speaker_label.replace("spk_", "speaker") + + return data + +def filter_labels_whisper(data): + """ + ------------------------------------------------------------------------------------------------------ + + replaces speaker labels in Whisper JSON. + + Parameters: + ........... + data : JSON + The JSON containing speaker labels. + + Returns: + ........... + data : JSON + The modified JSON with replaced speaker labels. + + ------------------------------------------------------------------------------------------------------ + """ + for segment in data.get('segments', []): + current_speaker = segment.get('speaker', '') + + if 'SPEAKER_0' in current_speaker: + segment["speaker"] = current_speaker.replace("SPEAKER_0", "speaker") + + for word in segment["words"]: + word_speaker = word.get('speaker', '') + + if 'SPEAKER_0' in word_speaker: + word["speaker"] = word_speaker.replace("SPEAKER_0", "speaker") + + for word_segment in data.get('word_segments', []): + word_seg_speaker = word_segment.get('speaker', '') + + if 'SPEAKER_0' in word_seg_speaker: + word_segment["speaker"] = word_seg_speaker.replace("SPEAKER_0", "speaker") + + return data + def extract_content(data): """ ------------------------------------------------------------------------------------------------------ @@ -194,7 +277,7 @@ def filter_transcript_response(status, input_param): transcript = response['results']['transcripts'][0]['transcript'] if input_param['speaker_labels'] == True:#replace speaker labels with standard names - response = replace_speaker_labels(response, ['spk_0', 'spk_1'], ['speaker0', 'speaker1']) + response = filter_labels_aws(response) return response, transcript def transcribe_audio(s3uri, input_param):