From 3cabd771019da5fef4e3f7445d0f6a9466f41478 Mon Sep 17 00:00:00 2001
From: Vijay Yadav <vijayyadav@vijays-mbp.myfiosgateway.com>
Date: Mon, 30 Oct 2023 22:41:32 -0400
Subject: [PATCH 01/21] speech characteristic update

---
 openwillis/measures/text/config/text.json     |    4 +-
 openwillis/measures/text/speech_attribute.py  |   99 +-
 .../text/util/characteristics_util.py         | 1252 ++++++-----------
 3 files changed, 439 insertions(+), 916 deletions(-)

diff --git a/openwillis/measures/text/config/text.json b/openwillis/measures/text/config/text.json
index fe08030..443433d 100644
--- a/openwillis/measures/text/config/text.json
+++ b/openwillis/measures/text/config/text.json
@@ -21,8 +21,8 @@
   "word_pause": "pre_word_pause",
   "phrase_pause": "pre_phrase_pause",
   "turn_pause": "pre_turn_pause",
-  "word_pause_mean": "word_pause_length_mean",
-  "word_pause_var": "word_pause_variability",
+  "word_pause_mean": "mean_pre_word_pause",
+  "word_pause_var": "mean_pause_variability",
   "phrase_pause_mean": "phrase_pause_length_mean",
   "phrase_pause_var": "phrase_pause_variability",
   "num_syllables": "num_syllables",
diff --git a/openwillis/measures/text/speech_attribute.py b/openwillis/measures/text/speech_attribute.py
index e5bd708..b3eaba5 100644
--- a/openwillis/measures/text/speech_attribute.py
+++ b/openwillis/measures/text/speech_attribute.py
@@ -9,7 +9,8 @@
 import nltk
 import numpy as np
 import pandas as pd
-from openwillis.measures.text.util import characteristics_util as cutil
+#from openwillis.measures.text.util import characteristics_util as cutil
+from util import characteristics_util as cutil
 
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger()
@@ -155,7 +156,7 @@ def filter_transcribe(json_conf, measures, speaker_label=None):
     return filter_json, text_list, text_indices
 
 
-def filter_whisper(json_conf, measures, speaker_label=None):
+def filter_whisper(json_conf, measures, min_turn_length, speaker_label=None):
     """
     ------------------------------------------------------------------------------------------------------
 
@@ -171,6 +172,8 @@ def filter_whisper(json_conf, measures, speaker_label=None):
         A dictionary containing the names of the columns in the output dataframes.
     speaker_label: str
         Speaker label
+    min_turn_length: int
+        minimum words required in each turn
 
     Returns:
     ...........
@@ -192,38 +195,24 @@ def filter_whisper(json_conf, measures, speaker_label=None):
     item_data = json_conf["segments"]
 
     if speaker_label is not None:
-        # filter out segments that do not have speaker labels
-        item_data = [
-            segment for segment in item_data if "speaker" in segment
-        ]
-
-    # make a dictionary to map old indices to new indices
+        item_data = [segment for segment in item_data if "speaker" in segment]
+        
     item_data = cutil.create_index_column(item_data, measures)
-
-    # phrase-split
-    phrases_idxs, phrases = cutil.filter_phrases(
-        item_data, speaker_label, measures
-    )
+    phrases_idxs, phrases = cutil.filter_phrases(item_data, speaker_label, measures) # phrase-split
 
     # turn-split
     if speaker_label is not None:
-        turns_idxs, turns = cutil.filter_turns(
-            item_data, speaker_label, measures
-        )
+        turns_idxs, turns = cutil.filter_turns(item_data, speaker_label, measures, min_turn_length)
+        
     else:
         turns_idxs, turns = [], []
-
-
+    
     # filter json to only include items with start_time and end_time
     filter_json = cutil.filter_json_transcribe(item_data, speaker_label, measures)
-
-    # extract words
-    words = [w["word"] for w in filter_json]
-
-    # entire transcript - by joining all the phrases
+    words = [value["word"] for value in filter_json]
     text = " ".join(phrases)
-
-    text_list = [words, phrases, turns, text]
+    
+    text_list = [words, turns, text]
     text_indices = [phrases_idxs, turns_idxs]
 
     return filter_json, text_list, text_indices
@@ -258,12 +247,10 @@ def filter_vosk(json_conf, measures):
     # make a dictionary to map old indices to new indices
     for i, item in enumerate(json_conf):
         item[measures["old_index"]] = i
-
-
+        
     return words, text
 
-
-def speech_characteristics(json_conf, language="en", speaker_label=None):
+def speech_characteristics(json_conf, language="en", speaker_label=None, min_turn_length=1):
     """
     ------------------------------------------------------------------------------------------------------
 
@@ -277,6 +264,8 @@ def speech_characteristics(json_conf, language="en", speaker_label=None):
         Language type
     speaker_label: str
         Speaker label
+    min_turn_length: int
+        minimum words required in each turn
 
     Returns:
     ...........
@@ -295,53 +284,31 @@ def speech_characteristics(json_conf, language="en", speaker_label=None):
 
     measures = get_config(os.path.abspath(__file__), "text.json")
     df_list = cutil.create_empty_dataframes(measures)
-
+    
     try:
-        if not isinstance(language, str):
-            raise ValueError("Language should be a string")
-        if len(language) < 2:
-            # if language is not specified, then set it to "xx"
-            # run speech characteristics as not english
-            language = "xx"
-        else:
-            language = language[:2].lower()
-
         if bool(json_conf):
-            cutil.download_nltk_resources()
+            language = "na" if language is None or len(language) < 2 else language[:2].lower()
+
+            if language == 'en':
+                cutil.download_nltk_resources()
 
             if is_whisper_transcribe(json_conf):
-                filter_json, text_list, text_indices = filter_whisper(
-                    json_conf, measures, speaker_label
-                )
+                filter_json, text_list, text_indices = filter_whisper(json_conf, measures, min_turn_length, speaker_label)
 
                 if len(filter_json) > 0 and len(text_list[-1]) > 0:
-                    df_list = cutil.process_language_feature(
-                        filter_json, df_list, text_list,
-                        text_indices, language, measures,
-                    )
+                    df_list = cutil.process_language_feature(filter_json, df_list, text_list, text_indices, language, measures)
+
             else:
                 words, text = filter_vosk(json_conf, measures)
                 if len(text) > 0:
-                    df_list = cutil.process_language_feature(
-                        json_conf, df_list, [words, [], [], text],
-                        [[], []], language, measures,
-                    )
-            
+                    df_list = cutil.process_language_feature(json_conf, df_list, [words,[],[],text],[[],[]], language, measures)
+        
+        
     except Exception as e:
         logger.error(f"Error in Speech Characteristics {e}")
 
     finally:
-        # if word_df is empty, then add a row of NaNs
-        if df_list[0].empty:
-            df_list[0].loc[0] = np.nan
-        # if phrase_df is empty, then add a row of NaNs
-        if df_list[1].empty:
-            df_list[1].loc[0] = np.nan
-        # if turn_df is empty, then add a row of NaNs
-        if df_list[2].empty:
-            df_list[2].loc[0] = np.nan
-        # if summ_df is empty, then add a row of NaNs
-        if df_list[3].empty:
-            df_list[3].loc[0] = np.nan
-
-        return df_list
+        for df in df_list:
+            df.loc[0] = np.nan if df.empty else df.loc[0]
+    
+    return df_list
diff --git a/openwillis/measures/text/util/characteristics_util.py b/openwillis/measures/text/util/characteristics_util.py
index dcefbe6..4d939bb 100644
--- a/openwillis/measures/text/util/characteristics_util.py
+++ b/openwillis/measures/text/util/characteristics_util.py
@@ -70,29 +70,6 @@ def create_empty_dataframes(measures):
         ]
     )
 
-    phrase_df = pd.DataFrame(
-        columns=[
-            measures["phrase_pause"],
-            measures["phrase_minutes"],
-            measures["phrase_words"],
-            measures["word_rate"],
-            measures["syllable_rate"],
-            measures["pause_rate"],
-            measures["pause_var"],
-            measures["pause_meandur"],
-            measures["speech_percentage"],
-            measures["speech_noun"],
-            measures["speech_verb"],
-            measures["speech_adj"],
-            measures["speech_pronoun"],
-            measures["pos"],
-            measures["neg"],
-            measures["neu"],
-            measures["compound"],
-            measures["speech_mattr"],
-        ]
-    )
-
     turn_df = pd.DataFrame(
         columns=[
             measures["turn_pause"],
@@ -123,11 +100,8 @@ def create_empty_dataframes(measures):
             measures["speech_words"],
             measures["word_rate"],
             measures["syllable_rate"],
-            measures["pause_rate"],
             measures["word_pause_mean"],
             measures["word_pause_var"],
-            measures["phrase_pause_mean"],
-            measures["phrase_pause_var"],
             measures["speech_percentage"],
             measures["speech_noun"],
             measures["speech_verb"],
@@ -147,301 +121,57 @@ def create_empty_dataframes(measures):
         ]
     )
 
-    return word_df, phrase_df, turn_df, summ_df
+    return word_df, turn_df, summ_df
 
-
-def filter_speaker_phrase(item_data, speaker_label, phrases_idxs, phrases):
-    """
-    ------------------------------------------------------------------------------------------------------
-    This function updates the phrases list
-        to only include the speaker label provided.
-    Parameters:
-    ...........
-    item_data: dict
-        JSON response object.
-    speaker_label: str
-        Speaker label
-    phrases_idxs: list
-        A list of tuples containing
-            the start and end indices of the phrases in the JSON object.
-    phrases: list
-        A list of phrases extracted from the JSON object.
-    Returns:
-    ...........
-    phrases_idxs: list
-        A list of tuples containing
-            the start and end indices of the phrases in the JSON object.
-    phrases: list
-        A list of phrases extracted from the JSON object.
-    ------------------------------------------------------------------------------------------------------
+def create_index_column(item_data, measures):
     """
-    phrases_idxs2 = []
-    phrases2 = []
-    for i, phrase in enumerate(phrases_idxs):
-        try:
-            start_idx = phrase[0]
-            if item_data[start_idx].get("speaker_label", "") == speaker_label:
-                phrases_idxs2.append(phrase)
-                phrases2.append(phrases[i])
-        except Exception as e:
-            logger.error(f"Error in phrase-split for speaker {speaker_label}: {e}")
-            continue
-
-    return phrases_idxs2, phrases2
-
+    This function creates an index column in the JSON response object.
 
-def filter_speaker_turn(item_data, speaker_label, turns_idxs, turns):
-    """
-    ------------------------------------------------------------------------------------------------------
-    
-    This function updates the turns list
-        to only include the speaker label provided.
     Parameters:
-    ...........
     item_data: dict
         JSON response object.
-    speaker_label: str
-        Speaker label
-    turns_idxs: list
-        A list of tuples containing
-            the start and end indices of the turns in the JSON object.
-    turns: list
-        A list of turns extracted from the JSON object.
-    Returns:
-    ...........
-    turns_idxs: list
-        A list of tuples containing
-            the start and end indices of the turns in the JSON object.
-    turns: list
-        A list of turns extracted from the JSON object.
-    ------------------------------------------------------------------------------------------------------
-    """
-    start_idx = 0
-    for i, item in enumerate(item_data):
-        try:
-            if (
-                i > 0
-                and item.get("speaker_label", "") == speaker_label
-                and item_data[i - 1].get("speaker_label", "") != speaker_label
-            ):
-                start_idx = i
-            elif (
-                i > 0
-                and item.get("speaker_label", "") != speaker_label
-                and item_data[i - 1].get("speaker_label", "") == speaker_label
-            ):
-                turns_idxs.append((start_idx, i - 1))
-                # create turns texts
-                turns.append(
-                    " ".join(
-                        [
-                            item["alternatives"][0]["content"]
-                            for item in item_data[start_idx:i]
-                        ]
-                    )
-                )
-        except Exception as e:
-            logger.error(f"Error in turn-split for speaker {speaker_label}: {e}")
-            continue
-
-    # if the last item is the speaker label
-    if start_idx not in [item[0] for item in turns_idxs]:
-        turns_idxs.append((start_idx, len(item_data) - 1))
-        turns.append(
-            " ".join(
-                [
-                    item["alternatives"][0]["content"]
-                    for item in item_data[start_idx:]
-                ]
-            )
-        )
-    return turns_idxs, turns
-
+    measures: dict
+        A dictionary containing the names of the columns in the output dataframes.
 
-def filter_speaker(item_data, speaker_label, turns_idxs, turns, phrases_idxs, phrases):
-    """
-    ------------------------------------------------------------------------------------------------------
-    This function updates the turns and phrases lists
-        to only include the speaker label provided.
-    Parameters:
-    ...........
-    item_data: dict
-        JSON response object.
-    speaker_label: str
-        Speaker label
-    turns_idxs: list
-        A list of tuples containing
-            the start and end indices of the turns in the JSON object.
-    turns: list
-        A list of turns extracted from the JSON object.
-    phrases_idxs: list
-        A list of tuples containing
-            the start and end indices of the phrases in the JSON object.
-    phrases: list
-        A list of phrases extracted from the JSON object.
     Returns:
-    ...........
-    turns_idxs: list
-        A list of tuples containing
-            the start and end indices of the turns in the JSON object.
-    turns: list
-        A list of turns extracted from the JSON object.
-    phrases_idxs: list
-        A list of tuples containing
-            the start and end indices of the phrases in the JSON object.
-    phrases: list
-        A list of phrases extracted from the JSON object.
-    Raises:
-    ...........
-        ValueError: If the speaker label is not found in the json response object.
-    ------------------------------------------------------------------------------------------------------
+    item_data: dict
+        The updated JSON response object.
     """
+    index = 0
+    for item in item_data:
+        for word in item.get("words", []):
+            word[measures["old_index"]] = index
+            index += 1
 
-    speaker_labels = [
-        item["speaker_label"] for item
-        in item_data if "speaker_label" in item
-    ]
-
-    if speaker_label not in speaker_labels:
-        raise ValueError(
-            f"Speaker label {speaker_label} "
-            "not found in the json response object."
-        )
-
-    # phrase-split for the speaker label
-    phrases_idxs, phrases = filter_speaker_phrase(
-        item_data, speaker_label, phrases_idxs, phrases
-    )
-
-    # turn-split for the speaker label
-    turns_idxs, turns = filter_speaker_turn(
-        item_data, speaker_label, turns_idxs, turns
-    )
-
-    return turns_idxs, turns, phrases_idxs, phrases
-
+    return item_data
 
-def phrase_split(text):
+def download_nltk_resources():
     """
     ------------------------------------------------------------------------------------------------------
-    This function splits the input text into phrases.
-    Parameters:
-    ...........
-    text: str
-        The input text.
-    Returns:
-    ...........
-    phrases: list
-        A list of phrases extracted from the input text.
-    phrases_idxs: list
-        A list of tuples containing
-            the start and end indices of the phrases in the input text.
-    ------------------------------------------------------------------------------------------------------
-    """
-    phrases = nltk.tokenize.sent_tokenize(text)
-    phrases_idxs = []
-
-    start_idx = 0
-    for phrase in phrases:
-        end_idx = start_idx + len(phrase.split()) - 1
-        phrases_idxs.append((start_idx, end_idx))
-        start_idx = end_idx + 1
 
-    return phrases, phrases_idxs
-
-
-def filter_turns(item_data, speaker_label, measures):
-    """
-    ------------------------------------------------------------------------------------------------------
-    
-    This function updates the turns list
-        to only include the speaker label provided.
+    This function downloads the
+     required NLTK resources for processing text data.
 
     Parameters:
     ...........
-    item_data: dict
-        JSON response object.
-    speaker_label: str
-        Speaker label
-    measures: dict
-        A dictionary containing the names of the columns in the output dataframes.
+    None
 
     Returns:
     ...........
-    turns_idxs: list
-        A list of tuples containing
-            the start and end indices of the turns in the JSON object.
-    turns: list
-        A list of turns extracted from the JSON object.
-
-    Raises:
-    ...........
-        ValueError: If the speaker label is not found in the json response object.
+    None
 
     ------------------------------------------------------------------------------------------------------
     """
+    try:
+        nltk.data.find("tokenizers/punkt")
+    except LookupError:
+        nltk.download("punkt")
 
-    speaker_labels = [
-        item["speaker"] for item
-        in item_data if "speaker" in item
-    ]
-
-    if speaker_label not in speaker_labels:
-        raise ValueError(
-            f"Speaker label {speaker_label} "
-            "not found in the json response object."
-        )
-    
-    turns_idxs, turns = [], []
-
-    start_idx = 0
-    start_idx2 = 0
-    for i, item in enumerate(item_data):
-        try:
-            if (
-                i > 0
-                and item.get("speaker", "") == speaker_label
-                and item_data[i - 1].get("speaker", "") != speaker_label
-            ):
-                start_idx = i
-                start_idx2 = item["words"][0][measures["old_index"]]
-            elif (
-                i > 0
-                and item.get("speaker", "") != speaker_label
-                and item_data[i - 1].get("speaker", "") == speaker_label
-            ):
-                end_idx = i-1
-                end_idx2 = item["words"][-1][measures["old_index"]]
-                turns_idxs.append((start_idx2, end_idx2))
-                # create turns texts
-                turns.append(
-                    " ".join(
-                        [
-                            item["text"]
-                            for item in item_data[start_idx:(end_idx+1)]
-                        ]
-                    )
-                )
-        except Exception as e:
-            logger.error(f"Error in turn-split for speaker {speaker_label}: {e}")
-            continue
-
-    # if the last item is the speaker label
-    if start_idx not in [item[0] for item in turns_idxs]:
-        end_idx2 = item_data[-1]["words"][-1][measures["old_index"]]
-        turns_idxs.append((start_idx2, end_idx2))
-        turns.append(
-            " ".join(
-                [
-                    item["text"]
-                    for item in item_data[start_idx:]
-                ]
-            )
-        )
-
-    return turns_idxs, turns
-
-
+    try:
+        nltk.data.find("averaged_perceptron_tagger")
+    except LookupError:
+        nltk.download("averaged_perceptron_tagger")
+        
 def filter_phrases(item_data, speaker_label, measures):
     """
     ------------------------------------------------------------------------------------------------------
@@ -489,40 +219,75 @@ def filter_phrases(item_data, speaker_label, measures):
             logger.error(f"Failed to filter phrases: {e}")
     return phrases_idxs, phrases
 
-
-def create_index_column(item_data, measures):
+def filter_turns(item_data, speaker_label, measures, min_turn_length):
     """
     ------------------------------------------------------------------------------------------------------
-
-    This function creates an index column in the JSON response object.
+    
+    This function updates the turns list
+        to only include the speaker label provided.
 
     Parameters:
     ...........
     item_data: dict
         JSON response object.
+    speaker_label: str
+        Speaker label
     measures: dict
         A dictionary containing the names of the columns in the output dataframes.
+    min_turn_length: int
+        minimum words required in each turn
 
     Returns:
     ...........
-    item_data: dict
-        The updated JSON response object.
+    turns_idxs: list
+        A list of tuples containing
+            the start and end indices of the turns in the JSON object.
+    turns: list
+        A list of turns extracted from the JSON object.
+
+    Raises:
+    ...........
+        ValueError: If the speaker label is not found in the json response object.
 
     ------------------------------------------------------------------------------------------------------
     """
-    i = 0
-    i_p = 0
-    while True:
-        for j, word in enumerate(item_data[i_p]["words"]):
-            item_data[i_p]["words"][j][measures["old_index"]] = i
-            i += 1
-        
-        i_p += 1
-        if i_p >= len(item_data):
-            break
+    turns_idxs, turns = [], []
+    current_turn = None
+
+    for item in item_data:
+        try:
+            
+            if "speaker" in item:
+                if item["speaker"] == speaker_label:
+                    current_turn = [item] if current_turn is None else current_turn + [item]
+                    
+                else:
+                    if current_turn is not None:
+
+                        start_idx2 = current_turn[0]["words"][0][measures["old_index"]]
+                        end_idx2 = current_turn[-1]["words"][-1][measures["old_index"]]
+                        turn_text = " ".join(item["text"] for item in current_turn)
+                        
+                        if len(turn_text.split(" ")) >= min_turn_length:
+                            turns_idxs.append((start_idx2, end_idx2))
+                            
+                            turns.append(turn_text)
+                        current_turn = None
+                        
+        except Exception as e:
+            logger.error(f"Error in turn calculation {e}")
     
-    return item_data
+    if current_turn is not None:
+        start_idx2 = current_turn[0]["words"][0][measures["old_index"]]
+        
+        end_idx2 = current_turn[-1]["words"][-1][measures["old_index"]]
+        turn_text = " ".join(item["text"] for item in current_turn)
+        
+        if len(turn_text.split(" ")) >= min_turn_length: 
+            turns_idxs.append((start_idx2, end_idx2))
+            turns.append(turn_text)
 
+    return turns_idxs, turns
 
 def pause_calculation(filter_json, measures):
     """
@@ -546,15 +311,13 @@ def pause_calculation(filter_json, measures):
     """
     for i, item in enumerate(filter_json):
         if i > 0:
-            item[measures["pause"]] = float(item["start"]) - float(
-                filter_json[i - 1]["end"]
-            )
+            item[measures["pause"]] = float(item["start"]) - float(filter_json[i - 1]["end"])
+        
         else:
             item[measures["pause"]] = np.nan
     
     return filter_json
 
-
 def filter_json_transcribe(item_data, speaker_label, measures):
     """
     ------------------------------------------------------------------------------------------------------
@@ -584,352 +347,90 @@ def filter_json_transcribe(item_data, speaker_label, measures):
             
             speaker = item["speaker"]
             words = item["words"]
-    
-            # update speaker labels
-            for j, w in enumerate(words):
+            
+            for j, w in enumerate(words):# update speaker labels
                 words[j]["speaker"] = speaker
             
             item_data2 += words
         except Exception as e:
             logger.error(f"Failed to filter word: {e}")
     
-    filter_json = [
-        item for item in item_data2
-        if "start" in item and "end" in item
-    ]
-
-    # calculate time difference between each word
-    filter_json = pause_calculation(filter_json, measures)
+    filter_json = [item for item in item_data2 if "start" in item and "end" in item]
+    filter_json = pause_calculation(filter_json, measures) # calculate time difference between each word
 
     if speaker_label is not None:
-        filter_json = [
-            item
-            for item in filter_json
-            if item.get("speaker", "") == speaker_label
-        ]
-
+        filter_json = [item for item in filter_json if item.get("speaker", "") == speaker_label]
     return filter_json
 
-
-def download_nltk_resources():
+def get_num_of_syllables(text):
     """
     ------------------------------------------------------------------------------------------------------
 
-    This function downloads the
-     required NLTK resources for processing text data.
+    This function calculates the number of syllables in the input text.
 
     Parameters:
     ...........
-    None
+    text: str
+        The input text.
 
     Returns:
     ...........
-    None
+    syllable_count: int
+        The number of syllables in the input text.
 
-    ------------------------------------------------------------------------------------------------------
+    ---------------------------------------------------------------------------------------
     """
-    try:
-        nltk.data.find("tokenizers/punkt")
-    except LookupError:
-        nltk.download("punkt")
 
-    try:
-        nltk.data.find("averaged_perceptron_tagger")
-    except LookupError:
-        nltk.download("averaged_perceptron_tagger")
-
-
-def get_tag(json_conf, tag_dict, measures):
-    """
-    ------------------------------------------------------------------------------------------------------
-
-    This function performs part-of-speech
-     tagging on the input text using NLTK, and returns an updated
-     json_conf list with the part-of-speech tags.
-
-    Parameters:
-    ...........
-    json_conf: list
-        JSON response object.
-    tag_dict: dict
-        A dictionary mapping the NLTK tags to more readable tags.
-    measures: dict
-        A dictionary containing the names of the columns in the output dataframes.
-
-    Returns:
-    ...........
-    json_conf: list
-        The updated json_conf list.
-
-    ------------------------------------------------------------------------------------------------------
-    """
-    if len(json_conf) <= 0:
-        return json_conf
-
-    if "alternatives" not in json_conf[0].keys():
-        # local vosk transcriber
-        word_list = [word["word"] for word in json_conf if "word" in word]
-    else:
-        # aws transcriber
-        word_list = [item["alternatives"][0]["content"] for item in json_conf]
-
-    tag_list = nltk.pos_tag(word_list)
-
-    for i, tag in enumerate(tag_list):
-        if tag[1] in tag_dict.keys():
-            json_conf[i][measures["tag"]] = tag_dict[tag[1]]
-        else:
-            json_conf[i][measures["tag"]] = "Other"
+    syllable_tokenizer = nltk.tokenize.SyllableTokenizer()
 
-    return json_conf
+    # remove punctuation
+    punctuation = "!\"#$%&()*+,-./:;<=>?@[\]^_`{|}~"
+    syllables = [syllable_tokenizer.tokenize(token) for token in nltk.word_tokenize(text) if token not in punctuation]
+    # count the number of syllables in each word
+    syllable_count = sum([len(token) for token in syllables])
 
+    return syllable_count
 
-def get_part_of_speech(df, tags, measures, index=0):
+def get_pause_feature_word(word_df, df_diff, word_list, phrase_index, measures):
     """
     ------------------------------------------------------------------------------------------------------
 
-    This function calculates the proportions of verbs,
-     pronouns, adjectives, and nouns in the
-     transcribed text, and adds them to the output dataframe df.
+    This function calculates various pause-related speech characteristic
+        features at the word level and adds them to the output dataframe word_df.
 
     Parameters:
     ...........
-    df: pandas dataframe
-        A dataframe containing the speech characteristics of the input text.
-    tags: list
-        A list of part-of-speech tags for the input text.
+    word_df: pandas dataframe
+        A dataframe containing word summary information
+    df_diff: pandas dataframe
+        A dataframe containing the word-level information
+            from the JSON response.
+    word_list: list
+        List of transcribed text at the word level.
+    phrase_index: list
+        A list containing the indices of the first and last word
+            in each phrase or turn.
     measures: dict
         A dictionary containing the names of the columns in the output dataframes.
-    index: int
-        The index of the row in the output dataframe df.
 
     Returns:
     ...........
-    df: pandas dataframe
-        The updated df dataframe.
+    word_df: pandas dataframe
+        The updated word_df dataframe.
 
     ------------------------------------------------------------------------------------------------------
     """
-    if len(tags) == 0:
-        return df
+    phrase_starts = [pindex[0] for pindex in phrase_index]
 
-    df.loc[index, measures["speech_noun"]] = (
-        100 * len(tags[tags == "Noun"]) / len(tags)
-    )
-    df.loc[index, measures["speech_verb"]] = (
-        100 * len(tags[tags == "Verb"]) / len(tags)
-    )
-    df.loc[index, measures["speech_adj"]] = (
-        100 * len(tags[tags == "Adjective"]) / len(tags)
-    )
-    df.loc[index, measures["speech_pronoun"]] = (
-        100 * len(tags[tags == "Pronoun"]) / len(tags)
+    word_df[measures["word_pause"]] = df_diff[measures["pause"]].where(
+        ~df_diff[measures["old_index"]].isin(phrase_starts), np.nan
     )
 
-    return df
-
-
-def get_tag_summ(json_conf, df_list, text_indices, measures):
-    """
-    ------------------------------------------------------------------------------------------------------
-
-    This function calculates the proportions of verbs,
-     pronouns, adjectives, and nouns in the
-     transcribed text, and adds them to the output dataframe summ_df.
-
-    Parameters:
-    ...........
-    json_conf: list
-        JSON response object.
-    df_list: list
-        List of pandas dataframes.
-            word_df, phrase_df, turn_df, summ_df
-    text_indices: list
-        List of indices for text_list.
-            for phrases and turns.
-    measures: dict
-        A dictionary containing the names of the columns in the output dataframes.
-
-    Returns:
-    ...........
-    df_list: list
-        List of updated pandas dataframes.
-
-    ------------------------------------------------------------------------------------------------------
-    """
-
-    word_df, phrase_df, turn_df, summ_df = df_list
-    phrase_index, turn_index = text_indices
-
-    df_conf = pd.DataFrame(json_conf)
-
-    # word-level analysis
-    word_df[measures["part_of_speech"]] = df_conf[measures["tag"]]
-
-    # phrase-level analysis
-    for j, pindex in enumerate(phrase_index):
-        prange = range(pindex[0], pindex[1] + 1)
-        phrase_tags = df_conf.loc[df_conf[measures["old_index"]].isin(prange), measures["tag"]]
-
-        phrase_df = get_part_of_speech(phrase_df, phrase_tags, measures, j)
-
-    # turn-level analysis
-    for j, uindex in enumerate(turn_index):
-        urange = range(uindex[0], uindex[1] + 1)
-        turn_tags = df_conf.loc[df_conf[measures["old_index"]].isin(urange), measures["tag"]]
-
-        turn_df = get_part_of_speech(turn_df, turn_tags, measures, j)
-
-    # file-level analysis
-    summ_df = get_part_of_speech(summ_df, df_conf[measures["tag"]], measures)
-
-    df_list = [word_df, phrase_df, turn_df, summ_df]
-
-    return df_list
-
-
-def get_mattr(text):
-    """
-    ------------------------------------------------------------------------------------------------------
-    This function calculates the Moving Average Type-Token Ratio (MATTR)
-     of the input text using the
-     LexicalRichness library.
-
-    Parameters:
-    ...........
-    text : str
-        The input text to be analyzed.
-
-    Returns:
-    ...........
-    mattr : float
-        The calculated MATTR value.
-
-    ------------------------------------------------------------------------------------------------------
-    """
-    word = nltk.word_tokenize(text)
-    filter_punc = list(value for value in word if value not in [".", "!", "?"])
-    filter_punc = " ".join(filter_punc)
-    mattr = np.nan
-
-    lex_richness = LexicalRichness(filter_punc)
-    if lex_richness.words > 0:
-        mattr = lex_richness.mattr(window_size=lex_richness.words)
-
-    return mattr
-
-
-def get_sentiment(df_list, text_list, measures):
-    """
-    ------------------------------------------------------------------------------------------------------
-
-    This function calculates the sentiment scores of the input text using
-     VADER, and adds them to the output dataframe summ_df.
-
-    Parameters:
-    ...........
-    df_list: list
-        List of pandas dataframes.
-            word_df, phrase_df, turn_df, summ_df
-    text_list: list
-        List of transcribed text.
-            split into words, phrases, turns, and full text.
-    measures: dict
-        A dictionary containing the names of the columns in the output dataframes.
-
-    Returns:
-    ...........
-    df_list: list
-        List of updated pandas dataframes.
-
-    ------------------------------------------------------------------------------------------------------
-    """
-    word_df, phrase_df, turn_df, summ_df = df_list
-    word_list, phrase_list, turn_list, full_text = text_list
-
-    sentiment = SentimentIntensityAnalyzer()
-
-    # column names
-    cols = [
-        measures["neg"],
-        measures["neu"],
-        measures["pos"],
-        measures["compound"],
-        measures["speech_mattr"],
+    # calculate the number of syllables in each word from the word list
+    word_df[measures["num_syllables"]] = [
+        get_num_of_syllables(word) for word in word_list
     ]
-
-    # word-level analysis
-    for idx, w in enumerate(word_list):
-        try:
-            sentiment_dict = sentiment.polarity_scores(w)
-
-            word_df.loc[idx, cols[:-1]] = list(sentiment_dict.values())
-        except Exception as e:
-            logger.error(f"Error in sentiment analysis for word {w}: {e}")
-            continue
-
-    # phrase-level analysis
-    for idx, p in enumerate(phrase_list):
-        try:
-            sentiment_dict = sentiment.polarity_scores(p)
-            mattr = get_mattr(p)
-
-            phrase_df.loc[idx, cols] = list(sentiment_dict.values()) + [mattr]
-        except Exception as e:
-            logger.error(f"Error in sentiment analysis for phrase {p}: {e}")
-            continue
-
-    # turn-level analysis
-    for idx, u in enumerate(turn_list):
-        try:
-            sentiment_dict = sentiment.polarity_scores(u)
-            mattr = get_mattr(u)
-
-            turn_df.loc[idx, cols] = list(sentiment_dict.values()) + [mattr]
-        except Exception as e:
-            logger.error(f"Error in sentiment analysis for turn {u}: {e}")
-            continue
-
-    # file-level analysis
-    sentiment_dict = sentiment.polarity_scores(full_text)
-    mattr = get_mattr(full_text)
-
-    summ_df.loc[0, cols] = list(sentiment_dict.values()) + [mattr]
-
-    df_list = [word_df, phrase_df, turn_df, summ_df]
-
-    return df_list
-
-
-def get_num_of_syllables(text):
-    """
-    ------------------------------------------------------------------------------------------------------
-
-    This function calculates the number of syllables in the input text.
-
-    Parameters:
-    ...........
-    text: str
-        The input text.
-
-    Returns:
-    ...........
-    syllable_count: int
-        The number of syllables in the input text.
-
-    ---------------------------------------------------------------------------------------
-    """
-
-    syllable_tokenizer = nltk.tokenize.SyllableTokenizer()
-
-    # remove punctuation
-    punctuation = "!\"#$%&()*+,-./:;<=>?@[\]^_`{|}~"
-    syllables = [syllable_tokenizer.tokenize(token) for token in nltk.word_tokenize(text) if token not in punctuation]
-    # count the number of syllables in each word
-    syllable_count = sum([len(token) for token in syllables])
-
-    return syllable_count
-
+    return word_df
 
 def process_pause_feature(df_diff, df, text_level, index_list, time_index, level_name, measures):
     """
@@ -1017,203 +518,7 @@ def process_pause_feature(df_diff, df, text_level, index_list, time_index, level
 
     df[measures["pause_rate"]] = df[measures["word_rate"]]
 
-    return df
-
-
-def update_summ_df(
-    df_diff, summ_df, full_text, time_index, word_df, phrase_df, turn_df, measures
-):
-    """
-    ------------------------------------------------------------------------------------------------------
-
-    This function calculates various pause-related speech characteristic
-     features at the file level and adds them to the output dataframe summ_df.
-
-    Parameters:
-    ...........
-    df_diff: pandas dataframe
-        A dataframe containing the word-level information
-         from the JSON response.
-    summ_df: pandas dataframe
-        A dataframe containing the speech characteristics of the input text.
-    time_index: list
-        A list containing the names of the columns in json
-         that contain the start and end times of each word.
-    word_df: pandas dataframe
-        A dataframe containing word summary information
-    phrase_df: pandas dataframe
-        A dataframe containing phrase summary information
-    turn_df: pandas dataframe
-        A dataframe containing turn summary information
-    measures: dict
-        A dictionary containing the names of the columns in the output dataframes.
-
-    Returns:
-    ...........
-    summ_df: pandas dataframe
-        The updated summ_df dataframe.
-
-    ------------------------------------------------------------------------------------------------------
-    """
-    if len(phrase_df) > 0:
-        speech_minutes = phrase_df[measures["phrase_minutes"]].sum()
-    else:
-        speech_minutes = (float(df_diff.iloc[-1][time_index[1]]) - float(df_diff.iloc[0][time_index[0]])) / 60
-
-    summ_df[measures["speech_minutes"]] = [speech_minutes]
-
-    summ_df[measures["speech_words"]] = len(df_diff)
-    if speech_minutes > 0:
-        summ_df[measures["word_rate"]] = (
-            summ_df[measures["speech_words"]] / summ_df[measures["speech_minutes"]]
-        )
-        summ_df[measures["syllable_rate"]] = (
-            get_num_of_syllables(full_text) / summ_df[measures["speech_minutes"]]
-        )
-        summ_df[measures["speech_percentage"]] = 100 * (
-        1
-        - df_diff.loc[1:, measures["pause"]].sum()
-        / (60 * summ_df[measures["speech_minutes"]])
-    )
-
-    summ_df[measures["pause_rate"]] = summ_df[measures["word_rate"]]
-    
-    if len(word_df[measures["word_pause"]]) > 1:
-        summ_df[measures["word_pause_mean"]] = word_df[measures["word_pause"]].mean(
-            skipna=True
-        )
-        summ_df[measures["word_pause_var"]] = word_df[measures["word_pause"]].var(
-            skipna=True
-        )
-    
-    if len(phrase_df[measures["phrase_pause"]]) > 1:
-        summ_df[measures["phrase_pause_mean"]] = phrase_df[measures["phrase_pause"]].mean(
-            skipna=True
-        )
-        summ_df[measures["phrase_pause_var"]] = phrase_df[measures["phrase_pause"]].var(
-            skipna=True
-        )
-    
-    if len(turn_df) > 0:
-        summ_df[measures["num_turns"]] = len(turn_df)
-        summ_df[measures["turn_minutes_mean"]] = turn_df[
-            measures["turn_minutes"]
-        ].mean(skipna=True)
-        summ_df[measures["turn_words_mean"]] = turn_df[
-            measures["turn_words"]
-        ].mean(skipna=True)
-        summ_df[measures["turn_pause_mean"]] = turn_df[
-            measures["turn_pause"]
-        ].mean(skipna=True)
-        summ_df["num_one_word_turns"] = len(
-            turn_df[turn_df[measures["turn_words"]] == 1]
-        )
-        summ_df[measures["num_interrupts"]] = sum(turn_df[measures["interrupt_flag"]])
-
-    return summ_df
-
-
-def get_pause_feature_word(word_df, df_diff, word_list, phrase_index, measures):
-    """
-    ------------------------------------------------------------------------------------------------------
-
-    This function calculates various pause-related speech characteristic
-        features at the word level and adds them to the output dataframe word_df.
-
-    Parameters:
-    ...........
-    word_df: pandas dataframe
-        A dataframe containing word summary information
-    df_diff: pandas dataframe
-        A dataframe containing the word-level information
-            from the JSON response.
-    word_list: list
-        List of transcribed text at the word level.
-    phrase_index: list
-        A list containing the indices of the first and last word
-            in each phrase or turn.
-    measures: dict
-        A dictionary containing the names of the columns in the output dataframes.
-
-    Returns:
-    ...........
-    word_df: pandas dataframe
-        The updated word_df dataframe.
-
-    ------------------------------------------------------------------------------------------------------
-    """
-    phrase_starts = [pindex[0] for pindex in phrase_index]
-
-    word_df[measures["word_pause"]] = df_diff[measures["pause"]].where(
-        ~df_diff[measures["old_index"]].isin(phrase_starts), np.nan
-    )
-
-    # calculate the number of syllables in each word from the word list
-    word_df[measures["num_syllables"]] = [
-        get_num_of_syllables(word) for word in word_list
-    ]
-    return word_df
-
-
-def get_pause_feature_phrase(phrase_df, df_diff, phrase_list, phrase_index, turn_index, time_index, measures):
-    """
-    ------------------------------------------------------------------------------------------------------
-
-    This function calculates various pause-related speech characteristic
-        features at the phrase level and adds them to the output dataframe phrase_df.
-
-    Parameters:
-    ...........
-    phrase_df: pandas dataframe
-        A dataframe containing phrase summary information
-    df_diff: pandas dataframe
-        A dataframe containing the word-level information
-            from the JSON response.
-    phrase_list: list
-        List of transcribed text at the phrase level.
-    phrase_index: list
-        A list containing the indices of the first and last word
-            in each phrase
-    turn_index: list
-        A list containing the indices of the first and last word
-            in each turn.
-    time_index: list
-        A list containing the names of the columns in json that contain
-            the start and end times of each word.
-    measures: dict
-        A dictionary containing the names of the columns in the output dataframes.
-
-    Returns:
-    ...........
-    phrase_df: pandas dataframe
-        The updated phrase_df dataframe.
-
-    ------------------------------------------------------------------------------------------------------
-    """
-    phrase_starts = [pindex[0] for pindex in phrase_index]
-
-    df_diff_phrase = df_diff[
-        df_diff[measures["old_index"]].isin(phrase_starts)
-    ]  # get the rows corresponding to the start of each phrase
-
-    if len(turn_index) > 0:
-        turn_starts = [
-            uindex[0] for uindex in turn_index
-        ]  # get the start index of each turn
-        phrase_df[measures["phrase_pause"]] = df_diff_phrase[measures["pause"]].where(
-            ~df_diff_phrase[measures["old_index"]].isin(turn_starts), np.nan
-        )
-    else:
-        phrase_df[measures["phrase_pause"]] = df_diff_phrase[measures["pause"]]
-
-    phrase_df = phrase_df.reset_index(drop=True)
-
-    phrase_df = process_pause_feature(
-        df_diff, phrase_df, phrase_list, phrase_index, time_index, measures["phrase"], measures
-    )
-
-    return phrase_df
-
+    return df
 
 def get_pause_feature_turn(turn_df, df_diff, turn_list, turn_index, time_index, measures):
     """
@@ -1271,6 +576,64 @@ def get_pause_feature_turn(turn_df, df_diff, turn_list, turn_index, time_index,
 
     return turn_df
 
+def update_summ_df(df_diff, summ_df, full_text, time_index, word_df, turn_df, measures):
+    """
+    ------------------------------------------------------------------------------------------------------
+
+    This function calculates various pause-related speech characteristic
+     features at the file level and adds them to the output dataframe summ_df.
+
+    Parameters:
+    ...........
+    df_diff: pandas dataframe
+        A dataframe containing the word-level information
+         from the JSON response.
+    summ_df: pandas dataframe
+        A dataframe containing the speech characteristics of the input text.
+    time_index: list
+        A list containing the names of the columns in json
+         that contain the start and end times of each word.
+    word_df: pandas dataframe
+        A dataframe containing word summary information
+    turn_df: pandas dataframe
+        A dataframe containing turn summary information
+    measures: dict
+        A dictionary containing the names of the columns in the output dataframes.
+
+    Returns:
+    ...........
+    summ_df: pandas dataframe
+        The updated summ_df dataframe.
+
+    ------------------------------------------------------------------------------------------------------
+    """
+    speech_minutes = (float(df_diff.iloc[-1][time_index[1]]) - float(df_diff.iloc[0][time_index[0]])) / 60
+    summ_df[measures["speech_minutes"]] = [speech_minutes]
+    
+    summ_df[measures["speech_words"]] = len(df_diff)
+    if speech_minutes > 0:
+        
+        summ_df[measures["word_rate"]] = (summ_df[measures["speech_words"]] / summ_df[measures["speech_minutes"]])
+        summ_df[measures["syllable_rate"]] = (get_num_of_syllables(full_text) / summ_df[measures["speech_minutes"]])
+        
+        summ_df[measures["speech_percentage"]] = 100 * (
+        1 - df_diff.loc[1:, measures["pause"]].sum()/ (60 * summ_df[measures["speech_minutes"]]))
+
+    if len(word_df[measures["word_pause"]]) > 1:
+        summ_df[measures["word_pause_mean"]] = word_df[measures["word_pause"]].mean(skipna=True)
+        summ_df[measures["word_pause_var"]] = word_df[measures["word_pause"]].var(skipna=True)
+    
+    if len(turn_df) > 0:
+        summ_df[measures["num_turns"]] = len(turn_df)
+        summ_df[measures["turn_minutes_mean"]] = turn_df[measures["turn_minutes"]].mean(skipna=True)
+        
+        summ_df[measures["turn_words_mean"]] = turn_df[measures["turn_words"]].mean(skipna=True)
+        summ_df[measures["turn_pause_mean"]] = turn_df[measures["turn_pause"]].mean(skipna=True)
+        
+        summ_df["num_one_word_turns"] = len(turn_df[turn_df[measures["turn_words"]] == 1])
+        summ_df[measures["num_interrupts"]] = sum(turn_df[measures["interrupt_flag"]])
+
+    return summ_df
 
 def get_pause_feature(json_conf, df_list, text_list, text_indices, measures):
     """
@@ -1303,54 +666,249 @@ def get_pause_feature(json_conf, df_list, text_list, text_indices, measures):
 
     ------------------------------------------------------------------------------------------------------
     """
-    # Check if json_conf is empty
     if len(json_conf) <= 0:
         return df_list
 
-    word_df, phrase_df, turn_df, summ_df = df_list
-    word_list, phrase_list, turn_list, full_text = text_list
+    word_df, turn_df, summ_df = df_list
+    word_list, turn_list, full_text = text_list
     phrase_index, turn_index = text_indices
 
-    # Convert json_conf to a pandas DataFrame
     df_diff = pd.DataFrame(json_conf)
-
     time_index = ["start", "end"]
 
-    # Calculate the pause time between
-    # each word and add the results to pause_list
+    # Calculate the pause time between; each word and add the results to pause_list
     if measures["pause"] not in df_diff.columns:
-        df_diff[measures["pause"]] = df_diff[time_index[0]].astype(float) - df_diff[
-            time_index[1]
-        ].astype(float).shift(1)
+        df_diff[measures["pause"]] = df_diff[time_index[0]].astype(float) - df_diff[time_index[1]].astype(float).shift(1)
 
     # word-level analysis
     word_df = get_pause_feature_word(word_df, df_diff, word_list, phrase_index, measures)
 
-    # phrase-level analysis
-    phrase_df = get_pause_feature_phrase(
-        phrase_df, df_diff, phrase_list, phrase_index, turn_index, time_index, measures
-    )
-
     # turn-level analysis
     if len(turn_index) > 0:
-        turn_df = get_pause_feature_turn(
-            turn_df, df_diff, turn_list, turn_index, time_index, measures
-        )
+        turn_df = get_pause_feature_turn(turn_df, df_diff, turn_list, turn_index, time_index, measures)
 
     # file-level analysis
-    summ_df = update_summ_df(
-        df_diff, summ_df, full_text, time_index, word_df, phrase_df, turn_df, measures
-    )
+    summ_df = update_summ_df(df_diff, summ_df, full_text, time_index, word_df, turn_df, measures)
+    df_feature = [word_df, turn_df, summ_df]
+    return df_feature
 
-    df_feature = [word_df, phrase_df, turn_df, summ_df]
+def get_mattr(text):
+    """
+    ------------------------------------------------------------------------------------------------------
+    This function calculates the Moving Average Type-Token Ratio (MATTR)
+     of the input text using the
+     LexicalRichness library.
 
-    return df_feature
+    Parameters:
+    ...........
+    text : str
+        The input text to be analyzed.
+
+    Returns:
+    ...........
+    mattr : float
+        The calculated MATTR value.
+
+    ------------------------------------------------------------------------------------------------------
+    """
+    word = nltk.word_tokenize(text)
+    filter_punc = list(value for value in word if value not in [".", "!", "?"])
+    filter_punc = " ".join(filter_punc)
+    mattr = np.nan
+
+    lex_richness = LexicalRichness(filter_punc)
+    if lex_richness.words > 0:
+        mattr = lex_richness.mattr(window_size=lex_richness.words)
+
+    return mattr
+
+def get_tag(json_conf, tag_dict, measures):
+    """
+    ------------------------------------------------------------------------------------------------------
+
+    This function performs part-of-speech
+     tagging on the input text using NLTK, and returns an updated
+     json_conf list with the part-of-speech tags.
+
+    Parameters:
+    ...........
+    json_conf: list
+        JSON response object.
+    tag_dict: dict
+        A dictionary mapping the NLTK tags to more readable tags.
+    measures: dict
+        A dictionary containing the names of the columns in the output dataframes.
+
+    Returns:
+    ...........
+    json_conf: list
+        The updated json_conf list.
+
+    ------------------------------------------------------------------------------------------------------
+    """
+    if len(json_conf) <= 0:
+        return json_conf
+
+    if "alternatives" not in json_conf[0].keys():
+        # local vosk transcriber
+        word_list = [word["word"] for word in json_conf if "word" in word]
+    else:
+        # aws transcriber
+        word_list = [item["alternatives"][0]["content"] for item in json_conf]
+
+    tag_list = nltk.pos_tag(word_list)
+    for i, tag in enumerate(tag_list):
+        
+        if tag[1] in tag_dict.keys():
+            json_conf[i][measures["tag"]] = tag_dict[tag[1]]
+        
+        else:
+            json_conf[i][measures["tag"]] = "Other"
+    return json_conf
+
+def get_part_of_speech(df, tags, measures, index=0):
+    """
+    ------------------------------------------------------------------------------------------------------
+
+    This function calculates the proportions of verbs,
+     pronouns, adjectives, and nouns in the
+     transcribed text, and adds them to the output dataframe df.
+
+    Parameters:
+    ...........
+    df: pandas dataframe
+        A dataframe containing the speech characteristics of the input text.
+    tags: list
+        A list of part-of-speech tags for the input text.
+    measures: dict
+        A dictionary containing the names of the columns in the output dataframes.
+    index: int
+        The index of the row in the output dataframe df.
+
+    Returns:
+    ...........
+    df: pandas dataframe
+        The updated df dataframe.
+
+    ------------------------------------------------------------------------------------------------------
+    """
+    if len(tags) == 0:
+        return df
+
+    df.loc[index, measures["speech_noun"]] = (100 * len(tags[tags == "Noun"]) / len(tags))
+    df.loc[index, measures["speech_verb"]] = (100 * len(tags[tags == "Verb"]) / len(tags))
+    
+    df.loc[index, measures["speech_adj"]] = (100 * len(tags[tags == "Adjective"]) / len(tags))
+    df.loc[index, measures["speech_pronoun"]] = (100 * len(tags[tags == "Pronoun"]) / len(tags))
+    return df
+
+def get_tag_summ(json_conf, df_list, text_indices, measures):
+    """
+    ------------------------------------------------------------------------------------------------------
+
+    This function calculates the proportions of verbs,
+     pronouns, adjectives, and nouns in the
+     transcribed text, and adds them to the output dataframe summ_df.
+
+    Parameters:
+    ...........
+    json_conf: list
+        JSON response object.
+    df_list: list
+        List of pandas dataframes.
+            word_df, phrase_df, turn_df, summ_df
+    text_indices: list
+        List of indices for text_list.
+            for phrases and turns.
+    measures: dict
+        A dictionary containing the names of the columns in the output dataframes.
+
+    Returns:
+    ...........
+    df_list: list
+        List of updated pandas dataframes.
+
+    ------------------------------------------------------------------------------------------------------
+    """
+
+    word_df, turn_df, summ_df = df_list
+    _ , turn_index = text_indices
+
+    df_conf = pd.DataFrame(json_conf)
+    word_df[measures["part_of_speech"]] = df_conf[measures["tag"]]
+    
+    # turn-level analysis
+    for j, uindex in enumerate(turn_index):
+        urange = range(uindex[0], uindex[1] + 1)
+        
+        turn_tags = df_conf.loc[df_conf[measures["old_index"]].isin(urange), measures["tag"]]
+        turn_df = get_part_of_speech(turn_df, turn_tags, measures, j)
+
+    # file-level analysis
+    summ_df = get_part_of_speech(summ_df, df_conf[measures["tag"]], measures)
+    df_list = [word_df, turn_df, summ_df]
+    return df_list
+
+def get_sentiment(df_list, text_list, measures):
+    """
+    ------------------------------------------------------------------------------------------------------
+
+    This function calculates the sentiment scores of the input text using
+     VADER, and adds them to the output dataframe summ_df.
+
+    Parameters:
+    ...........
+    df_list: list
+        List of pandas dataframes.
+            word_df, phrase_df, turn_df, summ_df
+    text_list: list
+        List of transcribed text.
+            split into words, phrases, turns, and full text.
+    measures: dict
+        A dictionary containing the names of the columns in the output dataframes.
+
+    Returns:
+    ...........
+    df_list: list
+        List of updated pandas dataframes.
+
+    ------------------------------------------------------------------------------------------------------
+    """
+    word_df, turn_df, summ_df = df_list
+    word_list, turn_list, full_text = text_list
+
+    sentiment = SentimentIntensityAnalyzer()
+    cols = [measures["neg"], measures["neu"], measures["pos"], measures["compound"], measures["speech_mattr"]]
+
+    for idx, w in enumerate(word_list):
+        try:
+            
+            sentiment_dict = sentiment.polarity_scores(w)
+            word_df.loc[idx, cols[:-1]] = list(sentiment_dict.values())
+            
+        except Exception as e:
+            logger.error(f"Error in sentiment analysis: {e}")
+            continue
 
+    for idx, u in enumerate(turn_list):
+        try:
+            
+            sentiment_dict = sentiment.polarity_scores(u)
+            mattr = get_mattr(u)
+            turn_df.loc[idx, cols] = list(sentiment_dict.values()) + [mattr]
+            
+        except Exception as e:
+            logger.error(f"Error in sentiment analysis: {e}")
+            continue
+            
+    sentiment_dict = sentiment.polarity_scores(full_text)
+    mattr = get_mattr(full_text)
 
-def process_language_feature(
-    json_conf, df_list, text_list,
-    text_indices, language, measures,
-):
+    summ_df.loc[0, cols] = list(sentiment_dict.values()) + [mattr]
+    df_list = [word_df, turn_df, summ_df]
+    return df_list
+
+def process_language_feature(json_conf, df_list, text_list, text_indices, language, measures):
     """
     ------------------------------------------------------------------------------------------------------
 
@@ -1395,6 +953,4 @@ def process_language_feature(
         df_list = get_tag_summ(json_conf, df_list, text_indices, measures)
 
         df_list = get_sentiment(df_list, text_list, measures)
-
-    word_df, phrase_df, turn_df, summ_df = df_list
-    return word_df, phrase_df, turn_df, summ_df
+    return df_list
\ No newline at end of file

From 552ac96a11de4b8ad27947bd2c2bb95a23ef65a7 Mon Sep 17 00:00:00 2001
From: Vijay Yadav <vijayyadav@vijays-mbp.myfiosgateway.com>
Date: Wed, 1 Nov 2023 12:52:50 -0400
Subject: [PATCH 02/21] speech characteristics update

---
 openwillis/measures/text/config/text.json     |   2 +
 openwillis/measures/text/speech_attribute.py  | 179 +++++--
 .../text/util/characteristics_util.py         | 494 +++++++++---------
 3 files changed, 381 insertions(+), 294 deletions(-)

diff --git a/openwillis/measures/text/config/text.json b/openwillis/measures/text/config/text.json
index 443433d..897ace2 100644
--- a/openwillis/measures/text/config/text.json
+++ b/openwillis/measures/text/config/text.json
@@ -33,6 +33,8 @@
   "speech_words": "speech_length_words",
   "turn_minutes": "turn_length_minutes",
   "turn_words": "turn_length_words",
+  "file_length": "file_length",
+  "speaker_percentage": "speaker_percentage",
   "word_rate": "words_per_min",
   "syllable_rate": "syllables_per_min",
   "pause_rate": "pauses_per_min",
diff --git a/openwillis/measures/text/speech_attribute.py b/openwillis/measures/text/speech_attribute.py
index b3eaba5..5b44096 100644
--- a/openwillis/measures/text/speech_attribute.py
+++ b/openwillis/measures/text/speech_attribute.py
@@ -9,7 +9,7 @@
 import nltk
 import numpy as np
 import pandas as pd
-#from openwillis.measures.text.util import characteristics_util as cutil
+from openwillis.measures.text.util import characteristics_util as cutil
 from util import characteristics_util as cutil
 
 logging.basicConfig(level=logging.INFO)
@@ -83,7 +83,6 @@ def is_whisper_transcribe(json_conf):
             return True
     return False
 
-
 def filter_transcribe(json_conf, measures, speaker_label=None):
     """
     ------------------------------------------------------------------------------------------------------
@@ -115,44 +114,25 @@ def filter_transcribe(json_conf, measures, speaker_label=None):
     ------------------------------------------------------------------------------------------------------
     """
     item_data = json_conf["results"]["items"]
-
-    # make a dictionary to map old indices to new indices
-    item_data = cutil.create_index_column(item_data, measures)
+    
+    for i, item in enumerate(item_data): # create_index_column
+        item[measures["old_index"]] = i
 
     # extract text
-    text = " ".join(
-        [
-            item["alternatives"][0]["content"]
-            for item in item_data
-            if "alternatives" in item
-        ]
-    )
-
-    # phrase-split
+    text = " ".join([item["alternatives"][0]["content"] for item in item_data if "alternatives" in item])
     phrases, phrases_idxs = cutil.phrase_split(text)
 
-    # turn-split
-    turns = []
-    turns_idxs = []
-
     if speaker_label is not None:
+        turns_idxs, turns = cutil.filter_speaker_aws(item_data, speaker_label)
+    else:
+        turns_idxs, turns = [], []
 
-        turns_idxs, turns, phrases_idxs, phrases = cutil.filter_speaker(
-            item_data, speaker_label, turns_idxs, turns, phrases_idxs, phrases
-        )
-
-    # entire transcript - by joining all the phrases
     text = " ".join(phrases)
-
-    # filter json to only include items with start_time and end_time
-    filter_json = cutil.filter_json_transcribe(item_data, speaker_label, measures)
-
-    # extract words
+    filter_json = cutil.filter_json_transcribe_aws(item_data, speaker_label, measures)
     words = [word["alternatives"][0]["content"] for word in filter_json]
 
-    text_list = [words, phrases, turns, text]
+    text_list = [words, turns, text]
     text_indices = [phrases_idxs, turns_idxs]
-
     return filter_json, text_list, text_indices
 
 
@@ -250,6 +230,118 @@ def filter_vosk(json_conf, measures):
         
     return words, text
 
+def common_summary_feature(df_summ, json_data, model):
+    """
+    ------------------------------------------------------------------------------------------------------
+
+    Calculate file features based on JSON data.
+
+    Parameters:
+    ...........
+    json_conf: list
+        JSON response object.
+    summ_df: pandas dataframe
+        A dataframe containing summary information on the speech
+    model: str
+        model name
+
+    Returns:
+    ...........
+    summ_df: pandas dataframe
+        A dataframe containing summary information on the speech
+
+    ------------------------------------------------------------------------------------------------------
+    """
+    try:
+        if model == 'vosk':
+            if len(json_data) > 0 and 'end' in json_data[-1]:
+
+                last_dict = json_data[-1]
+                df_summ['file_length'] = [last_dict['end']]
+
+        else:
+            if model == 'aws':
+                json_data = json_data["results"]
+                fl_length, spk_pct = cutil.calculate_file_feature(json_data, model)
+
+            else:
+                fl_length, spk_pct = cutil.calculate_file_feature(json_data, model)
+            df_summ['file_length'] = [fl_length]
+            df_summ['speaker_percentage'] = [spk_pct]
+            
+    except Exception as e:
+        logger.error("Error in file length calculation")
+    return df_summ
+
+def process_transcript(df_list, json_conf, measures, min_turn_length, speaker_label, source, language):
+    """
+    ------------------------------------------------------------------------------------------------------
+    
+    Process transcript
+    
+    Parameters:
+    ...........
+    df_list: list, :
+        contains pandas dataframe
+    json_conf: dict
+        Transcribed json file
+    measures: dict
+        A dictionary containing the names of the columns in the output dataframes.
+    min_turn_length: int
+        minimum words required in each turn
+    speaker_label: str
+        Speaker label
+    source: str
+        model name
+    language: str
+        Language type
+    
+    Returns:
+    ...........
+    df_list: list
+        contains pandas dataframe
+    
+    ------------------------------------------------------------------------------------------------------
+    """
+    common_summary_feature(df_list[2], json_conf, source)
+
+    if source == 'whisper':
+        info = filter_whisper(json_conf, measures, min_turn_length, speaker_label)
+        
+    elif source == 'aws':
+        info = filter_transcribe(json_conf, measures, speaker_label)
+        
+    else:
+        words, text = filter_vosk(json_conf, measures)
+        info = (json_conf, [words, [], text], [[], []])
+
+    if len(info[0]) > 0 and len(info[1][-1]) > 0:
+        df_list = cutil.process_language_feature(df_list, info, language, get_time_columns(source), measures)
+    return df_list
+
+def get_time_columns(source):
+    """
+    ------------------------------------------------------------------------------------------------------
+    
+    get time columns
+    
+    Parameters:
+    ...........
+    source: str
+        model name
+    
+    Returns:
+    ...........
+    object: list
+        time index name
+        
+    ------------------------------------------------------------------------------------------------------
+    """
+    if source == 'aws':
+        return ["start_time", "end_time"]
+    else:
+        return ["start", "end"]
+
 def speech_characteristics(json_conf, language="en", speaker_label=None, min_turn_length=1):
     """
     ------------------------------------------------------------------------------------------------------
@@ -272,8 +364,6 @@ def speech_characteristics(json_conf, language="en", speaker_label=None, min_tur
     df_list: list, contains:
         word_df: pandas dataframe
             A dataframe containing word summary information
-        phrase_df: pandas dataframe
-            A dataframe containing phrase summary information
         turn_df: pandas dataframe
             A dataframe containing turn summary information
         summ_df: pandas dataframe
@@ -281,34 +371,31 @@ def speech_characteristics(json_conf, language="en", speaker_label=None, min_tur
 
     ------------------------------------------------------------------------------------------------------
     """
-
-    measures = get_config(os.path.abspath(__file__), "text.json")
-    df_list = cutil.create_empty_dataframes(measures)
-    
     try:
+        # Load configuration measures
+        measures = get_config(os.path.abspath(__file__), "text.json")
+        df_list = cutil.create_empty_dataframes(measures)
+
         if bool(json_conf):
-            language = "na" if language is None or len(language) < 2 else language[:2].lower()
+            language = language[:2].lower() if (language and len(language) >= 2) else "na"
 
             if language == 'en':
                 cutil.download_nltk_resources()
 
             if is_whisper_transcribe(json_conf):
-                filter_json, text_list, text_indices = filter_whisper(json_conf, measures, min_turn_length, speaker_label)
+                df_list = process_transcript(df_list, json_conf, measures, min_turn_length, speaker_label, 'whisper', language)
 
-                if len(filter_json) > 0 and len(text_list[-1]) > 0:
-                    df_list = cutil.process_language_feature(filter_json, df_list, text_list, text_indices, language, measures)
+            elif is_amazon_transcribe(json_conf):
+                df_list = process_transcript(df_list, json_conf, measures, min_turn_length, speaker_label, 'aws', language)
 
             else:
-                words, text = filter_vosk(json_conf, measures)
-                if len(text) > 0:
-                    df_list = cutil.process_language_feature(json_conf, df_list, [words,[],[],text],[[],[]], language, measures)
-        
-        
+                df_list = process_transcript(df_list, json_conf, measures, min_turn_length, speaker_label, 'vosk', language)
+
     except Exception as e:
         logger.error(f"Error in Speech Characteristics {e}")
 
     finally:
         for df in df_list:
             df.loc[0] = np.nan if df.empty else df.loc[0]
-    
+
     return df_list
diff --git a/openwillis/measures/text/util/characteristics_util.py b/openwillis/measures/text/util/characteristics_util.py
index 4d939bb..0a60710 100644
--- a/openwillis/measures/text/util/characteristics_util.py
+++ b/openwillis/measures/text/util/characteristics_util.py
@@ -15,23 +15,8 @@
 logger = logging.getLogger()
 
 # NLTK Tag list
-TAG_DICT = {
-    "PRP": "Pronoun",
-    "PRP$": "Pronoun",
-    "VB": "Verb",
-    "VBD": "Verb",
-    "VBG": "Verb",
-    "VBN": "Verb",
-    "VBP": "Verb",
-    "VBZ": "Verb",
-    "JJ": "Adjective",
-    "JJR": "Adjective",
-    "JJS": "Adjective",
-    "NN": "Noun",
-    "NNP": "Noun",
-    "NNS": "Noun",
-}
-
+TAG_DICT = {"PRP": "Pronoun", "PRP$": "Pronoun", "VB": "Verb", "VBD": "Verb", "VBG": "Verb", "VBN": "Verb", "VBP": "Verb", 
+            "VBZ": "Verb", "JJ": "Adjective", "JJR": "Adjective", "JJS": "Adjective", "NN": "Noun", "NNP": "Noun", "NNS": "Noun"}
 
 def create_empty_dataframes(measures):
     """
@@ -46,80 +31,26 @@ def create_empty_dataframes(measures):
 
     Returns:
     ...........
-    word_df: pandas dataframe
-        A dataframe containing word summary information
-    phrase_df: pandas dataframe
-        A dataframe containing phrase summary information
-    turn_df: pandas dataframe
-        A dataframe containing turn summary information
-    summ_df: pandas dataframe
-        A dataframe containing summary information on the speech
-
-    ------------------------------------------------------------------------------------------------------
-    """
-
-    word_df = pd.DataFrame(
-        columns=[
-            measures["word_pause"],
-            measures["num_syllables"],
-            measures["part_of_speech"],
-            measures["pos"],
-            measures["neg"],
-            measures["neu"],
-            measures["compound"],
-        ]
-    )
-
-    turn_df = pd.DataFrame(
-        columns=[
-            measures["turn_pause"],
-            measures["turn_minutes"],
-            measures["turn_words"],
-            measures["word_rate"],
-            measures["syllable_rate"],
-            measures["pause_rate"],
-            measures["pause_var"],
-            measures["pause_meandur"],
-            measures["speech_percentage"],
-            measures["speech_noun"],
-            measures["speech_verb"],
-            measures["speech_adj"],
-            measures["speech_pronoun"],
-            measures["pos"],
-            measures["neg"],
-            measures["neu"],
-            measures["compound"],
-            measures["speech_mattr"],
-            measures["interrupt_flag"],
-        ]
-    )
+    tuple: pandas dataframe
+        An empty dataframe for word, turn and summary measures
+
+    ------------------------------------------------------------------------------------------------------
+    """
+
+    word_df = pd.DataFrame(columns=[measures["word_pause"], measures["num_syllables"], measures["part_of_speech"]])
+    turn_df = pd.DataFrame(columns=[measures["turn_pause"], measures["turn_minutes"], measures["turn_words"], 
+                                    measures["word_rate"], measures["syllable_rate"], measures["speech_percentage"], 
+                                    measures["pause_meandur"], measures["pause_var"], measures["pos"], measures["neg"], 
+                                    measures["neu"], measures["compound"], measures["speech_mattr"], 
+                                    measures["interrupt_flag"]])
 
     summ_df = pd.DataFrame(
-        columns=[
-            measures["speech_minutes"],
-            measures["speech_words"],
-            measures["word_rate"],
-            measures["syllable_rate"],
-            measures["word_pause_mean"],
-            measures["word_pause_var"],
-            measures["speech_percentage"],
-            measures["speech_noun"],
-            measures["speech_verb"],
-            measures["speech_adj"],
-            measures["speech_pronoun"],
-            measures["pos"],
-            measures["neg"],
-            measures["neu"],
-            measures["compound"],
-            measures["speech_mattr"],
-            measures["num_turns"],
-            measures["turn_minutes_mean"],
-            measures["turn_words_mean"],
-            measures["turn_pause_mean"],
-            measures["num_one_word_turns"],
-            measures["num_interrupts"],
-        ]
-    )
+        columns=[measures["file_length"], measures["speech_minutes"], measures["speech_words"], measures["word_rate"],
+                 measures["syllable_rate"], measures["word_pause_mean"], measures["word_pause_var"], 
+                 measures["speech_percentage"], measures["pos"], measures["neg"], measures["neu"], measures["compound"], 
+                 measures["speech_mattr"], measures["num_turns"], measures["num_one_word_turns"], measures["turn_minutes_mean"],
+                 measures["turn_words_mean"], measures["turn_pause_mean"], measures["speaker_percentage"], 
+                 measures["num_interrupts"]])
 
     return word_df, turn_df, summ_df
 
@@ -139,6 +70,7 @@ def create_index_column(item_data, measures):
     """
     index = 0
     for item in item_data:
+        
         for word in item.get("words", []):
             word[measures["old_index"]] = index
             index += 1
@@ -152,14 +84,6 @@ def download_nltk_resources():
     This function downloads the
      required NLTK resources for processing text data.
 
-    Parameters:
-    ...........
-    None
-
-    Returns:
-    ...........
-    None
-
     ------------------------------------------------------------------------------------------------------
     """
     try:
@@ -172,6 +96,151 @@ def download_nltk_resources():
     except LookupError:
         nltk.download("averaged_perceptron_tagger")
         
+def phrase_split(text):
+    """
+    ------------------------------------------------------------------------------------------------------
+
+    This function splits the input text into phrases.
+
+    Parameters:
+    ...........
+    text: str
+        The input text.
+
+    Returns:
+    ...........
+    phrases: list
+        A list of phrases extracted from the input text.
+    phrases_idxs: list
+        A list of tuples containing
+            the start and end indices of the phrases in the input text.
+
+    ------------------------------------------------------------------------------------------------------
+    """
+    phrases = nltk.tokenize.sent_tokenize(text)
+    phrases_idxs = []
+
+    start_idx = 0
+    for phrase in phrases:
+        end_idx = start_idx + len(phrase.split()) - 1
+        
+        phrases_idxs.append((start_idx, end_idx))
+        start_idx = end_idx + 1
+
+    return phrases, phrases_idxs
+
+def filter_turn_aws(item_data, speaker_label):
+    """
+    ------------------------------------------------------------------------------------------------------
+    
+    This function updates the turns list
+        to only include the speaker label provided.
+
+    Parameters:
+    ...........
+    item_data: dict
+        JSON response object.
+    speaker_label: str
+        Speaker label
+    turns_idxs: list
+        A list of tuples containing
+            the start and end indices of the turns in the JSON object.
+    turns: list
+        A list of turns extracted from the JSON object.
+
+    Returns:
+    ...........
+    turns_idxs: list
+        A list of tuples containing
+            the start and end indices of the turns in the JSON object.
+    turns: list
+        A list of turns extracted from the JSON object.
+
+    ------------------------------------------------------------------------------------------------------
+    """
+    start_idx = 0
+    turns_idxs, turns = [], []
+    for i, item in enumerate(item_data):
+        
+        try:
+            if (i > 0 and item.get("speaker_label", "") == speaker_label and item_data[i - 1].get("speaker_label", "") != speaker_label):
+                start_idx = i
+            elif (i > 0 and item.get("speaker_label", "") != speaker_label and item_data[i - 1].get("speaker_label", "") == speaker_label):
+                turns_idxs.append((start_idx, i - 1))
+                turns.append(" ".join([item["alternatives"][0]["content"] for item in item_data[start_idx:i]]))
+                
+        except Exception as e:
+            logger.error(f"Error in turn-split for speaker {speaker_label}: {e}")
+            continue
+
+    if start_idx not in [item[0] for item in turns_idxs]:
+        turns_idxs.append((start_idx, len(item_data) - 1))
+        
+        turns.append(" ".join([item["alternatives"][0]["content"] for item in item_data[start_idx:]]))
+    return turns_idxs, turns
+
+def filter_speaker_aws(item_data, speaker_label):
+    """
+    ------------------------------------------------------------------------------------------------------
+
+    This function updates the turns and phrases lists
+        to only include the speaker label provided.
+
+    Parameters:
+    ...........
+    item_data: dict
+        JSON response object.
+    speaker_label: str
+        Speaker label
+    Returns:
+    ...........
+    turns_idxs: list
+        A list of tuples containing
+            the start and end indices of the turns in the JSON object.
+    turns: list
+        A list of turns extracted from the JSON object.
+
+    ------------------------------------------------------------------------------------------------------
+    """
+
+    speaker_labels = [item["speaker_label"] for item in item_data if "speaker_label" in item]
+
+    if speaker_label not in speaker_labels:
+        logger.error(f"Speaker label {speaker_label} not found in the json response object.")
+
+    turns_idxs, turns = filter_turn_aws(item_data, speaker_label)
+    return turns_idxs, turns
+
+def filter_json_transcribe_aws(item_data, speaker_label, measures):
+    """
+    ------------------------------------------------------------------------------------------------------
+
+    This function filters the JSON response object to only include items with start_time and end_time.
+
+    Parameters:
+    ...........
+    item_data: dict
+        JSON response object.
+    speaker_label: str
+        Speaker label
+    measures: dict
+        A dictionary containing the names of the columns in the output dataframes.
+
+    Returns:
+    ...........
+    filter_json: list
+        The updated JSON response object.
+
+    ------------------------------------------------------------------------------------------------------
+    """
+    filter_json = [item for item in item_data if "start_time" in item and "end_time" in item]
+    filter_json = pause_calculation(filter_json, measures, ['start_time', 'end_time'])
+
+    if speaker_label is not None:
+        filter_json = [item for item in filter_json if item.get("speaker_label", "") == speaker_label]
+
+    return filter_json
+        
 def filter_phrases(item_data, speaker_label, measures):
     """
     ------------------------------------------------------------------------------------------------------
@@ -245,10 +314,6 @@ def filter_turns(item_data, speaker_label, measures, min_turn_length):
     turns: list
         A list of turns extracted from the JSON object.
 
-    Raises:
-    ...........
-        ValueError: If the speaker label is not found in the json response object.
-
     ------------------------------------------------------------------------------------------------------
     """
     turns_idxs, turns = [], []
@@ -285,11 +350,11 @@ def filter_turns(item_data, speaker_label, measures, min_turn_length):
         
         if len(turn_text.split(" ")) >= min_turn_length: 
             turns_idxs.append((start_idx2, end_idx2))
+            
             turns.append(turn_text)
-
     return turns_idxs, turns
 
-def pause_calculation(filter_json, measures):
+def pause_calculation(filter_json, measures, time_index):
     """
     ------------------------------------------------------------------------------------------------------
 
@@ -311,11 +376,10 @@ def pause_calculation(filter_json, measures):
     """
     for i, item in enumerate(filter_json):
         if i > 0:
-            item[measures["pause"]] = float(item["start"]) - float(filter_json[i - 1]["end"])
+            item[measures["pause"]] = float(item[time_index[0]]) - float(filter_json[i - 1][time_index[0]])
         
         else:
             item[measures["pause"]] = np.nan
-    
     return filter_json
 
 def filter_json_transcribe(item_data, speaker_label, measures):
@@ -340,7 +404,6 @@ def filter_json_transcribe(item_data, speaker_label, measures):
 
     ------------------------------------------------------------------------------------------------------
     """
-    # phrase filtering
     item_data2 = []
     for item in item_data:
         try:
@@ -356,7 +419,7 @@ def filter_json_transcribe(item_data, speaker_label, measures):
             logger.error(f"Failed to filter word: {e}")
     
     filter_json = [item for item in item_data2 if "start" in item and "end" in item]
-    filter_json = pause_calculation(filter_json, measures) # calculate time difference between each word
+    filter_json = pause_calculation(filter_json, measures, ['start', 'end'])
 
     if speaker_label is not None:
         filter_json = [item for item in filter_json if item.get("speaker", "") == speaker_label]
@@ -382,11 +445,9 @@ def get_num_of_syllables(text):
     """
 
     syllable_tokenizer = nltk.tokenize.SyllableTokenizer()
-
-    # remove punctuation
-    punctuation = "!\"#$%&()*+,-./:;<=>?@[\]^_`{|}~"
+    punctuation = "!\"#$%&()*+,-./:;<=>?@[\]^_`{|}~" # remove punctuation
+    
     syllables = [syllable_tokenizer.tokenize(token) for token in nltk.word_tokenize(text) if token not in punctuation]
-    # count the number of syllables in each word
     syllable_count = sum([len(token) for token in syllables])
 
     return syllable_count
@@ -421,18 +482,12 @@ def get_pause_feature_word(word_df, df_diff, word_list, phrase_index, measures):
     ------------------------------------------------------------------------------------------------------
     """
     phrase_starts = [pindex[0] for pindex in phrase_index]
-
-    word_df[measures["word_pause"]] = df_diff[measures["pause"]].where(
-        ~df_diff[measures["old_index"]].isin(phrase_starts), np.nan
-    )
-
-    # calculate the number of syllables in each word from the word list
-    word_df[measures["num_syllables"]] = [
-        get_num_of_syllables(word) for word in word_list
-    ]
+    word_df[measures["word_pause"]] = df_diff[measures["pause"]].where(~df_diff[measures["old_index"]].isin(phrase_starts), np.nan)
+    
+    word_df[measures["num_syllables"]] = [get_num_of_syllables(word) for word in word_list]
     return word_df
 
-def process_pause_feature(df_diff, df, text_level, index_list, time_index, level_name, measures):
+def process_pause_feature(df_diff, df, text_level, index_list, time_index, level_name, measures, language):
     """
     ------------------------------------------------------------------------------------------------------
 
@@ -469,58 +524,47 @@ def process_pause_feature(df_diff, df, text_level, index_list, time_index, level
     """
 
     if level_name not in [measures["phrase"], measures["turn"]]:
-        logger.error(
-            f"level_name must be either {measures['phrase']} or {measures['turn']}"
-        )
+        logger.error(f"level_name must be either in phrase or turn")
         return df
 
     for j, index in enumerate(index_list):
         try:
+            
             rng = range(index[0], index[1] + 1)
             level_json = df_diff[df_diff[measures["old_index"]].isin(rng)]
-
-            # remove first pause as it is the pre_pause
+            
             pauses = level_json[measures["pause"]].values[1:]
-
-            df.loc[j, measures[f"{level_name}_minutes"]] = (
-                float(level_json.iloc[-1][time_index[1]])
-                - float(level_json.iloc[0][time_index[0]])
-            ) / 60
+            level_min_val = (float(level_json.iloc[-1][time_index[1]]) - float(level_json.iloc[0][time_index[0]])) / 60
+            
+            df.loc[j, measures[f"{level_name}_minutes"]] = level_min_val
             df.loc[j, measures[f"{level_name}_words"]] = len(level_json)
 
-            # if there is 1 pause
             if len(pauses) == 1:
                 df.loc[j, measures["pause_var"]] = 0
                 df.loc[j, measures["pause_meandur"]] = np.mean(pauses)
-            # if there are more than 1 pauses
+
             elif len(pauses) > 1:
                 df.loc[j, measures["pause_var"]] = np.var(pauses)
                 df.loc[j, measures["pause_meandur"]] = np.mean(pauses)
 
             if df.loc[j, measures[f"{level_name}_minutes"]] > 0:
-                df.loc[j, measures["speech_percentage"]] = 100 * (
-                    1 - np.sum(pauses) / (
-                        60 * df.loc[j, measures[f"{level_name}_minutes"]]
-                    )
-                )
-
-                # articulation rate
-                df.loc[j, measures["syllable_rate"]] = (
-                    get_num_of_syllables(text_level[j]) / df.loc[j, measures[f"{level_name}_minutes"]]
-                )
-
-                df.loc[j, measures["word_rate"]] = (
-                    df.loc[j, measures[f"{level_name}_words"]] / df.loc[j, measures[f"{level_name}_minutes"]]
-                )
+                speech_pct_val = 100 * (1 - np.sum(pauses) / (60 * df.loc[j, measures[f"{level_name}_minutes"]]))
+                df.loc[j, measures["speech_percentage"]] = speech_pct_val
+
+                if language == 'en':
+                    syllable_rate = (get_num_of_syllables(text_level[j]) / df.loc[j, measures[f"{level_name}_minutes"]])
+                    df.loc[j, measures["syllable_rate"]] = syllable_rate
+                
+                word_rate_val = (df.loc[j, measures[f"{level_name}_words"]] / df.loc[j, measures[f"{level_name}_minutes"]])
+                df.loc[j, measures["word_rate"]] = word_rate_val
+                
         except Exception as e:
             logger.error(f"Error in pause feature calculation for {level_name} {j}: {e}")
             continue
 
-    df[measures["pause_rate"]] = df[measures["word_rate"]]
-
     return df
 
-def get_pause_feature_turn(turn_df, df_diff, turn_list, turn_index, time_index, measures):
+def get_pause_feature_turn(turn_df, df_diff, turn_list, turn_index, time_index, measures, language):
     """
     ------------------------------------------------------------------------------------------------------
 
@@ -552,28 +596,19 @@ def get_pause_feature_turn(turn_df, df_diff, turn_list, turn_index, time_index,
 
     ------------------------------------------------------------------------------------------------------
     """
-
     turn_starts = [uindex[0] for uindex in turn_index]
-
-    # get the rows corresponding to the start of each turn
-    df_diff_turn = df_diff[
-        df_diff[measures["old_index"]].isin(turn_starts)
-    ]
+    df_diff_turn = df_diff[df_diff[measures["old_index"]].isin(turn_starts)]
 
     turn_df[measures["turn_pause"]] = df_diff_turn[measures["pause"]]
     turn_df[measures["interrupt_flag"]] = False
-    # set pre_turn_pause to 0 if negative (due to overlapping turns)
-    # and set interrupt_flag to True
-    negative_pause = turn_df[measures["turn_pause"]] < 0
+    
+    negative_pause = turn_df[measures["turn_pause"]] <= 0
     turn_df.loc[negative_pause, measures["turn_pause"]] = 0
+    
     turn_df.loc[negative_pause, measures["interrupt_flag"]] = True
-
     turn_df = turn_df.reset_index(drop=True)
 
-    turn_df = process_pause_feature(
-        df_diff, turn_df, turn_list, turn_index, time_index, measures["turn"], measures
-    )
-
+    turn_df = process_pause_feature(df_diff, turn_df, turn_list, turn_index, time_index, measures["turn"], measures, language)
     return turn_df
 
 def update_summ_df(df_diff, summ_df, full_text, time_index, word_df, turn_df, measures):
@@ -631,11 +666,11 @@ def update_summ_df(df_diff, summ_df, full_text, time_index, word_df, turn_df, me
         summ_df[measures["turn_pause_mean"]] = turn_df[measures["turn_pause"]].mean(skipna=True)
         
         summ_df["num_one_word_turns"] = len(turn_df[turn_df[measures["turn_words"]] == 1])
-        summ_df[measures["num_interrupts"]] = sum(turn_df[measures["interrupt_flag"]])
+        summ_df[measures["num_interrupts"]] = len(turn_df[turn_df[measures["interrupt_flag"]]==True])
 
     return summ_df
 
-def get_pause_feature(json_conf, df_list, text_list, text_indices, measures):
+def get_pause_feature(json_conf, df_list, text_list, text_indices, measures, time_index, language):
     """
     ------------------------------------------------------------------------------------------------------
 
@@ -661,8 +696,7 @@ def get_pause_feature(json_conf, df_list, text_list, text_indices, measures):
     Returns:
     ...........
     df_feature: list
-        List of updated pandas dataframes.
-            word_df, phrase_df, turn_df, summ_df
+        List of updated pandas dataframes (word_df, turn_df and summ_df)
 
     ------------------------------------------------------------------------------------------------------
     """
@@ -674,7 +708,7 @@ def get_pause_feature(json_conf, df_list, text_list, text_indices, measures):
     phrase_index, turn_index = text_indices
 
     df_diff = pd.DataFrame(json_conf)
-    time_index = ["start", "end"]
+    time_index = [time_index[0], time_index[1]]
 
     # Calculate the pause time between; each word and add the results to pause_list
     if measures["pause"] not in df_diff.columns:
@@ -685,7 +719,7 @@ def get_pause_feature(json_conf, df_list, text_list, text_indices, measures):
 
     # turn-level analysis
     if len(turn_index) > 0:
-        turn_df = get_pause_feature_turn(turn_df, df_diff, turn_list, turn_index, time_index, measures)
+        turn_df = get_pause_feature_turn(turn_df, df_diff, turn_list, turn_index, time_index, measures, language)
 
     # file-level analysis
     summ_df = update_summ_df(df_diff, summ_df, full_text, time_index, word_df, turn_df, measures)
@@ -750,11 +784,10 @@ def get_tag(json_conf, tag_dict, measures):
         return json_conf
 
     if "alternatives" not in json_conf[0].keys():
-        # local vosk transcriber
-        word_list = [word["word"] for word in json_conf if "word" in word]
+        word_list = [word["word"] for word in json_conf if "word" in word]# local vosk transcriber
+    
     else:
-        # aws transcriber
-        word_list = [item["alternatives"][0]["content"] for item in json_conf]
+        word_list = [item["alternatives"][0]["content"] for item in json_conf]# aws transcriber
 
     tag_list = nltk.pos_tag(word_list)
     for i, tag in enumerate(tag_list):
@@ -766,42 +799,6 @@ def get_tag(json_conf, tag_dict, measures):
             json_conf[i][measures["tag"]] = "Other"
     return json_conf
 
-def get_part_of_speech(df, tags, measures, index=0):
-    """
-    ------------------------------------------------------------------------------------------------------
-
-    This function calculates the proportions of verbs,
-     pronouns, adjectives, and nouns in the
-     transcribed text, and adds them to the output dataframe df.
-
-    Parameters:
-    ...........
-    df: pandas dataframe
-        A dataframe containing the speech characteristics of the input text.
-    tags: list
-        A list of part-of-speech tags for the input text.
-    measures: dict
-        A dictionary containing the names of the columns in the output dataframes.
-    index: int
-        The index of the row in the output dataframe df.
-
-    Returns:
-    ...........
-    df: pandas dataframe
-        The updated df dataframe.
-
-    ------------------------------------------------------------------------------------------------------
-    """
-    if len(tags) == 0:
-        return df
-
-    df.loc[index, measures["speech_noun"]] = (100 * len(tags[tags == "Noun"]) / len(tags))
-    df.loc[index, measures["speech_verb"]] = (100 * len(tags[tags == "Verb"]) / len(tags))
-    
-    df.loc[index, measures["speech_adj"]] = (100 * len(tags[tags == "Adjective"]) / len(tags))
-    df.loc[index, measures["speech_pronoun"]] = (100 * len(tags[tags == "Pronoun"]) / len(tags))
-    return df
-
 def get_tag_summ(json_conf, df_list, text_indices, measures):
     """
     ------------------------------------------------------------------------------------------------------
@@ -837,15 +834,6 @@ def get_tag_summ(json_conf, df_list, text_indices, measures):
     df_conf = pd.DataFrame(json_conf)
     word_df[measures["part_of_speech"]] = df_conf[measures["tag"]]
     
-    # turn-level analysis
-    for j, uindex in enumerate(turn_index):
-        urange = range(uindex[0], uindex[1] + 1)
-        
-        turn_tags = df_conf.loc[df_conf[measures["old_index"]].isin(urange), measures["tag"]]
-        turn_df = get_part_of_speech(turn_df, turn_tags, measures, j)
-
-    # file-level analysis
-    summ_df = get_part_of_speech(summ_df, df_conf[measures["tag"]], measures)
     df_list = [word_df, turn_df, summ_df]
     return df_list
 
@@ -880,16 +868,6 @@ def get_sentiment(df_list, text_list, measures):
     sentiment = SentimentIntensityAnalyzer()
     cols = [measures["neg"], measures["neu"], measures["pos"], measures["compound"], measures["speech_mattr"]]
 
-    for idx, w in enumerate(word_list):
-        try:
-            
-            sentiment_dict = sentiment.polarity_scores(w)
-            word_df.loc[idx, cols[:-1]] = list(sentiment_dict.values())
-            
-        except Exception as e:
-            logger.error(f"Error in sentiment analysis: {e}")
-            continue
-
     for idx, u in enumerate(turn_list):
         try:
             
@@ -908,25 +886,51 @@ def get_sentiment(df_list, text_list, measures):
     df_list = [word_df, turn_df, summ_df]
     return df_list
 
-def process_language_feature(json_conf, df_list, text_list, text_indices, language, measures):
+def calculate_file_feature(json_data, model):
     """
     ------------------------------------------------------------------------------------------------------
 
-    This function processes the language features from json response.
+    Calculate file features based on JSON data.
 
     Parameters:
     ...........
     json_conf: list
         JSON response object.
+
+    Returns:
+    ...........
+    tuple: A tuple containing two values - the total file length and the percentage of time spent speaking.
+
+    ------------------------------------------------------------------------------------------------------
+    """
+    speakers = ['clinician', 'speaker0']
+    
+    if model == 'aws':
+        segments = json_data.get('items', [])
+        file_length = max(float(segment.get("end_time", "0")) for segment in segments)
+
+        speaking_time = sum(float(segment.get("end_time", "0") or "0") - float(segment.get("start_time", "0") or "0")
+                           for segment in segments if segment.get("speaker_label", "") in speakers)
+    else:
+        segments = json_data.get('segments', [])
+        file_length = max(segment.get('end', 0) for segment in segments)
+        speaking_time = sum(segment['end'] - segment['start'] for segment in segments if segment.get('speaker', '') in speakers)
+
+    speaking_pct = (speaking_time / file_length) * 100
+    return file_length, speaking_pct
+
+def process_language_feature(df_list, transcribe_info, language, time_index, measures):
+    """
+    ------------------------------------------------------------------------------------------------------
+
+    This function processes the language features from json response.
+
+    Parameters:
+    ...........
     df_list: list
         List of pandas dataframes.
-         word_df, phrase_df, turn_df, summ_df
-    text_list: list
-        List of transcribed text.
-         split into words, phrases, turns, and full text.
-    text_indices: list
-        List of indices for text_list.
-         for phrases and turns.
+    transcribe_info: list
+        transcribed info
     language: str
         Language of the transcribed text.
     measures: dict
@@ -934,19 +938,13 @@ def process_language_feature(json_conf, df_list, text_list, text_indices, langua
 
     Returns:
     ...........
-    word_df: pandas dataframe
-        A dataframe containing word summary information
-    phrase_df: pandas dataframe
-        A dataframe containing phrase summary information
-    turn_df: pandas dataframe
-        A dataframe containing turn summary information
-    summ_df: pandas dataframe
-        A dataframe containing summary information on the speech
+    df_list: list
+        List of pandas dataframes (word_df, turn_df and summ_df)
 
     ------------------------------------------------------------------------------------------------------
     """
-
-    df_list = get_pause_feature(json_conf, df_list, text_list, text_indices, measures)
+    json_conf, text_list, text_indices = transcribe_info
+    df_list = get_pause_feature(json_conf, df_list, text_list, text_indices, measures, time_index, language)
 
     if language == "en":
         json_conf = get_tag(json_conf, TAG_DICT, measures)

From d9b74e4335fbf55983845447dd5a07be4b04fa76 Mon Sep 17 00:00:00 2001
From: Vijay Yadav <vijayyadav@vijays-mbp.myfiosgateway.com>
Date: Wed, 1 Nov 2023 13:12:51 -0400
Subject: [PATCH 03/21] speech update

---
 openwillis/measures/text/speech_attribute.py       | 14 ++++++++------
 .../measures/text/util/characteristics_util.py     |  9 +++++++--
 2 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/openwillis/measures/text/speech_attribute.py b/openwillis/measures/text/speech_attribute.py
index 5b44096..934b016 100644
--- a/openwillis/measures/text/speech_attribute.py
+++ b/openwillis/measures/text/speech_attribute.py
@@ -10,7 +10,6 @@
 import numpy as np
 import pandas as pd
 from openwillis.measures.text.util import characteristics_util as cutil
-from util import characteristics_util as cutil
 
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger()
@@ -230,7 +229,7 @@ def filter_vosk(json_conf, measures):
         
     return words, text
 
-def common_summary_feature(df_summ, json_data, model):
+def common_summary_feature(df_summ, json_data, model, speaker_label):
     """
     ------------------------------------------------------------------------------------------------------
 
@@ -244,6 +243,8 @@ def common_summary_feature(df_summ, json_data, model):
         A dataframe containing summary information on the speech
     model: str
         model name
+    speaker_label: str
+        Speaker label
 
     Returns:
     ...........
@@ -262,12 +263,13 @@ def common_summary_feature(df_summ, json_data, model):
         else:
             if model == 'aws':
                 json_data = json_data["results"]
-                fl_length, spk_pct = cutil.calculate_file_feature(json_data, model)
+                fl_length, spk_pct = cutil.calculate_file_feature(json_data, model, speaker_label)
 
             else:
-                fl_length, spk_pct = cutil.calculate_file_feature(json_data, model)
+                fl_length, spk_pct = cutil.calculate_file_feature(json_data, model, speaker_label)
+            
             df_summ['file_length'] = [fl_length]
-            df_summ['speaker_percentage'] = [spk_pct]
+            df_summ['speaker_percentage'] = [spk_pct]# if speaker_label is not None else df_summ['speaker_percentage']
             
     except Exception as e:
         logger.error("Error in file length calculation")
@@ -303,7 +305,7 @@ def process_transcript(df_list, json_conf, measures, min_turn_length, speaker_la
     
     ------------------------------------------------------------------------------------------------------
     """
-    common_summary_feature(df_list[2], json_conf, source)
+    common_summary_feature(df_list[2], json_conf, source, speaker_label)
 
     if source == 'whisper':
         info = filter_whisper(json_conf, measures, min_turn_length, speaker_label)
diff --git a/openwillis/measures/text/util/characteristics_util.py b/openwillis/measures/text/util/characteristics_util.py
index 0a60710..397fd3b 100644
--- a/openwillis/measures/text/util/characteristics_util.py
+++ b/openwillis/measures/text/util/characteristics_util.py
@@ -886,7 +886,7 @@ def get_sentiment(df_list, text_list, measures):
     df_list = [word_df, turn_df, summ_df]
     return df_list
 
-def calculate_file_feature(json_data, model):
+def calculate_file_feature(json_data, model, speakers):
     """
     ------------------------------------------------------------------------------------------------------
 
@@ -903,17 +903,22 @@ def calculate_file_feature(json_data, model):
 
     ------------------------------------------------------------------------------------------------------
     """
-    speakers = ['clinician', 'speaker0']
     
     if model == 'aws':
         segments = json_data.get('items', [])
         file_length = max(float(segment.get("end_time", "0")) for segment in segments)
+        
+        if speakers is None:
+            return file_length, np.NaN
 
         speaking_time = sum(float(segment.get("end_time", "0") or "0") - float(segment.get("start_time", "0") or "0")
                            for segment in segments if segment.get("speaker_label", "") in speakers)
     else:
         segments = json_data.get('segments', [])
         file_length = max(segment.get('end', 0) for segment in segments)
+        
+        if speakers is None:
+            return file_length, np.NaN
         speaking_time = sum(segment['end'] - segment['start'] for segment in segments if segment.get('speaker', '') in speakers)
 
     speaking_pct = (speaking_time / file_length) * 100

From d1ab70826ea0e1dbf56c39aad39a6dfbe1d6f7a4 Mon Sep 17 00:00:00 2001
From: vjbytes102 <vy386@nyu.edu>
Date: Wed, 1 Nov 2023 23:33:37 -0400
Subject: [PATCH 04/21] Update
 openwillis/measures/text/util/characteristics_util.py

Co-authored-by: GeorgiosEfstathiadis <54844705+GeorgeEfstathiadis@users.noreply.github.com>
---
 openwillis/measures/text/util/characteristics_util.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/openwillis/measures/text/util/characteristics_util.py b/openwillis/measures/text/util/characteristics_util.py
index 397fd3b..d79544b 100644
--- a/openwillis/measures/text/util/characteristics_util.py
+++ b/openwillis/measures/text/util/characteristics_util.py
@@ -922,7 +922,7 @@ def calculate_file_feature(json_data, model, speakers):
         speaking_time = sum(segment['end'] - segment['start'] for segment in segments if segment.get('speaker', '') in speakers)
 
     speaking_pct = (speaking_time / file_length) * 100
-    return file_length, speaking_pct
+    return file_length/60, speaking_pct
 
 def process_language_feature(df_list, transcribe_info, language, time_index, measures):
     """

From 1bcee4331a767dc8f78097c98175b24deb6a21b5 Mon Sep 17 00:00:00 2001
From: vjbytes102 <vy386@nyu.edu>
Date: Wed, 1 Nov 2023 23:35:16 -0400
Subject: [PATCH 05/21] Update characteristics_util

Co-authored-by: GeorgiosEfstathiadis <54844705+GeorgeEfstathiadis@users.noreply.github.com>
---
 openwillis/measures/text/util/characteristics_util.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/openwillis/measures/text/util/characteristics_util.py b/openwillis/measures/text/util/characteristics_util.py
index d79544b..91ff69a 100644
--- a/openwillis/measures/text/util/characteristics_util.py
+++ b/openwillis/measures/text/util/characteristics_util.py
@@ -376,7 +376,7 @@ def pause_calculation(filter_json, measures, time_index):
     """
     for i, item in enumerate(filter_json):
         if i > 0:
-            item[measures["pause"]] = float(item[time_index[0]]) - float(filter_json[i - 1][time_index[0]])
+            item[measures["pause"]] = float(item[time_index[0]]) - float(filter_json[i - 1][time_index[1]])
         
         else:
             item[measures["pause"]] = np.nan

From 8078d7bc8b8227d9855d346af2691892eb4fea6e Mon Sep 17 00:00:00 2001
From: vjbytes102 <vy386@nyu.edu>
Date: Wed, 1 Nov 2023 23:37:14 -0400
Subject: [PATCH 06/21] Update characteristics_util

Co-authored-by: GeorgiosEfstathiadis <54844705+GeorgeEfstathiadis@users.noreply.github.com>
---
 openwillis/measures/text/util/characteristics_util.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/openwillis/measures/text/util/characteristics_util.py b/openwillis/measures/text/util/characteristics_util.py
index 91ff69a..776d877 100644
--- a/openwillis/measures/text/util/characteristics_util.py
+++ b/openwillis/measures/text/util/characteristics_util.py
@@ -642,7 +642,10 @@ def update_summ_df(df_diff, summ_df, full_text, time_index, word_df, turn_df, me
 
     ------------------------------------------------------------------------------------------------------
     """
-    speech_minutes = (float(df_diff.iloc[-1][time_index[1]]) - float(df_diff.iloc[0][time_index[0]])) / 60
+    if len(turn_df) > 0:
+        speech_minutes = turn_df[measures["turn_minutes"]].sum()
+    else:
+        speech_minutes = (float(df_diff.iloc[-1][time_index[1]]) - float(df_diff.iloc[0][time_index[0]])) / 60
     summ_df[measures["speech_minutes"]] = [speech_minutes]
     
     summ_df[measures["speech_words"]] = len(df_diff)

From 42de460474cf7d0e881e0638f87e5895381f195e Mon Sep 17 00:00:00 2001
From: vjbytes102 <vy386@nyu.edu>
Date: Wed, 1 Nov 2023 23:43:38 -0400
Subject: [PATCH 07/21] Update speech_attribute

---
 openwillis/measures/text/speech_attribute.py | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/openwillis/measures/text/speech_attribute.py b/openwillis/measures/text/speech_attribute.py
index 934b016..957b5a7 100644
--- a/openwillis/measures/text/speech_attribute.py
+++ b/openwillis/measures/text/speech_attribute.py
@@ -85,9 +85,8 @@ def is_whisper_transcribe(json_conf):
 def filter_transcribe(json_conf, measures, speaker_label=None):
     """
     ------------------------------------------------------------------------------------------------------
-    This function extracts the text and filters the JSON data
-     for Amazon Transcribe json response objects.
-     Also, it filters the JSON data based on the speaker label if provided.
+    This function extracts the text and filters the JSON data for Amazon Transcribe json response objects.
+    Also, it filters the JSON data based on the speaker label if provided.
     Parameters:
     ...........
     json_conf: dict
@@ -103,13 +102,9 @@ def filter_transcribe(json_conf, measures, speaker_label=None):
         only the relevant data for processing.
     text_list: list
         List of transcribed text.
-         split into words, phrases, turns, and full text.
+         split into words, turns, and full text.
     text_indices: list
         List of indices for text_list.
-         for phrases and turns.
-    Raises:
-    ...........
-    ValueError: If the speaker label is not found in the json response object.
     ------------------------------------------------------------------------------------------------------
     """
     item_data = json_conf["results"]["items"]

From 3be0122ff3fb4afb694af8446f03edfdbaa62e00 Mon Sep 17 00:00:00 2001
From: vjbytes102 <vy386@nyu.edu>
Date: Wed, 1 Nov 2023 23:49:33 -0400
Subject: [PATCH 08/21] Update characteristics_util

---
 .../measures/text/util/characteristics_util.py    | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/openwillis/measures/text/util/characteristics_util.py b/openwillis/measures/text/util/characteristics_util.py
index 776d877..8c413ac 100644
--- a/openwillis/measures/text/util/characteristics_util.py
+++ b/openwillis/measures/text/util/characteristics_util.py
@@ -142,12 +142,7 @@ def filter_turn_aws(item_data, speaker_label):
         JSON response object.
     speaker_label: str
         Speaker label
-    turns_idxs: list
-        A list of tuples containing
-            the start and end indices of the turns in the JSON object.
-    turns: list
-        A list of turns extracted from the JSON object.
-
+        
     Returns:
     ...........
     turns_idxs: list
@@ -899,6 +894,10 @@ def calculate_file_feature(json_data, model, speakers):
     ...........
     json_conf: list
         JSON response object.
+    model: str
+        model name (vosk/aws/whisper)
+    speakers: str
+        speakers label
 
     Returns:
     ...........
@@ -939,6 +938,8 @@ def process_language_feature(df_list, transcribe_info, language, time_index, mea
         List of pandas dataframes.
     transcribe_info: list
         transcribed info
+    time_index: list
+        timepoint index (start/end)
     language: str
         Language of the transcribed text.
     measures: dict
@@ -959,4 +960,4 @@ def process_language_feature(df_list, transcribe_info, language, time_index, mea
         df_list = get_tag_summ(json_conf, df_list, text_indices, measures)
 
         df_list = get_sentiment(df_list, text_list, measures)
-    return df_list
\ No newline at end of file
+    return df_list

From 97f237dc296db20d3480d2efa5a8b0177fc4f25f Mon Sep 17 00:00:00 2001
From: Vijay Yadav <vijayyadav@vijays-mbp.myfiosgateway.com>
Date: Thu, 2 Nov 2023 17:10:16 -0400
Subject: [PATCH 09/21] speech update

---
 openwillis/measures/text/speech_attribute.py  |  31 ++-
 .../text/util/characteristics_util.py         | 204 ++++++------------
 2 files changed, 74 insertions(+), 161 deletions(-)

diff --git a/openwillis/measures/text/speech_attribute.py b/openwillis/measures/text/speech_attribute.py
index 957b5a7..674b20e 100644
--- a/openwillis/measures/text/speech_attribute.py
+++ b/openwillis/measures/text/speech_attribute.py
@@ -78,11 +78,13 @@ def is_whisper_transcribe(json_conf):
     ------------------------------------------------------------------------------------------------------
     """
     if "segments" in json_conf:
-        if "words" in json_conf["segments"][0]:
-            return True
+        if len(json_conf["segments"])>0:
+
+            if "words" in json_conf["segments"][0]:
+                return True
     return False
 
-def filter_transcribe(json_conf, measures, speaker_label=None):
+def filter_transcribe(json_conf, measures, min_turn_length, speaker_label=None):
     """
     ------------------------------------------------------------------------------------------------------
     This function extracts the text and filters the JSON data for Amazon Transcribe json response objects.
@@ -93,6 +95,8 @@ def filter_transcribe(json_conf, measures, speaker_label=None):
         aws transcribe json response.
     measures: dict
         A dictionary containing the names of the columns in the output dataframes.
+    min_turn_length: int
+        minimum words required in each turn
     speaker_label: str
         Speaker label
     Returns:
@@ -114,20 +118,18 @@ def filter_transcribe(json_conf, measures, speaker_label=None):
 
     # extract text
     text = " ".join([item["alternatives"][0]["content"] for item in item_data if "alternatives" in item])
-    phrases, phrases_idxs = cutil.phrase_split(text)
 
     if speaker_label is not None:
-        turns_idxs, turns = cutil.filter_speaker_aws(item_data, speaker_label)
+        turns_idxs, turns = cutil.filter_speaker_aws(item_data, min_turn_length, speaker_label)
     else:
         turns_idxs, turns = [], []
 
-    text = " ".join(phrases)
+    text = " ".join(turns)
     filter_json = cutil.filter_json_transcribe_aws(item_data, speaker_label, measures)
     words = [word["alternatives"][0]["content"] for word in filter_json]
 
     text_list = [words, turns, text]
-    text_indices = [phrases_idxs, turns_idxs]
-    return filter_json, text_list, text_indices
+    return filter_json, text_list, turns_idxs
 
 
 def filter_whisper(json_conf, measures, min_turn_length, speaker_label=None):
@@ -172,9 +174,6 @@ def filter_whisper(json_conf, measures, min_turn_length, speaker_label=None):
         item_data = [segment for segment in item_data if "speaker" in segment]
         
     item_data = cutil.create_index_column(item_data, measures)
-    phrases_idxs, phrases = cutil.filter_phrases(item_data, speaker_label, measures) # phrase-split
-
-    # turn-split
     if speaker_label is not None:
         turns_idxs, turns = cutil.filter_turns(item_data, speaker_label, measures, min_turn_length)
         
@@ -184,12 +183,10 @@ def filter_whisper(json_conf, measures, min_turn_length, speaker_label=None):
     # filter json to only include items with start_time and end_time
     filter_json = cutil.filter_json_transcribe(item_data, speaker_label, measures)
     words = [value["word"] for value in filter_json]
-    text = " ".join(phrases)
+    text = " ".join(turns)
     
     text_list = [words, turns, text]
-    text_indices = [phrases_idxs, turns_idxs]
-
-    return filter_json, text_list, text_indices
+    return filter_json, text_list, turns_idxs
 
 
 def filter_vosk(json_conf, measures):
@@ -306,11 +303,11 @@ def process_transcript(df_list, json_conf, measures, min_turn_length, speaker_la
         info = filter_whisper(json_conf, measures, min_turn_length, speaker_label)
         
     elif source == 'aws':
-        info = filter_transcribe(json_conf, measures, speaker_label)
+        info = filter_transcribe(json_conf, measures, min_turn_length, speaker_label)
         
     else:
         words, text = filter_vosk(json_conf, measures)
-        info = (json_conf, [words, [], text], [[], []])
+        info = (json_conf, [words, [], text], [])
 
     if len(info[0]) > 0 and len(info[1][-1]) > 0:
         df_list = cutil.process_language_feature(df_list, info, language, get_time_columns(source), measures)
diff --git a/openwillis/measures/text/util/characteristics_util.py b/openwillis/measures/text/util/characteristics_util.py
index 8c413ac..d09d8c1 100644
--- a/openwillis/measures/text/util/characteristics_util.py
+++ b/openwillis/measures/text/util/characteristics_util.py
@@ -95,41 +95,8 @@ def download_nltk_resources():
         nltk.data.find("averaged_perceptron_tagger")
     except LookupError:
         nltk.download("averaged_perceptron_tagger")
-        
-def phrase_split(text):
-    """
-    ------------------------------------------------------------------------------------------------------
-
-    This function splits the input text into phrases.
-
-    Parameters:
-    ...........
-    text: str
-        The input text.
-
-    Returns:
-    ...........
-    phrases: list
-        A list of phrases extracted from the input text.
-    phrases_idxs: list
-        A list of tuples containing
-            the start and end indices of the phrases in the input text.
 
-    ------------------------------------------------------------------------------------------------------
-    """
-    phrases = nltk.tokenize.sent_tokenize(text)
-    phrases_idxs = []
-
-    start_idx = 0
-    for phrase in phrases:
-        end_idx = start_idx + len(phrase.split()) - 1
-        
-        phrases_idxs.append((start_idx, end_idx))
-        start_idx = end_idx + 1
-
-    return phrases, phrases_idxs
-
-def filter_turn_aws(item_data, speaker_label):
+def filter_turn_aws(item_data, min_turn_length, speaker_label):
     """
     ------------------------------------------------------------------------------------------------------
     
@@ -140,6 +107,8 @@ def filter_turn_aws(item_data, speaker_label):
     ...........
     item_data: dict
         JSON response object.
+    min_turn_length: int
+        minimum words required in each turn
     speaker_label: str
         Speaker label
         
@@ -160,38 +129,46 @@ def filter_turn_aws(item_data, speaker_label):
         try:
             if (i > 0 and item.get("speaker_label", "") == speaker_label and item_data[i - 1].get("speaker_label", "") != speaker_label):
                 start_idx = i
+            
             elif (i > 0 and item.get("speaker_label", "") != speaker_label and item_data[i - 1].get("speaker_label", "") == speaker_label):
-                turns_idxs.append((start_idx, i - 1))
-                turns.append(" ".join([item["alternatives"][0]["content"] for item in item_data[start_idx:i]]))
+                turn_text = " ".join([item["alternatives"][0]["content"] for item in item_data[start_idx:i]])
+
+                if len(turn_text.split(" ")) >= min_turn_length:
+                    turns_idxs.append((start_idx, i - 1))
+                    turns.append(turn_text)
                 
         except Exception as e:
             logger.error(f"Error in turn-split for speaker {speaker_label}: {e}")
             continue
 
     if start_idx not in [item[0] for item in turns_idxs]:
-        turns_idxs.append((start_idx, len(item_data) - 1))
-        
-        turns.append(" ".join([item["alternatives"][0]["content"] for item in item_data[start_idx:]]))
+        turn_text = " ".join([item["alternatives"][0]["content"] for item in item_data[start_idx:]])
+
+        if len(turn_text.split(" ")) >= min_turn_length:
+            turns_idxs.append((start_idx, len(item_data) - 1))
+
+            turns.append(turn_text)
     return turns_idxs, turns
 
-def filter_speaker_aws(item_data, speaker_label):
+def filter_speaker_aws(item_data, min_turn_length, speaker_label):
     """
     ------------------------------------------------------------------------------------------------------
 
-    This function updates the turns and phrases lists
-        to only include the speaker label provided.
+    This function updates the turns lists to only include the speaker label provided.
 
     Parameters:
     ...........
     item_data: dict
         JSON response object.
+    min_turn_length: int
+        minimum words required in each turn
     speaker_label: str
         Speaker label
+
     Returns:
     ...........
     turns_idxs: list
-        A list of tuples containing
-            the start and end indices of the turns in the JSON object.
+        A list of tuples containing the start and end indices of the turns in the JSON object.
     turns: list
         A list of turns extracted from the JSON object.
 
@@ -203,7 +180,7 @@ def filter_speaker_aws(item_data, speaker_label):
     if speaker_label not in speaker_labels:
         logger.error(f"Speaker label {speaker_label} not found in the json response object.")
 
-    turns_idxs, turns = filter_turn_aws(item_data, speaker_label)
+    turns_idxs, turns = filter_turn_aws(item_data, min_turn_length, speaker_label)
     return turns_idxs, turns
 
 def filter_json_transcribe_aws(item_data, speaker_label, measures):
@@ -235,53 +212,6 @@ def filter_json_transcribe_aws(item_data, speaker_label, measures):
         filter_json = [item for item in filter_json if item.get("speaker_label", "") == speaker_label]
 
     return filter_json
-        
-def filter_phrases(item_data, speaker_label, measures):
-    """
-    ------------------------------------------------------------------------------------------------------
-    
-    This function updates the phrases list
-        to only include the speaker label provided.
-
-    Parameters:
-    ...........
-    item_data: dict
-        JSON response object.
-    speaker_label: str
-        Speaker label
-    measures: dict
-        A dictionary containing the names of the columns in the output dataframes.
-
-    Returns:
-    ...........
-    phrases_idxs: list
-        A list of tuples containing
-            the start and end indices of the phrases in the JSON object.
-    phrases: list
-        A list of phrases extracted from the JSON object.
-
-    ------------------------------------------------------------------------------------------------------
-    """
-
-
-    phrases_idxs, phrases = [], []
-    for item in item_data:
-        try:
-
-            start_idx = item["words"][0][measures["old_index"]]
-            end_idx = item["words"][-1][measures["old_index"]]
-
-            if speaker_label is not None:
-                if item["speaker"] == speaker_label:
-                    phrases.append(item["text"])
-                    phrases_idxs.append((start_idx, end_idx))
-            else:
-                phrases.append(item["text"])
-                phrases_idxs.append((start_idx, end_idx))
-
-        except Exception as e:
-            logger.error(f"Failed to filter phrases: {e}")
-    return phrases_idxs, phrases
 
 def filter_turns(item_data, speaker_label, measures, min_turn_length):
     """
@@ -323,15 +253,17 @@ def filter_turns(item_data, speaker_label, measures, min_turn_length):
                     
                 else:
                     if current_turn is not None:
-
-                        start_idx2 = current_turn[0]["words"][0][measures["old_index"]]
-                        end_idx2 = current_turn[-1]["words"][-1][measures["old_index"]]
-                        turn_text = " ".join(item["text"] for item in current_turn)
                         
-                        if len(turn_text.split(" ")) >= min_turn_length:
-                            turns_idxs.append((start_idx2, end_idx2))
+                        if len(current_turn)>0 and len(current_turn[0]["words"])>0: 
+                            start_idx2 = current_turn[0]["words"][0][measures["old_index"]]
                             
-                            turns.append(turn_text)
+                            end_idx2 = current_turn[-1]["words"][-1][measures["old_index"]]
+                            turn_text = " ".join(item["text"] for item in current_turn)
+                            
+                            if len(turn_text.split(" ")) >= min_turn_length:
+                                turns_idxs.append((start_idx2, end_idx2))
+
+                                turns.append(turn_text)
                         current_turn = None
                         
         except Exception as e:
@@ -447,7 +379,7 @@ def get_num_of_syllables(text):
 
     return syllable_count
 
-def get_pause_feature_word(word_df, df_diff, word_list, phrase_index, measures):
+def get_pause_feature_word(word_df, df_diff, word_list, turn_index, measures):
     """
     ------------------------------------------------------------------------------------------------------
 
@@ -463,9 +395,8 @@ def get_pause_feature_word(word_df, df_diff, word_list, phrase_index, measures):
             from the JSON response.
     word_list: list
         List of transcribed text at the word level.
-    phrase_index: list
+    turn_index: list
         A list containing the indices of the first and last word
-            in each phrase or turn.
     measures: dict
         A dictionary containing the names of the columns in the output dataframes.
 
@@ -476,8 +407,8 @@ def get_pause_feature_word(word_df, df_diff, word_list, phrase_index, measures):
 
     ------------------------------------------------------------------------------------------------------
     """
-    phrase_starts = [pindex[0] for pindex in phrase_index]
-    word_df[measures["word_pause"]] = df_diff[measures["pause"]].where(~df_diff[measures["old_index"]].isin(phrase_starts), np.nan)
+    turn_starts = [pindex[0] for pindex in turn_index]
+    word_df[measures["word_pause"]] = df_diff[measures["pause"]].where(~df_diff[measures["old_index"]].isin(turn_starts), np.nan)
     
     word_df[measures["num_syllables"]] = [get_num_of_syllables(word) for word in word_list]
     return word_df
@@ -487,26 +418,24 @@ def process_pause_feature(df_diff, df, text_level, index_list, time_index, level
     ------------------------------------------------------------------------------------------------------
 
     This function calculates various pause-related speech
-     characteristic features at the phrase or turn
+     characteristic features at the turn
      level and adds them to the output dataframe df.
 
     Parameters:
     ...........
     df_diff: pandas dataframe
-        A dataframe containing the word-level information
-         from the JSON response.
+        A dataframe containing the word-level information from the JSON response.
     df: pandas dataframe
-        A dataframe containing phrase or turn summary information
+        A dataframe containing turn summary information
     text_level: list
-        List of transcribed text at the phrase or turn level.
+        List of transcribed text at the turn level.
     index_list: list
-        A list containing the indices of the first and last word
-         in each phrase or turn.
+        A list containing the indices of the first and last word in each turn.
     time_index: list
         A list containing the names of the columns in json that contain
          the start and end times of each word.
     level_name: str
-        The name of the level being analyzed (phrase or turn).
+        The name of the level being analyzed turn.
     measures: dict
         A dictionary containing the names of the columns in the output dataframes.
 
@@ -518,8 +447,8 @@ def process_pause_feature(df_diff, df, text_level, index_list, time_index, level
     ------------------------------------------------------------------------------------------------------
     """
 
-    if level_name not in [measures["phrase"], measures["turn"]]:
-        logger.error(f"level_name must be either in phrase or turn")
+    if level_name not in [measures["turn"]]:
+        logger.error(f"level_name must be turn")
         return df
 
     for j, index in enumerate(index_list):
@@ -648,9 +577,7 @@ def update_summ_df(df_diff, summ_df, full_text, time_index, word_df, turn_df, me
         
         summ_df[measures["word_rate"]] = (summ_df[measures["speech_words"]] / summ_df[measures["speech_minutes"]])
         summ_df[measures["syllable_rate"]] = (get_num_of_syllables(full_text) / summ_df[measures["speech_minutes"]])
-        
-        summ_df[measures["speech_percentage"]] = 100 * (
-        1 - df_diff.loc[1:, measures["pause"]].sum()/ (60 * summ_df[measures["speech_minutes"]]))
+        summ_df[measures["speech_percentage"]] = 100 * (summ_df[measures["speech_minutes"]] / summ_df[measures["file_length"]])
 
     if len(word_df[measures["word_pause"]]) > 1:
         summ_df[measures["word_pause_mean"]] = word_df[measures["word_pause"]].mean(skipna=True)
@@ -668,7 +595,7 @@ def update_summ_df(df_diff, summ_df, full_text, time_index, word_df, turn_df, me
 
     return summ_df
 
-def get_pause_feature(json_conf, df_list, text_list, text_indices, measures, time_index, language):
+def get_pause_feature(json_conf, df_list, text_list, turn_index, measures, time_index, language):
     """
     ------------------------------------------------------------------------------------------------------
 
@@ -680,16 +607,17 @@ def get_pause_feature(json_conf, df_list, text_list, text_indices, measures, tim
     json_conf: list
         JSON response object.
     df_list: list
-        List of pandas dataframes.
-            word_df, phrase_df, turn_df, summ_df
+        List of pandas dataframes: word_df, turn_df, summ_df
     text_list: list
-        List of transcribed text.
-            split into words, phrases, turns, and full text.
-    text_indices: list
+        List of transcribed text: split into words, turns, and full text.
+    turn_index: list
         List of indices for text_list.
-            for phrases and turns.
     measures: dict
         A dictionary containing the names of the columns in the output dataframes.
+    time_index: list
+        timepoint index (start/end)
+    language: str
+        Language of the transcribed text.
 
     Returns:
     ...........
@@ -703,17 +631,14 @@ def get_pause_feature(json_conf, df_list, text_list, text_indices, measures, tim
 
     word_df, turn_df, summ_df = df_list
     word_list, turn_list, full_text = text_list
-    phrase_index, turn_index = text_indices
-
     df_diff = pd.DataFrame(json_conf)
-    time_index = [time_index[0], time_index[1]]
 
     # Calculate the pause time between; each word and add the results to pause_list
     if measures["pause"] not in df_diff.columns:
         df_diff[measures["pause"]] = df_diff[time_index[0]].astype(float) - df_diff[time_index[1]].astype(float).shift(1)
 
     # word-level analysis
-    word_df = get_pause_feature_word(word_df, df_diff, word_list, phrase_index, measures)
+    word_df = get_pause_feature_word(word_df, df_diff, word_list, turn_index, measures)
 
     # turn-level analysis
     if len(turn_index) > 0:
@@ -797,7 +722,7 @@ def get_tag(json_conf, tag_dict, measures):
             json_conf[i][measures["tag"]] = "Other"
     return json_conf
 
-def get_tag_summ(json_conf, df_list, text_indices, measures):
+def get_tag_summ(json_conf, df_list, measures):
     """
     ------------------------------------------------------------------------------------------------------
 
@@ -810,11 +735,7 @@ def get_tag_summ(json_conf, df_list, text_indices, measures):
     json_conf: list
         JSON response object.
     df_list: list
-        List of pandas dataframes.
-            word_df, phrase_df, turn_df, summ_df
-    text_indices: list
-        List of indices for text_list.
-            for phrases and turns.
+        List of pandas dataframes: word_df, turn_df, summ_df
     measures: dict
         A dictionary containing the names of the columns in the output dataframes.
 
@@ -825,10 +746,7 @@ def get_tag_summ(json_conf, df_list, text_indices, measures):
 
     ------------------------------------------------------------------------------------------------------
     """
-
     word_df, turn_df, summ_df = df_list
-    _ , turn_index = text_indices
-
     df_conf = pd.DataFrame(json_conf)
     word_df[measures["part_of_speech"]] = df_conf[measures["tag"]]
     
@@ -846,10 +764,8 @@ def get_sentiment(df_list, text_list, measures):
     ...........
     df_list: list
         List of pandas dataframes.
-            word_df, phrase_df, turn_df, summ_df
     text_list: list
         List of transcribed text.
-            split into words, phrases, turns, and full text.
     measures: dict
         A dictionary containing the names of the columns in the output dataframes.
 
@@ -911,7 +827,7 @@ def calculate_file_feature(json_data, model, speakers):
         file_length = max(float(segment.get("end_time", "0")) for segment in segments)
         
         if speakers is None:
-            return file_length, np.NaN
+            return file_length/60, np.NaN
 
         speaking_time = sum(float(segment.get("end_time", "0") or "0") - float(segment.get("start_time", "0") or "0")
                            for segment in segments if segment.get("speaker_label", "") in speakers)
@@ -920,7 +836,7 @@ def calculate_file_feature(json_data, model, speakers):
         file_length = max(segment.get('end', 0) for segment in segments)
         
         if speakers is None:
-            return file_length, np.NaN
+            return file_length/60, np.NaN
         speaking_time = sum(segment['end'] - segment['start'] for segment in segments if segment.get('speaker', '') in speakers)
 
     speaking_pct = (speaking_time / file_length) * 100
@@ -952,12 +868,12 @@ def process_language_feature(df_list, transcribe_info, language, time_index, mea
 
     ------------------------------------------------------------------------------------------------------
     """
-    json_conf, text_list, text_indices = transcribe_info
-    df_list = get_pause_feature(json_conf, df_list, text_list, text_indices, measures, time_index, language)
+    json_conf, text_list, turn_indices = transcribe_info
+    df_list = get_pause_feature(json_conf, df_list, text_list, turn_indices, measures, time_index, language)
 
     if language == "en":
         json_conf = get_tag(json_conf, TAG_DICT, measures)
-        df_list = get_tag_summ(json_conf, df_list, text_indices, measures)
+        df_list = get_tag_summ(json_conf, df_list, measures)
 
         df_list = get_sentiment(df_list, text_list, measures)
     return df_list

From 5d1433154311b1fc60bf95f705151ff371bf5358 Mon Sep 17 00:00:00 2001
From: vjbytes102 <vy386@nyu.edu>
Date: Thu, 2 Nov 2023 17:40:00 -0400
Subject: [PATCH 10/21] Update speech_attribute

---
 openwillis/measures/text/speech_attribute.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/openwillis/measures/text/speech_attribute.py b/openwillis/measures/text/speech_attribute.py
index 674b20e..004cbb1 100644
--- a/openwillis/measures/text/speech_attribute.py
+++ b/openwillis/measures/text/speech_attribute.py
@@ -121,10 +121,11 @@ def filter_transcribe(json_conf, measures, min_turn_length, speaker_label=None):
 
     if speaker_label is not None:
         turns_idxs, turns = cutil.filter_speaker_aws(item_data, min_turn_length, speaker_label)
+        text = " ".join(turns)
+        
     else:
         turns_idxs, turns = [], []
 
-    text = " ".join(turns)
     filter_json = cutil.filter_json_transcribe_aws(item_data, speaker_label, measures)
     words = [word["alternatives"][0]["content"] for word in filter_json]
 
@@ -183,7 +184,7 @@ def filter_whisper(json_conf, measures, min_turn_length, speaker_label=None):
     # filter json to only include items with start_time and end_time
     filter_json = cutil.filter_json_transcribe(item_data, speaker_label, measures)
     words = [value["word"] for value in filter_json]
-    text = " ".join(turns)
+    text = " ".join(words)
     
     text_list = [words, turns, text]
     return filter_json, text_list, turns_idxs

From af3fd098c1fcbb22b4c05a203cf60883447e30bf Mon Sep 17 00:00:00 2001
From: GeorgiosEfstathiadis
 <54844705+GeorgeEfstathiadis@users.noreply.github.com>
Date: Thu, 2 Nov 2023 17:58:25 -0400
Subject: [PATCH 11/21] text joining in whisper doesnt word when multiple
 speakers and minimum turn length

---
 openwillis/measures/text/speech_attribute.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/openwillis/measures/text/speech_attribute.py b/openwillis/measures/text/speech_attribute.py
index 004cbb1..b8ce3f7 100644
--- a/openwillis/measures/text/speech_attribute.py
+++ b/openwillis/measures/text/speech_attribute.py
@@ -171,20 +171,21 @@ def filter_whisper(json_conf, measures, min_turn_length, speaker_label=None):
     """
     item_data = json_conf["segments"]
 
+    text = " ".join(item["text"] for item in item_data)
+
     if speaker_label is not None:
         item_data = [segment for segment in item_data if "speaker" in segment]
         
     item_data = cutil.create_index_column(item_data, measures)
     if speaker_label is not None:
         turns_idxs, turns = cutil.filter_turns(item_data, speaker_label, measures, min_turn_length)
-        
+        text = " ".join(turns)
     else:
         turns_idxs, turns = [], []
     
     # filter json to only include items with start_time and end_time
     filter_json = cutil.filter_json_transcribe(item_data, speaker_label, measures)
     words = [value["word"] for value in filter_json]
-    text = " ".join(words)
     
     text_list = [words, turns, text]
     return filter_json, text_list, turns_idxs

From 5c1acd8167d8f9e9d831f894ccedb6d46da03b6b Mon Sep 17 00:00:00 2001
From: vjbytes102 <vy386@nyu.edu>
Date: Thu, 2 Nov 2023 18:20:43 -0400
Subject: [PATCH 12/21] Update speech_attribute

---
 openwillis/measures/text/speech_attribute.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/openwillis/measures/text/speech_attribute.py b/openwillis/measures/text/speech_attribute.py
index b8ce3f7..b030e41 100644
--- a/openwillis/measures/text/speech_attribute.py
+++ b/openwillis/measures/text/speech_attribute.py
@@ -170,15 +170,15 @@ def filter_whisper(json_conf, measures, min_turn_length, speaker_label=None):
     ------------------------------------------------------------------------------------------------------
     """
     item_data = json_conf["segments"]
-
-    text = " ".join(item["text"] for item in item_data)
+    text = " ".join(item.get("text", "") for item in item_data)
 
     if speaker_label is not None:
         item_data = [segment for segment in item_data if "speaker" in segment]
         
     item_data = cutil.create_index_column(item_data, measures)
-    if speaker_label is not None:
+    if speaker_label is not None:    
         turns_idxs, turns = cutil.filter_turns(item_data, speaker_label, measures, min_turn_length)
+        
         text = " ".join(turns)
     else:
         turns_idxs, turns = [], []

From 247005ebd93b218d175bff0e669e15a977e73dac Mon Sep 17 00:00:00 2001
From: vjbytes102 <vy386@nyu.edu>
Date: Thu, 2 Nov 2023 19:57:10 -0400
Subject: [PATCH 13/21] Update version

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 7545ba1..b9f51fc 100644
--- a/setup.py
+++ b/setup.py
@@ -12,7 +12,7 @@
     install_requires = fp.read()
 
 setuptools.setup(name='openwillis',
-                 version='1.5.2',
+                 version='1.6',
                  description='digital health measurement',
                  long_description=long_description,
                  long_description_content_type="text/markdown",

From 5167eae89505891215694eeed2c3cd524699de9f Mon Sep 17 00:00:00 2001
From: Vijay Yadav <vijayyadav@vijays-mbp.myfiosgateway.com>
Date: Mon, 6 Nov 2023 12:57:30 -0500
Subject: [PATCH 14/21] speech transcription vosk

---
 openwillis/__init__.py                        |   3 +-
 openwillis/measures/api.py                    |   1 +
 openwillis/measures/audio/__init__.py         |   6 +-
 .../measures/audio/speech_transcribe_vosk.py  | 262 ++++++++++++++++++
 4 files changed, 270 insertions(+), 2 deletions(-)
 create mode 100644 openwillis/measures/audio/speech_transcribe_vosk.py

diff --git a/openwillis/__init__.py b/openwillis/__init__.py
index dd36a91..5188a2c 100644
--- a/openwillis/__init__.py
+++ b/openwillis/__init__.py
@@ -13,7 +13,8 @@
     speaker_separation,
     speaker_separation_cloud,
     speech_transcription_cloud,
+    speech_transcription_vosk,
     to_audio
 )
 
-__all__ = ["facial_expressivity", "vocal_acoustics", "emotional_expressivity", "eye_blink_rate", "speech_transcription", "speech_characteristics", "speaker_separation", "speaker_separation_cloud", "speech_transcription_cloud", "to_audio"]
+__all__ = ["facial_expressivity", "vocal_acoustics", "emotional_expressivity", "eye_blink_rate", "speech_transcription", "speech_characteristics", "speaker_separation", "speaker_separation_cloud", "speech_transcription_cloud", "speech_transcription_vosk", "to_audio"]
diff --git a/openwillis/measures/api.py b/openwillis/measures/api.py
index ecc0897..b25367a 100644
--- a/openwillis/measures/api.py
+++ b/openwillis/measures/api.py
@@ -13,6 +13,7 @@
     speaker_separation,
     speaker_separation_cloud,
     speech_transcription_cloud,
+    speech_transcription_vosk
 )
 from openwillis.measures.text import (
     speech_characteristics
diff --git a/openwillis/measures/audio/__init__.py b/openwillis/measures/audio/__init__.py
index d53a3af..b146e8c 100644
--- a/openwillis/measures/audio/__init__.py
+++ b/openwillis/measures/audio/__init__.py
@@ -18,4 +18,8 @@
     speech_transcription_cloud,
 )
 
-__all__ = ["vocal_acoustics", "speech_transcription", "speaker_separation", "speaker_separation_cloud", "speech_transcription_cloud"]
+from openwillis.measures.audio.speech_transcribe_vosk import (
+    speech_transcription_vosk,
+)
+
+__all__ = ["vocal_acoustics", "speech_transcription", "speaker_separation", "speaker_separation_cloud", "speech_transcription_cloud", "speech_transcription_vosk"]
diff --git a/openwillis/measures/audio/speech_transcribe_vosk.py b/openwillis/measures/audio/speech_transcribe_vosk.py
new file mode 100644
index 0000000..f18ae9f
--- /dev/null
+++ b/openwillis/measures/audio/speech_transcribe_vosk.py
@@ -0,0 +1,262 @@
+# website:   http://www.brooklyn.health
+
+# import the required packages
+import os
+import wave
+import json
+import logging
+import json
+
+from vosk import Model, KaldiRecognizer
+from pydub import AudioSegment
+from openwillis.measures.audio.util import util as ut
+
+logging.basicConfig(level=logging.INFO)
+logger=logging.getLogger()
+
+def get_config():
+    """
+    ------------------------------------------------------------------------------------------------------
+
+    Load the configuration settings for the speech transcription.
+
+    Parameters:
+    ...........
+    None
+
+    Returns:
+    ...........
+    measures : dict
+        A dictionary containing the configuration settings.
+
+    ------------------------------------------------------------------------------------------------------
+    """
+    #Loading json config
+    dir_name = os.path.dirname(os.path.abspath(__file__))
+    measure_path = os.path.abspath(os.path.join(dir_name, 'config/speech.json'))
+
+    file = open(measure_path)
+    measures = json.load(file)
+    return measures
+
+def filter_audio(filepath, t_interval):
+    """
+    ------------------------------------------------------------------------------------------------------
+
+    Filter an audio file to extract a segment based on the specified time interval.
+
+    Parameters:
+    ............
+    filepath : str
+        The path to the audio file to be filtered.
+    t_interval : list
+        A list of tuples representing the start and end times (in seconds) of the segment to extract.
+
+    Returns:
+    ............
+    sound : AudioSegment
+        The filtered audio segment.
+
+    ------------------------------------------------------------------------------------------------------
+    """
+    sound = AudioSegment.from_wav(filepath)
+
+    if len(t_interval)==2:
+        sound = sound[int(t_interval[0])*1000 : int(t_interval[1])*1000]
+
+    elif len(t_interval)==1:
+        sound = sound[int(t_interval[0])*1000:]
+
+    sound = sound.set_channels(1)
+    return sound
+
+def filter_speech(measures, results):
+    """
+    ------------------------------------------------------------------------------------------------------
+
+    Filter the speech transcription results to extract the transcript.
+
+    Parameters:
+    ...........
+    measures : dict
+        A dictionary containing the configuration settings for the speech transcription.
+    results : list of dict
+        The raw transcription results returned by the transcription service.
+
+    Returns:
+    ...........
+    result_key : list
+        A list containing the framewise transcription of the audio file.
+    transcript : str
+        The transcript of the audio file.
+
+    ------------------------------------------------------------------------------------------------------
+    """
+    result_key = []
+    text_key = []
+    transcript_dict = {}
+
+    for res in results:
+        dict_keys = res.keys()
+
+        if 'result' in dict_keys and 'text' in dict_keys:
+            result_key.extend(res['result'])
+            text_key.append(res['text'])
+
+    transcript_dict['result'] = result_key
+    transcript_dict['text'] = ' '.join(text_key)
+    return result_key, ' '.join(text_key)
+
+def get_vosk(audio_path, lang):
+    """
+    ------------------------------------------------------------------------------------------------------
+
+    Recognize speech using the Vosk model.
+
+    Parameters:
+    ............
+    audio_path : str
+        The path to the audio file to be transcribed.
+    lang : str
+        The language of the audio file (e.g. 'en-us', 'es', 'fr').
+
+    Returns:
+    ............
+    results : list of dict
+        The raw transcription results returned by the Vosk model.
+
+    ------------------------------------------------------------------------------------------------------
+    """
+    model = Model(lang=lang)
+    wf = wave.open(audio_path, "rb")
+
+    recog = KaldiRecognizer(model, wf.getframerate())
+    recog.SetWords(True)
+
+    results = []
+    while True:
+
+        data = wf.readframes(4000) #Future work
+        if len(data) == 0:
+            break
+
+        if recog.AcceptWaveform(data):
+            partial_result = json.loads(recog.Result())
+            results.append(partial_result)
+
+    partial_result = json.loads(recog.FinalResult())
+    results.append(partial_result)
+    return results
+
+def stereo_to_mono(filepath, t_interval):
+    """
+    ------------------------------------------------------------------------------------------------------
+
+    Convert a stereo audio file to a mono audio file.
+
+    Parameters:
+    ............
+    filepath : str
+        The path to the stereo audio file to be converted.
+    t_interval : list
+        A list of tuples representing the start and end times (in seconds) of segments of the audio file to be transcribed.
+
+    Returns:
+    ............
+    mono_filepath : str
+        The path to the mono audio file.
+
+    ------------------------------------------------------------------------------------------------------
+    """
+    sound = filter_audio(filepath, t_interval)
+
+    filename, _ = os.path.splitext(os.path.basename(filepath))
+    dir_name = os.path.join(os.path.dirname(filepath), 'temp_mono_' + filename)
+
+    ut.make_dir(dir_name)
+    mono_filepath = os.path.join(dir_name, filename + '.wav')
+    sound.export(mono_filepath, format="wav")
+    return mono_filepath
+
+def run_vosk(filepath, language, transcribe_interval = []):
+    """
+    ------------------------------------------------------------------------------------------------------
+
+    Transcribe speech in an audio file using the Vosk model.
+
+    Parameters:
+    ............
+    filepath : str
+        The path to the audio file to be transcribed.
+    language : str, optional
+        The language of the audio file (e.g. 'en-us', 'es', 'fr'). Default is 'en-us'.
+    transcribe_interval : list, optional
+        A list of tuples representing the start and end times (in seconds) of segments of the audio file to be transcribed.
+        Default is an empty list.
+
+    Returns:
+    ............
+    json_response : str
+        The JSON response from the Vosk transcription service.
+    transcript : str
+        The transcript of the audio file.
+
+    ------------------------------------------------------------------------------------------------------
+    """
+    json_response = json.dumps({})
+    transcript = mono_filepath = ''
+
+    try:
+        if os.path.exists(filepath):
+
+            measures = get_config()
+            mono_filepath = stereo_to_mono(filepath, transcribe_interval)
+            results = get_vosk(mono_filepath, language)
+
+            ut.remove_dir(os.path.dirname(mono_filepath)) #Clean temp directory
+            json_response, transcript = filter_speech(measures, results)
+
+        else:
+            logger.info(f'Audio file not available. File: {filepath}')
+
+    except Exception as e:
+        ut.remove_dir(os.path.dirname(mono_filepath))#Clean temp directory
+        logger.error(f'Error in speech Transcription: {e} & File: {filepath}')
+
+    finally:
+        return json_response, transcript
+
+    
+
+def speech_transcription_vosk(filepath, **kwargs):
+    """
+    ------------------------------------------------------------------------------------------------------
+
+    Speech transcription function that transcribes an audio file using vosk.
+
+    Parameters:
+    ...........
+    filepath : str
+        The path to the audio file to be transcribed.
+    language : str, optional
+        The language of the audio file (e.g. 'en-us', 'es', 'fr'). Default is 'en-us'.
+    transcribe_interval : list, optional
+        A list of tuples representing the start and end times (in seconds) of segments of the audio file to be transcribed.
+        Only applicable if model is 'vosk'. Default is an empty list.
+
+    Returns:
+    ...........
+    json_response : JSON Object
+        A transcription response object in JSON format
+    transcript : str
+        The transcript of the recording.
+
+    ------------------------------------------------------------------------------------------------------
+    """
+
+    measures = get_config()
+    language = kwargs.get('language', 'en-us')
+    transcribe_interval = kwargs.get('transcribe_interval', [])
+    
+    json_response, transcript = run_vosk(filepath, language, transcribe_interval)
+    return json_response, transcript

From 6d6707662abbf6d8c22cf48c702a534242bd7608 Mon Sep 17 00:00:00 2001
From: Vijay Yadav <vijayyadav@vijays-mbp.myfiosgateway.com>
Date: Mon, 6 Nov 2023 16:05:50 -0500
Subject: [PATCH 15/21] whisperx update

---
 openwillis/__init__.py                        |   4 +-
 openwillis/measures/api.py                    |   2 +-
 openwillis/measures/audio/__init__.py         |   6 +-
 .../measures/audio/speech_transcribe.py       | 330 ------------------
 .../audio/speech_transcribe_whisper.py        | 149 ++++++++
 .../measures/audio/util/whisperx_util.py      |  52 ++-
 6 files changed, 178 insertions(+), 365 deletions(-)
 delete mode 100644 openwillis/measures/audio/speech_transcribe.py
 create mode 100644 openwillis/measures/audio/speech_transcribe_whisper.py

diff --git a/openwillis/__init__.py b/openwillis/__init__.py
index 5188a2c..61e0c35 100644
--- a/openwillis/__init__.py
+++ b/openwillis/__init__.py
@@ -8,7 +8,7 @@
     emotional_expressivity,
     eye_blink_rate,
     vocal_acoustics,
-    speech_transcription,
+    speech_transcription_whisper,
     speech_characteristics,
     speaker_separation,
     speaker_separation_cloud,
@@ -17,4 +17,4 @@
     to_audio
 )
 
-__all__ = ["facial_expressivity", "vocal_acoustics", "emotional_expressivity", "eye_blink_rate", "speech_transcription", "speech_characteristics", "speaker_separation", "speaker_separation_cloud", "speech_transcription_cloud", "speech_transcription_vosk", "to_audio"]
+__all__ = ["facial_expressivity", "vocal_acoustics", "emotional_expressivity", "eye_blink_rate", "speech_transcription_whisper", "speech_characteristics", "speaker_separation", "speaker_separation_cloud", "speech_transcription_cloud", "speech_transcription_vosk", "to_audio"]
diff --git a/openwillis/measures/api.py b/openwillis/measures/api.py
index b25367a..f76db67 100644
--- a/openwillis/measures/api.py
+++ b/openwillis/measures/api.py
@@ -9,7 +9,7 @@
 )
 from openwillis.measures.audio import (
     vocal_acoustics,
-    speech_transcription,
+    speech_transcription_whisper,
     speaker_separation,
     speaker_separation_cloud,
     speech_transcription_cloud,
diff --git a/openwillis/measures/audio/__init__.py b/openwillis/measures/audio/__init__.py
index b146e8c..f7448e6 100644
--- a/openwillis/measures/audio/__init__.py
+++ b/openwillis/measures/audio/__init__.py
@@ -2,8 +2,8 @@
     vocal_acoustics,
 )
 
-from openwillis.measures.audio.speech_transcribe import (
-    speech_transcription,
+from openwillis.measures.audio.speech_transcribe_whisper import (
+    speech_transcription_whisper,
 )
 
 from openwillis.measures.audio.speech_separation import (
@@ -22,4 +22,4 @@
     speech_transcription_vosk,
 )
 
-__all__ = ["vocal_acoustics", "speech_transcription", "speaker_separation", "speaker_separation_cloud", "speech_transcription_cloud", "speech_transcription_vosk"]
+__all__ = ["vocal_acoustics", "speech_transcription_whisper", "speaker_separation", "speaker_separation_cloud", "speech_transcription_cloud", "speech_transcription_vosk"]
diff --git a/openwillis/measures/audio/speech_transcribe.py b/openwillis/measures/audio/speech_transcribe.py
deleted file mode 100644
index 6be8eb5..0000000
--- a/openwillis/measures/audio/speech_transcribe.py
+++ /dev/null
@@ -1,330 +0,0 @@
-# author:    Vijay Yadav
-# website:   http://www.bklynhlth.com
-
-# import the required packages
-
-import numpy as np
-import pandas as pd
-import os
-import wave
-import json
-import logging
-
-from pydub import AudioSegment
-from openwillis.measures.audio.util import util as ut
-from openwillis.measures.audio.util import transcribe_util as tutil
-
-logging.basicConfig(level=logging.INFO)
-logger=logging.getLogger()
-
-def run_vosk(filepath, language='en-us', transcribe_interval = []):
-    """
-    ------------------------------------------------------------------------------------------------------
-
-    Transcribe speech in an audio file using the Vosk model.
-
-    Parameters:
-    ............
-    filepath : str
-        The path to the audio file to be transcribed.
-    language : str, optional
-        The language of the audio file (e.g. 'en-us', 'es', 'fr'). Default is 'en-us'.
-    transcribe_interval : list, optional
-        A list of tuples representing the start and end times (in seconds) of segments of the audio file to be transcribed.
-        Default is an empty list.
-
-    Returns:
-    ............
-    json_response : str
-        The JSON response from the Vosk transcription service.
-    transcript : str
-        The transcript of the audio file.
-
-    ------------------------------------------------------------------------------------------------------
-    """
-    json_response = '{}'
-    transcript = mono_filepath = ''
-
-    try:
-        if os.path.exists(filepath):
-
-            measures = get_config()
-            mono_filepath = stereo_to_mono(filepath, transcribe_interval)
-            results = get_vosk(mono_filepath, language)
-
-            ut.remove_dir(os.path.dirname(mono_filepath)) #Clean temp directory
-            json_response, transcript = filter_speech(measures, results)
-
-        else:
-            logger.info(f'Audio file not available. File: {filepath}')
-
-    except Exception as e:
-        ut.remove_dir(os.path.dirname(mono_filepath))#Clean temp directory
-        logger.error(f'Error in speech Transcription: {e} & File: {filepath}')
-
-    finally:
-        return json_response, transcript
-
-def filter_audio(filepath, t_interval):
-    """
-    ------------------------------------------------------------------------------------------------------
-
-    Filter an audio file to extract a segment based on the specified time interval.
-
-    Parameters:
-    ............
-    filepath : str
-        The path to the audio file to be filtered.
-    t_interval : list
-        A list of tuples representing the start and end times (in seconds) of the segment to extract.
-
-    Returns:
-    ............
-    sound : AudioSegment
-        The filtered audio segment.
-
-    ------------------------------------------------------------------------------------------------------
-    """
-    sound = AudioSegment.from_wav(filepath)
-
-    if len(t_interval)==2:
-        sound = sound[int(t_interval[0])*1000 : int(t_interval[1])*1000]
-
-    elif len(t_interval)==1:
-        sound = sound[int(t_interval[0])*1000:]
-
-    sound = sound.set_channels(1)
-    return sound
-
-def stereo_to_mono(filepath, t_interval):
-    """
-    ------------------------------------------------------------------------------------------------------
-
-    Convert a stereo audio file to a mono audio file.
-
-    Parameters:
-    ............
-    filepath : str
-        The path to the stereo audio file to be converted.
-    t_interval : list
-        A list of tuples representing the start and end times (in seconds) of segments of the audio file to be transcribed.
-
-    Returns:
-    ............
-    mono_filepath : str
-        The path to the mono audio file.
-
-    ------------------------------------------------------------------------------------------------------
-    """
-    sound = filter_audio(filepath, t_interval)
-
-    filename, _ = os.path.splitext(os.path.basename(filepath))
-    dir_name = os.path.join(os.path.dirname(filepath), 'temp_mono_' + filename)
-
-    ut.make_dir(dir_name)
-    mono_filepath = os.path.join(dir_name, filename + '.wav')
-    sound.export(mono_filepath, format="wav")
-    return mono_filepath
-
-def get_vosk(audio_path, lang):
-    """
-    ------------------------------------------------------------------------------------------------------
-
-    Recognize speech using the Vosk model.
-
-    Parameters:
-    ............
-    audio_path : str
-        The path to the audio file to be transcribed.
-    lang : str
-        The language of the audio file (e.g. 'en-us', 'es', 'fr').
-
-    Returns:
-    ............
-    results : list of dict
-        The raw transcription results returned by the Vosk model.
-
-    ------------------------------------------------------------------------------------------------------
-    """
-    #import in-case of model=vosk
-    from vosk import Model, KaldiRecognizer
-    
-    model = Model(lang=lang)
-    wf = wave.open(audio_path, "rb")
-
-    recog = KaldiRecognizer(model, wf.getframerate())
-    recog.SetWords(True)
-
-    results = []
-    while True:
-
-        data = wf.readframes(4000) #Future work
-        if len(data) == 0:
-            break
-
-        if recog.AcceptWaveform(data):
-            partial_result = json.loads(recog.Result())
-            results.append(partial_result)
-
-    partial_result = json.loads(recog.FinalResult())
-    results.append(partial_result)
-    return results
-
-def filter_speech(measures, results):
-    """
-    ------------------------------------------------------------------------------------------------------
-
-    Filter the speech transcription results to extract the transcript.
-
-    Parameters:
-    ...........
-    measures : dict
-        A dictionary containing the configuration settings for the speech transcription.
-    results : list of dict
-        The raw transcription results returned by the transcription service.
-
-    Returns:
-    ...........
-    result_key : list
-        A list containing the framewise transcription of the audio file.
-    transcript : str
-        The transcript of the audio file.
-
-    ------------------------------------------------------------------------------------------------------
-    """
-    result_key = []
-    text_key = []
-    transcript_dict = {}
-
-    for res in results:
-        dict_keys = res.keys()
-
-        if 'result' in dict_keys and 'text' in dict_keys:
-            result_key.extend(res['result'])
-            text_key.append(res['text'])
-
-    transcript_dict['result'] = result_key
-    transcript_dict['text'] = ' '.join(text_key)
-    return result_key, ' '.join(text_key)
-
-
-def get_config():
-    """
-    ------------------------------------------------------------------------------------------------------
-
-    Load the configuration settings for the speech transcription.
-
-    Parameters:
-    ...........
-    None
-
-    Returns:
-    ...........
-    measures : dict
-        A dictionary containing the configuration settings.
-
-    ------------------------------------------------------------------------------------------------------
-    """
-    #Loading json config
-    dir_name = os.path.dirname(os.path.abspath(__file__))
-    measure_path = os.path.abspath(os.path.join(dir_name, 'config/speech.json'))
-
-    file = open(measure_path)
-    measures = json.load(file)
-    return measures
-
-def run_whisperx(filepath, hf_token, del_model, num_speakers, infra_model, language):
-    """
-    ------------------------------------------------------------------------------------------------------
-
-    Transcribe audio data using the WhisperX model.
-
-    Parameters:
-    ...........
-    filepath : str
-        The path to the audio file to be transcribed.
-    hf_token : str
-        The Hugging Face token for model authentication.
-    del_model: boolean
-        Boolean indicator to delete model if low on GPU resources 
-    num_speakers: int
-        Number of speaker
-    infra_model:list
-        whisper model artifacts (this is optional param: to optimize willisInfra) 
-    language: str
-        language code
-
-    Returns:
-    ...........
-    json_response : JSON Object
-        A transcription response object in JSON format
-    transcript : str
-        The transcript of the recording.
-
-    ------------------------------------------------------------------------------------------------------
-    """
-    json_response = '{}'
-    transcript = ''
-    
-    if os.path.exists(filepath)== False or hf_token == '':
-        return json_response, transcript
-    
-    from openwillis.measures.audio.util import whisperx_util as wutil #import in-case of model=whisperx
-    json_response, transcript = wutil.get_whisperx_diariazation(filepath, hf_token, del_model, num_speakers, infra_model, language)
-    
-    if str(json_response) != '{}':
-        json_response = tutil.replace_whisperx_speaker_labels(json_response, ['SPEAKER_00', 'SPEAKER_01'], 
-                                                              ['speaker0', 'speaker1'])
-    return json_response, transcript
-    
-
-def speech_transcription(filepath, **kwargs):
-    """
-    ------------------------------------------------------------------------------------------------------
-
-    Speech transcription function that transcribes an audio file using vosk/whisperx.
-
-    Parameters:
-    ...........
-    filepath : str
-        The path to the audio file to be transcribed.
-    model : str, optional
-        The transcription model to use ('vosk'). Default is 'vosk'.
-    language : str, optional
-        The language of the audio file (e.g. 'en-us', 'es', 'fr'). Default is 'en-us'.
-    transcribe_interval : list, optional
-        A list of tuples representing the start and end times (in seconds) of segments of the audio file to be transcribed.
-        Only applicable if model is 'vosk'. Default is an empty list.
-
-    Returns:
-    ...........
-    json_response : JSON Object
-        A transcription response object in JSON format
-    transcript : str
-        The transcript of the recording.
-
-    ------------------------------------------------------------------------------------------------------
-    """
-
-    measures = get_config()
-    model = kwargs.get('model', 'vosk')
-    
-    language = kwargs.get('language', 'en-us')
-    scale = kwargs.get('c_scale', '')
-    num_speakers = kwargs.get('num_speakers', None)
-    
-    transcribe_interval = kwargs.get('transcribe_interval', [])
-    hf_token = kwargs.get('hf_token', '')
-    del_model = kwargs.get('del_model', False)
-    infra_model = kwargs.get('infra_model', [True, None, None])
-    
-    if model == 'whisperx':
-        json_response, transcript = run_whisperx(filepath, hf_token, del_model, num_speakers, infra_model, language)
-        
-        if scale.lower() in measures['scale'].split(','):
-            content_dict = tutil.get_whisperx_content(json_response)
-            json_response = tutil.get_whisperx_clinical_labels(scale, measures, content_dict, json_response)
-        
-    else:
-        json_response, transcript = run_vosk(filepath, language, transcribe_interval)
-    return json_response, transcript
diff --git a/openwillis/measures/audio/speech_transcribe_whisper.py b/openwillis/measures/audio/speech_transcribe_whisper.py
new file mode 100644
index 0000000..ed434c8
--- /dev/null
+++ b/openwillis/measures/audio/speech_transcribe_whisper.py
@@ -0,0 +1,149 @@
+# author:    Vijay Yadav
+# website:   http://www.bklynhlth.com
+
+# import the required packages
+
+import numpy as np
+import pandas as pd
+import os
+import json
+import logging
+
+from pydub import AudioSegment
+from openwillis.measures.audio.util import util as ut
+from openwillis.measures.audio.util import transcribe_util as tutil
+
+logging.basicConfig(level=logging.INFO)
+logger=logging.getLogger()
+
+
+def get_config():
+    """
+    ------------------------------------------------------------------------------------------------------
+
+    Load the configuration settings for the speech transcription.
+
+    Parameters:
+    ...........
+    None
+
+    Returns:
+    ...........
+    measures : dict
+        A dictionary containing the configuration settings.
+
+    ------------------------------------------------------------------------------------------------------
+    """
+    #Loading json config
+    dir_name = os.path.dirname(os.path.abspath(__file__))
+    measure_path = os.path.abspath(os.path.join(dir_name, 'config/speech.json'))
+
+    file = open(measure_path)
+    measures = json.load(file)
+    return measures
+
+def read_kwargs(kwargs):
+    """
+    ------------------------------------------------------------------------------------------------------
+
+    Reads keyword arguments and returns a dictionary containing input parameters.
+
+    Parameters:
+    ...........
+    kwargs : dict
+        Keyword arguments to be processed.
+
+    Returns:
+    ...........
+    input_param: dict
+        A dictionary containing input parameters with their corresponding values.
+
+    ------------------------------------------------------------------------------------------------------
+    """
+    input_param = {}
+    input_param['model'] = kwargs.get('model', 'tiny')
+    input_param['language'] = kwargs.get('language', 'en')
+    
+    input_param['context'] = kwargs.get('context', '')
+    input_param['max_speakers'] = kwargs.get('max_speakers', None)
+    input_param['min_speakers'] = kwargs.get('min_speakers', None)
+
+    input_param['hf_token'] = kwargs.get('hf_token', '')
+    input_param['del_model'] = kwargs.get('del_model', False) #Temp filter
+    input_param['infra_model'] = kwargs.get('infra_model', [True, None, None]) #Temp filter
+    
+    return input_param
+
+def run_whisperx(filepath, input_param):
+    """
+    ------------------------------------------------------------------------------------------------------
+
+    Transcribe audio data using the WhisperX model.
+
+    Parameters:
+    ...........
+    filepath : str
+        The path to the audio file to be transcribed.
+    input_param : dict
+        A dictionary containing input parameters
+
+    Returns:
+    ...........
+    json_response : JSON Object
+        A transcription response object in JSON format
+    transcript : str
+        The transcript of the recording.
+
+    ------------------------------------------------------------------------------------------------------
+    """
+    json_response = json.dumps({})
+    transcript = ''
+    
+    if os.path.exists(filepath)== False or input_param['hf_token'] == '':
+        return json_response, transcript
+    
+    from openwillis.measures.audio.util import whisperx_util as wutil #import in-case of model=whisperx
+    json_response, transcript = wutil.get_whisperx_diariazation(filepath, input_param)
+    
+    if str(json_response) != '{}':
+        json_response = tutil.replace_whisperx_speaker_labels(json_response, ['SPEAKER_00', 'SPEAKER_01'], 
+                                                              ['speaker0', 'speaker1'])
+    return json_response, transcript
+    
+
+def speech_transcription_whisper(filepath, **kwargs):
+    """
+    ------------------------------------------------------------------------------------------------------
+
+    Speech transcription function that transcribes an audio file using whisperx.
+
+    Parameters:
+    ...........
+    filepath : str
+        The path to the audio file to be transcribed.
+    model : str, optional
+        The transcription model to use ('vosk'). Default is 'vosk'.
+    language : str, optional
+        The language of the audio file (e.g. 'en-us', 'es', 'fr'). Default is 'en-us'.
+    transcribe_interval : list, optional
+        A list of tuples representing the start and end times (in seconds) of segments of the audio file to be transcribed.
+        Only applicable if model is 'vosk'. Default is an empty list.
+
+    Returns:
+    ...........
+    json_response : JSON Object
+        A transcription response object in JSON format
+    transcript : str
+        The transcript of the recording.
+
+    ------------------------------------------------------------------------------------------------------
+    """
+    measures = get_config()
+    input_param = read_kwargs(kwargs)
+    
+    json_response, transcript = run_whisperx(filepath, input_param)
+    if input_param['context'].lower() in measures['scale'].split(','):
+        
+        content_dict = tutil.get_whisperx_content(json_response)
+        json_response = tutil.get_whisperx_clinical_labels(input_param['context'], measures, content_dict, json_response)
+    return json_response, transcript
diff --git a/openwillis/measures/audio/util/whisperx_util.py b/openwillis/measures/audio/util/whisperx_util.py
index bd71064..a7bae73 100644
--- a/openwillis/measures/audio/util/whisperx_util.py
+++ b/openwillis/measures/audio/util/whisperx_util.py
@@ -27,7 +27,7 @@ def delete_model(model):
     torch.cuda.empty_cache()
     del model
 
-def get_diarization(audio, align_json, HF_TOKEN, device, num_speakers, infra_model):
+def get_diarization(audio, align_json, device, input_param):
     """
     ------------------------------------------------------------------------------------------------------
 
@@ -38,12 +38,10 @@ def get_diarization(audio, align_json, HF_TOKEN, device, num_speakers, infra_mod
         audio signal object
     align_json: json
         aligned whisper transcribed output
-    HF_TOKEN : str
-        The Hugging Face token for model authentication.
     device : str
         device type
-    num_speakers: int
-        Number of speaker
+    input_param : dict
+        A dictionary containing input parameters
     
     Returns:
     ...........
@@ -53,17 +51,23 @@ def get_diarization(audio, align_json, HF_TOKEN, device, num_speakers, infra_mod
     ------------------------------------------------------------------------------------------------------
     """
     # Assign speaker labels
-    if infra_model[0]:
-        diarize_model = whisperx.DiarizationPipeline(use_auth_token=HF_TOKEN, device=device)
-    
+    if input_param['infra_model'][0]:
+        diarize_model = whisperx.DiarizationPipeline(use_auth_token=input_param['hf_token'], device=device)
     else:
-        diarize_model = infra_model[2]
+        diarize_model = input_param['infra_model'][2]
 
-    if num_speakers == None:
+    if input_param['min_speakers'] == None and input_param['max_speakers'] == None:
         diarize_segments = diarize_model(audio)
     
+    elif input_param['min_speakers'] == None and input_param['max_speakers'] != None:
+        diarize_segments = diarize_model(audio, max_speakers = input_param['max_speakers'])
+    
+    elif input_param['min_speakers'] != None and input_param['max_speakers'] == None:
+        diarize_segments = diarize_model(audio, min_speakers= input_param['min_speakers'])
+        
     else:
-        diarize_segments = diarize_model(audio, min_speakers=num_speakers, max_speakers=num_speakers)
+        diarize_segments = diarize_model(audio, min_speakers=input_param['min_speakers'], max_speakers=input_param['max_speakers'])
+        
     json_response = whisperx.assign_word_speakers(diarize_segments, align_json)
     return json_response
 
@@ -126,7 +130,7 @@ def transcribe_whisper(filepath, model, device, compute_type, batch_size, infra_
     transcribe_json = model_whisp.transcribe(audio, batch_size=batch_size, language=language)
     return transcribe_json, audio
 
-def get_whisperx_diariazation(filepath, HF_TOKEN, del_model, num_speakers, infra_model, language):
+def get_whisperx_diariazation(filepath, input_param):
     """
     ------------------------------------------------------------------------------------------------------
 
@@ -136,16 +140,8 @@ def get_whisperx_diariazation(filepath, HF_TOKEN, del_model, num_speakers, infra
     ...........
     filepath : str
         The path to the audio file to be transcribed.
-    HF_TOKEN : str
-        The Hugging Face token for model authentication.
-    del_model: boolean
-        Boolean indicator to delete model if low on GPU resources 
-    num_speakers: int
-        Number of speaker
-    infra_model: list
-        whisper model artifacts (this is optional param: to optimize willisInfra) 
-    language: str
-        language code
+    input_param : dict
+        A dictionary containing input parameters
 
     Returns:
     ...........
@@ -158,11 +154,9 @@ def get_whisperx_diariazation(filepath, HF_TOKEN, del_model, num_speakers, infra
     """
     device = 'cpu'
     compute_type = "int16"
-    
-    model = 'large-v2'
     batch_size = 16
     
-    json_response = '{}'
+    json_response = json.dumps({})
     transcript = ''
     
     try:
@@ -170,16 +164,16 @@ def get_whisperx_diariazation(filepath, HF_TOKEN, del_model, num_speakers, infra
             device = 'cuda'
             compute_type = "float16"
     
-        transcribe_json, audio = transcribe_whisper(filepath, model, device, compute_type, batch_size, infra_model, language)
+        transcribe_json, audio = transcribe_whisper(filepath, input_param['model'], device, compute_type, batch_size, input_param['infra_model'], input_param['language'])
     
         # Align whisper output
-        model_a, metadata = whisperx.load_align_model(language_code=language, device=device)
+        model_a, metadata = whisperx.load_align_model(language_code=input_param['language'], device=device)
         align_json = whisperx.align(transcribe_json["segments"], model_a, metadata, audio, device, return_char_alignments=False)
     
-        if del_model:
+        if input_param['del_model']:
             delete_model(model_a)
             
-        json_response = get_diarization(audio, align_json, HF_TOKEN, device, num_speakers, infra_model)    
+        json_response = get_diarization(audio, align_json, device, input_param)    
         transcript = get_transcribe_summary(json_response)
     
     except Exception as e:

From d0e97165d81e0ac4004ea822d0a75b4fbe5fa252 Mon Sep 17 00:00:00 2001
From: Vijay Yadav <vijayyadav@vijays-mbp.myfiosgateway.com>
Date: Tue, 7 Nov 2023 14:17:03 -0500
Subject: [PATCH 16/21] aws support

---
 openwillis/__init__.py                        |  6 +--
 openwillis/measures/api.py                    |  4 +-
 openwillis/measures/audio/__init__.py         |  4 +-
 .../measures/audio/speech_transcribe_cloud.py | 40 ++++++++----------
 .../measures/audio/util/transcribe_util.py    | 41 ++++++++++---------
 5 files changed, 46 insertions(+), 49 deletions(-)

diff --git a/openwillis/__init__.py b/openwillis/__init__.py
index 61e0c35..724a34a 100644
--- a/openwillis/__init__.py
+++ b/openwillis/__init__.py
@@ -1,5 +1,5 @@
 # author:    Vijay Yadav
-# website:   http://www.bklynhlth.com
+# website:   http://www.brooklyn.health
 
 # import the required packages
 
@@ -12,9 +12,9 @@
     speech_characteristics,
     speaker_separation,
     speaker_separation_cloud,
-    speech_transcription_cloud,
+    speech_transcription_aws,
     speech_transcription_vosk,
     to_audio
 )
 
-__all__ = ["facial_expressivity", "vocal_acoustics", "emotional_expressivity", "eye_blink_rate", "speech_transcription_whisper", "speech_characteristics", "speaker_separation", "speaker_separation_cloud", "speech_transcription_cloud", "speech_transcription_vosk", "to_audio"]
+__all__ = ["facial_expressivity", "vocal_acoustics", "emotional_expressivity", "eye_blink_rate", "speech_transcription_whisper", "speech_characteristics", "speaker_separation", "speaker_separation_cloud", "speech_transcription_aws", "speech_transcription_vosk", "to_audio"]
diff --git a/openwillis/measures/api.py b/openwillis/measures/api.py
index f76db67..026b153 100644
--- a/openwillis/measures/api.py
+++ b/openwillis/measures/api.py
@@ -1,5 +1,5 @@
 # author:    Vijay Yadav
-# website:   http://www.bklynhlth.com
+# website:   http://www.brooklyn.health
 
 # import the required packages
 from openwillis.measures.video import (
@@ -12,7 +12,7 @@
     speech_transcription_whisper,
     speaker_separation,
     speaker_separation_cloud,
-    speech_transcription_cloud,
+    speech_transcription_aws,
     speech_transcription_vosk
 )
 from openwillis.measures.text import (
diff --git a/openwillis/measures/audio/__init__.py b/openwillis/measures/audio/__init__.py
index f7448e6..84245fb 100644
--- a/openwillis/measures/audio/__init__.py
+++ b/openwillis/measures/audio/__init__.py
@@ -15,11 +15,11 @@
 )
 
 from openwillis.measures.audio.speech_transcribe_cloud import (
-    speech_transcription_cloud,
+    speech_transcription_aws,
 )
 
 from openwillis.measures.audio.speech_transcribe_vosk import (
     speech_transcription_vosk,
 )
 
-__all__ = ["vocal_acoustics", "speech_transcription_whisper", "speaker_separation", "speaker_separation_cloud", "speech_transcription_cloud", "speech_transcription_vosk"]
+__all__ = ["vocal_acoustics", "speech_transcription_whisper", "speaker_separation", "speaker_separation_cloud", "speech_transcription_aws", "speech_transcription_vosk"]
diff --git a/openwillis/measures/audio/speech_transcribe_cloud.py b/openwillis/measures/audio/speech_transcribe_cloud.py
index 513e7a5..6156262 100644
--- a/openwillis/measures/audio/speech_transcribe_cloud.py
+++ b/openwillis/measures/audio/speech_transcribe_cloud.py
@@ -1,5 +1,5 @@
 # author:    Vijay Yadav
-# website:   http://www.bklynhlth.com
+# website:   http://www.brooklyn.health
 
 # import the required packages
 import os
@@ -53,20 +53,19 @@ def read_kwargs(kwargs):
     ------------------------------------------------------------------------------------------------------
     """
     input_param = {}
-    input_param['model'] = kwargs.get('model', 'pyannote')
     input_param['language'] = kwargs.get('language', 'en-US')
     input_param['region'] = kwargs.get('region', 'us-east-1')
 
     input_param['job_name'] = kwargs.get('job_name', 'transcribe_job_01')
-    input_param['ShowSpeakerLabels'] = kwargs.get('ShowSpeakerLabels', True)
-    input_param['MaxSpeakerLabels'] = kwargs.get('MaxSpeakerLabels', 2)
+    input_param['speaker_labels'] = kwargs.get('speaker_labels', False)
+    input_param['max_speakers'] = kwargs.get('max_speakers', 2)
 
-    input_param['c_scale'] = kwargs.get('c_scale', '')
+    input_param['context'] = kwargs.get('context', '')
     input_param['access_key'] = kwargs.get('access_key', '')
     input_param['secret_key'] = kwargs.get('secret_key', '')
     return input_param
 
-def speech_transcription_cloud(filepath, **kwargs):
+def speech_transcription_aws(s3_uri, **kwargs):
     """
     ------------------------------------------------------------------------------------------------------
 
@@ -74,29 +73,26 @@ def speech_transcription_cloud(filepath, **kwargs):
 
     Parameters:
     ...........
-    filepath : str
+    s3_uri : str
         The S3 uri for the recording to be transcribed.
     kwargs: Object
-        model : str, optional
-            The transcription model to use ('aws'). Default is 'aws'.
         language : str, optional
             The language of the audio file (e.g. 'en-US', 'en-IN'). Default is 'en-US'.
         region : str, optional
             The AWS region to use (e.g. 'us-east-1'). Only applicable if model is 'aws'. Default is 'us-east-1'.
         job_name : str, optional
             The name of the transcription job. Only applicable if model is 'aws'. Default is 'transcribe_job_01'.
-        ShowSpeakerLabels : boolean, optional
-            Show speaker labels
-        MaxSpeakerLabels : int, optional
-            Max number of speakers
-        c_scale : str, optional
-            Clinical scale to use for slicing the separated audio files, if any.
         access_key : str, optional
             AWS access key
         secret_key : str, optional
             AWS secret key
-
-
+        speaker_labels : boolean, optional
+            Show speaker labels
+        max_speakers : int, optional
+            Max number of speakers
+        context : str, optional
+            scale to use for slicing the separated audio files, if any.
+            
     Returns:
     ...........
     json_response : JSON Object
@@ -108,10 +104,10 @@ def speech_transcription_cloud(filepath, **kwargs):
     """
     input_param = read_kwargs(kwargs)
     measures = get_config()
-    json_response, transcript = tutil.transcribe_audio(filepath, input_param)
-
-    if input_param['ShowSpeakerLabels'] == True and input_param['c_scale']:
+    json_response, transcript = tutil.transcribe_audio(s3_uri, input_param)
+    
+    if input_param['speaker_labels'] == True and input_param['context'].lower() in measures['scale'].split(','):
         content_dict = tutil.extract_content(json_response)
-        json_response = tutil.get_clinical_labels(input_param['c_scale'], measures, content_dict, json_response)
-
+        
+        json_response = tutil.get_clinical_labels(input_param['context'], measures, content_dict, json_response)
     return json_response, transcript
diff --git a/openwillis/measures/audio/util/transcribe_util.py b/openwillis/measures/audio/util/transcribe_util.py
index 5e4049c..c510a2b 100644
--- a/openwillis/measures/audio/util/transcribe_util.py
+++ b/openwillis/measures/audio/util/transcribe_util.py
@@ -1,5 +1,5 @@
 # author:    Vijay Yadav
-# website:   http://www.bklynhlth.com
+# website:   http://www.brooklyn.health
 
 # import the required packages
 
@@ -117,20 +117,19 @@ def get_clinical_labels(scale, measures, content_dict, json_response):
     ------------------------------------------------------------------------------------------------------
     """
     #Check if content is available for all the speaker
-    if content_dict and content_dict['speaker0'] and content_dict['speaker1']:
-        if scale.lower() not in measures['scale'].split(","):
-            return json_response
+    if len(content_dict) <2:
+        return json_response
         
-        score_string = scale.lower()+'_string'
-        spk1_score = sutil.match_transcript(measures[score_string], content_dict['speaker0'])
-        spk2_score = sutil.match_transcript(measures[score_string], content_dict['speaker1'])
-
-        if spk1_score > spk2_score:
-            json_response = replace_speaker_labels(json_response, ['speaker0', 'speaker1'], ['clinician', 'participant'])
+    score_string = scale.lower()+'_string'
+    spk1_score = sutil.match_transcript(measures[score_string], content_dict['speaker0'])
+    spk2_score = sutil.match_transcript(measures[score_string], content_dict['speaker1'])
 
-        else:
-            json_response = replace_speaker_labels(json_response, ['speaker0', 'speaker1'], ['participant', 'clinician'])
+    if spk1_score > spk2_score:
+        json_response = replace_speaker_labels(json_response, ['speaker0', 'speaker1'], ['clinician', 'participant'])
 
+    else:
+        json_response = replace_speaker_labels(json_response, ['speaker0', 'speaker1'], ['participant', 'clinician'])
+    
     return json_response
 
 def get_job_status(transcribe, input_param):
@@ -193,7 +192,7 @@ def filter_transcript_response(status, input_param):
     response = json.loads(read_data.read().decode('utf-8'))
 
     transcript = response['results']['transcripts'][0]['transcript']
-    if input_param['ShowSpeakerLabels'] == True:#replace speaker labels with standard names
+    if input_param['speaker_labels'] == True:#replace speaker labels with standard names
 
         response = replace_speaker_labels(response, ['spk_0', 'spk_1'], ['speaker0', 'speaker1'])
     return response, transcript
@@ -222,24 +221,26 @@ def transcribe_audio(s3uri, input_param):
 
     ------------------------------------------------------------------------------------------------------
     """
-    response = json.loads("{}")
+    response = json.dumps({})
+    settings = {}
     transcript = ""
 
     try:
         if input_param['access_key'] and input_param['secret_key']:
-            transcribe = boto3.client('transcribe', region_name = input_param['region'], aws_access_key_id = input_param['access_key'], aws_secret_access_key = input_param['secret_key'])
+            transcribe = boto3.client('transcribe', region_name = input_param['region'], 
+                                      aws_access_key_id = input_param['access_key'], 
+                                      aws_secret_access_key = input_param['secret_key'])
         else:
             transcribe = boto3.client('transcribe', region_name = input_param['region'])
 
-        settings = {'ShowSpeakerLabels': input_param['ShowSpeakerLabels'], 'MaxSpeakerLabels': input_param['MaxSpeakerLabels']}
+        if input_param['speaker_labels'] == True and input_param['max_speakers']>=2:
+            settings = {'ShowSpeakerLabels': input_param['speaker_labels'], 'MaxSpeakerLabels': input_param['max_speakers']}
+
         transcribe.start_transcription_job(
             TranscriptionJobName=input_param['job_name'],
             Media={'MediaFileUri': s3uri},
-
-            #IdentifyMultipleLanguages=True,
             LanguageCode=input_param['language'],
-            Settings=settings
-        )
+            Settings=settings)
 
         status = get_job_status(transcribe, input_param)
         if status['TranscriptionJob']['TranscriptionJobStatus'] == 'COMPLETED':

From c0427309d75283313fbb5fa319f45182c1364d07 Mon Sep 17 00:00:00 2001
From: Vijay Yadav <vijayyadav@vijays-mbp.myfiosgateway.com>
Date: Wed, 8 Nov 2023 12:52:34 -0500
Subject: [PATCH 17/21] speaker separation

---
 openwillis/__init__.py                        |  6 ++--
 openwillis/measures/api.py                    |  4 +--
 openwillis/measures/audio/__init__.py         |  4 +--
 ...ration.py => speech_separation_nlabels.py} | 35 +++++++------------
 openwillis/measures/commons/common.py         | 12 +++----
 5 files changed, 26 insertions(+), 35 deletions(-)
 rename openwillis/measures/audio/{speech_separation.py => speech_separation_nlabels.py} (83%)

diff --git a/openwillis/__init__.py b/openwillis/__init__.py
index 724a34a..157d244 100644
--- a/openwillis/__init__.py
+++ b/openwillis/__init__.py
@@ -1,5 +1,5 @@
 # author:    Vijay Yadav
-# website:   http://www.brooklyn.health
+# website:   http://www.bklynhlth.com
 
 # import the required packages
 
@@ -10,11 +10,11 @@
     vocal_acoustics,
     speech_transcription_whisper,
     speech_characteristics,
-    speaker_separation,
+    speaker_separation_nolabels,
     speaker_separation_cloud,
     speech_transcription_aws,
     speech_transcription_vosk,
     to_audio
 )
 
-__all__ = ["facial_expressivity", "vocal_acoustics", "emotional_expressivity", "eye_blink_rate", "speech_transcription_whisper", "speech_characteristics", "speaker_separation", "speaker_separation_cloud", "speech_transcription_aws", "speech_transcription_vosk", "to_audio"]
+__all__ = ["facial_expressivity", "vocal_acoustics", "emotional_expressivity", "eye_blink_rate", "speech_transcription_whisper", "speech_characteristics", "speaker_separation_nolabels", "speaker_separation_cloud", "speech_transcription_aws", "speech_transcription_vosk", "to_audio"]
diff --git a/openwillis/measures/api.py b/openwillis/measures/api.py
index 026b153..3cbea8f 100644
--- a/openwillis/measures/api.py
+++ b/openwillis/measures/api.py
@@ -1,5 +1,5 @@
 # author:    Vijay Yadav
-# website:   http://www.brooklyn.health
+# website:   http://www.bklynhlth.com
 
 # import the required packages
 from openwillis.measures.video import (
@@ -10,7 +10,7 @@
 from openwillis.measures.audio import (
     vocal_acoustics,
     speech_transcription_whisper,
-    speaker_separation,
+    speaker_separation_nolabels,
     speaker_separation_cloud,
     speech_transcription_aws,
     speech_transcription_vosk
diff --git a/openwillis/measures/audio/__init__.py b/openwillis/measures/audio/__init__.py
index 84245fb..dd3096c 100644
--- a/openwillis/measures/audio/__init__.py
+++ b/openwillis/measures/audio/__init__.py
@@ -6,8 +6,8 @@
     speech_transcription_whisper,
 )
 
-from openwillis.measures.audio.speech_separation import (
-    speaker_separation,
+from openwillis.measures.audio.speech_separation_nlabels import (
+    speaker_separation_nolabels,
 )
 
 from openwillis.measures.audio.speech_separation_cloud import (
diff --git a/openwillis/measures/audio/speech_separation.py b/openwillis/measures/audio/speech_separation_nlabels.py
similarity index 83%
rename from openwillis/measures/audio/speech_separation.py
rename to openwillis/measures/audio/speech_separation_nlabels.py
index 58d6777..62409eb 100644
--- a/openwillis/measures/audio/speech_separation.py
+++ b/openwillis/measures/audio/speech_separation_nlabels.py
@@ -3,13 +3,11 @@
 
 # import the required packages
 from pyannote.audio import Pipeline
-from openwillis.measures.audio.util import util as ut
 from openwillis.measures.audio.util import separation_util as sutil
 from pydub import AudioSegment
 
 import os
 import json
-import shutil
 import pandas as pd
 import logging
 
@@ -89,11 +87,10 @@ def read_kwargs(kwargs):
     ------------------------------------------------------------------------------------------------------
     """
     input_param = {}
-    input_param['model'] = kwargs.get('model', 'pyannote')
-
     input_param['hf_token'] = kwargs.get('hf_token', '')
-    input_param['json_response'] = kwargs.get('json_response', json.loads("{}"))
-    input_param['c_scale'] = kwargs.get('c_scale', '')
+    
+    input_param['transcript_json'] = kwargs.get('transcript_json', json.dumps({}))
+    input_param['context'] = kwargs.get('context', '')
     return input_param
 
 def get_pyannote(input_param, file_name, filepath):
@@ -122,12 +119,12 @@ def get_pyannote(input_param, file_name, filepath):
     """
     
     diart_df = run_pyannote(filepath, input_param['hf_token'])
-    transcribe_df = pd.DataFrame(input_param['json_response'])
+    transcribe_df = pd.DataFrame(input_param['transcript_json'])
 
     speaker_df, speaker_count = sutil.get_speaker_identification(diart_df, transcribe_df)
     return speaker_df, speaker_count
 
-def speaker_separation(filepath, **kwargs):
+def speaker_separation_nolabels(filepath, **kwargs):
     """
     ------------------------------------------------------------------------------------------------------
 
@@ -137,14 +134,12 @@ def speaker_separation(filepath, **kwargs):
     ...........
     filepath : str
         Path to the input audio file.
+    transcript_json : json
+        Speech transcription json response.
     hf_token : str
         Access token for HuggingFace to access pre-trained models.
-    json_response : json
-        Speech transcription json response.
-    model : str, optional
-        Model to use for speech diarization, default is 'pyannote'.
-    c_scale : str, optional
-        Clinical scale to use for slicing the separated audio files, if any.
+    context : str, optional
+        scale to use for slicing the separated audio files, if any.
 
     Returns:
     ...........
@@ -160,18 +155,14 @@ def speaker_separation(filepath, **kwargs):
     measures = get_config()
 
     try:
-        if not os.path.exists(filepath) or 'json_response' not in kwargs:
+        if not os.path.exists(filepath) or 'transcript_json' not in kwargs:
             return signal_label
 
-        if input_param['model'] == 'whisperx': 
-            input_param['c_scale'] = ''
-            speaker_df, speaker_count = sutil.whisperx_to_dataframe(input_param['json_response'])
-        else:
-            speaker_df, speaker_count = get_pyannote(input_param, file_name, filepath)
-
+        speaker_df, speaker_count = get_pyannote(input_param, file_name, filepath)
         audio_signal = AudioSegment.from_file(file = filepath, format = "wav")
+
         if len(speaker_df)>0 and speaker_count>1:
-            signal_label = sutil.generate_audio_signal(speaker_df , audio_signal, input_param['c_scale'], measures)
+            signal_label = sutil.generate_audio_signal(speaker_df , audio_signal, input_param['context'], measures)
 
     except Exception as e:
         logger.error(f'Error in diard processing: {e} & File: {filepath}')
diff --git a/openwillis/measures/commons/common.py b/openwillis/measures/commons/common.py
index 9199a2e..d0c4951 100644
--- a/openwillis/measures/commons/common.py
+++ b/openwillis/measures/commons/common.py
@@ -26,7 +26,7 @@ def make_dir(dir_name):
     if not os.path.exists(dir_name):
         os.makedirs(dir_name)
 
-def to_audio(filepath, speaker_label, out_dir):
+def to_audio(filepath, speaker_dict, output_dir):
     """
     ------------------------------------------------------------------------------------------------------
 
@@ -36,22 +36,22 @@ def to_audio(filepath, speaker_label, out_dir):
     ----------
         filepath : str
             The path to the input audio file.
-        speaker_label : dict
+        speaker_dict : dict
             A dictionary containing speaker labels as keys and corresponding segments (NumPy arrays) as values.
-        out_dir : str
+        output_dir : str
             The directory where the output audio files will be saved.
 
     ------------------------------------------------------------------------------------------------------
     """
-    make_dir(out_dir)
-    for key, value in speaker_label.items():
+    make_dir(output_dir)
+    for key, value in speaker_dict.items():
         file_name, _ = os.path.splitext(os.path.basename(filepath))
 
         audio_signal = AudioSegment.from_file(file = filepath, format = "wav")
         spk_signal = AudioSegment(value.tobytes(), frame_rate=audio_signal.frame_rate,
                                   sample_width=audio_signal.sample_width, channels=audio_signal.channels)
 
-        output_file = os.path.join(out_dir, file_name + '_' + key + '.wav')
+        output_file = os.path.join(output_dir, file_name + '_' + key + '.wav')
         spk_signal.export(output_file, format="wav")
 
 def get_config(filepath, json_file):

From 755c744246ac09464263ebd466374ffb5c4c3e87 Mon Sep 17 00:00:00 2001
From: Vijay Yadav <vijayyadav@vijays-mbp.myfiosgateway.com>
Date: Wed, 8 Nov 2023 15:27:28 -0500
Subject: [PATCH 18/21] speaker separation update

---
 openwillis/__init__.py                        |  4 +--
 openwillis/measures/api.py                    |  2 +-
 openwillis/measures/audio/__init__.py         |  4 +--
 ...n_cloud.py => speech_separation_labels.py} | 27 ++++++++++++++++---
 .../measures/audio/util/separation_util.py    |  3 ++-
 5 files changed, 31 insertions(+), 9 deletions(-)
 rename openwillis/measures/audio/{speech_separation_cloud.py => speech_separation_labels.py} (68%)

diff --git a/openwillis/__init__.py b/openwillis/__init__.py
index 157d244..41d7d3b 100644
--- a/openwillis/__init__.py
+++ b/openwillis/__init__.py
@@ -11,10 +11,10 @@
     speech_transcription_whisper,
     speech_characteristics,
     speaker_separation_nolabels,
-    speaker_separation_cloud,
+    speaker_separation_labels,
     speech_transcription_aws,
     speech_transcription_vosk,
     to_audio
 )
 
-__all__ = ["facial_expressivity", "vocal_acoustics", "emotional_expressivity", "eye_blink_rate", "speech_transcription_whisper", "speech_characteristics", "speaker_separation_nolabels", "speaker_separation_cloud", "speech_transcription_aws", "speech_transcription_vosk", "to_audio"]
+__all__ = ["facial_expressivity", "vocal_acoustics", "emotional_expressivity", "eye_blink_rate", "speech_transcription_whisper", "speech_characteristics", "speaker_separation_nolabels", "speaker_separation_labels", "speech_transcription_aws", "speech_transcription_vosk", "to_audio"]
diff --git a/openwillis/measures/api.py b/openwillis/measures/api.py
index 3cbea8f..38dad3e 100644
--- a/openwillis/measures/api.py
+++ b/openwillis/measures/api.py
@@ -11,7 +11,7 @@
     vocal_acoustics,
     speech_transcription_whisper,
     speaker_separation_nolabels,
-    speaker_separation_cloud,
+    speaker_separation_labels,
     speech_transcription_aws,
     speech_transcription_vosk
 )
diff --git a/openwillis/measures/audio/__init__.py b/openwillis/measures/audio/__init__.py
index dd3096c..355a3bd 100644
--- a/openwillis/measures/audio/__init__.py
+++ b/openwillis/measures/audio/__init__.py
@@ -10,8 +10,8 @@
     speaker_separation_nolabels,
 )
 
-from openwillis.measures.audio.speech_separation_cloud import (
-    speaker_separation_cloud,
+from openwillis.measures.audio.speech_separation_labels import (
+    speaker_separation_labels,
 )
 
 from openwillis.measures.audio.speech_transcribe_cloud import (
diff --git a/openwillis/measures/audio/speech_separation_cloud.py b/openwillis/measures/audio/speech_separation_labels.py
similarity index 68%
rename from openwillis/measures/audio/speech_separation_cloud.py
rename to openwillis/measures/audio/speech_separation_labels.py
index f314c4d..6d82291 100644
--- a/openwillis/measures/audio/speech_separation_cloud.py
+++ b/openwillis/measures/audio/speech_separation_labels.py
@@ -38,7 +38,23 @@ def get_config():
     measures = json.load(file)
     return measures
 
-def speaker_separation_cloud(filepath, json_response):
+def is_amazon_transcribe(json_conf):
+    """
+    ------------------------------------------------------------------------------------------------------
+    This function checks if the json response object is from Amazon Transcribe.
+    Parameters:
+    ...........
+    json_conf: dict
+        JSON response object.
+    Returns:
+    ...........
+    bool: True if the json response object
+     is from Amazon Transcribe, False otherwise.
+    ------------------------------------------------------------------------------------------------------
+    """
+    return "jobName" in json_conf and "results" in json_conf
+
+def speaker_separation_labels(filepath, transcript_json):
     """
     ------------------------------------------------------------------------------------------------------
 
@@ -48,7 +64,7 @@ def speaker_separation_cloud(filepath, json_response):
     ...........
     filepath : str
         Path to the input audio file.
-    json_response : json
+    transcript_json : json
         Speech transcription json response.
 
     Returns:
@@ -66,8 +82,13 @@ def speaker_separation_cloud(filepath, json_response):
             return signal_label
 
         audio_signal = AudioSegment.from_file(file = filepath, format = "wav")
-        speaker_df, speaker_count = sutil.transcribe_response_to_dataframe(json_response)
+        if not is_amazon_transcribe(transcript_json):
 
+            speaker_df, speaker_count = sutil.whisperx_to_dataframe(transcript_json)
+        else:
+            speaker_df, speaker_count = sutil.transcribe_response_to_dataframe(transcript_json)
+        print(speaker_df)
+        print(speaker_count)
         if len(speaker_df)>0 and speaker_count>1:
             signal_label = sutil.generate_audio_signal(speaker_df , audio_signal, '', measures)
 
diff --git a/openwillis/measures/audio/util/separation_util.py b/openwillis/measures/audio/util/separation_util.py
index 0959a4c..9f623f7 100644
--- a/openwillis/measures/audio/util/separation_util.py
+++ b/openwillis/measures/audio/util/separation_util.py
@@ -322,7 +322,7 @@ def transcribe_response_to_dataframe(response):
     speakers = 0
     df = pd.DataFrame()
 
-    if 'segments' in response:
+    if 'results' in response:
         if 'speaker_labels' in response['results']:
 
             if 'speakers' in response['results']['speaker_labels']:
@@ -338,6 +338,7 @@ def transcribe_response_to_dataframe(response):
 
                 df = df[df["confidence"] > 0].reset_index(drop=True)
                 df = df[["start_time", "end_time", "confidence", "speaker_label", "content"]]
+                
     return df, speakers
 
 def extract_data(segment_info):

From 5a477b3887f752648e88fe33ac29a71bf1489a05 Mon Sep 17 00:00:00 2001
From: vjbytes102 <vy386@nyu.edu>
Date: Wed, 8 Nov 2023 15:36:33 -0500
Subject: [PATCH 19/21] Update speech_separation_labels

---
 openwillis/measures/audio/speech_separation_labels.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/openwillis/measures/audio/speech_separation_labels.py b/openwillis/measures/audio/speech_separation_labels.py
index 6d82291..d251e59 100644
--- a/openwillis/measures/audio/speech_separation_labels.py
+++ b/openwillis/measures/audio/speech_separation_labels.py
@@ -87,8 +87,7 @@ def speaker_separation_labels(filepath, transcript_json):
             speaker_df, speaker_count = sutil.whisperx_to_dataframe(transcript_json)
         else:
             speaker_df, speaker_count = sutil.transcribe_response_to_dataframe(transcript_json)
-        print(speaker_df)
-        print(speaker_count)
+            
         if len(speaker_df)>0 and speaker_count>1:
             signal_label = sutil.generate_audio_signal(speaker_df , audio_signal, '', measures)
 

From 2403e697eee908e0cc8f67e09f0571abcc6edddb Mon Sep 17 00:00:00 2001
From: Vijay Yadav <vijayyadav@vijays-mbp.myfiosgateway.com>
Date: Thu, 9 Nov 2023 12:48:32 -0500
Subject: [PATCH 20/21] whisper update

---
 .../measures/audio/util/separation_util.py    | 42 +++++++++----------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/openwillis/measures/audio/util/separation_util.py b/openwillis/measures/audio/util/separation_util.py
index 9f623f7..fff47d4 100644
--- a/openwillis/measures/audio/util/separation_util.py
+++ b/openwillis/measures/audio/util/separation_util.py
@@ -359,12 +359,18 @@ def extract_data(segment_info):
 
     ------------------------------------------------------------------------------------------------------
     """
-    phrase = segment_info["text"]
-    start = segment_info["start"]
-    end = segment_info["end"]
+    phrase = segment_info.get("text", "")
+    start = segment_info.get("start", np.nan)
     
-    score = segment_info["words"][0]["score"] if segment_info["words"] and len(segment_info["words"]) > 0 else 0
-    speaker = segment_info["speaker"] if "speaker" in segment_info else "no_speaker"
+    end = segment_info.get("end", np.nan)
+    words = segment_info.get("words", None)
+
+    if words is not None and len(words) > 0:
+        score = words[0].get("score", 0)
+    else:
+        score = 0
+
+    speaker = segment_info.get("speaker", "no_speaker")
     return pd.Series([start, end, phrase, score, speaker], index=["start", "end", "phrase", "score", "speaker"])
 
 def whisperx_to_dataframe(json_response):
@@ -387,23 +393,17 @@ def whisperx_to_dataframe(json_response):
 
     ------------------------------------------------------------------------------------------------------
     """
-    # Initialize an empty DataFrame
-    df = pd.DataFrame(columns=["start", "end", "phrase", "score", "speaker"])
+    df = pd.DataFrame(columns=["start_time", "end_time", "content", "confidence", "speaker_label"])
     if 'segments' in json_response:
         
-        for segment_info in json_response["segments"]:
-            try:
-                
-                segment_df = extract_data(segment_info)
-                df = df.append(segment_df, ignore_index=True)
-                
-            except Exception as e:
-                logger.info("Some segments have no speaker labels.")
-    
-    df = df[df["score"] > 0].reset_index(drop=True)
-    df = df[df["speaker"] != "no_speaker"].reset_index(drop=True)
-    df = df.rename(columns={"start": "start_time", "end": "end_time", "score":"confidence", "speaker":"speaker_label", 
-                            "phrase":"content"})
-    
+        segment_infos = json_response["segments"]
+        df = pd.DataFrame(segment_infos).apply(extract_data, axis=1)
+
+        df = df[df["score"] > 0].reset_index(drop=True)
+        df = df.dropna(subset=["start", "end"]).reset_index(drop=True)
+        
+        df = df[df["speaker"] != "no_speaker"].reset_index(drop=True)
+        df = df.rename(columns={"start": "start_time", "end": "end_time", "score": "confidence", "speaker": "speaker_label", "phrase": "content"})
+
     speakers = df['speaker_label'].nunique()
     return df, speakers
\ No newline at end of file

From 6b3652695b7feba152925bbbbfef7906dd4abf27 Mon Sep 17 00:00:00 2001
From: Vijay Yadav <vijayyadav@vijays-mbp.myfiosgateway.com>
Date: Fri, 10 Nov 2023 15:38:55 -0500
Subject: [PATCH 21/21] transcription update

---
 .../audio/speech_transcribe_whisper.py        |  3 +-
 .../measures/audio/util/transcribe_util.py    | 85 ++++++++++++++++++-
 2 files changed, 85 insertions(+), 3 deletions(-)

diff --git a/openwillis/measures/audio/speech_transcribe_whisper.py b/openwillis/measures/audio/speech_transcribe_whisper.py
index ed434c8..5e5371d 100644
--- a/openwillis/measures/audio/speech_transcribe_whisper.py
+++ b/openwillis/measures/audio/speech_transcribe_whisper.py
@@ -106,8 +106,7 @@ def run_whisperx(filepath, input_param):
     json_response, transcript = wutil.get_whisperx_diariazation(filepath, input_param)
     
     if str(json_response) != '{}':
-        json_response = tutil.replace_whisperx_speaker_labels(json_response, ['SPEAKER_00', 'SPEAKER_01'], 
-                                                              ['speaker0', 'speaker1'])
+        json_response = tutil.filter_labels_whisper(json_response)
     return json_response, transcript
     
 
diff --git a/openwillis/measures/audio/util/transcribe_util.py b/openwillis/measures/audio/util/transcribe_util.py
index c510a2b..9b99542 100644
--- a/openwillis/measures/audio/util/transcribe_util.py
+++ b/openwillis/measures/audio/util/transcribe_util.py
@@ -58,6 +58,89 @@ def replace_speaker_labels(data, check_labels, speaker_labels):
 
     return data
 
+def filter_labels_aws(data):
+    """
+    ------------------------------------------------------------------------------------------------------
+
+    replaces speaker labels in AWS JSON.
+
+    Parameters:
+    ...........
+    data : JSON
+        The JSON containing speaker labels.
+
+    Returns:
+    ...........
+    data : JSON
+        The modified JSON with replaced speaker labels.
+
+    ------------------------------------------------------------------------------------------------------
+    """
+    if 'results' in data:
+        speaker_labels = data['results'].get('speaker_labels', {})
+        segments = speaker_labels.get('segments', [])
+
+        for segment in segments:
+            seg_speaker_label = segment.get('speaker_label', '')
+            
+            if 'spk_' in seg_speaker_label:
+                segment['speaker_label'] = seg_speaker_label.replace("spk_", "speaker")
+            
+            seg_items = segment.get('items', [])
+            for seg_item in seg_items:
+                
+                seg_item_speaker_label = seg_item.get('speaker_label', '')
+                if 'spk_' in seg_item_speaker_label:
+                    
+                    seg_item['speaker_label'] = seg_item_speaker_label.replace("spk_", "speaker")
+        items = data['results'].get('items', [])
+        
+        for item in items:
+            item_speaker_label = item.get('speaker_label', '')
+            
+            if 'spk_' in item_speaker_label:
+                item['speaker_label'] = item_speaker_label.replace("spk_", "speaker")
+
+    return data
+
+def filter_labels_whisper(data):
+    """
+    ------------------------------------------------------------------------------------------------------
+
+    replaces speaker labels in Whisper JSON.
+
+    Parameters:
+    ...........
+    data : JSON
+        The JSON containing speaker labels.
+
+    Returns:
+    ...........
+    data : JSON
+        The modified JSON with replaced speaker labels.
+
+    ------------------------------------------------------------------------------------------------------
+    """
+    for segment in data.get('segments', []):
+        current_speaker = segment.get('speaker', '')
+
+        if 'SPEAKER_0' in current_speaker:
+            segment["speaker"] = current_speaker.replace("SPEAKER_0", "speaker")
+
+        for word in segment["words"]:
+            word_speaker = word.get('speaker', '')
+            
+            if 'SPEAKER_0' in word_speaker:
+                word["speaker"] = word_speaker.replace("SPEAKER_0", "speaker")
+
+    for word_segment in data.get('word_segments', []):
+        word_seg_speaker = word_segment.get('speaker', '')
+        
+        if 'SPEAKER_0' in word_seg_speaker: 
+            word_segment["speaker"] = word_seg_speaker.replace("SPEAKER_0", "speaker")
+
+    return data
+
 def extract_content(data):
     """
     ------------------------------------------------------------------------------------------------------
@@ -194,7 +277,7 @@ def filter_transcript_response(status, input_param):
     transcript = response['results']['transcripts'][0]['transcript']
     if input_param['speaker_labels'] == True:#replace speaker labels with standard names
 
-        response = replace_speaker_labels(response, ['spk_0', 'spk_1'], ['speaker0', 'speaker1'])
+        response = filter_labels_aws(response)
     return response, transcript
 
 def transcribe_audio(s3uri, input_param):