From 36633b4c72276a49a8ad85be16676dc4ed90bcaf Mon Sep 17 00:00:00 2001 From: shadowcz007 Date: Wed, 2 Oct 2024 14:09:08 +0800 Subject: [PATCH 01/11] update --- __init__.py | 15 +++- nodes/Audio.py | 92 ++++++++++++++++++++++++ nodes/SenseVoice.py | 18 +++-- nodes/Whisper.py | 171 ++++++++++++++++++++++++++++++++++++++++++++ requirements.txt | 1 + 5 files changed, 289 insertions(+), 8 deletions(-) create mode 100644 nodes/Whisper.py diff --git a/__init__.py b/__init__.py index aedea7b4..9007c2a6 100644 --- a/__init__.py +++ b/__init__.py @@ -1006,7 +1006,7 @@ def mix_status(request): # from .nodes.Vae import VAELoader,VAEDecode from .nodes.ScreenShareNode import ScreenShareNode,FloatingVideo -from .nodes.Audio import AudioPlayNode,SpeechRecognition,SpeechSynthesis +from .nodes.Audio import AudioPlayNode,SpeechRecognition,SpeechSynthesis,AnalyzeAudioNone from .nodes.Utils import CreateJsonNode,KeyInput,IncrementingListNode,ListSplit,CreateLoraNames,CreateSampler_names,CreateCkptNames,CreateSeedNode,TESTNODE_,TESTNODE_TOKEN,AppInfo,IntNumber,FloatSlider,TextInput,ColorInput,FontInput,TextToNumber,DynamicDelayProcessor,LimitNumber,SwitchByIndex,MultiplicationNode from .nodes.Mask import PreviewMask_,MaskListReplace,MaskListMerge,OutlineMask,FeatheredMask @@ -1103,6 +1103,7 @@ def mix_status(request): "SpeechRecognition":SpeechRecognition, "SpeechSynthesis":SpeechSynthesis, "AudioPlay":AudioPlayNode, + "AnalyzeAudio":AnalyzeAudioNone, # Text "TextToNumber":TextToNumber, @@ -1220,6 +1221,7 @@ def mix_status(request): "SpeechSynthesis":"SpeechSynthesis ♾️Mixlab", "SpeechRecognition":"SpeechRecognition ♾️Mixlab", "AudioPlay":"Preview Audio ♾️Mixlab", + "AnalyzeAudio":"Analyze Audio ♾️Mixlab", # Utils "DynamicDelayProcessor":"DynamicDelayByText ♾️Mixlab", @@ -1433,11 +1435,20 @@ def mix_status(request): from .nodes.SenseVoice import SenseVoiceNode logging.info('SenseVoice.available') NODE_CLASS_MAPPINGS['SenseVoiceNode']=SenseVoiceNode - NODE_DISPLAY_NAME_MAPPINGS["SenseVoiceNode"]= "Sense Voice" + NODE_DISPLAY_NAME_MAPPINGS["SenseVoiceNode"]= "Sense Voice ♾️Mixlab" except Exception as e: logging.info('SenseVoice.available False' ) +try: + from .nodes.Whisper import LoadWhisperModel,WhisperTranscribe + logging.info('Whisper.available') + NODE_CLASS_MAPPINGS['LoadWhisperModel_']=LoadWhisperModel + NODE_CLASS_MAPPINGS['WhisperTranscribe_']=WhisperTranscribe + NODE_DISPLAY_NAME_MAPPINGS["LoadWhisperModel_"]= "Load Whisper Model ♾️Mixlab" + NODE_DISPLAY_NAME_MAPPINGS["WhisperTranscribe_"]= "Whisper Transcribe ♾️Mixlab" +except Exception as e: + logging.info('Whisper.available False' ) logging.info('\033[93m -------------- \033[0m') diff --git a/nodes/Audio.py b/nodes/Audio.py index 4db98c62..1e44f34d 100644 --- a/nodes/Audio.py +++ b/nodes/Audio.py @@ -3,6 +3,98 @@ import folder_paths import torchaudio +class AnyType(str): + """A special class that is always equal in not equal comparisons. Credit to pythongosssss""" + + def __ne__(self, __value: object) -> bool: + return False + +any_type = AnyType("*") + + +def analyze_audio_data(audio_data): + total_duration = 0 + total_gap_duration = 0 + emotion_counts = {} + audio_types = set() + + for i, entry in enumerate(audio_data): + # Calculate the duration of each audio segment + start_time = entry['start_time'] + end_time = entry['end_time'] + duration = end_time - start_time + total_duration += duration + + # Count the emotions + if "emotion" in entry: + emotion = entry['emotion'] + if emotion in emotion_counts: + emotion_counts[emotion] += 1 + else: + emotion_counts[emotion] = 1 + + # Collect the audio types + if "audio_type" in entry: + audio_types.add(entry['audio_type']) + + # Calculate gap duration if not the last entry + if i < len(audio_data) - 1: + next_start_time = audio_data[i + 1]['start_time'] + gap_duration = next_start_time - end_time + if gap_duration > 0: + total_gap_duration += gap_duration + + # Get the most frequent emotion + if len(emotion_counts.keys())>0: + most_frequent_emotion = max(emotion_counts, key=emotion_counts.get) + else: + most_frequent_emotion=None + + # Convert audio_types set to list for better readability + audio_types = list(audio_types) + + # Print the results + print(f"Total Effective Duration: {total_duration:.2f} seconds") + print(f"Total Gap Duration: {total_gap_duration:.2f} seconds") + print(f"Emotion Changes: {emotion_counts}") + print(f"Most Frequent Emotion: {most_frequent_emotion}") + print(f"Audio Types: {audio_types}") + + + return { + "total_duration": total_duration, + "total_gap_duration": total_gap_duration, + "emotion_changes": emotion_counts, + "most_frequent_emotion": most_frequent_emotion, + "audio_types": audio_types + } + +# Example usage +audio_data = [{'language': 'zh', 'emotion': 'SAD', 'audio_type': 'Speech', 'itn': 'withitn', 'srt_content': '1\n00:00:00,540 --> 00:00:10,550\n嘿,欢迎来到梦幻纷飞探索心灵迷雾的奇境之旅。在这里,你将踏上一段超现实的。\n', 'start_time': 0.54, 'end_time': 10.55, 'text': '嘿,欢迎来到梦幻纷飞探索心灵迷雾的奇境之旅。在这里,你将踏上一段超现实的。'}, {'language': 'zh', 'emotion': 'NEUTRAL', 'audio_type': 'Speech', 'itn': 'withitn', 'srt_content': '1\n00:00:10,550 --> 00:00:20,559\n有奇幻之旅,想象一下啊,这个悬浮的棋盘和巨大的漂浮眼睛。哇这。\n', 'start_time': 10.55, 'end_time': 20.56, 'text': '有奇幻之旅,想象一下啊,这个悬浮的棋盘和巨大的漂浮眼睛。哇这。'}, {'language': 'zh', 'emotion': 'NEUTRAL', 'audio_type': 'Speech', 'itn': 'withitn', 'srt_content': '1\n00:00:20,559 --> 00:00:25,960\n意象就会让你对其他世界为独,充满好奇。\n', 'start_time': 20.56, 'end_time': 25.96, 'text': '意象就会让你对其他世界为独,充满好奇。'}, {'language': 'zh', 'emotion': 'NEUTRAL', 'audio_type': 'Speech', 'itn': 'withitn', 'srt_content': '1\n00:00:26,239 --> 00:00:36,250\n我们的艺术家梦颖就以其独特的杏人型眼睛和飘逸的黑发,穿着流动的礼服,完美。\n', 'start_time': 26.24, 'end_time': 36.25, 'text': '我们的艺术家梦颖就以其独特的杏人型眼睛和飘逸的黑发,穿着流动的礼服,完美。'}, {'language': 'zh', 'emotion': 'NEUTRAL', 'audio_type': 'Speech', 'itn': 'withitn', 'srt_content': '1\n00:00:36,250 --> 00:00:41,820\n融入这个暗淡的背景啊,形成一个神秘的身影。\n', 'start_time': 36.25, 'end_time': 41.82, 'text': '融入这个暗淡的背景啊,形成一个神秘的身影。'}, {'language': 'zh', 'emotion': 'NEUTRAL', 'audio_type': 'Speech', 'itn': 'withitn', 'srt_content': '1\n00:00:43,170 --> 00:00:53,179\n在这次展览中,我们将一起揭示人类心灵的秘密与欲望。那个梦影的作品通过细腻的绘画。\n', 'start_time': 43.17, 'end_time': 53.18, 'text': '在这次展览中,我们将一起揭示人类心灵的秘密与欲望。那个梦影的作品通过细腻的绘画。'}, {'language': 'zh', 'emotion': 'SAD', 'audio_type': 'Speech', 'itn': 'withitn', 'srt_content': '1\n00:00:53,179 --> 00:01:03,189\n技巧和超越常规的图像构思,带你探索内心深处的情感和欲望。这种感觉真的就是。\n', 'start_time': 53.18, 'end_time': 63.19, 'text': '技巧和超越常规的图像构思,带你探索内心深处的情感和欲望。这种感觉真的就是。'}, {'language': 'zh', 'emotion': 'NEUTRAL', 'audio_type': 'Speech', 'itn': 'withitn', 'srt_content': '1\n00:01:03,189 --> 00:01:09,019\n是准备好被激发想象力和困惑了吗?哇哦。\n', 'start_time': 63.19, 'end_time': 69.02, 'text': '是准备好被激发想象力和困惑了吗?哇哦。'}, {'language': 'zh', 'emotion': 'NEUTRAL', 'audio_type': 'Speech', 'itn': 'withitn', 'srt_content': '1\n00:01:09,299 --> 00:01:19,310\n通过与艺术家的互动,你将进入一个超越现实的境界,挑战你对现实和逻辑的理解。\n', 'start_time': 69.3, 'end_time': 79.31, 'text': '通过与艺术家的互动,你将进入一个超越现实的境界,挑战你对现实和逻辑的理解。'}, {'language': 'zh', 'emotion': 'NEUTRAL', 'audio_type': 'Speech', 'itn': 'withitn', 'srt_content': '1\n00:01:20,939 --> 00:01:30,950\n所以快来吧,然后让我们一起投身这场奇境之旅吧,挖掘人类的秘密与欲望。\n', 'start_time': 80.94, 'end_time': 90.95, 'text': '所以快来吧,然后让我们一起投身这场奇境之旅吧,挖掘人类的秘密与欲望。'}, {'language': 'zh', 'emotion': 'SAD', 'audio_type': 'Speech', 'itn': 'withitn', 'srt_content': '1\n00:01:31,189 --> 00:01:41,200\n激发你的想象力与困惑,希望这段旅程能让你在艺术与灵魂的碰撞中找到珍贵的。\n', 'start_time': 91.19, 'end_time': 101.2, 'text': '激发你的想象力与困惑,希望这段旅程能让你在艺术与灵魂的碰撞中找到珍贵的。'}, {'language': 'zh', 'emotion': 'NEUTRAL', 'audio_type': 'Speech', 'itn': 'withitn', 'srt_content': '1\n00:01:41,200 --> 00:01:47,170\n回忆并引发内心的转变与行动。那就是。\n', 'start_time': 101.2, 'end_time': 107.17, 'text': '回忆并引发内心的转变与行动。那就是。'}, {'language': 'zh', 'emotion': 'NEUTRAL', 'audio_type': 'Speech', 'itn': 'withitn', 'srt_content': '1\n00:01:48,760 --> 00:01:51,790\n嗯,准备好了吗?走吧。\n', 'start_time': 108.76, 'end_time': 111.79, 'text': '嗯,准备好了吗?走吧。'}] + +print(analyze_audio_data(audio_data)) + +# 分析音频数据 +class AnalyzeAudioNone: + @classmethod + def INPUT_TYPES(s): + return {"required": { + "json":(any_type,),}, + } + + RETURN_TYPES = (any_type,) + RETURN_NAMES = ("result",) + + FUNCTION = "run" + + CATEGORY = "♾️Mixlab/Audio" + + def run(self,json): + result=analyze_audio_data(json) + return (result,) + + + class SpeechRecognition: @classmethod def INPUT_TYPES(s): diff --git a/nodes/SenseVoice.py b/nodes/SenseVoice.py index 8c0d4de4..8c532dcf 100644 --- a/nodes/SenseVoice.py +++ b/nodes/SenseVoice.py @@ -168,8 +168,9 @@ def INPUT_TYPES(s): OUTPUT_NODE = True FUNCTION = "run" - RETURN_TYPES = (any_type,) - RETURN_NAMES = ("result",) + + RETURN_TYPES = (any_type,"STRING","FLOAT",) + RETURN_NAMES = ("result","text","total_seconds",) def run(self,audio,device,language,num_threads,use_int8,use_itn ): @@ -200,16 +201,21 @@ def run(self,audio,device,language,num_threads,use_int8,use_itn ): if 'waveform' in audio and 'sample_rate' in audio: waveform = audio['waveform'] + sample_rate = audio['sample_rate'] # print("Original shape:", waveform.shape) # 打印原始形状 if waveform.ndim == 3 and waveform.shape[0] == 1: # 检查是否为三维且 batch_size 为 1 waveform = waveform.squeeze(0) # 移除 batch_size 维度 - waveform_numpy = waveform.numpy().transpose(1, 0) # 转换为 (num_samples, num_channels) else: raise ValueError("Unexpected waveform dimensions") - _sample_rate = audio['sample_rate'] + print("waveform.shape:", waveform.shape) + total_length_seconds = waveform.shape[1] / sample_rate + + waveform_numpy = waveform.numpy().transpose(1, 0) # 转换为 (num_samples, num_channels) - results=self.processor.process_audio(waveform_numpy, _sample_rate, language, use_itn) + results=self.processor.process_audio(waveform_numpy, sample_rate, language, use_itn) + srt_content="\n".join([s['srt_content'] for s in results]) - return (results,) + return (results,srt_content,total_length_seconds,) + \ No newline at end of file diff --git a/nodes/Whisper.py b/nodes/Whisper.py new file mode 100644 index 00000000..b3d4f60e --- /dev/null +++ b/nodes/Whisper.py @@ -0,0 +1,171 @@ +import os,re +import sys,time +from pathlib import Path +import torchaudio +import hashlib +import torch +import folder_paths +import comfy.utils + +from faster_whisper import WhisperModel + +class AnyType(str): + """A special class that is always equal in not equal comparisons. Credit to pythongosssss""" + + def __ne__(self, __value: object) -> bool: + return False + +any_type = AnyType("*") + +def get_model_dir(m): + try: + return folder_paths.get_folder_paths(m)[0] + except: + return os.path.join(folder_paths.models_dir, m) + + + +whisper_model_path=get_model_dir('whisper') + +model_sizes=[ + d for d in os.listdir(whisper_model_path) if os.path.isdir( + os.path.join(whisper_model_path, d) + ) and os.path.isfile(os.path.join(os.path.join(whisper_model_path, d), "config.json")) + ] + + +class LoadWhisperModel: + def __init__(self): + self.model = None + self.device="cuda" if torch.cuda.is_available() else "cpu" + self.model_size=model_sizes[0] + self.compute_type='float16' + + @classmethod + def INPUT_TYPES(s): + return {"required": { + "model_size": (model_sizes,), + "device": (["auto","cpu"],), + "compute_type": (["float16","int8_float16","int8"],), + }, + } + + RETURN_TYPES = ("WHISPER",) + RETURN_NAMES = ("whisper_model",) + + FUNCTION = "run" + + CATEGORY = "♾️Mixlab/Audio/Whisper" + + INPUT_IS_LIST = False + OUTPUT_IS_LIST = (False,) + + def run(self,model_size,device,compute_type): + + if device=="auto" and self.device!='cuda': + self.device="cuda" if torch.cuda.is_available() else "cpu" + self.model=None + + if device=='cpu' and self.device!='cpu': + self.device="cpu" + self.model=None + + if model_size!= self.model_size: + self.model_size=model_size + self.model=None + + if compute_type!=self.compute_type: + self.compute_type=compute_type + self.model=None + + if self.model==None: + self.model = WhisperModel( + os.path.join(whisper_model_path, self.model_size), + device=self.device, + compute_type=self.compute_type + ) + + return (self.model,) + + +class WhisperTranscribe: + @classmethod + def INPUT_TYPES(s): + return {"required": { + "whisper_model": ("WHISPER",), + "audio": ("AUDIO",), + }, + } + + RETURN_TYPES = (any_type,"STRING","FLOAT",) + RETURN_NAMES = ("result","text","total_seconds",) + + FUNCTION = "run" + + CATEGORY = "♾️Mixlab/Audio/Whisper" + + INPUT_IS_LIST = False + # OUTPUT_IS_LIST = (False,False,False,) + + def run(self,whisper_model,audio): + + if 'audio_path' in audio and (not 'waveform' in audio): + waveform, sample_rate = torchaudio.load(audio['audio_path']) + waveform=waveform.mean(0) + total_length_seconds = waveform.shape[0] / sample_rate + waveform=waveform.numpy() + + elif 'waveform' in audio and 'sample_rate' in audio: + print("Original shape:", audio["waveform"].shape, isinstance(audio["waveform"], torch.Tensor)) # 打印原始形状 + waveform = audio["waveform"].squeeze(0) # Remove the added batch dimension + sample_rate = audio["sample_rate"] + + # if audio_sf != sampling_rate: + # waveform = torchaudio.functional.resample( + # waveform, orig_freq=audio_sf, new_freq=sampling_rate + # ) + + waveform=waveform.mean(0) + + total_length_seconds = waveform.shape[0] / sample_rate + + waveform=waveform.numpy() #whisper_model.transcribe 旧版不支持直接传tensor,先用numpy + + segments, info = whisper_model.transcribe(waveform, beam_size=5) + + print("Detected language '%s' with probability %f" % (info.language, info.language_probability)) + + # Function to format time for SRT + def format_time(seconds): + millis = int((seconds - int(seconds)) * 1000) + hours, remainder = divmod(int(seconds), 3600) + minutes, seconds = divmod(remainder, 60) + return f"{hours:02}:{minutes:02}:{seconds:02},{millis:03}" + + # Prepare SRT content as a string + results = [] + for i, segment in enumerate(segments): + start_time = format_time(segment.start) + end_time = format_time(segment.end) + srt_content = f"{i + 1}\n" + srt_content += f"{start_time} --> {end_time}\n" + + text=segment.text.strip() + + srt_content += f"{text}\n\n" + + start_time=segment.start + end_time=segment.end + + + results.append({ + "srt_content":srt_content, + "start_time":start_time, + "end_time":end_time, + "text":text + }) + + srt_content="\n".join([s['srt_content'] for s in results]) + + return (results,srt_content,total_length_seconds,) + diff --git a/requirements.txt b/requirements.txt index c0326dc7..37cc568a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -32,3 +32,4 @@ natsort>=8.4.0 git+https://github.com/shadowcz007/SenseVoice-python.git +faster_whisper From 289f83675b7460e84e451b07ac085638b44eb083 Mon Sep 17 00:00:00 2001 From: shadowcz007 Date: Wed, 2 Oct 2024 16:41:44 +0800 Subject: [PATCH 02/11] Update SenseVoice.py --- nodes/SenseVoice.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/nodes/SenseVoice.py b/nodes/SenseVoice.py index 8c532dcf..04736b13 100644 --- a/nodes/SenseVoice.py +++ b/nodes/SenseVoice.py @@ -169,8 +169,8 @@ def INPUT_TYPES(s): OUTPUT_NODE = True FUNCTION = "run" - RETURN_TYPES = (any_type,"STRING","FLOAT",) - RETURN_NAMES = ("result","text","total_seconds",) + RETURN_TYPES = (any_type,"STRING","STRING","FLOAT",) + RETURN_NAMES = ("result","srt","text","total_seconds",) def run(self,audio,device,language,num_threads,use_int8,use_itn ): @@ -216,6 +216,7 @@ def run(self,audio,device,language,num_threads,use_int8,use_itn ): results=self.processor.process_audio(waveform_numpy, sample_rate, language, use_itn) srt_content="\n".join([s['srt_content'] for s in results]) + text="\n".join([s['text'] for s in results]) - return (results,srt_content,total_length_seconds,) + return (results,srt_content,text,total_length_seconds,) \ No newline at end of file From 0f77f28a954773c9d8c211465ee1a4d447ce17e6 Mon Sep 17 00:00:00 2001 From: shadowcz007 Date: Wed, 2 Oct 2024 16:41:46 +0800 Subject: [PATCH 03/11] Update Whisper.py --- nodes/Whisper.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/nodes/Whisper.py b/nodes/Whisper.py index b3d4f60e..cc669d72 100644 --- a/nodes/Whisper.py +++ b/nodes/Whisper.py @@ -97,8 +97,8 @@ def INPUT_TYPES(s): }, } - RETURN_TYPES = (any_type,"STRING","FLOAT",) - RETURN_NAMES = ("result","text","total_seconds",) + RETURN_TYPES = (any_type,"STRING","STRING","FLOAT",) + RETURN_NAMES = ("result","srt","text","total_seconds",) FUNCTION = "run" @@ -166,6 +166,7 @@ def format_time(seconds): }) srt_content="\n".join([s['srt_content'] for s in results]) + text="\n".join([s['text'] for s in results]) - return (results,srt_content,total_length_seconds,) + return (results,srt_content,text,total_length_seconds,) From b72e7dda088954c275bb395e93721f297cd11019 Mon Sep 17 00:00:00 2001 From: shadowcz007 Date: Wed, 2 Oct 2024 17:05:35 +0800 Subject: [PATCH 04/11] Update Audio.py --- nodes/Audio.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/nodes/Audio.py b/nodes/Audio.py index 1e44f34d..353c4d6b 100644 --- a/nodes/Audio.py +++ b/nodes/Audio.py @@ -17,6 +17,7 @@ def analyze_audio_data(audio_data): total_gap_duration = 0 emotion_counts = {} audio_types = set() + languages = set() for i, entry in enumerate(audio_data): # Calculate the duration of each audio segment @@ -37,6 +38,9 @@ def analyze_audio_data(audio_data): if "audio_type" in entry: audio_types.add(entry['audio_type']) + if "language" in entry: + languages.add(entry['language']) + # Calculate gap duration if not the last entry if i < len(audio_data) - 1: next_start_time = audio_data[i + 1]['start_time'] @@ -53,6 +57,8 @@ def analyze_audio_data(audio_data): # Convert audio_types set to list for better readability audio_types = list(audio_types) + languages=list(languages) + # Print the results print(f"Total Effective Duration: {total_duration:.2f} seconds") print(f"Total Gap Duration: {total_gap_duration:.2f} seconds") @@ -66,13 +72,10 @@ def analyze_audio_data(audio_data): "total_gap_duration": total_gap_duration, "emotion_changes": emotion_counts, "most_frequent_emotion": most_frequent_emotion, - "audio_types": audio_types + "audio_types": audio_types, + "languages":languages } -# Example usage -audio_data = [{'language': 'zh', 'emotion': 'SAD', 'audio_type': 'Speech', 'itn': 'withitn', 'srt_content': '1\n00:00:00,540 --> 00:00:10,550\n嘿,欢迎来到梦幻纷飞探索心灵迷雾的奇境之旅。在这里,你将踏上一段超现实的。\n', 'start_time': 0.54, 'end_time': 10.55, 'text': '嘿,欢迎来到梦幻纷飞探索心灵迷雾的奇境之旅。在这里,你将踏上一段超现实的。'}, {'language': 'zh', 'emotion': 'NEUTRAL', 'audio_type': 'Speech', 'itn': 'withitn', 'srt_content': '1\n00:00:10,550 --> 00:00:20,559\n有奇幻之旅,想象一下啊,这个悬浮的棋盘和巨大的漂浮眼睛。哇这。\n', 'start_time': 10.55, 'end_time': 20.56, 'text': '有奇幻之旅,想象一下啊,这个悬浮的棋盘和巨大的漂浮眼睛。哇这。'}, {'language': 'zh', 'emotion': 'NEUTRAL', 'audio_type': 'Speech', 'itn': 'withitn', 'srt_content': '1\n00:00:20,559 --> 00:00:25,960\n意象就会让你对其他世界为独,充满好奇。\n', 'start_time': 20.56, 'end_time': 25.96, 'text': '意象就会让你对其他世界为独,充满好奇。'}, {'language': 'zh', 'emotion': 'NEUTRAL', 'audio_type': 'Speech', 'itn': 'withitn', 'srt_content': '1\n00:00:26,239 --> 00:00:36,250\n我们的艺术家梦颖就以其独特的杏人型眼睛和飘逸的黑发,穿着流动的礼服,完美。\n', 'start_time': 26.24, 'end_time': 36.25, 'text': '我们的艺术家梦颖就以其独特的杏人型眼睛和飘逸的黑发,穿着流动的礼服,完美。'}, {'language': 'zh', 'emotion': 'NEUTRAL', 'audio_type': 'Speech', 'itn': 'withitn', 'srt_content': '1\n00:00:36,250 --> 00:00:41,820\n融入这个暗淡的背景啊,形成一个神秘的身影。\n', 'start_time': 36.25, 'end_time': 41.82, 'text': '融入这个暗淡的背景啊,形成一个神秘的身影。'}, {'language': 'zh', 'emotion': 'NEUTRAL', 'audio_type': 'Speech', 'itn': 'withitn', 'srt_content': '1\n00:00:43,170 --> 00:00:53,179\n在这次展览中,我们将一起揭示人类心灵的秘密与欲望。那个梦影的作品通过细腻的绘画。\n', 'start_time': 43.17, 'end_time': 53.18, 'text': '在这次展览中,我们将一起揭示人类心灵的秘密与欲望。那个梦影的作品通过细腻的绘画。'}, {'language': 'zh', 'emotion': 'SAD', 'audio_type': 'Speech', 'itn': 'withitn', 'srt_content': '1\n00:00:53,179 --> 00:01:03,189\n技巧和超越常规的图像构思,带你探索内心深处的情感和欲望。这种感觉真的就是。\n', 'start_time': 53.18, 'end_time': 63.19, 'text': '技巧和超越常规的图像构思,带你探索内心深处的情感和欲望。这种感觉真的就是。'}, {'language': 'zh', 'emotion': 'NEUTRAL', 'audio_type': 'Speech', 'itn': 'withitn', 'srt_content': '1\n00:01:03,189 --> 00:01:09,019\n是准备好被激发想象力和困惑了吗?哇哦。\n', 'start_time': 63.19, 'end_time': 69.02, 'text': '是准备好被激发想象力和困惑了吗?哇哦。'}, {'language': 'zh', 'emotion': 'NEUTRAL', 'audio_type': 'Speech', 'itn': 'withitn', 'srt_content': '1\n00:01:09,299 --> 00:01:19,310\n通过与艺术家的互动,你将进入一个超越现实的境界,挑战你对现实和逻辑的理解。\n', 'start_time': 69.3, 'end_time': 79.31, 'text': '通过与艺术家的互动,你将进入一个超越现实的境界,挑战你对现实和逻辑的理解。'}, {'language': 'zh', 'emotion': 'NEUTRAL', 'audio_type': 'Speech', 'itn': 'withitn', 'srt_content': '1\n00:01:20,939 --> 00:01:30,950\n所以快来吧,然后让我们一起投身这场奇境之旅吧,挖掘人类的秘密与欲望。\n', 'start_time': 80.94, 'end_time': 90.95, 'text': '所以快来吧,然后让我们一起投身这场奇境之旅吧,挖掘人类的秘密与欲望。'}, {'language': 'zh', 'emotion': 'SAD', 'audio_type': 'Speech', 'itn': 'withitn', 'srt_content': '1\n00:01:31,189 --> 00:01:41,200\n激发你的想象力与困惑,希望这段旅程能让你在艺术与灵魂的碰撞中找到珍贵的。\n', 'start_time': 91.19, 'end_time': 101.2, 'text': '激发你的想象力与困惑,希望这段旅程能让你在艺术与灵魂的碰撞中找到珍贵的。'}, {'language': 'zh', 'emotion': 'NEUTRAL', 'audio_type': 'Speech', 'itn': 'withitn', 'srt_content': '1\n00:01:41,200 --> 00:01:47,170\n回忆并引发内心的转变与行动。那就是。\n', 'start_time': 101.2, 'end_time': 107.17, 'text': '回忆并引发内心的转变与行动。那就是。'}, {'language': 'zh', 'emotion': 'NEUTRAL', 'audio_type': 'Speech', 'itn': 'withitn', 'srt_content': '1\n00:01:48,760 --> 00:01:51,790\n嗯,准备好了吗?走吧。\n', 'start_time': 108.76, 'end_time': 111.79, 'text': '嗯,准备好了吗?走吧。'}] - -print(analyze_audio_data(audio_data)) # 分析音频数据 class AnalyzeAudioNone: From e32a3675fcbc5219269c178b659c7e53ace58962 Mon Sep 17 00:00:00 2001 From: shadowcz007 Date: Wed, 2 Oct 2024 17:05:43 +0800 Subject: [PATCH 05/11] Update Whisper.py --- nodes/Whisper.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/nodes/Whisper.py b/nodes/Whisper.py index cc669d72..62c44e07 100644 --- a/nodes/Whisper.py +++ b/nodes/Whisper.py @@ -162,7 +162,8 @@ def format_time(seconds): "srt_content":srt_content, "start_time":start_time, "end_time":end_time, - "text":text + "text":text, + "language":[info.language] }) srt_content="\n".join([s['srt_content'] for s in results]) From d3aaa191489327fa55c2eafa8b65a3f0a8062c65 Mon Sep 17 00:00:00 2001 From: shadowcz007 Date: Wed, 2 Oct 2024 17:07:10 +0800 Subject: [PATCH 06/11] Update ChatGPT.py --- nodes/ChatGPT.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/nodes/ChatGPT.py b/nodes/ChatGPT.py index ed797320..7ad339d6 100644 --- a/nodes/ChatGPT.py +++ b/nodes/ChatGPT.py @@ -802,6 +802,9 @@ def INPUT_TYPES(s): def run(self, json_string,key=""): + if not isinstance(json_string, str): + json_string=json.dumps(json_string) + json_string=extract_json_strings(json_string) # print(json_string) good_json_string = repair_json(json_string) From 2fbee59c3ec43d01a9432248bea71989d4905483 Mon Sep 17 00:00:00 2001 From: shadowcz007 Date: Thu, 3 Oct 2024 11:12:10 +0800 Subject: [PATCH 07/11] fixbug --- nodes/SenseVoice.py | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/nodes/SenseVoice.py b/nodes/SenseVoice.py index 04736b13..159a5201 100644 --- a/nodes/SenseVoice.py +++ b/nodes/SenseVoice.py @@ -48,6 +48,9 @@ def format_time(seconds): pattern = r"<\|(.+?)\|><\|(.+?)\|><\|(.+?)\|><\|(.+?)\|>(.+)" match = re.match(pattern,asr_result) + print('#format_to_srt',match,asr_result) + if match==None: + return None, None, None, None,None,start_time,end_time,text lang, emotion, audio_type, itn, text = match.groups() # 😊 表示高兴,😡 表示愤怒,😔 表示悲伤。对于音频事件,🎼 表示音乐,😀 表示笑声,👏 表示掌声 @@ -115,16 +118,17 @@ def process_audio(self, waveform, _sample_rate, language, use_itn): part[1], asr_result) - results.append({ - "language":lang, - "emotion":emotion, - "audio_type":audio_type, - "itn":itn, - "srt_content":srt_content, - "start_time":start_time, - "end_time":end_time, - "text":text - }) + if lang!=None: + results.append({ + "language":lang, + "emotion":emotion, + "audio_type":audio_type, + "itn":itn, + "srt_content":srt_content, + "start_time":start_time, + "end_time":end_time, + "text":text + }) self.vad.vad.all_reset_detection() pbar.update(1) # 更新进度条 From 6579ff20b4a1d04bc90415e8caf0ddf0a6cfa5c2 Mon Sep 17 00:00:00 2001 From: shadowcz007 Date: Thu, 3 Oct 2024 11:12:22 +0800 Subject: [PATCH 08/11] json_string2 --- nodes/ChatGPT.py | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/nodes/ChatGPT.py b/nodes/ChatGPT.py index 7ad339d6..940879bf 100644 --- a/nodes/ChatGPT.py +++ b/nodes/ChatGPT.py @@ -786,9 +786,12 @@ class JsonRepair: def INPUT_TYPES(s): return { "required": { - "json_string":("STRING", {"forceInput": True,}), - "key":("STRING", {"multiline": False,"dynamicPrompts": False,"default": ""}), - } + "json_string":("STRING", {"forceInput": True,}), + "key":("STRING", {"multiline": False,"dynamicPrompts": False,"default": ""}), + }, + "optional":{ + "json_string2":("STRING", {"forceInput": True,}) + }, } INPUT_IS_LIST = False @@ -800,7 +803,7 @@ def INPUT_TYPES(s): CATEGORY = "♾️Mixlab/GPT" - def run(self, json_string,key=""): + def run(self, json_string,key="",json_string2=None): if not isinstance(json_string, str): json_string=json.dumps(json_string) @@ -812,6 +815,20 @@ def run(self, json_string,key=""): # 将 JSON 字符串解析为 Python 对象 data = json.loads(good_json_string) + if json_string2!=None: + if not isinstance(json_string2, str): + json_string2=json.dumps(json_string2) + + json_string2=extract_json_strings(json_string2) + # print(json_string) + good_json_string2 = repair_json(json_string2) + + # 将 JSON 字符串解析为 Python 对象 + data2 = json.loads(good_json_string2) + + data={**data, **data2} + + v="" if key!="" and (key in data): v=data[key] From b766b8b65dfb8675088e261a8e5f5a5d53699d6f Mon Sep 17 00:00:00 2001 From: shadowcz007 Date: Thu, 3 Oct 2024 11:13:14 +0800 Subject: [PATCH 09/11] Update SenseVoice.py --- nodes/SenseVoice.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nodes/SenseVoice.py b/nodes/SenseVoice.py index 159a5201..61f9a866 100644 --- a/nodes/SenseVoice.py +++ b/nodes/SenseVoice.py @@ -50,7 +50,7 @@ def format_time(seconds): match = re.match(pattern,asr_result) print('#format_to_srt',match,asr_result) if match==None: - return None, None, None, None,None,start_time,end_time,text + return None, None, None, None,None,start_time,end_time,None lang, emotion, audio_type, itn, text = match.groups() # 😊 表示高兴,😡 表示愤怒,😔 表示悲伤。对于音频事件,🎼 表示音乐,😀 表示笑声,👏 表示掌声 From 36ef7d25eff09e63f959cea1ba0d9679f5ab42f0 Mon Sep 17 00:00:00 2001 From: shadowcz007 Date: Sat, 5 Oct 2024 12:31:22 +0800 Subject: [PATCH 10/11] Update ui_mixlab.js --- web/javascript/ui_mixlab.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/web/javascript/ui_mixlab.js b/web/javascript/ui_mixlab.js index aff861ca..45026d96 100644 --- a/web/javascript/ui_mixlab.js +++ b/web/javascript/ui_mixlab.js @@ -544,7 +544,7 @@ async function getCustomnodeMappings () { const data = (await get_nodes_map()).data window._nodes_maps = data } - console.log('#getCustomnodeMappings', window._nodes_maps) + // console.log('#getCustomnodeMappings', window._nodes_maps) for (let url in window._nodes_maps) { let n = window._nodes_maps[url] for (let node of n[0]) { From edd7af986d86222f990bbf3e8b724c0d975c2735 Mon Sep 17 00:00:00 2001 From: shadowcz007 Date: Sat, 12 Oct 2024 10:44:39 +0800 Subject: [PATCH 11/11] update --- pyproject.toml | 2 +- web/javascript/checkVersion_mixlab.js | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 41924b13..d250c713 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "comfyui-mixlab-nodes" description = "3D, ScreenShareNode & FloatingVideoNode, SpeechRecognition & SpeechSynthesis, GPT, LoadImagesFromLocal, Layers, Other Nodes, ..." -version = "0.43.0" +version = "0.44.0" license = "MIT" dependencies = ["numpy", "pyOpenSSL", "watchdog", "opencv-python-headless", "matplotlib", "openai", "simple-lama-inpainting", "clip-interrogator==0.6.0", "transformers>=4.36.0", "lark-parser", "imageio-ffmpeg", "rembg[gpu]", "omegaconf==2.3.0", "Pillow>=9.5.0", "einops==0.7.0", "trimesh>=4.0.5", "huggingface-hub", "scikit-image"] diff --git a/web/javascript/checkVersion_mixlab.js b/web/javascript/checkVersion_mixlab.js index 111158e7..eaf9aa9c 100644 --- a/web/javascript/checkVersion_mixlab.js +++ b/web/javascript/checkVersion_mixlab.js @@ -3,7 +3,7 @@ import { app } from '../../../scripts/app.js' const repoOwner = 'shadowcz007' // 替换为仓库的所有者 const repoName = 'comfyui-mixlab-nodes' // 替换为仓库的名称 -const version = 'v0.43.0' +const version = 'v0.44.0' fetch(`https://api.github.com/repos/${repoOwner}/${repoName}/releases/latest`) .then(response => response.json())