-
Notifications
You must be signed in to change notification settings - Fork 5
/
voiceover.py
239 lines (217 loc) · 9.7 KB
/
voiceover.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
import os
from dotenv import load_dotenv
import hashlib
import queue
from typing import List, Dict, Any
from pydub import AudioSegment
from tqdm.auto import tqdm
import httpx
import backoff
import asyncio
load_dotenv()
ELEVEN_LABS_API_KEY = os.getenv("ELEVENLABS_API_KEY")
ELEVEN_LABS_API_URL = "https://api.elevenlabs.io/v1/text-to-speech"
def get_clip_filename(speaker: str, text: str, output_dir: str):
text_hash = hashlib.md5(text.encode("utf-8")).hexdigest()
return os.path.join(output_dir, f"{speaker}_{text_hash}.wav")
async def generate_voice_clips(dialogue: List[Dict[str, Any]],
output_dir: str = "audio_clips"):
if not os.path.exists(output_dir):
os.makedirs(output_dir)
dialogue_queue = queue.Queue()
for line in dialogue:
dialogue_queue.put(line)
if "overlaps" in line:
for overlap in line.get("overlaps"):
dialogue_queue.put(overlap)
@backoff.on_exception(backoff.expo,
httpx.HTTPStatusError,
max_tries=5,
giveup=lambda e: e.response.status_code != 429)
async def generate_audio(line, previous_text=None):
speaker = line.get("speaker")
text = line.get("text")
speaker_voice_id = line.get("speaker_voice_id")
filename = get_clip_filename(speaker, text, output_dir)
if os.path.exists(filename):
return
headers = {
"Accept": "audio/mpeg",
"Content-Type": "application/json",
"xi-api-key": ELEVEN_LABS_API_KEY
}
data = {
"text": text,
"model_id": "eleven_monolingual_v1",
"voice_settings": {
"stability": 0.5,
"similarity_boost": 0.5,
},
"output_format": "mp3_44100_128",
"enable_logging": False,
"previous_text": previous_text,
"seed": 42
}
try:
async with httpx.AsyncClient(timeout=600) as client:
response = await client.post(
f"{ELEVEN_LABS_API_URL}/{speaker_voice_id}",
headers=headers,
json=data)
response.raise_for_status()
with open(filename, "wb") as f:
f.write(response.content)
except httpx.HTTPStatusError as e:
print(f"Failed to generate audio for {speaker}: {e}")
semaphore = asyncio.Semaphore(2)
tasks = []
previous_text = None
while not dialogue_queue.empty():
line = dialogue_queue.get()
await semaphore.acquire()
task = asyncio.create_task(generate_audio(line, previous_text))
task.add_done_callback(lambda _: semaphore.release())
tasks.append(task)
previous_text = line.get("text")
for task in tqdm.as_completed(tasks):
await task
def join_audio_clips(dialogue: List[Dict[str, Any]],
output_dir: str = "audio_clips",
output_file: str = "final_output_1.wav"):
output_audio = AudioSegment.silent(duration=0)
for ix, line in enumerate(
tqdm(dialogue, desc="Joining audio clips", leave=False)):
speaker = line.get("speaker")
text = line.get("text")
filename = get_clip_filename(speaker, text, output_dir)
try:
clip = AudioSegment.from_file(file=filename)
except FileNotFoundError:
print(f"Audio clip not found: {filename}")
continue
# print(f"Length of clip before adding crossfade: {len(clip)} ms")
# print(
# f"Length of output_audio before adding clip: {len(output_audio)} ms"
# )
# print(f"Length of clip before adding crossfade: {len(clip)} ms")
# print(
# f"Length of output_audio before adding clip: {len(output_audio)} ms"
# )
if "overlaps" in line:
# print(f"Line: {line} has overlaps")
for overlap in line.get("overlaps"):
overlap_filename = get_clip_filename(overlap.get("speaker"),
overlap.get('text'),
output_dir)
try:
print(f"Overlap filename: {overlap_filename}")
overlap_clip = AudioSegment.from_file(
file=overlap_filename)
except FileNotFoundError:
print(f"Overlap audio clip not found: {overlap_filename}")
continue
overlap_start_time = max(0, len(clip) - 850)
overlap_start_time = max(0, len(clip) - 850)
clip = clip.overlay(overlap_clip, position=overlap_start_time)
if len(overlap_clip) > 850:
remaining_overlap = overlap_clip[850:]
clip = clip.append(remaining_overlap, crossfade=0)
if len(clip) < 10 or len(output_audio) < 10:
crossfade_duration = 0
else:
crossfade_duration = min(10,
len(clip) // 2,
len(output_audio) // 2)
print(f"Crossfade duration: {crossfade_duration} ms")
output_audio = output_audio.append(clip, crossfade=crossfade_duration)
output_audio.export(output_file, format="wav")
# Example usage
if __name__ == "__main__":
import asyncio
input_dialogue = [{
"speaker": "Jessica",
"text":
"Welcome, everyone, to today's discussion. We're diving into MemGPT and its innovative approach to memory management in large language models.",
"speaker_voice_id": "fNmfW5GlQ7PDakGkiTzs"
}, {
"speaker":
"Michael",
"text":
"Thanks, Jessica! I've always found the concept of limitations in LLMs quite fascinating. But how exactly does MemGPT tackle these constraints?",
"speaker_voice_id":
"iP95p4xoKVk53GoZ742B",
"overlaps": [{
"speaker": "Emily",
"text":
"Oh, I was wondering the same thing! Especially in terms of document analysis and conversations.",
"speaker_voice_id": "cgSgspJ2msm6clMCkdW9"
}]
}, {
"speaker":
"David",
"text":
"Essentially, MemGPT uses a method akin to virtual memory in computers. It handles hierarchical memory management, mimicking how an OS would deal with overflow by paging. This way, the system can seemingly manage more data than its context window allows.",
"speaker_voice_id":
"cjVigY5qzO86Huf0OWal",
"overlaps": [{
"speaker": "Jessica",
"text":
"That's interesting\u2014it sounds like a computer handling its RAM efficiently.",
"speaker_voice_id": "fNmfW5GlQ7PDakGkiTzs"
}]
}, {
"speaker": "Emily",
"text":
"So, it's like increasing the room size without actually adding more room? By using data movement techniques similar to those in a computer's memory system?",
"speaker_voice_id": "cgSgspJ2msm6clMCkdW9"
}, {
"speaker": "Michael",
"text":
"Exactly. And what's brilliant is MemGPT's ability to do this autonomously through function calls\u2014kind of like it has the keys to its own memory bank.",
"speaker_voice_id": "iP95p4xoKVk53GoZ742B"
}, {
"speaker": "Jessica",
"text":
"So, we're talking about independence from human intervention, which is pivotal in applications like virtual assistants, right?",
"speaker_voice_id": "fNmfW5GlQ7PDakGkiTzs"
}, {
"speaker": "David",
"text":
"Precisely, Jessica. This autonomy translates into better efficiency and user experience, where MemGPT personalizes and evolves based on extended interactions.",
"speaker_voice_id": "cjVigY5qzO86Huf0OWal"
}, {
"speaker": "Emily",
"text":
"I can imagine how this would significantly enhance interaction in conversational agents. But what about document analysis\u2014how does MemGPT improve there?",
"speaker_voice_id": "cgSgspJ2msm6clMCkdW9"
}, {
"speaker": "David",
"text":
"In document analysis, MemGPT gives the system the capacity to handle documents that exceed the typical context window, paging information in and out at need. It's like a librarian fetching a book based on what you're asking for.",
"speaker_voice_id": "cjVigY5qzO86Huf0OWal"
}, {
"speaker": "Michael",
"text":
"With this design, MemGPT effectively expands the usable memory without the exponentially higher costs associated with longer context models.",
"speaker_voice_id": "iP95p4xoKVk53GoZ742B"
}, {
"speaker": "Jessica",
"text":
"Wrapping it up, MemGPT seems to offer a bridge over the limitations of traditional LLMs, allowing for more complex, engaging, and extended usage in AI applications. Any last thoughts?",
"speaker_voice_id": "fNmfW5GlQ7PDakGkiTzs"
}, {
"speaker": "Emily",
"text":
"I think it's a game-changer for AI in handling extensive and dynamic content.",
"speaker_voice_id": "cgSgspJ2msm6clMCkdW9"
}, {
"speaker": "David",
"text":
"Absolutely. MemGPT has the potential to redefine how we approach LLM constraints\u2014we're definitely looking at a promising future for AI-powered solutions.",
"speaker_voice_id": "cjVigY5qzO86Huf0OWal"
}]
# Generate voice clips
asyncio.run(generate_voice_clips(input_dialogue, "audio_clips_1"))
# Join audio clips with overlaps
join_audio_clips(input_dialogue, "audio_clips_1",
"full_podcast_overlay_1500secs.wav")