forked from AdhamSamehA/Outbound-Phone-GPT
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Worker.py
378 lines (314 loc) · 18.4 KB
/
Worker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
"""
This file is part of Outbound Phone GPT.
Outbound Phone GPT is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Outbound Phone GPT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Outbound Phone GPT. If not, see <https://www.gnu.org/licenses/>
"""
import json
import uuid
import logging
import base64
import asyncio
import time
import os
from typing import Optional, AsyncGenerator
# Import utils
from __utils__ import text_chunker, get_cached_streaming_generator, LRUCache
# Import configs
from __config__ import AGENT_CACHE_FILE
# Import sales agent framework (SalesGPT)
from ConversationModel.agents import ConversationalModel
# Import filler prediction model (FillerPredictor)
from FillerPredictionModel import GPTPredictor
# Import FastAPI libraries
from fastapi import WebSocket, WebSocketDisconnect
# Import libraries to handle eleven labs websocket
from websockets.legacy.client import WebSocketClientProtocol
from websockets.exceptions import ConnectionClosed
# Set up logging configuration
from logger_config import setup_logger
logger = setup_logger("my_app_logger", level=logging.DEBUG)
# AI Agent class
class AIAgent():
"""
AIAgent instances are workers specialised in processing streaming generators and audio streams
"""
def __init__(self, call_sid=None):
"""
Initializes a new AI agent instance.
Args:
call_sid (str): A string representing the Call SID (Session Identifier) that uniquely identifies the call session.
"""
self.gpt : ConversationalModel = ConversationalModel().init_agent()
self.model : GPTPredictor = GPTPredictor()
self.call_sid : str = call_sid or str(uuid.uuid4())
self.ai_response : str = ""
self.websocket_url : str = ""
self.stream_sid : str = ""
self.welcome_file_path : str = ""
self.filler_file_path : str = ""
self.user_transcribed_input : str = ""
self.transcripts = []
self.lru_cache = LRUCache(capacity=1000, cache_file_path=AGENT_CACHE_FILE)
self.streaming_generator_future: Optional[asyncio.Future]
self.mark_event_future = Optional[asyncio.Future]
self.post_audio_tasks = []
self.responses = []
self.streaming_generator: Optional[AsyncGenerator]
self.audio_start_event = asyncio.Event()
self.is_first_audio_chunk_sent : bool = True
self.use_cache : bool = False
self.stop_signal : bool = False
self.response_latency_start : time
self.response_latency_end : time
self.streaming_gen_retrieval_start_time : time
self.streaming_gen_retrieval_end_time : time
self.eleven_labs_websocket_connect_start_time : time
self.eleven_labs_websocket_connect_end_time : time
################################################ TWILIO BI-DIRECTIONAL STREAM HANDLING #############################################
async def connect_to_twilio_bidirectional_stream(self, websocket : WebSocket):
"""
an async method responsible for connecting to Twilio's bidirectional stream in order to enable audio input/output streaming.
Args:
websocket: Our FastAPI server's websocket instance which is responsible for handling twilio's media streams
Returns:
None: This method does not return anything
"""
try:
twilio_to_websocket_connection_time_start = time.time()
# Wait for the 'connected' message from Twilio
connected_message = await websocket.receive_json()
if connected_message.get('event') != 'connected':
logger.error("connect_to_twilio_bidirectional_stream: Expected 'connected' message, received something else.")
return
logger.info("connect_to_twilio_bidirectional_stream: Connection between custom websocket and Twilio is successful...")
# Wait for the 'start' message from Twilio
start_message = await websocket.receive_json()
if start_message.get('event') != 'start':
logger.error("connect_to_twilio_bidirectional_stream: Expected 'start' message, received something else.")
return
logger.info("Twilio bi-directional media streaming started...")
self.stream_sid = start_message.get('streamSid')
logger.info("Stream SID Received...")
twilio_to_websocket_connection_time_end = time.time()
logger.info(f"Time taken to connect Twilio to custom websocket: {twilio_to_websocket_connection_time_end - twilio_to_websocket_connection_time_start} seconds")
except WebSocketDisconnect:
logger.error(f"connect_to_twilio_bidirectional_stream: Error -> Websocket Disconnected")
#TODO: Implement better exception handling
except Exception as e:
logger.error(f"connect_to_twilio_bidirectional_stream: Exception -> {e}")
async def send_mark_message(self, websocket_server : WebSocket):
"""
Called when the entire audio output stream has been pushed to Twilio through the `websocket_server`. This will send
a mark message to the Twilio and wait for a response to signal that the audio output stream finished playing on the call.
Args:
websocket_server: Our FastAPI server's websocket instance which is responsible for handling twilio's media streams
Returns:
None: This method does not return anything
"""
mark_name = str(uuid.uuid4())
logger.info("Sending mark message now...")
await websocket_server.send_json({
"event": "mark",
"streamSid": self.stream_sid,
"mark": {"name": mark_name}
})
logger.info("Waiting for mark message response...")
while True:
response = await websocket_server.receive_json()
if response.get('event') == 'mark' and response.get('mark', {}).get('name') == mark_name:
logger.info(f"Mark event received for {mark_name}. Audio chunk has been transmitted successfully.")
break
elif response.get('event') == 'stop':
logger.info("Call Ended...")
self.stop_signal = True
break
################################################ STREAM HANDLING ###################################################################
async def send_text_stream(self, eleven_labs_websocket, cached_response=None):
"""
Takes a new (is self_use_cache=False) or old (is self_use_cache=True) generated stream from the GPT model, filters out it's contents for compatibility with voice generation, and then streams it
asynchronously to the Eleven Labs API for converting it into audio.
Args:
cached_response : This is a list of words from a pre-generated GPT response. It is stored as a list to enable compatibility with the
`get_cached_streaming_generator` method, which converts the list of words into a streaming generator.
eleven_labs_websocket (WebSocket): The WebSocket connection to Eleven Labs API.
Returns:
None: This method does not return anything
"""
async def output_iterator():
if self.use_cache:
logger.info("Sending pre-generated text stream to Eleven Labs API...")
async for chunk in get_cached_streaming_generator(cached_response):
chunk_content = chunk["choices"][0]["delta"]["content"]
if chunk_content is not None:
self.ai_response += chunk_content
logger.info(f"Retrieving text chunk: {chunk_content}")
yield chunk_content
else:
logger.info("Sending newly generated text stream to Eleven Labs API...")
async for chunk in self.streaming_generator:
chunk_content = chunk.choices[0].delta.content
if chunk_content is not None:
self.ai_response += chunk_content
logger.info(f"Retrieving text chunk: {chunk_content}")
#self.responses.append(chunk_content) # Collect the content
yield chunk_content
#self.lru_cache.put(user_input, self.responses)
#self.responses.clear()
###
# If you'd like to store newly generated streams automatically, then uncomment the logic for adding content into the LRU cache.
# Otherwise, you can leave it, and update the cache manulaly through the API (preferred method, as you can control what is stored)
###
try:
async for text_chunk in text_chunker(output_iterator()):
try:
logger.info("Sending text chunk...")
await eleven_labs_websocket.send(json.dumps({"text": text_chunk, "try_trigger_generation": True}))
except ConnectionClosed as e:
logger.error("Eleven Labs WebSocket connection closed unexpectedly: {}".format(e))
break # Exit loop if connection is closed
except Exception as e:
logger.error(f"Exception in sending text chunk: {e}")
# Now that the text stream has ended, send the final WebSocket message to Eleven Labs
await eleven_labs_websocket.send(json.dumps({"text": ""}))
logger.info("Final WebSocket message sent to Eleven Labs.")
except Exception as e:
logger.error(f"Exception in send_text_stream: {e}")
async def post_audio_to_websocket(self, audio_data, websocket_server : WebSocket):
"""
Posts the audio stream to the agent's WebSocket server.
This method is responsible for streaming the audio data, received from
Eleven Labs, to the client via the agent's WebSocket server.
Args:
audio_stream (AsyncGenerator): An asynchronous generator yielding audio chunks.
Returns:
None: This method streams audio data and does not return anything.
"""
logger.info("Sending audio data now...")
await websocket_server.send_json({
"event": "media",
"streamSid": self.stream_sid,
"media": {"payload": base64.b64encode(audio_data).decode('utf-8')}
})
logger.info("Audio data sent...")
if self.is_first_audio_chunk_sent == False:
self.response_latency_end = time.time()
logger.info(f"Agent Utterance Response Latency ~ {self.response_latency_end - self.response_latency_start} seconds")
self.is_first_audio_chunk_sent = True
async def get_streaming_generator(self):
"""
Utilises `self.streaming_generator_future` which is an `asyncio.Future` instance. The method
is responsible for handling the retrieval of the streaming generator from the instance.
The future instance is used to mimimise the time taken to wait for the completion of the streamimg generator in order to reduce
response latency.
Arguments:
None: This method doesn't take in any arguments
Returns:
None: This method doesn't return anything
"""
if self.streaming_generator_future:
logger.info("Verified streaming generator future coroutine...")
# Check if the future is already done
if self.streaming_generator_future.done():
# Get the result directly without awaiting
self.streaming_generator = self.streaming_generator_future.result()
logger.info(f"Streaming generator created and retrieved in: {self.streaming_gen_retrieval_end_time-self.streaming_gen_retrieval_start_time} seconds")
else:
# If the future is not done, await it
logger.info("Awaiting streaming generator...")
await_streaming_gen_start = time.time()
self.streaming_generator = await self.streaming_generator_future
await_streaming_gen_end = time.time()
logger.info("Streaming generator awaited successfully...")
logger.info(f"Streaming generator awaited for: {await_streaming_gen_end - await_streaming_gen_start} seconds")
self.streaming_gen_retrieval_end_time = time.time()
logger.info(f"Streaming generator created and retrieved in: {self.streaming_gen_retrieval_end_time-self.streaming_gen_retrieval_start_time} seconds")
################################################ INPUT PROCESSING ##################################################################
async def process_input(self, cached_response, eleven_labs_websocket : WebSocketClientProtocol, websocket_server : WebSocket, user_input : str):
"""
Processing the text and audio stream asynchronously to ensure that
Args:
eleven_labs_websocket (WebSocket): The WebSocket connection to Eleven Labs API.
websocket (WebSocket): The active WebSocket connection for audio streaming.
user_input (str): The user input text to be processed.
cached_response: list of words from a pre-generated GPT stream, retrived from the agent's cache
Returns:
None: This method does not return anything but sends data over a websocket.
"""
try:
if eleven_labs_websocket:
async def listen_eleven_labs():
"""
This is the primary handler of the speech to text response stream generated by Eleven Labs API.
It calls `post_audio_to_websocket` as soon as a valid audio chunk has been recieved then adds that to a list of
`post_audio_future` instances. This technique replaces the standard buffer to enable taking full advantage of asynchronous programming
by pushing data to our websocket server as soon as its available.
It then waits for the mark message to be recieved from Twilio, to signal successful playback of the voice generated.
"""
try:
while True:
message = await eleven_labs_websocket.recv()
data = json.loads(message)
if data.get("audio"):
audio_chunk = base64.b64decode(data["audio"])
if audio_chunk is not None:
logger.info(f"listen_eleven_labs: Audio data : {audio_chunk[:10]} - Length: {len(audio_chunk)} bytes")
post_audio_future = asyncio.ensure_future(self.post_audio_to_websocket(audio_chunk, websocket_server))
self.post_audio_tasks.append(post_audio_future)
else:
logger.warning("listen_eleven_labs: Recieved empty audio chunk")
continue
elif data.get('isFinal'):
await asyncio.gather(*self.post_audio_tasks)
await self.send_mark_message(websocket_server)
break
except Exception as e:
logger.error(f"listen_eleven_labs: Exception-> {e}")
if self.use_cache:
logger.info(f"Retrieved from cache for '{user_input}': {cached_response}")
else:
await self.get_streaming_generator()
listen_task = asyncio.create_task(listen_eleven_labs())
text_stream_task = asyncio.create_task(self.send_text_stream(eleven_labs_websocket=eleven_labs_websocket, cached_response=cached_response))
await asyncio.gather(text_stream_task, listen_task)
self.mark_event_future = asyncio.ensure_future(self.send_mark_message(websocket_server))
else:
logger.error(f"Error with eleven labs websocket. Type: {type(eleven_labs_websocket)}")
raise WebSocketDisconnect
except Exception as e:
logger.error(f"Exception in process_input: {e}")
################################################ UPDATING AGENT FOR CONVERSATION FLOW ###############################################
def update_agent_response(self):
"""
Updates the agent's response after processing user input.
Args:
None: This method doesn't take in any arguments
Returns:
None: This method updates internal state and does not return anything.
"""
logger.info(f"Most recent response: {self.ai_response}")
logger.info("Updating Agent...")
agent_name = self.gpt.get_attribute('agent_name')
self.ai_response = f"{agent_name}: {self.ai_response.strip()}"
if "<END_OF_TURN>" not in self.ai_response:
self.ai_response += " <END_OF_TURN>"
self.gpt.conversation_history.append(self.ai_response)
logger.info("Agent Update Complete...")
logger.info(self.ai_response.replace("<END_OF_TURN>", ""))
logger.info(f"Conversation History: {self.gpt.conversation_history}")
self.ai_response=""
################################################ RESETING AGENT FOR NEXT CONVERSATION ###############################################
async def reset_after_interaction(self):
"""Resets the agent's state after each interaction round"""
self.audio_start_event.clear() # Reset the event for the next interaction
self.is_first_audio_chunk_sent = False
self.post_audio_tasks.clear()
self.use_cache = False
# Reset other state variables or events as necessary