-
Notifications
You must be signed in to change notification settings - Fork 0
/
stts.py
104 lines (79 loc) · 3.38 KB
/
stts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
"""DJ GPT CLI
Module to deal with all the speech-to-text and text-to-speech functionality.
"""
import time
from functools import cache
from typing import Optional
from speech_recognition import Recognizer, Microphone, UnknownValueError, RequestError
# All gross stuff to get the Siri voice on macOS going... extremely brittle
from AppKit import NSSpeechSynthesizer
from utils import CONSOLE, retry
TTS = NSSpeechSynthesizer.alloc().init()
@cache
def get_sr():
"""Get the cached recognizer instance.
Only need a single recognizer and audio device instance, also initialise for ambient noise.
"""
recognizer = Recognizer()
mic = Microphone()
with mic as source:
recognizer.adjust_for_ambient_noise(source)
return recognizer, mic
def _wait_for_siri_to_shutup():
"""She does love to drone on, we don't want Whisper listening to her"""
while TTS.isSpeaking(): # loop until it finish speaking
time.sleep(1) # TODO: see if this can be reduced
def say(text: str, wait: bool = False):
"""Super hacky NSSpeechSynthesizer "blocking" call... for text-to-speech
"""
# Don't interrupt previous startSpeaking as it cancels to the most recent
_wait_for_siri_to_shutup()
CONSOLE.print(text)
TTS.startSpeakingString_(text)
# Be a blocking call instead of OS level asynchronous
if wait:
_wait_for_siri_to_shutup()
@retry
def listen() -> Optional[str]:
"""Use OpenAI whisper model installed locally to do speech-to-text
This needs pyaudio installed to deal with the mic input
First time you ever use this it will download the medium.en model to ~/.cache/whisper its a bit chonky
"""
# Wait for Siri's voice otherwise we listen to ourselves!
_wait_for_siri_to_shutup()
with CONSOLE.status("[bold green]Listening...") as status:
recognizer, mic = get_sr()
with mic as source:
audio = recognizer.listen(source, phrase_time_limit=5)
status.update("[bold yellow]Recognizing...")
# received audio data, now we'll recognize it using Google Speech Recognition
try:
speech_text = recognizer.recognize_whisper(audio, language="english", model="base.en")
except UnknownValueError:
CONSOLE.log("[bold red]Whisper could not understand audio")
speech_text = None
except RequestError as e:
CONSOLE.log("[bold red]Could not request results from Whisper")
speech_text = None
CONSOLE.log(f"[bold red]Heard: {speech_text}")
return speech_text
def listen_command(recognizer, audio):
"""Dead code. 2023-05-04
Callback whilst waiting and not actively listening but background idle.
Hangover from background listening, keeping for later
"""
# recognizer, mic = get_sr()
# stop_listening = recognizer.listen_in_background(mic, listen_command)
CONSOLE.log("[bold yellow]Recognizing...")
try:
speech_text = recognizer.recognize_whisper(audio, language="english", model="base.en")
except UnknownValueError:
CONSOLE.log("[bold red]Whisper could not understand audio")
return
except RequestError as e:
CONSOLE.log("[bold red]Could not request results from Whisper")
return
CONSOLE.log(f"[bold red]Heard: {speech_text}")
if "stop" in speech_text.lower():
if S_DEVICE_ID:
get_spotify().pause_playback(S_DEVICE_ID)