-
Notifications
You must be signed in to change notification settings - Fork 59
/
tts.py
67 lines (53 loc) · 2.58 KB
/
tts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import nltk
import torch
import warnings
import numpy as np
from transformers import AutoProcessor, BarkModel
warnings.filterwarnings(
"ignore",
message="torch.nn.utils.weight_norm is deprecated in favor of torch.nn.utils.parametrizations.weight_norm.",
)
class TextToSpeechService:
def __init__(self, device: str = "cuda" if torch.cuda.is_available() else "cpu"):
"""
Initializes the TextToSpeechService class.
Args:
device (str, optional): The device to be used for the model, either "cuda" if a GPU is available or "cpu".
Defaults to "cuda" if available, otherwise "cpu".
"""
self.device = device
self.processor = AutoProcessor.from_pretrained("suno/bark-small")
self.model = BarkModel.from_pretrained("suno/bark-small")
self.model.to(self.device)
def synthesize(self, text: str, voice_preset: str = "v2/en_speaker_1"):
"""
Synthesizes audio from the given text using the specified voice preset.
Args:
text (str): The input text to be synthesized.
voice_preset (str, optional): The voice preset to be used for the synthesis. Defaults to "v2/en_speaker_1".
Returns:
tuple: A tuple containing the sample rate and the generated audio array.
"""
inputs = self.processor(text, voice_preset=voice_preset, return_tensors="pt")
inputs = {k: v.to(self.device) for k, v in inputs.items()}
with torch.no_grad():
audio_array = self.model.generate(**inputs, pad_token_id=10000)
audio_array = audio_array.cpu().numpy().squeeze()
sample_rate = self.model.generation_config.sample_rate
return sample_rate, audio_array
def long_form_synthesize(self, text: str, voice_preset: str = "v2/en_speaker_1"):
"""
Synthesizes audio from the given long-form text using the specified voice preset.
Args:
text (str): The input text to be synthesized.
voice_preset (str, optional): The voice preset to be used for the synthesis. Defaults to "v2/en_speaker_1".
Returns:
tuple: A tuple containing the sample rate and the generated audio array.
"""
pieces = []
sentences = nltk.sent_tokenize(text)
silence = np.zeros(int(0.25 * self.model.generation_config.sample_rate))
for sent in sentences:
sample_rate, audio_array = self.synthesize(sent, voice_preset)
pieces += [audio_array, silence.copy()]
return self.model.generation_config.sample_rate, np.concatenate(pieces)