-
Notifications
You must be signed in to change notification settings - Fork 4
/
PiperSpeaker.cs
129 lines (112 loc) · 4.22 KB
/
PiperSpeaker.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
using System;
using System.Collections.Generic;
using System.Threading.Tasks;
using UnityEngine;
namespace Abuksigun.Piper
{
public class PiperSpeaker
{
readonly AudioClip audioClip;
readonly PiperVoice voice;
readonly List<float[]> pcmBuffers = new List<float[]>();
volatile int pcmBufferPointer = 0;
volatile string queuedText = null;
Task speachTask = null;
public AudioClip AudioClip => audioClip;
public unsafe PiperSpeaker(PiperVoice voice)
{
this.voice = voice;
PiperLib.SynthesisConfig synthesisConfig = PiperLib.getSynthesisConfig(voice.Voice);
audioClip = AudioClip.Create("MyPCMClip", 1024 * 24, synthesisConfig.channels, synthesisConfig.sampleRate, true, PCMRead);
}
~PiperSpeaker()
{
if (audioClip)
UnityEngine.Object.Destroy(audioClip);
}
// Use when you want to interrupt the current speech and say new replica
public Task Speak(string text)
{
pcmBufferPointer = 0;
return OverrideSpeech(text);
}
// Use when you are streaming generating text, so you can override audiostream seamlessly while it's playing
// For example, LLM generates first 3 tokens "I'm going to", you can start playing them before generation ends
// And then override with "I'm going to school" while it's playing.
// This way you can minimize latency between generation and playback
public Task OverrideSpeech(string text)
{
lock (pcmBuffers)
pcmBuffers.Clear();
return ContinueSpeach(text);
}
// Use when you want to add more text to the current speech
public unsafe Task ContinueSpeach(string text)
{
if (speachTask == null || speachTask.IsCompleted)
{
speachTask = Task.Run(() =>
{
do
{
voice.TextToAudioStream(text, (short* data, int length) => AddPCMData(data, length));
text = queuedText;
queuedText = null;
}
while (text != null);
});
}
else
{
queuedText = text;
}
return speachTask;
}
void PCMRead(float[] data)
{
if (pcmBuffers.Count == 0)
{
Array.Fill(data, 0);
return;
}
int dataLength = data.Length;
int dataIndex = 0;
while (dataIndex < dataLength)
{
int bufferIndex = 0;
int bufferOffset = pcmBufferPointer;
lock (pcmBuffers)
{
while (bufferIndex < pcmBuffers.Count && bufferOffset >= pcmBuffers[bufferIndex].Length)
{
bufferOffset -= pcmBuffers[bufferIndex].Length;
bufferIndex++;
}
if (bufferIndex < pcmBuffers.Count)
{
float[] currentBuffer = pcmBuffers[bufferIndex];
int remainingInBuffer = currentBuffer.Length - bufferOffset;
int remainingInData = dataLength - dataIndex;
int copyLength = Mathf.Min(remainingInBuffer, remainingInData);
Array.Copy(currentBuffer, bufferOffset, data, dataIndex, copyLength);
dataIndex += copyLength;
pcmBufferPointer += copyLength;
}
else
{
Array.Fill(data, 0, dataIndex, data.Length - dataIndex);
break;
}
}
}
}
public unsafe void AddPCMData(short* pcmData, int length)
{
float[] floatData = new float[length];
for (int i = 0; i < length; i++)
floatData[i] = pcmData[i] / 32768.0f;
lock (pcmBuffers)
pcmBuffers.Add(floatData);
}
}
}