-
Notifications
You must be signed in to change notification settings - Fork 1
/
audio.go
253 lines (238 loc) · 7.47 KB
/
audio.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
package groq
import (
"bytes"
"context"
"fmt"
"io"
"net/http"
"os"
"github.com/conneroisu/groq-go/pkg/builders"
"github.com/conneroisu/groq-go/pkg/models"
)
const (
// TranscriptionTimestampGranularityWord is the word timestamp
// granularity.
TranscriptionTimestampGranularityWord TranscriptionTimestampGranularity = "word"
// TranscriptionTimestampGranularitySegment is the segment timestamp
// granularity.
TranscriptionTimestampGranularitySegment TranscriptionTimestampGranularity = "segment"
)
type (
// TranscriptionTimestampGranularity is the timestamp granularity for
// the transcription.
//
// string
TranscriptionTimestampGranularity string
// AudioRequest represents a request structure for audio API.
AudioRequest struct {
// Model is the model to use for the transcription.
Model models.AudioModel
// FilePath is either an existing file in your filesystem or a
// filename representing the contents of Reader.
FilePath string
// Reader is an optional io.Reader when you do not want to use
// an existing file.
Reader io.Reader
// Prompt is the prompt for the transcription.
Prompt string
// Temperature is the temperature for the transcription.
Temperature float32
// Language is the language for the transcription. Only for
// transcription.
Language string
// Format is the format for the response.
Format Format
}
// AudioResponse represents a response structure for audio API.
AudioResponse struct {
// Task is the task of the response.
Task string `json:"task"`
// Language is the language of the response.
Language string `json:"language"`
// Duration is the duration of the response.
Duration float64 `json:"duration"`
// Segments is the segments of the response.
Segments Segments `json:"segments"`
// Words is the words of the response.
Words Words `json:"words"`
// Text is the text of the response.
Text string `json:"text"`
Header http.Header // Header is the header of the response.
}
// Words is the words of the audio response.
Words []struct {
// Word is the textual representation of a word in the audio
// response.
Word string `json:"word"`
// Start is the start of the words in seconds.
Start float64 `json:"start"`
// End is the end of the words in seconds.
End float64 `json:"end"`
}
// Segments is the segments of the response.
Segments []struct {
// ID is the ID of the segment.
ID int `json:"id"`
// Seek is the seek of the segment.
Seek int `json:"seek"`
// Start is the start of the segment.
Start float64 `json:"start"`
// End is the end of the segment.
End float64 `json:"end"`
// Text is the text of the segment.
Text string `json:"text"`
// Tokens is the tokens of the segment.
Tokens []int `json:"tokens"`
// Temperature is the temperature of the segment.
Temperature float64 `json:"temperature"`
// AvgLogprob is the avg log prob of the segment.
AvgLogprob float64 `json:"avg_logprob"`
// CompressionRatio is the compression ratio of the segment.
CompressionRatio float64 `json:"compression_ratio"`
// NoSpeechProb is the no speech prob of the segment.
NoSpeechProb float64 `json:"no_speech_prob"`
// Transient is the transient of the segment.
Transient bool `json:"transient"`
}
// audioTextResponse is the response structure for the audio API when the
// response format is text.
audioTextResponse struct {
// Text is the text of the response.
Text string `json:"text"`
// Header is the response header.
header http.Header `json:"-"`
}
)
// SetHeader sets the header of the response.
func (r *AudioResponse) SetHeader(header http.Header) { r.Header = header }
// SetHeader sets the header of the audio text response.
func (r *audioTextResponse) SetHeader(header http.Header) { r.header = header }
// toAudioResponse converts the audio text response to an audio response.
func (r *audioTextResponse) toAudioResponse() AudioResponse {
return AudioResponse{Text: r.Text, Header: r.header}
}
// CreateTranscription calls the transcriptions endpoint with the given request.
//
// Returns transcribed text in the response_format specified in the request.
func (c *Client) CreateTranscription(
ctx context.Context,
request AudioRequest,
) (AudioResponse, error) {
return c.callAudioAPI(ctx, request, transcriptionsSuffix)
}
// CreateTranslation calls the translations endpoint with the given request.
//
// Returns the translated text in the response_format specified in the request.
func (c *Client) CreateTranslation(
ctx context.Context,
request AudioRequest,
) (AudioResponse, error) {
return c.callAudioAPI(ctx, request, translationsSuffix)
}
// callAudioAPI calls the audio API with the given request.
//
// Currently supports both the transcription and translation APIs.
func (c *Client) callAudioAPI(
ctx context.Context,
request AudioRequest,
endpointSuffix Endpoint,
) (response AudioResponse, err error) {
var formBody bytes.Buffer
c.requestFormBuilder = builders.NewFormBuilder(&formBody)
err = AudioMultipartForm(request, c.requestFormBuilder)
if err != nil {
return AudioResponse{}, err
}
req, err := builders.NewRequest(
ctx,
c.header,
http.MethodPost,
c.fullURL(endpointSuffix, withModel(request.Model)),
builders.WithBody(&formBody),
builders.WithContentType(c.requestFormBuilder.FormDataContentType()),
)
if err != nil {
return AudioResponse{}, err
}
if request.hasJSONResponse() {
err = c.sendRequest(req, &response)
} else {
var textResponse audioTextResponse
err = c.sendRequest(req, &textResponse)
response = textResponse.toAudioResponse()
}
if err != nil {
return AudioResponse{}, err
}
return
}
func (r AudioRequest) hasJSONResponse() bool {
return r.Format == "" || r.Format == FormatJSON ||
r.Format == FormatVerboseJSON
}
// AudioMultipartForm creates a form with audio file contents and the name of
// the model to use for audio processing.
func AudioMultipartForm(request AudioRequest, b builders.FormBuilder) error {
err := createFileField(request, b)
if err != nil {
return err
}
err = b.WriteField("model", string(request.Model))
if err != nil {
return fmt.Errorf("writing model name: %w", err)
}
// Create a form field for the prompt (if provided)
if request.Prompt != "" {
err = b.WriteField("prompt", request.Prompt)
if err != nil {
return fmt.Errorf("writing prompt: %w", err)
}
}
// Create a form field for the format (if provided)
if request.Format != "" {
err = b.WriteField("response_format", string(request.Format))
if err != nil {
return fmt.Errorf("writing format: %w", err)
}
}
// Create a form field for the temperature (if provided)
if request.Temperature != 0 {
err = b.WriteField(
"temperature",
fmt.Sprintf("%.2f", request.Temperature),
)
if err != nil {
return fmt.Errorf("writing temperature: %w", err)
}
}
// Create a form field for the language (if provided)
if request.Language != "" {
err = b.WriteField("language", request.Language)
if err != nil {
return fmt.Errorf("writing language: %w", err)
}
}
return b.Close()
}
func createFileField(
request AudioRequest,
b builders.FormBuilder,
) (err error) {
if request.Reader != nil {
err := b.CreateFormFileReader("file", request.Reader, request.FilePath)
if err != nil {
return fmt.Errorf("creating form using reader: %w", err)
}
return nil
}
f, err := os.Open(request.FilePath)
if err != nil {
return fmt.Errorf("opening audio file: %w", err)
}
defer f.Close()
err = b.CreateFormFile("file", f)
if err != nil {
return fmt.Errorf("creating form file: %w", err)
}
return nil
}