-
Notifications
You must be signed in to change notification settings - Fork 0
/
align_manage.py
143 lines (104 loc) · 5.26 KB
/
align_manage.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import pandas as pd
import numpy as np
import librosa
import librosa.display
import soundfile
import torch
import torch.utils.data as tud
import torch.nn as nn
import torch.nn.functional as F
from torchsummary import summary
from torchaudio.transforms import MelSpectrogram, ComputeDeltas
from torch.optim.adamw import AdamW
import textgrid
import re
import json
import os
from os import listdir
from os.path import isfile, join
import math
import random
from pathlib import Path
from IPython.display import Audio
from tqdm import tqdm
tqdm.pandas()
import matplotlib.pyplot as plt
import pyaudio
import wave
from fastprogress import master_bar, progress_bar
from google.cloud import texttospeech
import warnings
warnings.simplefilter("ignore", UserWarning)
import sys
with open('your_config.json', 'r') as archivo_json:
config_datos = json.load(archivo_json)
wake_words = config_datos['wake_words']
wake_words_sequence = []
for indice, elemento in enumerate(wake_words):
wake_words_sequence.append(str(indice))
wake_word_seq_map = dict(zip(wake_words, wake_words_sequence))
sr = 16000
path_to_dataset = 'dataset'
path_to_dataset_w = path_to_dataset + '/'
# ------------
positive_train_data = pd.read_json(path_to_dataset+'/json/' + 'positive_train_data.json')
positive_dev_data = pd.read_json(path_to_dataset+'/json/' + 'positive_dev_data.json')
positive_test_data = pd.read_json(path_to_dataset+'/json/' + 'positive_test_data.json')
negative_train_data = pd.read_json(path_to_dataset+'/json/' + 'negative_train_data.json')
negative_dev_data = pd.read_json(path_to_dataset+'/json/' + 'negative_dev_data.json')
negative_test_data = pd.read_json(path_to_dataset+'/json/' + 'negative_test_data.json')
wake_word_datapath = path_to_dataset
positive_data = "/positive/audio"
negative_data = "/negative/audio"
print(positive_train_data.head())
def get_timestamps(path):
filename = path.split('/')[-1].split('.')[0]
filepath = path_to_dataset_w + f'aligned_data/{filename}.TextGrid'
words_timestamps = {}
if os.path.exists(filepath):
tg = textgrid.TextGrid.fromFile(filepath)
for tg_intvl in range(len(tg[0])):
word = tg[0][tg_intvl].mark
if word:
words_timestamps[word] = {'start': tg[0][tg_intvl].minTime, 'end': tg[0][tg_intvl].maxTime}
return words_timestamps
def get_duration(path):
sounddata = librosa.core.load(path, sr=sr, mono=True)[0]
return sounddata.size / sr * 1000 # ms
positive_train_data = pd.read_csv(path_to_dataset_w+'positive/train.csv')
positive_dev_data = pd.read_csv(path_to_dataset_w+'positive/dev.csv')
positive_test_data = pd.read_csv(path_to_dataset_w+'positive/test.csv')
positive_train_data['path'] = positive_train_data['path'].apply(lambda x: path_to_dataset_w+'positive/audio/'+x.split('.')[0]+'.wav')
positive_dev_data['path'] = positive_dev_data['path'].apply(lambda x: path_to_dataset_w+'positive/audio/'+x.split('.')[0]+'.wav')
positive_test_data['path'] = positive_test_data['path'].apply(lambda x: path_to_dataset_w+'positive/audio/'+x.split('.')[0]+'.wav')
positive_train_data['timestamps'] = positive_train_data['path'].apply(get_timestamps)
positive_dev_data['timestamps'] = positive_dev_data['path'].apply(get_timestamps)
positive_test_data['timestamps'] = positive_test_data['path'].apply(get_timestamps)
positive_train_data['duration'] = positive_train_data['path'].apply(get_duration)
positive_dev_data['duration'] = positive_dev_data['path'].apply(get_duration)
positive_test_data['duration'] = positive_test_data['path'].apply(get_duration)
print(positive_train_data['timestamps'][:5])
print(positive_train_data['duration'][:5])
# sys.exit()
negative_train_data = pd.read_csv(path_to_dataset_w+'negative/train.csv')
negative_dev_data = pd.read_csv(path_to_dataset_w+'negative/dev.csv')
negative_test_data = pd.read_csv(path_to_dataset_w+'negative/test.csv')
negative_train_data['path'] = negative_train_data['path'].apply(lambda x: path_to_dataset_w+'negative/audio/'+x.split('.')[0]+'.wav')
negative_dev_data['path'] = negative_dev_data['path'].apply(lambda x: path_to_dataset_w+'negative/audio/'+x.split('.')[0]+'.wav')
negative_test_data['path'] = negative_test_data['path'].apply(lambda x: path_to_dataset_w+'negative/audio/'+x.split('.')[0]+'.wav')
negative_train_data['timestamps'] = negative_train_data['path'].apply(get_timestamps)
negative_dev_data['timestamps'] = negative_dev_data['path'].apply(get_timestamps)
negative_test_data['timestamps'] = negative_test_data['path'].apply(get_timestamps)
negative_train_data['duration'] = negative_train_data['path'].apply(get_duration)
negative_dev_data['duration'] = negative_dev_data['path'].apply(get_duration)
negative_test_data['duration'] = negative_test_data['path'].apply(get_duration)
print(negative_train_data['timestamps'][:5])
print(negative_train_data['duration'][:5])
# save above data
positive_train_data.to_csv(wake_word_datapath + "/positive/train.csv", index=False)
positive_dev_data.to_csv(wake_word_datapath + "/positive/dev.csv", index=False)
positive_test_data.to_csv(wake_word_datapath + "/positive/test.csv", index=False)
negative_train_data.to_csv(wake_word_datapath + "/negative/train.csv", index=False)
negative_dev_data.to_csv(wake_word_datapath + "/negative/dev.csv", index=False)
negative_test_data.to_csv(wake_word_datapath + "/negative/test.csv", index=False)
print('finished')