-
Notifications
You must be signed in to change notification settings - Fork 0
/
srt2hypertranscript.js
167 lines (129 loc) · 4.37 KB
/
srt2hypertranscript.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
// By Chris Zubak-Skees
const fs = require("fs");
const paraSplitTime = 2;
const paraPunct = false;
function parseSRT(data) {
let i = 0;
let len = 0;
let idx = 0;
let lines;
let time;
let text;
let sub;
// Simple function to convert HH:MM:SS,MMM or HH:MM:SS.MMM to SS.MMM
// Assume valid, returns 0 on error
const toSeconds = (t_in) => {
const t = t_in.split(":");
try {
let s = t[2].split(",");
// Just in case a . is decimal seperator
if (s.length === 1) {
s = t[2].split(".");
}
return (
parseFloat(t[0], 10) * 3600 +
parseFloat(t[1], 10) * 60 +
parseFloat(s[0], 10) +
parseFloat(s[1], 10) / 1000
);
} catch (e) {
return 0;
}
};
let outputString = "<article><section><p>";
const lineBreaks = true;
let ltime = 0;
let ltext;
// Here is where the magic happens
// Split on line breaks
lines = data.split(/(?:\r\n|\r|\n)/gm);
len = lines.length;
for (i = 0; i < len; i++) {
sub = {};
text = [];
sub.id = parseInt(lines[i++], 10);
// Split on '-->' delimiter, trimming spaces as well
try {
time = lines[i++].split(/[\t ]*-->[\t ]*/);
} catch (e) {
console.error(`Warning. Possible issue on line ${i}: '${lines[i]}'.`);
}
sub.start = toSeconds(time[0]);
// So as to trim positioning information from end
if (!time[1]) {
console.error(`Warning. Issue on line ${i}: '${lines[i]}'.`);
return;
}
idx = time[1].indexOf(" ");
if (idx !== -1) {
time[1] = time[1].substr(0, idx);
}
sub.end = toSeconds(time[1]);
// Build single line of text from multi-line subtitle in file
while (i < len && lines[i]) {
text.push(lines[i++]);
}
// Join into 1 line, SSA-style linebreaks
// Strip out other SSA-style tags
sub.text = text
.join("\\N")
.replace(/\{(\\[\w]+\(?([\w\d]+,?)+\)?)+\}/gi, "");
// Escape HTML entities
sub.text = sub.text.replace(/</g, "<").replace(/>/g, ">");
// Unescape great than and less than when it makes a valid html tag of a supported style (font, b, u, s, i)
// Modified version of regex from Phil Haack's blog: http://haacked.com/archive/2004/10/25/usingregularexpressionstomatchhtml.aspx
// Later modified by kev: http://kevin.deldycke.com/2007/03/ultimate-regular-expression-for-html-tag-parsing-with-php/
sub.text = sub.text.replace(
/<(\/?(font|b|u|i|s))((\s+(\w|\w[\w\-]*\w)(\s*=\s*(?:\".*?\"|'.*?'|[^'\">\s]+))?)+\s*|\s*)(\/?)>/gi,
"<$1$3$7>"
);
//sub.text = sub.text.replace( /\\N/gi, "<br />" );
sub.text = sub.text.replace(/\\N/gi, " ");
const splitMode = 0;
const wordLengthSplit = false;
// enhancements to take account of word length
const swords = sub.text.split(" ");
const sduration = sub.end - sub.start;
const stimeStep = sduration / swords.length;
// determine length of words
const swordLengths = [];
const swordTimes = [];
let totalLetters = 0;
for (var si = 0, sl = swords.length; si < sl; ++si) {
totalLetters = totalLetters + swords[si].length;
swordLengths[si] = swords[si].length;
}
const letterTime = sduration / totalLetters;
let wordStart = 0;
for (var si = 0, sl = swords.length; si < sl; ++si) {
const wordTime = swordLengths[si] * letterTime;
let stime;
if (wordLengthSplit) {
stime = Math.round((sub.start + si * stimeStep) * 1000);
} else {
stime = Math.round((wordStart + sub.start) * 1000);
}
wordStart = wordStart + wordTime;
const stext = swords[si];
if (stime - ltime > paraSplitTime * 1000 && paraSplitTime > 0) {
//console.log("fullstop? "+stext+" - "+stext.indexOf("."));
const punctPresent =
ltext &&
(ltext.indexOf(".") > 0 ||
ltext.indexOf("?") > 0 ||
ltext.indexOf("!") > 0);
if (!paraPunct || (paraPunct && punctPresent)) {
outputString += "</p><p>";
}
}
outputString += `<span data-m="${stime}">${stext} </span>`;
ltime = stime;
ltext = stext;
if (lineBreaks) outputString = `${outputString}\n`;
}
}
return `${outputString}</p></section></article>`;
}
const data = fs.readFileSync(process.argv[2], "utf8");
const result = parseSRT(data);
fs.writeFileSync(process.argv[3], result, "utf8");