-
Notifications
You must be signed in to change notification settings - Fork 0
/
Exporter.py
52 lines (47 loc) · 1.68 KB
/
Exporter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
# coding=UTF-8
import io
import unitok
import time
context = None
def tokenize(text):
"""Tokenizes given text"""
out = io.StringIO()
re_list = context["unitokConfig"].re_list
tokens = unitok.tokenize_recursively(text, re_list)
unitok.print_tokens(tokens, out, True, False)
text = out.getvalue()
out.close()
return text
def exportToStream(page):
"""Writes given errors to output stream"""
outputStreamFull = context["outputStreamFull"]
outputStreamOrphans = context["outputStreamOrphans"]
if context["outputFormat"] == "se":
latestRev = page["revisions"][-1]["*"]
output = ("<doc n=\"%s\" t=\"%s\">" % (page["name"], page["revisions"][-1]["timestamp"]))
#output = ("<doc n=\"%s\" t=\"%s\">" % (page["name"], time.strftime("%B %d, %Y %H:%M:%S", page["revisions"][-1]["timestamp"])))
for line in latestRev:
if line != "":
output += ("<s>%s</s>" % line)
output += "</doc>"
outputStreamFull.write(tokenize(output))
output = ("<doc n=\"%s\">" % page["name"])
for error in page["errors"]:
if error != "":
output += ("<s>%s</s>" % error)
output += "</errors></doc>"
outputStreamOrphans.write(tokenize(output))
outputStreamFull.flush()
outputStreamOrphans.flush()
elif context["outputFormat"] == "txt":
outputStreamFull.write("Page: %s\n" % (page["name"]))
outputStreamOrphans.write("Page: %s\n" % (page["name"]))
latestRev = page["revisions"][-1]["*"]
for line in latestRev:
outputStreamFull.write("%s\n" % line)
for error in page["errors"]:
outputStreamOrphans.write("%s\n" % error)
outputStreamFull.write("\n")
outputStreamOrphans.write("\n")
outputStreamFull.flush()
outputStreamOrphans.flush()