-
Notifications
You must be signed in to change notification settings - Fork 0
/
mail2phrases.py
44 lines (37 loc) · 1.06 KB
/
mail2phrases.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
#!/usr/bin/env python
"""
mail2phrases.py: extract sequences of n words from mail messages
usage: mail2phrases.py file1.xml [file2.xml ...]
20190627 erikt(at)xs4all.nl
"""
import sys
import xml.etree.ElementTree as ET
BODY = "Body"
CLIENT = "CLIENT"
MESSAGE = "Message"
N = 20
SENDER = "Sender"
def text2phrases(text):
words = text.split()
wordBuffer = []
for w in words:
wordBuffer.append(w)
if len(wordBuffer) >= N:
print(" ".join(wordBuffer))
wordBuffer.pop(0)
def printPhrases(root):
for message in root.findall(".//"+MESSAGE):
try:
sender = message.findall("./"+SENDER)[0].text
if sender != None:
bodyText = message.findall("./"+BODY)[0].text
text2phrases(bodyText)
except: pass
def main(argv):
for fileName in sys.argv[1:]:
parser = ET.XMLParser(encoding="utf-8")
tree = ET.parse(fileName,parser=parser)
root = tree.getroot()
printPhrases(root)
if __name__ == "__main__":
sys.exit(main(sys.argv))