-
Notifications
You must be signed in to change notification settings - Fork 1
/
MyCorpusReader.py
127 lines (109 loc) · 4.09 KB
/
MyCorpusReader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# Yunzhe Liu
# Austin Lau
# CSI 4V96
import nltk
import os
from nltk.corpus import PlaintextCorpusReader
class MyCorpusReader:
# Root of path
root = ""
# stemmer
stemmer = None
# list of stop words
stop_words = None
# Constructor
# dname - Name of directory of corpus
# sfile - File with stopwords, default English stopwords in NLTK
# stemmer - porter, snowball, etc., default none
def __init__(self, dname, sfile=None, stemmer=None):
if not os.path.isdir(os.path.abspath(dname)):
raise FileExistsError('invalid directory!')
if sfile == "":
self.stop_words = ()
elif sfile is None:
self.stop_words = set(nltk.corpus.stopwords.words('english'))
else:
if not os.path.exists(sfile):
raise FileExistsError('invalid file!')
else:
reader = PlaintextCorpusReader(str(os.getcwd()), sfile)
self.stop_words = set(reader.words([sfile, ]))
self.root = os.path.abspath(dname)
if isinstance(stemmer, nltk.stem.porter.PorterStemmer) or \
isinstance(stemmer, nltk.stem.snowball.SnowballStemmer) or \
stemmer is None:
self.stemmer = stemmer
else:
raise Exception('invalid stemmer')
# the raw content of the specified files
def raw(self, fileidList=None):
content_list = ""
if fileidList is None:
fileidList = os.listdir(self.root)
else:
if isinstance(fileidList, str):
fileidList = [fileidList, ]
for file in fileidList:
with open(os.path.join(self.root, file)) as opened_file:
content_list = content_list + opened_file.read()
return content_list
# the words of the specified fileids
def words(self, fileidList=None):
if fileidList is None:
fileidList = os.listdir(self.root)
else:
if isinstance(fileidList, str):
fileidList = [fileidList, ]
return_words = []
for file in fileidList:
reader = PlaintextCorpusReader(self.root, file)
return_words.extend(reader.words())
return return_words
# the sentences of the specified fileids
def sents(self, fileidList=None):
if fileidList is None:
fileidList = os.listdir(self.root)
else:
if isinstance(fileidList, str):
fileidList = [fileidList, ]
return_sents = []
for file in fileidList:
reader = PlaintextCorpusReader(self.root, file)
return_sents.extend(reader.sents())
return return_sents
# the location of the given file on disk
def abspath(self, fileid):
path = os.path.join(self.root, fileid)
if not os.path.exists(path):
raise FileExistsError('invalid file!')
return path
# count the word after stemming of each file
def count(self):
stemmed_words = []
if self.stemmer is None:
stemmed_words = self.words(os.listdir(self.root))
else:
for word in self.words(os.listdir(self.root)):
stemmed_words.append(self.stemmer.stem(word))
word_freq_dict = {}
for word in stemmed_words:
word_freq_dict[word] = word_freq_dict.get(word, 0) + 1
return word_freq_dict.items()
# count the frequency of a given file before stemming in each file
def countWord(self, string):
string_freq = {}
for doc in os.listdir(self.root):
for word in self.words(doc):
if word == string:
string_freq[doc] = string_freq.get(doc, 0) + 1
return string_freq
# count the total words in each file, remove stop words
def totalWords(self):
word_count = {}
for doc in os.listdir(self.root):
for word in self.words(doc):
if word not in self.stop_words:
word_count[doc] = word_count.get(doc, 0) + 1
return word_count
def fileids(self):
return os.listdir(self.root)