-
Notifications
You must be signed in to change notification settings - Fork 0
/
ErrorClassifier.py
102 lines (87 loc) · 2.76 KB
/
ErrorClassifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
# coding=UTF-8
import Levenshtein
import Utils
context = None
class ErrorClassifier(object):
errorType = "other"
def __init__(self, newSent, oldSent, diff, comment):
self.oldSent = oldSent
self.newSent = newSent
self.diff = diff
self.err = oldSent[diff[2]:diff[3]]
self.corr = newSent[diff[0]:diff[1]]
self.comment = comment
self.classify()
def classify(self):
if(self.__isPunct()):
self.errorType = "punct"
elif(self.__isTypographical_1()):
self.errorType = "typographical"
else:
self.errBag = Utils.bagOfWords(self.err)
self.corrBag = Utils.bagOfWords(self.corr)
if(self.__isSpelling()):
self.errorType = "spelling"
elif(self.__isTypographical_2()):
self.errorType = "typographical"
elif(self.__isLexicoSemantic()):
self.errorType = "lexicosemantic"
elif(self.__isStylistic()):
self.errorType = "style"
else:
self.errorType = "unclassified"
return self.errorType
def __isPunct(self):
oldPunct = context["errCorpConfig"].reList["classifierpunctuation"].sub('', self.err)
newPunct = context["errCorpConfig"].reList["classifierpunctuation"].sub('', self.corr)
if(oldPunct == newPunct):
return True
else:
return False
def __isTypographical_1(self):
oldPunct = context["errCorpConfig"].reList["punctSpace"].sub('', self.err)
newPunct = context["errCorpConfig"].reList["punctSpace"].sub('', self.corr)
if(oldPunct == newPunct):
return True
else:
return False
def __isTypographical_2(self):
oldPunct = context["errCorpConfig"].reList["punctSpace"].sub('', self.err).lower()
newPunct = context["errCorpConfig"].reList["punctSpace"].sub('', self.corr).lower()
if(oldPunct == newPunct):
return True
else:
return False
def __isSpelling(self):
if(self.err.lower() == self.corr.lower()):
return True
if(len(self.errBag) == len(self.corrBag)):
if(Levenshtein.distance(self.err, self.corr) < context["typoTreshold"]):
return True
return False
def __isLexicoSemantic(self):
if(len(self.corrBag) >= len(self.errBag)):
if(len(self.corrBag - self.errBag) <= context["wordTreshold"]):
return True
else:
if(len(self.errBag - self.corrBag) <= context["wordTreshold"]):
return True
return False
def __isStylistic(self):
oldBag = Utils.bagOfWords(self.oldSent)
newBag = Utils.bagOfWords(self.newSent)
if(len(self.corrBag - oldBag) == 0):
return True
if(len(oldBag ^ newBag) <= 2 * context["wordTreshold"]):
return True
return False
def getStart(self):
return self.diff[0]
def getEnd(self):
return self.diff[1]
def getStartString(self):
return "<err type=\""+ self.errorType + "\">" + self.err + "</err><corr type=\""+ self.errorType + "\">"
def getEndString(self):
return "</corr>"
def getErrorType(self):
return self.errorType