-
Notifications
You must be signed in to change notification settings - Fork 0
/
m2_to_json.py
223 lines (197 loc) · 7.65 KB
/
m2_to_json.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
import re
import os
import argparse
from tqdm.auto import tqdm
from autocorrect import Speller
from collections import defaultdict
from typing import Dict, List, Tuple, Union
from cdifflib import CSequenceMatcher as SequenceMatcher
from src.utils import write_json, clean_text
TITLE = """
######################
# Process M2 to JSON #
######################
"""
ELLIPSIS_PATTERN = r"(\.\s){2,}" # Two or more sequence of ". "
PARENTHESIS_PATTERN = r"\((.*?)\)(\s)*" # From "This is ok ( I guess ) ." match "( I guess ) "
Edit = Tuple[int, int, str]
def m2_parser(data_path: str) -> Tuple[str, Dict[int, List[Edit]]]:
"""
Extract sentence and annotator edits from the M2 file
"""
orig_sent = None
annotator_edits = defaultdict(list)
with open(data_path, "r") as fp:
for line in fp:
line = line.strip()
if line.startswith("S "):
orig_sent = line[2:]
elif line.startswith("A "):
assert orig_sent is not None
start_end, edit_type, edit_span, _, _, annotator_id = line[2:].split("|||")
if edit_type != "noop":
start, end = start_end.split()
edit = (int(start), int(end), edit_span)
annotator_edits[annotator_id].append(edit)
else:
if orig_sent is not None:
yield orig_sent, annotator_edits
orig_sent = None
annotator_edits = defaultdict(list)
def filter_duplicate(annot_edits: Dict[int, List[Edit]]) -> List[List[Edit]]:
"""
Remove duplicate annotator edits
"""
all_edits = []
for edits in annot_edits.values():
if edits not in all_edits:
all_edits.append(edits)
return all_edits
def apply(tokens: List[str], edits: List[Edit]) -> List[str]:
"""
Apply the annotator edits to the source tokens to generate new tokens
"""
tokens = tokens.copy()
for edit in reversed(edits):
i, j, span = edit
if span == "":
tokens[i:j] = []
else:
tokens[i:j] = [span]
return tokens
def gen_references(text: str, all_edits: List[List[Edit]]) -> List[str]:
"""
Generate reference sentences from given text and edits
"""
if all_edits:
tokens = text.split()
references = []
for edits in all_edits:
ref_tokens = apply(tokens, edits)
ref_text = clean_text(" ".join(ref_tokens), is_ref=True)
ref_text = remove_parenthetical_text(ref_text)
references.append(ref_text)
else:
references = [text]
return references
def similar_ratio(text_a: str, text_b: str) -> float:
"""
Calculate token-based similarity between two texts
"""
tokens_a = text_a.split()
tokens_b = text_b.split()
return SequenceMatcher(None, tokens_a, tokens_b).ratio()
def check_proper_sent(text: str) -> bool:
"""
Check that the sentence starts and ends properly
"""
tokens = text.split()
if not tokens:
return False
# First token is Capitalized and the ending character of last token is one of the given characters
return tokens[0].istitle() and tokens[-1][-1] in '.!?"'
def check_ellipsis(text: str) -> bool:
"""
Check if the sentence has any ellipsis
"""
return bool(re.search(ELLIPSIS_PATTERN, text))
def correct_spelling(checker: Speller, text: str) -> str:
"""
Correct any typos from the sentence
"""
return checker(text)
def remove_parenthetical_text(text: str) -> str:
"""
Remove parenthetical texts i.e. from "I am fine (or not)." to "I am fine."
"""
return re.sub(PARENTHESIS_PATTERN, "", text)
def process_sent(
text: str,
annot_edits: Dict[int, List[Edit]],
checker: Speller,
min_len: int,
max_len: int,
min_sim: float,
only_proper_sent: bool,
spell_check: bool = False,
) -> Union[str, Dict[str, str]]:
"""
Process a given sentence
"""
# Filter sentence with ellipsis
if check_ellipsis(text):
return "Ellipsis"
text = clean_text(text, is_ref=False)
text = remove_parenthetical_text(text)
# Filter sentence based on number of tokens
num_tokens = len(text.split())
if num_tokens < min_len:
return "Less Tokens"
elif num_tokens > max_len:
return "More Tokens"
if spell_check:
text = correct_spelling(checker, text)
all_edits = filter_duplicate(annot_edits)
references = gen_references(text, all_edits)
# Filter sentence based on whether any of the references is not a proper sentence
if only_proper_sent and any(not check_proper_sent(ref_sent) for ref_sent in references):
return "Improper Sentence"
if all_edits:
# Filter sentence based on the mean similarity between the original and reference sentences
mean_sim = sum(similar_ratio(text, ref_sent) for ref_sent in references) / len(references)
if mean_sim < min_sim:
return "Source-Reference Similarity"
return {"text": text, "references": references}
def main(
m2_path: str,
json_path: str,
min_len: int = 5,
max_len: int = 50,
min_sim: float = 0.8,
only_proper_sent: bool = True,
spell_check: bool = True,
remove_ellipsis: bool = True,
):
print(TITLE)
assert json_path.lower().endswith(".json"), f"Not a JSON file; got '{json_path}'"
json_data = []
stats = defaultdict(int)
checker = Speller(lang="en", fast=False, threshold=0)
for orig_sent, annot_edits in tqdm(m2_parser(m2_path), desc="Processing"):
result = process_sent(orig_sent, annot_edits, checker, min_len, max_len, min_sim, only_proper_sent, spell_check)
if isinstance(result, dict):
json_data.append(result)
else:
stats[result] += 1
print(f"Number of sentences: {len(json_data)}")
print("Report of filtered sentences.")
for key, value in stats.items():
print(f"{key:>30}: {value}")
json_dir = os.path.dirname(json_path)
filename = os.path.basename(json_path)[:-5]
os.makedirs(json_dir, exist_ok=True)
params = {
"min_len": min_len,
"max_len": max_len,
"min_sim": min_sim,
"spell_check": spell_check,
"remove_ellipsis": remove_ellipsis,
"only_proper_sent": only_proper_sent,
}
write_json(os.path.join(json_dir, f"{filename}_params.json"), params)
write_json(os.path.join(json_dir, f"{filename}_metadata.json"), stats)
write_json(json_path, json_data)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--m2_path', help='Path to the input M2 file', required=True)
parser.add_argument('--json_path', help='Path to the output JSON files', required=True)
parser.add_argument('--min_len', type=int, help='Min number of tokens in original sentence', default=5)
parser.add_argument('--max_len', type=int, help='Max number of tokens in original sentence', default=50)
parser.add_argument('--min_sim', type=float, help='Min avg similarity between original and references', default=0.8)
parser.add_argument('--only_proper_sent', help='Allow only proper reference sentences', action="store_true")
parser.add_argument('--spell_check', help='Check spelling errors in original and references', action="store_true")
parser.add_argument('--remove_ellipsis', help='Remove (source) sentences with ellipsis', action="store_true")
parser.set_defaults(only_proper_sent=True, spell_check=True, remove_ellipsis=True)
# Convert parsed arguments into key-worded arguments
kwargs = parser.parse_args().__dict__
main(**kwargs)