-
Notifications
You must be signed in to change notification settings - Fork 0
/
acrodet.py
executable file
·106 lines (90 loc) · 3.45 KB
/
acrodet.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#!/usr/bin/env python3
import csv
import re
import fire
import pdfplumber
from rich.progress import Progress, TextColumn, SpinnerColumn, BarColumn, MofNCompleteColumn, TimeElapsedColumn, \
TimeRemainingColumn
PUNC = '!"#$%&\'()*+,./:;<=>?@[\\]^_`{|}~“”’‘,。:?'
class AcroDet:
def __init__(self, filename, pages=None):
self.filename = filename
self.pages = pages
self.acronyms = {}
def extract_one_page(self, page):
text = page.extract_text()
words = re.sub('[' + PUNC + ']', '', text).split()
matches = []
for i, word in enumerate(words):
if sum([1 for c in word if c.isupper()]) < 3:
continue
if sum([1 for c in word if c.isdigit()]) > 0:
continue
matches.append((word, " ".join(words[i-5:i+5])))
return matches
def extract(self):
with pdfplumber.open(self.filename) as pdf:
number_of_pages = len(pdf.pages)
pages = list(self.pages or range(number_of_pages))
with Progress(
TextColumn(f"[bold green]Parsing {len(pages)} pages"),
SpinnerColumn(),
BarColumn(bar_width=20),
"[progress.percentage]{task.percentage:>3.1f}%",
"•",
MofNCompleteColumn(),
"•",
TimeElapsedColumn(),
"• ETA",
TimeRemainingColumn(),
SpinnerColumn()
) as progress:
task = progress.add_task("parse", total=len(pages))
parsed = 0
for i in pages:
words = self.extract_one_page(pdf.pages[i])
for word, context in words:
if word not in self.acronyms:
self.acronyms[word] = i, context
parsed += 1
progress.update(task, completed=parsed)
def save(self, output_file):
print(f"Saving to file {output_file}...")
with open(output_file, 'w', newline='') as file:
writer = csv.writer(file)
writer.writerow(["Acronym", "Page", "Context"])
for word, (page, context) in self.acronyms.items():
writer.writerow([word, page, context])
def print(self):
for word, (page, context) in self.acronyms.items():
print(f"{word:40s}{page:-5d} {context}")
def acrodet(file_name: str,
start_page: int = None,
end_page: int = None,
output: str = "acronyms.csv",
print_results: bool = False):
"""
AcroDet - Acronym Detector
:param file_name: Full path to the PDF file to be parsed
:param start_page: Start page if you want to parse a subsection
:param end_page: End page if you want to parse a subsection
:param output: Output CSV file for the result
:param print_results: Print result to screen if set
:return: None
"""
print(f"AcroDet - Acronym Detector")
print(f"file_name = {file_name}")
print(f"pages = {start_page} - {end_page}")
print(f"output = {output}")
print(f"print_results = {print_results}")
if start_page is None or end_page is None:
pages = None
else:
pages = range(start_page, end_page)
ad = AcroDet(file_name, pages)
ad.extract()
ad.save(output)
if print_results:
ad.print()
if __name__ == "__main__":
fire.Fire(acrodet)