-
Notifications
You must be signed in to change notification settings - Fork 1
/
main.py
134 lines (111 loc) · 6.13 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import os
import re
import docx
import win32com.client as win32
from pypandoc import convert_file
from utils import CFL, create_quiz, extract_format_text, split_options, is_option, is_question, process_formats
def format_file(file_path: str, del_list: list, selected_options: list) -> list:
"""
Format a document file to DOCX, extract formatted text, and return relevant information.
Args:
file_path (str): The path to the document file.
del_list (list): A list to store the paths of temporary files.
selected_options (list): A list of selected options for formatting.
Returns:
list: Containing the path to the converted DOCX file, a list of highlighted text, and a list of paths of temporary files.
"""
def convert_to_docx(file_path_convert: str, name: str, del_list: list) -> list:
global temp_path_docx
temp_name = f'wteTemp{name}'
# Load the DOCX document using pypandoc
convert_file(file_path_convert, 'plain', extra_args=['--wrap=none'], outputfile=f'{temp_name}.txt')
document = docx.Document()
# Read the text from the file and replace soft returns with paragraph marks
with open(f'{temp_name}.txt', 'r', encoding='utf-8') as file:
text = file.readlines()
for line in text:
document.add_paragraph(line)
document.save(f'{temp_name}.docx')
os.remove(f'{temp_name}.txt')
temp_path_docx = os.path.abspath(f'{temp_name}.docx')
del_list.append(temp_path_docx)
return del_list
# Function to extract formatted text
def extract_original_format(file_path: str, selected_options: list) -> list:
highlights = []
document = docx.Document(file_path)
# Append the highlighted text
for paragraph in document.paragraphs:
highlighted_text = extract_format_text(paragraph, selected_options)
if highlighted_text is not None:
match = re.match(r'^[a-dA-D]', highlighted_text)
if "A,B,C,D" in selected_options and match:
highlights.append(highlighted_text)
# Regex to extract the correct answer with no white space
else:
highlights.append(CFL(re.sub(f'{match}. ', '', highlighted_text)))
return highlights
# Split the file path into name and extension
name, ext = os.path.splitext(os.path.basename(file_path))
abs_file_path = os.path.abspath(file_path)
if ext == ".doc":
# Convert .doc to .docx
temp_name = f"wteDocTemp{name}"
temp_path = os.path.abspath(f"{temp_name}.docx")
word = win32.gencache.EnsureDispatch('Word.Application')
doc = word.Documents.Open(abs_file_path)
doc.Activate()
word.ActiveDocument.SaveAs(temp_name, FileFormat=win32.constants.wdFormatXMLDocument)
doc.Close(False)
word.Quit()
#Delete the temporary .docx file
del_list = convert_to_docx(temp_path, name, del_list)
del_list.append(temp_path)
highlights = extract_original_format(temp_path, selected_options)
return temp_path_docx, highlights, del_list
elif ext == ".docx":
del_list = convert_to_docx(abs_file_path, name, del_list)
highlights = extract_original_format(file_path, selected_options)
return temp_path_docx, highlights, del_list
return None, None, None
# Function to process questions and options
def question_create(doc, current_question: str, current_options: list, highlights: list, data: list, platform: str, selected_options: list, question_numbers: int) -> int:
"""
Process a document to create quiz questions and options based on specific formatting.
The document structure and formatting rules must align with the processing logic for accurate results.
Parameters:
- doc: The document object to process.
- current_question: The current question being processed.
- current_options: The list of current options being processed.
- highlights: The list of highlights being processed.
- data: The list of data being processed.
- platform: The platform for which the quiz is being created.
- selected_options: The list of selected options being processed.
- question_numbers: The current question number.
Returns:
- int: The updated question number.
"""
def last_question(current_question: str, current_options: list, highlights: list, data: list, platform: str, selected_options: list, question_numbers: int) -> None:
if current_question and len(current_options) > 0:
current_question, current_options = process_formats(current_question, current_options, selected_options, question_numbers)
create_quiz(data, current_question, current_options, highlights, platform, selected_options)
for index, paragraph in enumerate(doc.paragraphs):
text = paragraph.text.strip()
#The second condition is to handle multiple line questions, index-2 is because of how the current question is updated
if is_question(text) and not is_question(doc.paragraphs[index - 2].text.strip()):
if current_question and len(current_options) > 0:
current_question, current_options = process_formats(current_question, current_options, selected_options, question_numbers)
question_numbers += 1
create_quiz(data, current_question, current_options, highlights, platform, selected_options)
current_options.clear() # Clear the options list for the new questions
current_question = text
elif current_question:
if is_option(text):
for option in split_options(text):
current_options.append(option)
elif text.strip() and not is_option(doc.paragraphs[index-2].text.strip()):
current_question += '\n'+text
# Process the last question
question_numbers += 1
last_question(current_question, current_options, highlights, data, platform, selected_options, question_numbers)
return question_numbers