-
Notifications
You must be signed in to change notification settings - Fork 1
/
i_using_templates_newsgroup_generate_data.py
173 lines (146 loc) · 7.36 KB
/
i_using_templates_newsgroup_generate_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
# -*- coding: utf-8 -*-
################### Arguments ################################################
# https://support.microsoft.com/en-us/help/827745/how-to-change-the-export-resolution-of-a-powerpoint-slide
# 72 as Decimal
###############################################################################
import win32com.client, sys
import os
import argparse
import i_utilities_ifpeb
import random
def main():
parser = argparse.ArgumentParser()
parser.add_argument("language", help="lang_ja,lang_ko,lang_es")
args = parser.parse_args()
CURR_LANG = args.language
data_folder = os.path.join(os.getcwd(), 'data')
lang_folder, images_folder, image_pool_folder, ppt_folder = i_utilities_ifpeb.init_folder_hierarchy(data_folder, CURR_LANG)
BATCH_COUNTER = -1
transcription = None
folder_for_ppt = ppt_folder
con_set = i_utilities_ifpeb.populate_links_have(lang_folder)
# takes care of the condition when the process and stopped. Essential since MS-PPT crashes sometimes.
is_first_call = True
# reading the target language news group and loading in the a list
nw_file = open(lang_folder+"/newsgroup.txt", 'r', encoding='utf-8')
list_sample = nw_file.readlines()
for each_ppt in os.listdir(folder_for_ppt):
if (each_ppt.endswith('ppt') or each_ppt.endswith('pptx')):
# if(each_ppt in con_set):
# print('continuing - ',each_ppt)
# continue
# launching the Microsift powerpoint
Application = win32com.client.Dispatch("PowerPoint.Application")
Application.Visible = True
# ------------------------------------
print('PPTS processed = ', len(con_set))
# implement the batching to save intermediate results
if len(con_set) % i_utilities_ifpeb.BATCH == 0 or is_first_call:
is_first_call = False
if transcription:
transcription.close()
BATCH_COUNTER = int(len(con_set) / i_utilities_ifpeb.BATCH)
filename = os.path.join(lang_folder, 'transcription_'+str(BATCH_COUNTER)+'.txt')
try:
transcription = open(filename, 'a')
except IOError:
transcription = open(filename, 'w')
print("file to be used = ",filename)
# ---------------------------------------------------------------
# create an object for the powerpoint file
try:
presentation_object = Application.Presentations.Open(os.path.join(folder_for_ppt, each_ppt))
except Exception as e:
# corrupt slide
print(each_ppt, 'could not open ',e)
continue
print("working for = ",each_ppt)
con_set.add(each_ppt)
# open up a section in transcription for the current slide
trans = ["SlideName - " + each_ppt]
transcription.write(trans[0] + '\n')
#----------------------------------------------------------
try :
for sl_index, each_slide_object in enumerate(presentation_object.Slides):
process_this_slide(sl_index, each_slide_object, con_set, trans, transcription, presentation_object,
BATCH_COUNTER, image_pool_folder, each_ppt, images_folder, list_sample)
# call this method with multiple threads.
try:
presentation_object.Close()
except Exception as e:
print('problem with closing the file',e)
except:
continue
Application.Quit()
transcription.close()
def save_results_for(elems, trans):
was_anything_found = False
for elem in elems:
if elem.Text not in ("\r", "\n", " ", u"\u000D", u"\u000A"): # , u"\u000B", u"\u0009"
# skip if only spaces are there
if(elem.Text.isspace()):
continue
# makes it eligible for the slide to be recorded as a sample
was_anything_found = True
# elem.TextFrame.TextRange.Text = "sd hdgd hdgd"
text_in_unicode = i_utilities_ifpeb.charwise_hex_string(elem.Text)
# See the code in the end if you are trying to extract.
trans.append(str(int(elem.BoundLeft)) + ' ' + str(int(elem.BoundTop)) + ' '
+ str(int(elem.BoundWidth)) + ' ' + str(int(elem.BoundHeight)) + ' ' + text_in_unicode)
return was_anything_found
def process_this_slide(sl_index, each_slide_object, con_set, trans, transcription, presentation_object, BATCH_COUNTER, image_pool_folder, each_ppt, images_folder, list_sample):
print('============================ slide no ================================ ppt - ',len(con_set),'slide = ',str(sl_index+1),'/', len(presentation_object.Slides))
# Divide the groups of all the slides.
print('BEFORE number of shapes in the current slide = ', len(each_slide_object.Shapes))
in_group_limit_satisfied = i_utilities_ifpeb.ungroup_all_shapes(each_slide_object)
print('REVISED number of shapes in the current slide = ', len(each_slide_object.Shapes))
if(not in_group_limit_satisfied):
print("SKIPPING this slide.")
return
# -----------------------------------------
# initilizaions for the slide processing.
trans = []
trans.append("Slide " + str(sl_index))
was_anything_found = False
to_be_processed_shapes = []
import time
st_time = time.time()
print('Starting to loop through ungrouped shapes on the shapes')
# finally process the slide. Extract the text that is in the slides.
for i in range(len(each_slide_object.Shapes)):
try:
each_shape = each_slide_object.Shapes[i]
if each_shape.HasTextFrame and each_shape.TextFrame.HasText and not each_shape.HasSmartArt:
elems = each_shape.TextFrame.TextRange.Lines()
replace_text(each_shape, list_sample)
was_anything_found = save_results_for(elems, trans)
else: # if has text loop
to_be_processed_shapes.append(each_shape)
except Exception as e:
print('exception line 138', e)
continue
# for other shapes. keep on singing.
else: # for loop else.
# Everything good store the slide as image
name = each_ppt +"_"+ str(sl_index) + "_" + str(BATCH_COUNTER) + '.jpg'
if was_anything_found:
try:
i_utilities_ifpeb.process_these_shapes(to_be_processed_shapes, each_slide_object, image_pool_folder)
except:
print('exception during delete shape')
print(' SAVING ======= ', name)
try:
each_slide_object.export(os.path.join(images_folder, name), 'JPG')
transcription.write('\n'.join(trans) + '\n')
except:
print('error during export')
# input("waiting")
print('Done looping through the shapes.', time.time() - st_time)
def replace_text(each_shape, list_sample):
target_len = len(each_shape.TextFrame.TextRange.Text)
to_be_substituted = ""
while (len(to_be_substituted) < target_len):
to_be_substituted += random.choice(list_sample).rstrip()+" "
each_shape.TextFrame.TextRange.Text = to_be_substituted[0:target_len]+'\n'
if __name__=='__main__':
main()