-
Notifications
You must be signed in to change notification settings - Fork 35
/
pdf2pdfocr.py
executable file
·1543 lines (1456 loc) · 76.9 KB
/
pdf2pdfocr.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python3
##############################################################################
# Copyright (c) 2023: Leonardo Cardoso
# https://github.com/LeoFCardoso/pdf2pdfocr
##############################################################################
# OCR a PDF and add a text "layer" in the original file (a so called "pdf sandwich")
# Use only open source tools.
# Unless requested, does not re-encode the images inside an unprotected PDF file.
# Leonardo Cardoso - inspired in ocrmypdf (https://github.com/jbarlow83/OCRmyPDF)
# and this post: https://github.com/jbarlow83/OCRmyPDF/issues/8
###############################################################################
import argparse
import configparser
import datetime
import errno
import glob
import io
import itertools
import math
import multiprocessing
import os
import random
import re
import shlex
import shutil
import signal
import string
import subprocess
import sys
import tempfile
import time
from collections import namedtuple
from concurrent import futures
from pathlib import Path
from xml.etree import ElementTree
import PyPDF2
import psutil
from PIL import Image, ImageChops
from PyPDF2.errors import PdfReadError
from PyPDF2.generic import ByteStringObject
from bs4 import BeautifulSoup
from packaging.version import parse as parse_version
from reportlab.lib.units import inch
from reportlab.pdfgen.canvas import Canvas
__author__ = 'Leonardo F. Cardoso'
VERSION = '1.12.3 marapurense '
def eprint(*args, **kwargs):
print(*args, file=sys.stderr, flush=True, **kwargs)
def do_pdftoimage(param_path_pdftoppm, param_page_range, param_input_file, param_image_resolution, param_tmp_dir,
param_prefix, param_shell_mode):
"""
Will be called from multiprocessing, so no global variables are allowed.
Convert PDF to image file.
"""
command_line_list = [param_path_pdftoppm]
first_page = 0
last_page = 0
if param_page_range is not None:
first_page = param_page_range[0]
last_page = param_page_range[1]
command_line_list += ['-f', str(first_page), '-l', str(last_page)]
#
command_line_list += ['-r', str(param_image_resolution), '-jpeg', param_input_file, param_tmp_dir + param_prefix]
pimage = subprocess.Popen(command_line_list, stdout=subprocess.DEVNULL,
stderr=open(param_tmp_dir + "pdftoppm_err_{0}-{1}-{2}.log".format(param_prefix, first_page, last_page), "wb"),
shell=param_shell_mode)
pimage.wait()
return pimage.returncode
def do_autorotate_info(param_image_file, param_shell_mode, param_temp_dir, param_tess_lang, param_path_tesseract, param_tesseract_version):
"""
Will be called from multiprocessing, so no global variables are allowed.
Do autorotate of images based on tesseract (execution with 'psm 0') information.
"""
param_image_no_ext = os.path.splitext(os.path.basename(param_image_file))[0]
psm_parameter = "-psm" if (param_tesseract_version == 3) else "--psm"
tess_command_line = [param_path_tesseract, '-l', "osd+" + param_tess_lang, psm_parameter, '0', param_image_file,
param_temp_dir + param_image_no_ext]
ptess1 = subprocess.Popen(tess_command_line,
stdout=open(param_temp_dir + "autorot_tess_out_{0}.log".format(param_image_no_ext), "wb"),
stderr=open(param_temp_dir + "autorot_tess_err_{0}.log".format(param_image_no_ext), "wb"),
shell=param_shell_mode)
ptess1.wait()
def do_deskew(param_image_file, param_threshold, param_shell_mode, param_path_mogrify):
"""
Will be called from multiprocessing, so no global variables are allowed.
Do a deskew of image.
"""
pd = subprocess.Popen([param_path_mogrify, '-deskew', param_threshold, param_image_file], shell=param_shell_mode)
pd.wait()
return True
def do_ocr_tesseract(param_image_file, param_extra_ocr_flag, param_tess_lang, param_tess_psm, param_temp_dir, param_shell_mode, param_path_tesseract,
param_text_generation_strategy, param_delete_temps, param_tess_can_textonly_pdf):
"""
Will be called from multiprocessing, so no global variables are allowed.
Do OCR of image with tesseract
"""
param_image_no_ext = os.path.splitext(os.path.basename(param_image_file))[0]
tess_command_line = [param_path_tesseract]
if type(param_extra_ocr_flag) == str:
tess_command_line.extend(param_extra_ocr_flag.split(" "))
tess_command_line.extend(['-l', param_tess_lang])
if param_text_generation_strategy == "tesseract":
tess_command_line += ['-c', 'tessedit_create_pdf=1']
if param_tess_can_textonly_pdf:
tess_command_line += ['-c', 'textonly_pdf=1']
#
if param_text_generation_strategy == "native":
tess_command_line += ['-c', 'tessedit_create_hocr=1']
#
tess_command_line += [
'-c', 'tessedit_create_txt=1',
'-c', 'tessedit_pageseg_mode=' + param_tess_psm,
param_image_file, param_temp_dir + param_image_no_ext]
pocr = subprocess.Popen(tess_command_line,
stdout=subprocess.DEVNULL,
stderr=open(param_temp_dir + "tess_err_{0}.log".format(param_image_no_ext), "wb"),
shell=param_shell_mode)
pocr.wait()
if param_text_generation_strategy == "tesseract" and (not param_tess_can_textonly_pdf):
pdf_file = param_temp_dir + param_image_no_ext + ".pdf"
pdf_file_tmp = param_temp_dir + param_image_no_ext + ".tesspdf"
os.rename(pdf_file, pdf_file_tmp)
output_pdf = PyPDF2.PdfWriter()
desc_pdf_file_tmp = open(pdf_file_tmp, 'rb')
tess_pdf = PyPDF2.PdfReader(desc_pdf_file_tmp, strict=False)
for i in range(len(tess_pdf.pages)):
imagepage = tess_pdf.pages[i]
output_pdf.addPage(imagepage)
#
output_pdf.removeImages(ignoreByteStringObject=False)
out_page = output_pdf.getPage(0) # Tesseract PDF is always one page in this software
# Hack to obtain smaller file (delete the image reference)
out_page["/Resources"][PyPDF2.generic.createStringObject("/XObject")] = PyPDF2.generic.ArrayObject()
out_page.compressContentStreams()
with open(pdf_file, 'wb') as f:
output_pdf.write(f)
desc_pdf_file_tmp.close()
# Try to save some temp space as tesseract generate PDF with same size of image
if param_delete_temps:
os.remove(pdf_file_tmp)
#
if param_text_generation_strategy == "native":
hocr = HocrTransform(param_temp_dir + param_image_no_ext + ".hocr", 300)
hocr.to_pdf(param_temp_dir + param_image_no_ext + ".pdf", image_file_name=None, show_bounding_boxes=False,
invisible_text=True)
# Track progress in all situations
Path(param_temp_dir + param_image_no_ext + ".tmp").touch() # .tmp files are used to track overall progress
def do_ocr_cuneiform(param_image_file, param_extra_ocr_flag, param_cunei_lang, param_temp_dir, param_shell_mode, param_path_cunei):
"""
Will be called from multiprocessing, so no global variables are allowed.
Do OCR of image with cuneiform
"""
param_image_no_ext = os.path.splitext(os.path.basename(param_image_file))[0]
cunei_command_line = [param_path_cunei]
if type(param_extra_ocr_flag) == str:
cunei_command_line.extend(param_extra_ocr_flag.split(" "))
cunei_command_line.extend(['-l', param_cunei_lang.lower(), "-f", "hocr", "-o", param_temp_dir + param_image_no_ext + ".hocr", param_image_file])
#
pocr = subprocess.Popen(cunei_command_line,
stdout=open(param_temp_dir + "cuneif_out_{0}.log".format(param_image_no_ext), "wb"),
stderr=open(param_temp_dir + "cuneif_err_{0}.log".format(param_image_no_ext), "wb"),
shell=param_shell_mode)
pocr.wait()
# Sometimes, cuneiform fails to OCR and expected HOCR file is missing. Experiments show that English can be used to try a workaround.
if not os.path.isfile(param_temp_dir + param_image_no_ext + ".hocr") and param_cunei_lang.lower() != "eng":
eprint("Warning: fail to OCR file '{0}'. Trying again with English language.".format(param_image_no_ext))
cunei_command_line = [param_path_cunei]
if type(param_extra_ocr_flag) == str:
cunei_command_line.extend(param_extra_ocr_flag.split(" "))
cunei_command_line.extend(['-l', "eng", "-f", "hocr", "-o", param_temp_dir + param_image_no_ext + ".hocr", param_image_file])
pocr = subprocess.Popen(cunei_command_line,
stdout=open(param_temp_dir + "cuneif_out_eng_{0}.log".format(param_image_no_ext), "wb"),
stderr=open(param_temp_dir + "cuneif_err_eng_{0}.log".format(param_image_no_ext), "wb"),
shell=param_shell_mode)
pocr.wait()
#
bs_parser = "lxml"
if os.path.isfile(param_temp_dir + param_image_no_ext + ".hocr"):
# Try to fix unclosed meta tags, as cuneiform HOCR may be not well-formed
with open(param_temp_dir + param_image_no_ext + ".hocr", "r") as fpr:
corrected_hocr = str(BeautifulSoup(fpr, bs_parser))
else:
eprint("Warning: fail to OCR file '{0}'. Page will not contain text.".format(param_image_no_ext))
# TODO try to use the same size as original PDF page (bbox is hard coded by now to look like A4 page - portrait)
corrected_hocr = str(BeautifulSoup('<div class="ocr_page" id="page_1" title="image x; bbox 0 0 1700 2400">', bs_parser))
with open(param_temp_dir + param_image_no_ext + ".fixed.hocr", "w") as fpw:
fpw.write(corrected_hocr)
#
hocr = HocrTransform(param_temp_dir + param_image_no_ext + ".fixed.hocr", 300)
hocr.to_pdf(param_temp_dir + param_image_no_ext + ".pdf", image_file_name=None, show_bounding_boxes=False, invisible_text=True)
# Track progress
Path(param_temp_dir + param_image_no_ext + ".tmp").touch() # .tmp files are used to track overall progress
def do_rebuild(param_image_file, param_path_convert, param_convert_params, param_tmp_dir, param_shell_mode):
"""
Will be called from multiprocessing, so no global variables are allowed.
Create one PDF file from image file.
"""
param_image_no_ext = os.path.splitext(os.path.basename(param_image_file))[0]
# http://stackoverflow.com/questions/79968/split-a-string-by-spaces-preserving-quoted-substrings-in-python
convert_params_list = shlex.split(param_convert_params)
command_rebuild = [param_path_convert, param_image_file] + convert_params_list + [param_tmp_dir + "REBUILD_" + param_image_no_ext + ".pdf"]
prebuild = subprocess.Popen(
command_rebuild,
stdout=open(param_tmp_dir + "convert_log_{0}.log".format(param_image_no_ext), "wb"),
stderr=open(param_tmp_dir + "convert_err_{0}.log".format(param_image_no_ext), "wb"),
shell=param_shell_mode)
prebuild.wait()
def do_check_img_greyscale(param_image_file):
"""
Inspired in code provided by karl-k:
https://stackoverflow.com/questions/23660929/how-to-check-whether-a-jpeg-image-is-color-or-gray-scale-using-only-python-stdli
Check if image is monochrome (1 channel or 3 identical channels)
"""
im = Image.open(param_image_file).convert('RGB')
rgb = im.split()
if ImageChops.difference(rgb[0], rgb[1]).getextrema()[1] != 0:
return False
if ImageChops.difference(rgb[0], rgb[2]).getextrema()[1] != 0:
return False
#
return True
def do_check_img_colors_size(param_image_file):
"""
Inspired in code provided by Raffael:
https://stackoverflow.com/questions/14041562/python-pil-detect-if-an-image-is-completely-black-or-white
Check if image is single color
"""
im = Image.open(param_image_file)
colors = im.getcolors()
width, height = im.size
return colors, (width, height)
def do_create_blank_pdf(param_filename_pdf, param_dimensions, param_image_resolution):
blank_output_pdf = PyPDF2.PdfWriter()
img_witdh = param_dimensions[0]
img_width_pt = (img_witdh / param_image_resolution) * 72.0
img_height = param_dimensions[1]
img_height_pt = (img_height / param_image_resolution) * 72.0
blank_output_pdf.addBlankPage(img_width_pt, img_height_pt)
with open(param_filename_pdf, 'wb') as f:
blank_output_pdf.write(f)
f.close()
def percentual_float(x):
x = float(x)
if x <= 0.0 or x > 1.0:
raise argparse.ArgumentTypeError("%r not in range (0.0, 1.0]" % (x,))
return x
class HocrTransformError(Exception):
pass
class HocrTransform:
"""
A class for converting documents from the hOCR format.
For details of the hOCR format, see:
http://docs.google.com/View?docid=dfxcv4vc_67g844kf
Adapted from https://github.com/jbarlow83/OCRmyPDF/blob/master/ocrmypdf/hocrtransform.py
"""
def __init__(self, hocr_file_name, dpi):
self.rect = namedtuple('Rect', ['x1', 'y1', 'x2', 'y2'])
self.dpi = dpi
self.boxPattern = re.compile(r'bbox((\s+\d+){4})')
self.hocr = ElementTree.parse(hocr_file_name)
# if the hOCR file has a namespace, ElementTree requires its use to
# find elements
matches = re.match(r'({.*})html', self.hocr.getroot().tag)
self.xmlns = ''
if matches:
self.xmlns = matches.group(1)
# get dimension in pt (not pixel!!!!) of the OCRed image
self.width, self.height = None, None
for div in self.hocr.findall(
".//%sdiv[@class='ocr_page']" % self.xmlns):
coords = self.element_coordinates(div)
pt_coords = self.pt_from_pixel(coords)
self.width = pt_coords.x2 - pt_coords.x1
self.height = pt_coords.y2 - pt_coords.y1
# there shouldn't be more than one, and if there is, we don't want it
break
if self.width is None or self.height is None:
raise HocrTransformError("hocr file is missing page dimensions")
def __str__(self):
"""
Return the textual content of the HTML body
"""
if self.hocr is None:
return ''
body = self.hocr.find(".//%sbody" % self.xmlns)
if body:
return self._get_element_text(body)
else:
return ''
def _get_element_text(self, element):
"""
Return the textual content of the element and its children
"""
text = ''
if element.text is not None:
text += element.text
for child in element:
text += self._get_element_text(child)
if element.tail is not None:
text += element.tail
return text
def element_coordinates(self, element):
"""
Returns a tuple containing the coordinates of the bounding box around
an element
"""
out = (0, 0, 0, 0)
if 'title' in element.attrib:
matches = self.boxPattern.search(element.attrib['title'])
if matches:
coords = matches.group(1).split()
out = self.rect._make(int(coords[n]) for n in range(4))
return out
def pt_from_pixel(self, pxl):
"""
Returns the quantity in PDF units (pt) given quantity in pixels
"""
return self.rect._make(
(c / self.dpi * inch) for c in pxl)
def replace_unsupported_chars(self, s):
"""
Given an input string, returns the corresponding string that:
- is available in the helvetica facetype
- does not contain any ligature (to allow easy search in the PDF file)
"""
# The 'u' before the character to replace indicates that it is a
# unicode character
s = s.replace(u"fl", "fl")
s = s.replace(u"fi", "fi")
return s
def to_pdf(self, out_file_name, image_file_name=None, show_bounding_boxes=False, fontname="Helvetica",
invisible_text=True):
"""
Creates a PDF file with an image superimposed on top of the text.
Text is positioned according to the bounding box of the lines in
the hOCR file.
The image need not be identical to the image used to create the hOCR
file.
It can have a lower resolution, different color mode, etc.
"""
# create the PDF file
# page size in points (1/72 in.)
pdf = Canvas(
out_file_name, pagesize=(self.width, self.height), pageCompression=1)
# draw bounding box for each paragraph
# light blue for bounding box of paragraph
pdf.setStrokeColorRGB(0, 1, 1)
# light blue for bounding box of paragraph
pdf.setFillColorRGB(0, 1, 1)
pdf.setLineWidth(0) # no line for bounding box
for elem in self.hocr.findall(
".//%sp[@class='%s']" % (self.xmlns, "ocr_par")):
elemtxt = self._get_element_text(elem).rstrip()
if len(elemtxt) == 0:
continue
pxl_coords = self.element_coordinates(elem)
pt = self.pt_from_pixel(pxl_coords)
# draw the bbox border
if show_bounding_boxes:
pdf.rect(pt.x1, self.height - pt.y2, pt.x2 - pt.x1, pt.y2 - pt.y1, fill=1)
# check if element with class 'ocrx_word' are available
# otherwise use 'ocr_line' as fallback
elemclass = "ocr_line"
if self.hocr.find(".//%sspan[@class='ocrx_word']" % self.xmlns) is not None:
elemclass = "ocrx_word"
# itterate all text elements
# light green for bounding box of word/line
pdf.setStrokeColorRGB(1, 0, 0)
pdf.setLineWidth(0.5) # bounding box line width
pdf.setDash(6, 3) # bounding box is dashed
pdf.setFillColorRGB(0, 0, 0) # text in black
for elem in self.hocr.findall(".//%sspan[@class='%s']" % (self.xmlns, elemclass)):
elemtxt = self._get_element_text(elem).rstrip()
elemtxt = self.replace_unsupported_chars(elemtxt)
if len(elemtxt) == 0:
continue
pxl_coords = self.element_coordinates(elem)
pt = self.pt_from_pixel(pxl_coords)
# draw the bbox border
if show_bounding_boxes:
pdf.rect(pt.x1, self.height - pt.y2, pt.x2 - pt.x1, pt.y2 - pt.y1, fill=0)
text = pdf.beginText()
fontsize = pt.y2 - pt.y1
text.setFont(fontname, fontsize)
if invisible_text:
text.setTextRenderMode(3) # Invisible (indicates OCR text)
# set cursor to bottom left corner of bbox (adjust for dpi)
text.setTextOrigin(pt.x1, self.height - pt.y2)
# scale the width of the text to fill the width of the bbox
text.setHorizScale(100 * (pt.x2 - pt.x1) / pdf.stringWidth(elemtxt, fontname, fontsize))
# write the text to the page
text.textLine(elemtxt)
pdf.drawText(text)
#
# put the image on the page, scaled to fill the page
if image_file_name is not None:
pdf.drawImage(image_file_name, 0, 0, width=self.width, height=self.height)
# finish up the page and save it
pdf.showPage()
pdf.save()
#
class Pdf2PdfOcrException(Exception):
pass
class Pdf2PdfOcr:
# External tools command. If you can't edit your path, adjust here to match your system
cmd_cuneiform = "cuneiform"
path_cuneiform = ""
cmd_tesseract = "tesseract"
path_tesseract = ""
cmd_convert = "convert"
cmd_magick = "magick" # used on Windows with ImageMagick 7+ (to avoid conversion path problems)
path_convert = ""
cmd_mogrify = "mogrify"
path_mogrify = ""
cmd_file = "file"
path_file = ""
cmd_pdftoppm = "pdftoppm"
path_pdftoppm = ""
cmd_pdffonts = "pdffonts"
path_pdffonts = ""
cmd_ps2pdf = "ps2pdf"
path_ps2pdf = ""
cmd_pdf2ps = "pdf2ps"
path_pdf2ps = ""
cmd_gs = "gs"
path_gs = ""
cmd_qpdf = "qpdf"
path_qpdf = ""
tesseract_can_textonly_pdf = False
"""Since Tesseract 3.05.01, new use case of tesseract - https://github.com/tesseract-ocr/tesseract/issues/660"""
tesseract_version = 3
"""Tesseract version installed on system"""
extension_images = "jpg"
"""Temp images will use this extension. Using jpg to avoid big temp files in pdf with a lot of pages"""
output_file = ""
"""The PDF output file"""
output_file_text = ""
"""The TXT output file"""
path_this_python = sys.executable
"""Path for python in this system"""
shell_mode = (sys.platform == "win32")
"""How to run external process? In Windows use Shell=True
http://stackoverflow.com/questions/5658622/python-subprocess-popen-environment-path
"Also, on Windows with shell=False, it pays no attention to PATH at all,
and will only look in relative to the current working directory."
"""
def __init__(self, args, override_input_file=None):
super().__init__()
self.log_time_format = '%Y-%m-%d %H:%M:%S.%f'
#
# A random prefix to support multiple execution in parallel
self.prefix = ''.join(random.SystemRandom().choice(string.ascii_uppercase + string.digits) for _ in range(5))
# The temp dir
self.tmp_dir = tempfile.gettempdir() + os.path.sep + "pdf2pdfocr_{0}".format(self.prefix) + os.path.sep
os.mkdir(self.tmp_dir)
#
self.verbose_mode = args.verbose_mode
self.check_external_tools()
# Handle arguments from command line
self.safe_mode = args.safe_mode
self.check_text_mode = args.check_text_mode
self.ignore_existing_text = args.ignore_existing_text
self.blank_pages = []
self.blank_pages_dimensions = []
self.check_protection_mode = args.check_protection_mode
self.avoid_high_pages_mode = args.max_pages is not None
self.avoid_high_pages_pages = args.max_pages
self.avoid_small_file_mode = args.min_kbytes is not None
self.avoid_small_file_limit_kb = args.min_kbytes
self.force_rebuild_mode = args.force_rebuild_mode
self.rebuild_pdf_from_images = False
self.user_convert_params = args.convert_params
if self.user_convert_params is None:
self.user_convert_params = "" # Default
self.deskew_threshold = args.deskew_percent
self.use_deskew_mode = args.deskew_percent is not None
self.use_autorotate = args.autorotate
self.parallel_threshold = args.parallel_percent
if self.parallel_threshold is None:
self.parallel_threshold = 1 # Default
self.create_text_mode = args.create_text_mode
self.force_out_file_mode = args.output_file is not None
if self.force_out_file_mode:
self.force_out_file = args.output_file
else:
self.force_out_file = ""
self.force_out_dir_mode = args.output_dir is not None
if self.force_out_dir_mode:
self.force_out_dir = args.output_dir
else:
self.force_out_dir = ""
if self.force_out_file != "" and self.force_out_dir != "":
raise Pdf2PdfOcrException("It's not possible to force output name and dir at the same time. Please use '-o' OR '-O'")
if self.force_out_dir_mode and (not os.path.isdir(self.force_out_dir)):
raise Pdf2PdfOcrException("Invalid output directory: {0}".format(self.force_out_dir))
self.tess_langs = args.tess_langs
if self.tess_langs is None:
self.tess_langs = "por+eng" # Default
self.tess_psm = args.tess_psm
if self.tess_psm is None:
self.tess_psm = "1" # Default
self.image_resolution = args.image_resolution
self.text_generation_strategy = args.text_generation_strategy
if self.text_generation_strategy not in ["tesseract", "native"]:
raise Pdf2PdfOcrException("{0} is not a valid text generation strategy. Exiting.".format(self.text_generation_strategy))
self.ocr_ignored = False
self.ocr_engine = args.ocr_engine
if self.ocr_engine not in ["tesseract", "cuneiform", "no_ocr"]:
raise Pdf2PdfOcrException("{0} is not a valid ocr engine. Exiting.".format(self.ocr_engine))
self.extra_ocr_flag = args.extra_ocr_flag
if self.extra_ocr_flag is not None:
self.extra_ocr_flag = str(self.extra_ocr_flag.strip())
self.delete_temps = not args.keep_temps
self.input_file = args.input_file if override_input_file is None else override_input_file
if not os.path.isfile(self.input_file):
raise Pdf2PdfOcrException("{0} not found. Exiting.".format(self.input_file))
self.input_file = os.path.abspath(self.input_file)
self.input_file_type = ""
#
self.input_file_has_text = False
self.input_file_is_encrypted = False
self.input_file_metadata = dict()
self.input_file_number_of_pages = None
#
self.debug("Temp dir is {0}".format(self.tmp_dir))
self.debug("Prefix is {0}".format(self.prefix))
# Where am I?
self.script_dir = os.path.dirname(os.path.abspath(__file__)) + os.path.sep
self.debug("Script dir is {0}".format(self.script_dir))
#
self.cpu_to_use = int(multiprocessing.cpu_count() * self.parallel_threshold)
if self.cpu_to_use == 0:
self.cpu_to_use = 1
self.debug("Parallel operations will use {0} CPUs".format(self.cpu_to_use))
#
self.main_pool = multiprocessing.Pool(self.cpu_to_use)
#
def check_external_tools(self):
"""Check if external tools are available, aborting or warning in case of any error."""
self.path_tesseract = shutil.which(self.cmd_tesseract)
if self.path_tesseract is None:
eprint("tesseract not found. Aborting...")
sys.exit(1)
#
self.tesseract_can_textonly_pdf = self.test_tesseract_textonly_pdf()
self.tesseract_version = self.get_tesseract_version()
#
self.path_cuneiform = shutil.which(self.cmd_cuneiform)
if self.path_cuneiform is None:
self.debug("cuneiform not available")
#
# Try to avoid errors on Windows with native OS "convert" command
# http://savage.net.au/ImageMagick/html/install-convert.html
# https://www.imagemagick.org/script/magick.php
self.path_convert = shutil.which(self.cmd_convert)
if not self.test_convert():
self.path_convert = shutil.which(self.cmd_magick)
if self.path_convert is None:
eprint("convert/magick from ImageMagick not found. Aborting...")
sys.exit(1)
#
self.path_mogrify = shutil.which(self.cmd_mogrify)
if self.path_mogrify is None:
eprint("mogrify from ImageMagick not found. Aborting...")
sys.exit(1)
#
self.path_file = shutil.which(self.cmd_file)
if self.path_file is None:
eprint("file not found. Aborting...")
sys.exit(1)
#
self.path_pdftoppm = shutil.which(self.cmd_pdftoppm)
if self.path_pdftoppm is None:
eprint("pdftoppm (poppler) not found. Aborting...")
sys.exit(1)
if self.get_pdftoppm_version() <= parse_version("0.70.0"):
self.log("External tool 'pdftoppm' is outdated. Please upgrade poppler")
#
self.path_pdffonts = shutil.which(self.cmd_pdffonts)
if self.path_pdffonts is None:
eprint("pdffonts (poppler) not found. Aborting...")
sys.exit(1)
#
self.path_ps2pdf = shutil.which(self.cmd_ps2pdf)
self.path_pdf2ps = shutil.which(self.cmd_pdf2ps)
if self.path_ps2pdf is None or self.path_pdf2ps is None:
eprint("ps2pdf or pdf2ps (ghostscript) not found. File repair will not work...")
#
self.path_gs = shutil.which(self.cmd_gs)
if self.path_gs is None:
eprint("ghostscript not found. Param 'ignore-existing-text' will not work...")
#
self.path_qpdf = shutil.which(self.cmd_qpdf)
if self.path_qpdf is None:
self.log("External tool 'qpdf' not available. Merge can be slow")
else:
qpdf_version = self.get_qpdf_version()
minimum_version = "8.4.1"
if qpdf_version < parse_version(minimum_version):
self.log("External tool 'qpdf' is not on minimum version ({0}). Merge can be slow".format(minimum_version))
self.path_qpdf = None
#
def debug(self, param):
try:
if self.verbose_mode:
tstamp = datetime.datetime.now().strftime(self.log_time_format)
print("[{0}] [DEBUG] {1}".format(tstamp, param), flush=True)
except:
pass
def log(self, param):
try:
tstamp = datetime.datetime.now().strftime(self.log_time_format)
print("[{0}] [LOG] {1}".format(tstamp, param), flush=True)
except:
pass
def cleanup(self):
#
# Try to kill all child process still alive (in timeout situation)
process = psutil.Process(os.getpid())
for proc in process.children(recursive=True):
if "python" not in proc.name().lower(): # Python process are from multiprocessing and will be handled below
self.debug("Killing child process {0} with pid {1}".format(proc.name(), proc.pid))
try:
proc.kill()
except:
pass # By design
#
# Cleanup the pool
if self.main_pool:
self.main_pool.close()
self.main_pool.terminate()
self.main_pool.join()
self.main_pool = None # Signal for pool to stop waiting in while loops
#
# Cleanup temp files
if self.delete_temps:
shutil.rmtree(self.tmp_dir, ignore_errors=True)
else:
eprint("Temporary files kept in {0}".format(self.tmp_dir))
def ocr(self):
time_at_start = time.time()
self.log("Welcome to pdf2pdfocr version {0} - https://github.com/LeoFCardoso/pdf2pdfocr".format(VERSION))
self.check_avoid_file_by_size()
self.detect_file_type()
if self.input_file_type == "application/pdf":
self.validate_pdf_input_file()
self.check_rebuild_pdf()
self.debug("User conversion params: {0}".format(self.user_convert_params))
self.define_output_files()
self.initial_cleanup()
self.convert_input_to_images()
# TODO - create param to user pass input page range for OCR
image_file_list = sorted(glob.glob(self.tmp_dir + "{0}*.{1}".format(self.prefix, self.extension_images)))
if self.input_file_number_of_pages is None:
self.input_file_number_of_pages = len(image_file_list)
self.check_avoid_high_pages()
# TODO - create param to user pass image filters before OCR
self.check_blank_pages(image_file_list)
self.autorotate_info(image_file_list)
self.deskew(image_file_list)
self.external_ocr(image_file_list)
if not self.ocr_ignored:
self.join_ocred_pdf()
self.create_text_output()
self.build_final_output()
self.autorotate_final_output()
#
# TODO - create directory watch mode (maybe using watchdog library)
# Like a daemon
#
# TODO - create option for PDF/A files
# gs -dPDFA=3 -dBATCH -dNOPAUSE -sProcessColorModel=DeviceCMYK -sDEVICE=pdfwrite
# -sPDFACompatibilityPolicy=2 -sOutputFile=output_filename.pdf ./Test.pdf
# As in
# http://git.ghostscript.com/?p=ghostpdl.git;a=blob_plain;f=doc/VectorDevices.htm;hb=HEAD#PDFA
#
# Edit producer and build final PDF
# Without edit producer is easy as "shutil.copyfile(tmp_dir + prefix + "-OUTPUT.pdf", output_file)"
self.edit_producer()
#
self.debug("Output file created")
#
# Adjust the new file timestamp
# TODO touch -r "$INPUT_FILE" "$OUTPUT_FILE"
#
self.cleanup()
time_elapsed = time.time() - time_at_start
#
paypal_donate_link = "https://www.paypal.com/cgi-bin/webscr?cmd=_donations&business=leonardo%2ef%2ecardoso%40gmail%2ecom&lc=US&item_name" \
"=pdf2pdfocr%20development¤cy_code=USD&bn=PP%2dDonationsBF%3abtn_donateCC_LG%2egif%3aNonHosted"
tippin_donate_link = "https://tippin.me/@LeoFCardoso"
bitcoin_address = "173D1zQQyzvCCCek9b1SpDvh7JikBEdtRJ"
dogecoin_address = "D94hD2qPnkxmZk8qa1b6F1d7NfUrPkmcrG"
pix_key = "0726e8f2-7e59-488a-8abb-bda8f0d7d9ce"
success_message = """Success in {5:.3f} seconds!
This software is free, but if you like it, please donate to support new features.
---> Paypal
{0}
---> Tippin.me
{1}
---> Bitcoin (BTC) address: {2}
---> Dogecoin (DOGE) address: {3}
---> PIX (Brazilian Instant Payments) key: {4}
---> Please contact for donations in other cryptocurrencies - https://github.com/LeoFCardoso/pdf2pdfocr""".format(
paypal_donate_link, tippin_donate_link, bitcoin_address, dogecoin_address, pix_key, time_elapsed)
self.log(success_message)
def check_rebuild_pdf(self):
self.rebuild_pdf_from_images = (
self.input_file_is_encrypted or self.input_file_type != "application/pdf" or self.use_deskew_mode or self.force_rebuild_mode)
if self.rebuild_pdf_from_images and self.ignore_existing_text:
self.cleanup()
raise Pdf2PdfOcrException("Rebuild from images and ignore existing text won't work together")
def _merge_ocr(self, image_pdf_file_path, text_pdf_file_path, result_pdf_file_path, tag):
# Merge OCR background PDF into the main PDF document making a PDF sandwich
self.debug("Merging with OCR")
if self.path_qpdf is not None:
try:
with open(image_pdf_file_path, "rb") as img_f:
img_data = PyPDF2.PdfReader(img_f, strict=False)
first_page_img_rect = img_data.pages[0].mediaBox
first_page_img_area = first_page_img_rect.getWidth() * first_page_img_rect.getHeight()
except PdfReadError:
eprint("Warning: could not read input file page geometry. Merge may fail, please check input file.")
first_page_img_area = 0
with open(text_pdf_file_path, "rb") as txt_f:
txt_data = PyPDF2.PdfReader(txt_f, strict=False)
first_page_txt_rect = txt_data.pages[0].mediaBox
first_page_txt_area = first_page_txt_rect.getWidth() * first_page_txt_rect.getHeight()
#
# Define overlay / underlay based on biggest page
if first_page_txt_area < first_page_img_area:
qpdf_command = [self.path_qpdf, "--underlay", image_pdf_file_path, "--", text_pdf_file_path, result_pdf_file_path]
else:
qpdf_command = [self.path_qpdf, "--overlay", text_pdf_file_path, "--", image_pdf_file_path, result_pdf_file_path]
#
pqpdf = subprocess.Popen(
qpdf_command,
stdout=subprocess.DEVNULL,
stderr=open(self.tmp_dir + "err_merge-qpdf-{0}-{1}.log".format(self.prefix, tag), "wb"),
shell=self.shell_mode)
pqpdf.wait()
else:
pmulti = subprocess.Popen(
[self.path_this_python, self.script_dir + 'pdf2pdfocr_multibackground.py',
image_pdf_file_path, text_pdf_file_path, result_pdf_file_path],
stdout=subprocess.DEVNULL,
stderr=open(self.tmp_dir + "err_merge-multiback-{0}-{1}.log".format(self.prefix, tag), "wb"),
shell=self.shell_mode)
pmulti.wait()
def build_final_output(self):
# Start building final PDF.
# First, should we rebuild source file?
if not self.rebuild_pdf_from_images:
if not self.ocr_ignored:
self._merge_ocr(self.input_file, (self.tmp_dir + self.prefix + "-ocr.pdf"), (self.tmp_dir + self.prefix + "-OUTPUT.pdf"),
"final-output")
#
# Try to handle fail.
# The code below try to rewrite source PDF and try again.
if not os.path.isfile(self.tmp_dir + self.prefix + "-OUTPUT.pdf"):
self.try_repair_input_and_merge()
else:
# OCR ignored
shutil.copyfile(self.input_file, (self.tmp_dir + self.prefix + "-OUTPUT.pdf"))
else:
self.rebuild_and_merge()
#
if not os.path.isfile(self.tmp_dir + self.prefix + "-OUTPUT.pdf"):
self.cleanup()
raise Pdf2PdfOcrException("Output file could not be created :( Exiting with error code.")
def rebuild_and_merge(self):
eprint("Warning: metadata wiped from final PDF file (original file is not an unprotected PDF / "
"forcing rebuild from extracted images / using deskew)")
# Convert presets
# Please read http://www.imagemagick.org/Usage/quantize/#colors_two
preset_fast = "-threshold 60% -compress Group4"
preset_best = "-colors 2 -colorspace gray -normalize -threshold 60% -compress Group4"
preset_grayscale = "-threshold 85% -morphology Dilate Diamond -compress Group4"
preset_jpeg = "-strip -interlace Plane -gaussian-blur 0.05 -quality 50% -compress JPEG"
preset_jpeg2000 = "-quality 32% -compress JPEG2000"
#
rebuild_list = sorted(glob.glob(self.tmp_dir + self.prefix + "*." + self.extension_images))
#
if self.user_convert_params == "smart":
checkimg_pool_map = self.main_pool.starmap_async(do_check_img_greyscale, zip(rebuild_list))
checkimg_wait_rounds = 0
while not checkimg_pool_map.ready() and (self.main_pool is not None):
checkimg_wait_rounds += 1
if checkimg_wait_rounds % 10 == 0:
self.log("Checking page colors...")
time.sleep(0.5)
result_check_img = checkimg_pool_map.get()
if all(result_check_img):
self.log("No color pages detected. Smart mode will use 'best' preset.")
self.user_convert_params = "best"
else:
self.log("Color pages detected. Smart mode will use 'jpeg' preset.")
self.user_convert_params = "jpeg"
#
if self.user_convert_params == "fast":
convert_params = preset_fast
elif self.user_convert_params == "best":
convert_params = preset_best
elif self.user_convert_params == "grayscale":
convert_params = preset_grayscale
elif self.user_convert_params == "jpeg":
convert_params = preset_jpeg
elif self.user_convert_params == "jpeg2000":
convert_params = preset_jpeg2000
else:
convert_params = self.user_convert_params
# Handle default case
if convert_params == "":
convert_params = preset_best
#
self.log("Rebuilding PDF from images")
rebuild_pool_map = self.main_pool.starmap_async(do_rebuild,
zip(rebuild_list,
itertools.repeat(self.path_convert),
itertools.repeat(convert_params),
itertools.repeat(self.tmp_dir),
itertools.repeat(self.shell_mode)))
rebuild_wait_rounds = 0
while not rebuild_pool_map.ready() and (self.main_pool is not None):
rebuild_wait_rounds += 1
pages_processed = len(glob.glob(self.tmp_dir + "REBUILD_" + self.prefix + "*.pdf"))
if rebuild_wait_rounds % 10 == 0:
self.log("Waiting for PDF rebuild to complete. {0}/{1} pages completed...".format(pages_processed, self.input_file_number_of_pages))
time.sleep(0.5)
#
rebuilt_pdf_file_list = sorted(glob.glob(self.tmp_dir + "REBUILD_{0}*.pdf".format(self.prefix)))
self.debug("We have {0} rebuilt PDF files".format(len(rebuilt_pdf_file_list)))
if len(rebuilt_pdf_file_list) > 0:
pdf_merger = PyPDF2.PdfMerger()
for rebuilt_pdf_file in rebuilt_pdf_file_list:
pdf_merger.append(PyPDF2.PdfReader(rebuilt_pdf_file, strict=False))
pdf_merger.write(self.tmp_dir + self.prefix + "-input_unprotected.pdf")
pdf_merger.close()
else:
self.cleanup()
raise Pdf2PdfOcrException("No PDF files generated after image rebuilding. This is not expected. Aborting.")
self.debug("PDF rebuilding completed")
#
if not self.ocr_ignored:
self._merge_ocr((self.tmp_dir + self.prefix + "-input_unprotected.pdf"),
(self.tmp_dir + self.prefix + "-ocr.pdf"),
(self.tmp_dir + self.prefix + "-OUTPUT.pdf"), "rebuild-merge")
else:
shutil.copyfile((self.tmp_dir + self.prefix + "-input_unprotected.pdf"), (self.tmp_dir + self.prefix + "-OUTPUT.pdf"))
def try_repair_input_and_merge(self):
self.debug("Fail to merge source PDF with extracted OCR text. Trying to fix source PDF to build final file...")
prepair1 = subprocess.Popen(
[self.path_pdf2ps, self.input_file, self.tmp_dir + self.prefix + "-fixPDF.ps"],
stdout=subprocess.DEVNULL,
stderr=open(self.tmp_dir + "err_pdf2ps-{0}.log".format(self.prefix), "wb"),
shell=self.shell_mode)
prepair1.wait()
prepair2 = subprocess.Popen([self.path_ps2pdf, self.tmp_dir + self.prefix + "-fixPDF.ps",
self.tmp_dir + self.prefix + "-fixPDF.pdf"],
stdout=subprocess.DEVNULL,
stderr=open(self.tmp_dir + "err_ps2pdf-{0}.log".format(self.prefix),
"wb"), shell=self.shell_mode)
prepair2.wait()
#
self._merge_ocr((self.tmp_dir + self.prefix + "-fixPDF.pdf"),
(self.tmp_dir + self.prefix + "-ocr.pdf"),
(self.tmp_dir + self.prefix + "-OUTPUT.pdf"), "repair_input")
def create_text_output(self):
# Create final text output
if self.create_text_mode:
text_files = sorted(glob.glob(self.tmp_dir + self.prefix + "*.txt"))
text_io_wrapper = open(self.output_file_text, 'wb')
with text_io_wrapper as outfile:
for fname in text_files:
with open(fname, 'rb') as infile:
outfile.write(infile.read())
#
text_io_wrapper.close()
#
self.log("Created final text file")
def join_ocred_pdf(self):
# Join PDF files into one file that contains all OCR "backgrounds"
text_pdf_file_list = sorted(glob.glob(self.tmp_dir + "{0}*.{1}".format(self.prefix, "pdf")))
self.debug("We have {0} ocr'ed files".format(len(text_pdf_file_list)))
if len(text_pdf_file_list) > 0:
pdf_merger = PyPDF2.PdfMerger()
for text_pdf_file in text_pdf_file_list:
pdf_merger.append(PyPDF2.PdfReader(text_pdf_file, strict=False))
pdf_merger.write(self.tmp_dir + self.prefix + "-ocr.pdf")
pdf_merger.close()
else:
self.cleanup()
raise Pdf2PdfOcrException("No PDF files generated after OCR. This is not expected. Aborting.")
#
self.debug("Joined ocr'ed PDF files")
def external_ocr(self, image_file_list):
if self.ocr_engine in ["cuneiform", "tesseract"]:
self.log("Starting OCR with {0}...".format(self.ocr_engine))
image_list_for_external_ocr = [x for x in image_file_list if x not in self.blank_pages]
if self.ocr_engine == "cuneiform":
ocr_pool_map = self.main_pool.starmap_async(do_ocr_cuneiform,
zip(image_list_for_external_ocr,
itertools.repeat(self.extra_ocr_flag),
itertools.repeat(self.tess_langs),
itertools.repeat(self.tmp_dir),
itertools.repeat(self.shell_mode),
itertools.repeat(self.path_cuneiform)))
elif self.ocr_engine == "tesseract":
ocr_pool_map = self.main_pool.starmap_async(do_ocr_tesseract,
zip(image_list_for_external_ocr,
itertools.repeat(self.extra_ocr_flag),
itertools.repeat(self.tess_langs),
itertools.repeat(self.tess_psm),
itertools.repeat(self.tmp_dir),
itertools.repeat(self.shell_mode),
itertools.repeat(self.path_tesseract),
itertools.repeat(self.text_generation_strategy),
itertools.repeat(self.delete_temps),
itertools.repeat(self.tesseract_can_textonly_pdf)))
else:
ocr_pool_map = None # Should never happen
#
ocr_rounds = 0
while not ocr_pool_map.ready() and (self.main_pool is not None):
ocr_rounds += 1
pages_processed = len(glob.glob(self.tmp_dir + self.prefix + "*.tmp"))
if ocr_rounds % 10 == 0:
self.log("Waiting for OCR to complete. {0}/{1} pages completed...".format(pages_processed, self.input_file_number_of_pages))
time.sleep(0.5)
#
if len(self.blank_pages) > 0:
for idx, blank_page_img in enumerate(self.blank_pages):
blank_page_dimension = self.blank_pages_dimensions[idx]
pdf_file_img = blank_page_img.replace("." + self.extension_images, ".pdf")
do_create_blank_pdf(pdf_file_img, blank_page_dimension, self.image_resolution)
#
self.log("OCR completed")
self.ocr_ignored = False