From acac11b8c71c47021f7b20e31d4f25629d2da757 Mon Sep 17 00:00:00 2001 From: Sivakumar Mahalingam Date: Sun, 7 Apr 2024 11:41:00 +0400 Subject: [PATCH] updated repo --- README.md | 6 +-- scripts/{Laghima.py => fastmrz.py} | 85 ++++++++++++++++-------------- scripts/main.py | 10 ++-- tests/test.py | 65 +++++++++++++++++++++++ 4 files changed, 118 insertions(+), 48 deletions(-) rename scripts/{Laghima.py => fastmrz.py} (71%) diff --git a/README.md b/README.md index 531ef8e..45ac6ce 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,13 @@ -# LAGHIMA +# Fast MRZ ![License](https://img.shields.io/badge/license-AGPL%203.0-green) ![Python](https://img.shields.io/badge/python-3.11.8-blue) [![CodeQL](https://github.com/sivakumar-mahalingam/Laghima/actions/workflows/codeql.yml/badge.svg)](https://github.com/sivakumar-mahalingam/Laghima/actions/workflows/codeql.yml) - - + + This repository extracts the Machine Readable Zone (MRZ) from passport images. The MRZ typically contains important information such as the passport holder's name, nationality, passport number, and date of birth. diff --git a/scripts/Laghima.py b/scripts/fastmrz.py similarity index 71% rename from scripts/Laghima.py rename to scripts/fastmrz.py index cd4f7a8..ec6a52e 100644 --- a/scripts/Laghima.py +++ b/scripts/fastmrz.py @@ -6,15 +6,17 @@ import os # Set the Tesseract path -pytesseract.pytesseract.tesseract_cmd = r'/opt/homebrew/Cellar/tesseract/5.3.4_1/bin/tesseract' +# pytesseract.pytesseract.tesseract_cmd = r'/opt/homebrew/Cellar/tesseract/5.3.4_1/bin/tesseract' # pytesseract.pytesseract.tesseract_cmd = 'C:\\Program Files\\Tesseract-OCR\\tesseract.exe' -class Laghima: - def __init__(self): + +class FastMRZ: + def __init__(self, tesseract_path=''): self.interpreter = tensorflow.lite.Interpreter(model_path=os.path.abspath('../models/mrz_seg.tflite')) self.interpreter.allocate_tensors() self.input_details = self.interpreter.get_input_details() self.output_details = self.interpreter.get_output_details() + self.tesseract_path = tesseract_path def _process_image(self, image_path): image = cv2.imread(image_path, cv2.IMREAD_COLOR) if isinstance(image_path, str) else image_path @@ -29,6 +31,8 @@ def _process_image(self, image_path): return image def _get_roi(self, output_data, image_path): + if self.tesseract_path != '': + pytesseract.pytesseract.tesseract_cmd = self.tesseract_path image = cv2.imread(image_path, cv2.IMREAD_COLOR) if isinstance(image_path, str) else image_path output_data = (output_data[0, :, :, 0] > 0.35) * 1 @@ -57,13 +61,15 @@ def _cleanse_roi(self, raw_text): selection_length = None for item in input_list: - if '<' in item and len(item) in (30, 36, 44): + if '<' in item and len(item) in (30, 36, 44): selection_length = len(item) break - output_list = [item for item in input_list if len(item) >= selection_length] + new_list = [item for item in input_list if len(item) >= selection_length] + + output_text = '\n'.join(new_list) - return output_list + return output_text def _get_final_check_digit(self, input_string, input_type): if input_type == 'TD3': @@ -71,7 +77,8 @@ def _get_final_check_digit(self, input_string, input_type): elif input_type == 'TD2': return self._get_check_digit(input_string[0:10] + input_string[13:20] + input_string[21:35]) else: - return self._get_check_digit(input_string[0][5:] + input_string[1][:7] + input_string[1][8:15] + input_string[1][18:29]) + return self._get_check_digit( + input_string[0][5:] + input_string[1][:7] + input_string[1][8:15] + input_string[1][18:29]) def _get_check_digit(self, input_string): weights_pattern = [7, 3, 1] @@ -109,36 +116,37 @@ def read_mrz(self, image_path): return self._parse_mrz(mrz_text) def _parse_mrz(self, mrz_text): - if len(mrz_text) not in [2, 3]: + mrz_lines = mrz_text.strip().split('\n') + if len(mrz_lines) not in [2, 3]: return {'status': 'FAILURE', 'message': 'Invalid MRZ format'} mrz_code_dict = {} - if len(mrz_text) == 2: + if len(mrz_lines) == 2: # add optional data field - mrz_code_dict['mrz_type'] = 'TD2' if len(mrz_text[0]) == 36 else 'TD3' + mrz_code_dict['mrz_type'] = 'TD2' if len(mrz_lines[0]) == 36 else 'TD3' # Line 1 - mrz_code_dict['document_type'] = mrz_text[0][:1] - mrz_code_dict['country_code'] = mrz_text[0][2:5] - names = mrz_text[0][5:].split('<<') + mrz_code_dict['document_type'] = mrz_lines[0][:1] + mrz_code_dict['country_code'] = mrz_lines[0][2:5] + names = mrz_lines[0][5:].split('<<') mrz_code_dict['surname'] = names[0].replace('<', ' ') mrz_code_dict['given_name'] = names[1].replace('<', ' ') # Line 2 - mrz_code_dict['document_number'] = mrz_text[1][0:9].replace('<', '') - if self._get_check_digit(mrz_code_dict['document_number']) != mrz_text[1][9]: + mrz_code_dict['document_number'] = mrz_lines[1][0:9].replace('<', '') + if self._get_check_digit(mrz_code_dict['document_number']) != mrz_lines[1][9]: return {'status': 'FAILURE', 'message': 'document number checksum is not matching'} - mrz_code_dict['nationality'] = mrz_text[1][10:13] - mrz_code_dict['date_of_birth'] = mrz_text[1][13:19] - if self._get_check_digit(mrz_code_dict['date_of_birth']) != mrz_text[1][19]: + mrz_code_dict['nationality'] = mrz_lines[1][10:13] + mrz_code_dict['date_of_birth'] = mrz_lines[1][13:19] + if self._get_check_digit(mrz_code_dict['date_of_birth']) != mrz_lines[1][19]: return {'status': 'FAILURE', 'message': 'date of birth checksum is not matching'} mrz_code_dict['date_of_birth'] = self._format_date(mrz_code_dict['date_of_birth']) - mrz_code_dict['sex'] = mrz_text[1][20] - mrz_code_dict['date_of_expiry'] = mrz_text[1][21:27] - if self._get_check_digit(mrz_code_dict['date_of_expiry']) != mrz_text[1][27]: + mrz_code_dict['sex'] = mrz_lines[1][20] + mrz_code_dict['date_of_expiry'] = mrz_lines[1][21:27] + if self._get_check_digit(mrz_code_dict['date_of_expiry']) != mrz_lines[1][27]: return {'status': 'FAILURE', 'message': 'date of expiry checksum is not matching'} mrz_code_dict['date_of_expiry'] = self._format_date(mrz_code_dict['date_of_expiry']) - if mrz_text[1][-1] != self._get_final_check_digit(mrz_text[1], mrz_code_dict['mrz_type']): + if mrz_lines[1][-1] != self._get_final_check_digit(mrz_lines[1], mrz_code_dict['mrz_type']): return {'status': 'FAILURE', 'message': 'final checksum is not matching'} # Final status @@ -147,30 +155,30 @@ def _parse_mrz(self, mrz_text): mrz_code_dict['mrz_type'] = 'TD1' # Line 1 - mrz_code_dict['document_type'] = mrz_text[0][:2].replace('<', ' ') - mrz_code_dict['country_code'] = mrz_text[0][2:5] - mrz_code_dict['document_number'] = mrz_text[0][5:14] - if self._get_check_digit(mrz_code_dict['document_number']) != mrz_text[0][14]: + mrz_code_dict['document_type'] = mrz_lines[0][:2].replace('<', ' ') + mrz_code_dict['country_code'] = mrz_lines[0][2:5] + mrz_code_dict['document_number'] = mrz_lines[0][5:14] + if self._get_check_digit(mrz_code_dict['document_number']) != mrz_lines[0][14]: return {'status': 'FAILURE', 'message': 'document number checksum is not matching'} - mrz_code_dict['optional_data_1'] = mrz_text[0][15:].strip('<') + mrz_code_dict['optional_data_1'] = mrz_lines[0][15:].strip('<') # Line 2 - mrz_code_dict['date_of_birth'] = mrz_text[1][:6] - if self._get_check_digit(mrz_code_dict['date_of_birth']) != mrz_text[1][6]: + mrz_code_dict['date_of_birth'] = mrz_lines[1][:6] + if self._get_check_digit(mrz_code_dict['date_of_birth']) != mrz_lines[1][6]: return {'status': 'FAILURE', 'message': 'date of birth checksum is not matching'} mrz_code_dict['date_of_birth'] = self._format_date(mrz_code_dict['date_of_birth']) - mrz_code_dict['sex'] = mrz_text[1][7] - mrz_code_dict['date_of_expiry'] = mrz_text[1][8:14] - if self._get_check_digit(mrz_code_dict['date_of_expiry']) != mrz_text[1][14]: + mrz_code_dict['sex'] = mrz_lines[1][7] + mrz_code_dict['date_of_expiry'] = mrz_lines[1][8:14] + if self._get_check_digit(mrz_code_dict['date_of_expiry']) != mrz_lines[1][14]: return {'status': 'FAILURE', 'message': 'date of expiry checksum is not matching'} mrz_code_dict['date_of_expiry'] = self._format_date(mrz_code_dict['date_of_expiry']) - mrz_code_dict['nationality'] = mrz_text[1][15:18] - mrz_code_dict['optional_data_2'] = mrz_text[0][18:29].strip('<') - if mrz_text[1][-1] != self._get_final_check_digit(mrz_text, mrz_code_dict['mrz_type']): + mrz_code_dict['nationality'] = mrz_lines[1][15:18] + mrz_code_dict['optional_data_2'] = mrz_lines[0][18:29].strip('<') + if mrz_lines[1][-1] != self._get_final_check_digit(mrz_lines, mrz_code_dict['mrz_type']): return {'status': 'FAILURE', 'message': 'final checksum is not matching'} # Line 3 - names = mrz_text[2].split('<<') + names = mrz_lines[2].split('<<') mrz_code_dict['surname'] = names[0].replace('<', ' ') mrz_code_dict['given_name'] = names[1].replace('<', ' ') @@ -178,8 +186,3 @@ def _parse_mrz(self, mrz_text): mrz_code_dict['status'] = 'SUCCESS' return mrz_code_dict - - - - - diff --git a/scripts/main.py b/scripts/main.py index 74760bc..ce67c74 100644 --- a/scripts/main.py +++ b/scripts/main.py @@ -1,9 +1,11 @@ -from Laghima import Laghima +from fastmrz import FastMRZ import os -laghima = Laghima() +# fast_mrz = FastMRZ(tesseract_path=r'/opt/homebrew/Cellar/tesseract/5.3.4_1/bin/tesseract') +# fast_mrz = FastMRZ(tesseract_path=r'C:\\Program Files\\Tesseract-OCR\\tesseract.exe') -# Need to add other type of documents in /data -passport_mrz = laghima.read_mrz(os.path.abspath('../data/passport_uk.jpg')) +fast_mrz = FastMRZ() +passport_mrz = fast_mrz.read_mrz(os.path.abspath('../data/passport_uk.jpg')) print(passport_mrz) +# Add README testing badge. Ref, https://github.com/mingrammer/diagrams/blob/master/README.md?plain=1 diff --git a/tests/test.py b/tests/test.py index e69de29..ef10fa7 100644 --- a/tests/test.py +++ b/tests/test.py @@ -0,0 +1,65 @@ +import numpy as np +import os +from fastmrz import FastMRZ + +fast_mrz = FastMRZ() + + +# Test cases for _process_image function +def test_process_image(): + image_path = os.path.abspath('../data/td3.jpg') + processed_image = fast_mrz._process_image(image_path) + assert isinstance(processed_image, np.ndarray) + assert processed_image.shape == (1, 256, 256, 3) + + +# Test cases for _get_roi function +def test_get_roi(): + output_data = np.random.rand(1, 256, 256, 1) + image_path = os.path.abspath('../data/td3.jpg') + roi = fast_mrz._get_roi(output_data, image_path) + assert isinstance(roi, str) + + +# Test cases for _cleanse_roi function +def test_cleanse_roi(): + raw_text = "P