updated repo

sivakumar-mahalingam · Apr 7, 2024 · acac11b · acac11b
1 parent 19ff25d
commit acac11b
Show file tree

Hide file tree

Showing 4 changed files with 118 additions and 48 deletions.
diff --git a/README.md b/README.md
@@ -1,13 +1,13 @@
-# LAGHIMA
+# Fast MRZ
 
 ![License](https://img.shields.io/badge/license-AGPL%203.0-green)
 ![Python](https://img.shields.io/badge/python-3.11.8-blue)
 [![CodeQL](https://github.com/sivakumar-mahalingam/Laghima/actions/workflows/codeql.yml/badge.svg)](https://github.com/sivakumar-mahalingam/Laghima/actions/workflows/codeql.yml)
 
 <a href="https://github.com/sivakumar-mahalingam/passport-mrz-reader/" target="_blank">
-        <img src="docs/LAGHIMA.png" target="_blank" />
-</a>
+        <img src="docs/FastMRZ.png" target="_blank" />
 
+</a>
 
 This repository extracts the Machine Readable Zone (MRZ) from passport images. The MRZ typically contains important information such as the passport holder's name, nationality, passport number, and date of birth.
 

diff --git a/scripts/Laghima.py → scripts/fastmrz.py b/scripts/Laghima.py → scripts/fastmrz.py
@@ -6,15 +6,17 @@
 import os
 
 # Set the Tesseract path
-pytesseract.pytesseract.tesseract_cmd = r'/opt/homebrew/Cellar/tesseract/5.3.4_1/bin/tesseract'
+# pytesseract.pytesseract.tesseract_cmd = r'/opt/homebrew/Cellar/tesseract/5.3.4_1/bin/tesseract'
 # pytesseract.pytesseract.tesseract_cmd = 'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'
 
-class Laghima:
-    def __init__(self):
+
+class FastMRZ:
+    def __init__(self, tesseract_path=''):
         self.interpreter = tensorflow.lite.Interpreter(model_path=os.path.abspath('../models/mrz_seg.tflite'))
         self.interpreter.allocate_tensors()
         self.input_details = self.interpreter.get_input_details()
         self.output_details = self.interpreter.get_output_details()
+        self.tesseract_path = tesseract_path
 
     def _process_image(self, image_path):
         image = cv2.imread(image_path, cv2.IMREAD_COLOR) if isinstance(image_path, str) else image_path
@@ -29,6 +31,8 @@ def _process_image(self, image_path):
         return image
 
     def _get_roi(self, output_data, image_path):
+        if self.tesseract_path != '':
+            pytesseract.pytesseract.tesseract_cmd = self.tesseract_path
         image = cv2.imread(image_path, cv2.IMREAD_COLOR) if isinstance(image_path, str) else image_path
 
         output_data = (output_data[0, :, :, 0] > 0.35) * 1
@@ -57,21 +61,24 @@ def _cleanse_roi(self, raw_text):
 
         selection_length = None
         for item in input_list:
-            if '<' in item  and len(item) in (30, 36, 44):
+            if '<' in item and len(item) in (30, 36, 44):
                 selection_length = len(item)
                 break
 
-        output_list = [item for item in input_list if len(item) >= selection_length]
+        new_list = [item for item in input_list if len(item) >= selection_length]
+
+        output_text = '\n'.join(new_list)
 
-        return output_list
+        return output_text
 
     def _get_final_check_digit(self, input_string, input_type):
         if input_type == 'TD3':
             return self._get_check_digit(input_string[0:10] + input_string[13:20] + input_string[21:43])
         elif input_type == 'TD2':
             return self._get_check_digit(input_string[0:10] + input_string[13:20] + input_string[21:35])
         else:
-            return self._get_check_digit(input_string[0][5:] + input_string[1][:7] + input_string[1][8:15] + input_string[1][18:29])
+            return self._get_check_digit(
+                input_string[0][5:] + input_string[1][:7] + input_string[1][8:15] + input_string[1][18:29])
 
     def _get_check_digit(self, input_string):
         weights_pattern = [7, 3, 1]
@@ -109,36 +116,37 @@ def read_mrz(self, image_path):
         return self._parse_mrz(mrz_text)
 
     def _parse_mrz(self, mrz_text):
-        if len(mrz_text) not in [2, 3]:
+        mrz_lines = mrz_text.strip().split('\n')
+        if len(mrz_lines) not in [2, 3]:
             return {'status': 'FAILURE', 'message': 'Invalid MRZ format'}
 
         mrz_code_dict = {}
-        if len(mrz_text) == 2:
+        if len(mrz_lines) == 2:
             # add optional data field
-            mrz_code_dict['mrz_type'] = 'TD2' if len(mrz_text[0]) == 36 else 'TD3'
+            mrz_code_dict['mrz_type'] = 'TD2' if len(mrz_lines[0]) == 36 else 'TD3'
 
             # Line 1
-            mrz_code_dict['document_type'] = mrz_text[0][:1]
-            mrz_code_dict['country_code'] = mrz_text[0][2:5]
-            names = mrz_text[0][5:].split('<<')
+            mrz_code_dict['document_type'] = mrz_lines[0][:1]
+            mrz_code_dict['country_code'] = mrz_lines[0][2:5]
+            names = mrz_lines[0][5:].split('<<')
             mrz_code_dict['surname'] = names[0].replace('<', ' ')
             mrz_code_dict['given_name'] = names[1].replace('<', ' ')
 
             # Line 2
-            mrz_code_dict['document_number'] = mrz_text[1][0:9].replace('<', '')
-            if self._get_check_digit(mrz_code_dict['document_number']) != mrz_text[1][9]:
+            mrz_code_dict['document_number'] = mrz_lines[1][0:9].replace('<', '')
+            if self._get_check_digit(mrz_code_dict['document_number']) != mrz_lines[1][9]:
                 return {'status': 'FAILURE', 'message': 'document number checksum is not matching'}
-            mrz_code_dict['nationality'] = mrz_text[1][10:13]
-            mrz_code_dict['date_of_birth'] = mrz_text[1][13:19]
-            if self._get_check_digit(mrz_code_dict['date_of_birth']) != mrz_text[1][19]:
+            mrz_code_dict['nationality'] = mrz_lines[1][10:13]
+            mrz_code_dict['date_of_birth'] = mrz_lines[1][13:19]
+            if self._get_check_digit(mrz_code_dict['date_of_birth']) != mrz_lines[1][19]:
                 return {'status': 'FAILURE', 'message': 'date of birth checksum is not matching'}
             mrz_code_dict['date_of_birth'] = self._format_date(mrz_code_dict['date_of_birth'])
-            mrz_code_dict['sex'] = mrz_text[1][20]
-            mrz_code_dict['date_of_expiry'] = mrz_text[1][21:27]
-            if self._get_check_digit(mrz_code_dict['date_of_expiry']) != mrz_text[1][27]:
+            mrz_code_dict['sex'] = mrz_lines[1][20]
+            mrz_code_dict['date_of_expiry'] = mrz_lines[1][21:27]
+            if self._get_check_digit(mrz_code_dict['date_of_expiry']) != mrz_lines[1][27]:
                 return {'status': 'FAILURE', 'message': 'date of expiry checksum is not matching'}
             mrz_code_dict['date_of_expiry'] = self._format_date(mrz_code_dict['date_of_expiry'])
-            if mrz_text[1][-1] != self._get_final_check_digit(mrz_text[1], mrz_code_dict['mrz_type']):
+            if mrz_lines[1][-1] != self._get_final_check_digit(mrz_lines[1], mrz_code_dict['mrz_type']):
                 return {'status': 'FAILURE', 'message': 'final checksum is not matching'}
 
             # Final status
@@ -147,39 +155,34 @@ def _parse_mrz(self, mrz_text):
             mrz_code_dict['mrz_type'] = 'TD1'
 
             # Line 1
-            mrz_code_dict['document_type'] = mrz_text[0][:2].replace('<', ' ')
-            mrz_code_dict['country_code'] = mrz_text[0][2:5]
-            mrz_code_dict['document_number'] = mrz_text[0][5:14]
-            if self._get_check_digit(mrz_code_dict['document_number']) != mrz_text[0][14]:
+            mrz_code_dict['document_type'] = mrz_lines[0][:2].replace('<', ' ')
+            mrz_code_dict['country_code'] = mrz_lines[0][2:5]
+            mrz_code_dict['document_number'] = mrz_lines[0][5:14]
+            if self._get_check_digit(mrz_code_dict['document_number']) != mrz_lines[0][14]:
                 return {'status': 'FAILURE', 'message': 'document number checksum is not matching'}
-            mrz_code_dict['optional_data_1'] = mrz_text[0][15:].strip('<')
+            mrz_code_dict['optional_data_1'] = mrz_lines[0][15:].strip('<')
 
             # Line 2
-            mrz_code_dict['date_of_birth'] = mrz_text[1][:6]
-            if self._get_check_digit(mrz_code_dict['date_of_birth']) != mrz_text[1][6]:
+            mrz_code_dict['date_of_birth'] = mrz_lines[1][:6]
+            if self._get_check_digit(mrz_code_dict['date_of_birth']) != mrz_lines[1][6]:
                 return {'status': 'FAILURE', 'message': 'date of birth checksum is not matching'}
             mrz_code_dict['date_of_birth'] = self._format_date(mrz_code_dict['date_of_birth'])
-            mrz_code_dict['sex'] = mrz_text[1][7]
-            mrz_code_dict['date_of_expiry'] = mrz_text[1][8:14]
-            if self._get_check_digit(mrz_code_dict['date_of_expiry']) != mrz_text[1][14]:
+            mrz_code_dict['sex'] = mrz_lines[1][7]
+            mrz_code_dict['date_of_expiry'] = mrz_lines[1][8:14]
+            if self._get_check_digit(mrz_code_dict['date_of_expiry']) != mrz_lines[1][14]:
                 return {'status': 'FAILURE', 'message': 'date of expiry checksum is not matching'}
             mrz_code_dict['date_of_expiry'] = self._format_date(mrz_code_dict['date_of_expiry'])
-            mrz_code_dict['nationality'] = mrz_text[1][15:18]
-            mrz_code_dict['optional_data_2'] = mrz_text[0][18:29].strip('<')
-            if mrz_text[1][-1] != self._get_final_check_digit(mrz_text, mrz_code_dict['mrz_type']):
+            mrz_code_dict['nationality'] = mrz_lines[1][15:18]
+            mrz_code_dict['optional_data_2'] = mrz_lines[0][18:29].strip('<')
+            if mrz_lines[1][-1] != self._get_final_check_digit(mrz_lines, mrz_code_dict['mrz_type']):
                 return {'status': 'FAILURE', 'message': 'final checksum is not matching'}
 
             # Line 3
-            names = mrz_text[2].split('<<')
+            names = mrz_lines[2].split('<<')
             mrz_code_dict['surname'] = names[0].replace('<', ' ')
             mrz_code_dict['given_name'] = names[1].replace('<', ' ')
 
             # Final status
             mrz_code_dict['status'] = 'SUCCESS'
 
         return mrz_code_dict
-
-
-
-
-
diff --git a/scripts/main.py b/scripts/main.py
@@ -1,9 +1,11 @@
-from Laghima import Laghima
+from fastmrz import FastMRZ
 import os
 
-laghima = Laghima()
+# fast_mrz = FastMRZ(tesseract_path=r'/opt/homebrew/Cellar/tesseract/5.3.4_1/bin/tesseract')
+# fast_mrz = FastMRZ(tesseract_path=r'C:\\Program Files\\Tesseract-OCR\\tesseract.exe')
 
-# Need to add other type of documents in /data
-passport_mrz = laghima.read_mrz(os.path.abspath('../data/passport_uk.jpg'))
+fast_mrz = FastMRZ()
+passport_mrz = fast_mrz.read_mrz(os.path.abspath('../data/passport_uk.jpg'))
 print(passport_mrz)
 
+# Add README testing badge. Ref, https://github.com/mingrammer/diagrams/blob/master/README.md?plain=1
diff --git a/tests/test.py b/tests/test.py
@@ -0,0 +1,65 @@
+import numpy as np
+import os
+from fastmrz import FastMRZ
+
+fast_mrz = FastMRZ()
+
+
+# Test cases for _process_image function
+def test_process_image():
+    image_path = os.path.abspath('../data/td3.jpg')
+    processed_image = fast_mrz._process_image(image_path)
+    assert isinstance(processed_image, np.ndarray)
+    assert processed_image.shape == (1, 256, 256, 3)
+
+
+# Test cases for _get_roi function
+def test_get_roi():
+    output_data = np.random.rand(1, 256, 256, 1)
+    image_path = os.path.abspath('../data/td3.jpg')
+    roi = fast_mrz._get_roi(output_data, image_path)
+    assert isinstance(roi, str)
+
+
+# Test cases for _cleanse_roi function
+def test_cleanse_roi():
+    raw_text = "P<UTOERIKSSON<<ANNA<MARIA<<< <<<<<<<<<  <<<<<<<\n\nL898902C36UTO7408122F1204159ZE184226B<<<<<10\n"
+    cleansed_text = fast_mrz._cleanse_roi(raw_text)
+    assert isinstance(cleansed_text, str)
+
+
+# Test cases for _get_final_check_digit function
+def test_get_final_check_digit():
+    input_string = "'I<UTOERIKSSON<<ANNA<MARIA<<<<<<<<<<<\nD231458907UTO7408122F1204159<<<<<<<6"
+    input_type = "TD2"
+    final_check_digit = fast_mrz._get_final_check_digit(input_string, input_type)
+    assert isinstance(final_check_digit, str)
+
+
+# Test cases for _get_check_digit function
+def test_get_check_digit():
+    input_string = "'I<UTOERIKSSON<<ANNA< MARIA<<<<< <<<<<<\nD231458907UTO7408122F1204159<<<<<<<6\n\n"
+    check_digit = fast_mrz._get_check_digit(input_string)
+    assert isinstance(check_digit, str)
+
+
+# Test cases for _format_date function
+def test_format_date():
+    input_date = "220101"
+    formatted_date = fast_mrz._format_date(input_date)
+    assert isinstance(formatted_date, str)
+
+
+# Test cases for read_raw_mrz function
+def test_read_raw_mrz():
+    image_path = os.path.abspath('../data/td2.jpg')
+    raw_mrz = fast_mrz.read_raw_mrz(image_path)
+    assert isinstance(raw_mrz, str)
+
+
+# Test cases for read_mrz function
+def test_read_mrz():
+    image_path = os.path.abspath('../data/td3.jpg')
+    mrz_data = fast_mrz.read_mrz(image_path)
+    assert isinstance(mrz_data, dict)
+    assert 'status' in mrz_data.keys()