From acac11b8c71c47021f7b20e31d4f25629d2da757 Mon Sep 17 00:00:00 2001
From: Sivakumar Mahalingam <sivasai2006@gmail.com>
Date: Sun, 7 Apr 2024 11:41:00 +0400
Subject: [PATCH] updated repo

---
 README.md                          |  6 +--
 scripts/{Laghima.py => fastmrz.py} | 85 ++++++++++++++++--------------
 scripts/main.py                    | 10 ++--
 tests/test.py                      | 65 +++++++++++++++++++++++
 4 files changed, 118 insertions(+), 48 deletions(-)
 rename scripts/{Laghima.py => fastmrz.py} (71%)
diff --git a/README.md b/README.md
index 531ef8e..45ac6ce 100644
--- a/README.md
+++ b/README.md
@@ -1,13 +1,13 @@
-# LAGHIMA
+# Fast MRZ
 
 ![License](https://img.shields.io/badge/license-AGPL%203.0-green)
 ![Python](https://img.shields.io/badge/python-3.11.8-blue)
 [![CodeQL](https://github.com/sivakumar-mahalingam/Laghima/actions/workflows/codeql.yml/badge.svg)](https://github.com/sivakumar-mahalingam/Laghima/actions/workflows/codeql.yml)
 
 <a href="https://github.com/sivakumar-mahalingam/passport-mrz-reader/" target="_blank">
-        <img src="docs/LAGHIMA.png" target="_blank" />
-</a>
+        <img src="docs/FastMRZ.png" target="_blank" />
 
+</a>
 
 This repository extracts the Machine Readable Zone (MRZ) from passport images. The MRZ typically contains important information such as the passport holder's name, nationality, passport number, and date of birth.
 
diff --git a/scripts/Laghima.py b/scripts/fastmrz.py
similarity index 71%
rename from scripts/Laghima.py
rename to scripts/fastmrz.py
index cd4f7a8..ec6a52e 100644
--- a/scripts/Laghima.py
+++ b/scripts/fastmrz.py
@@ -6,15 +6,17 @@
 import os
 
 # Set the Tesseract path
-pytesseract.pytesseract.tesseract_cmd = r'/opt/homebrew/Cellar/tesseract/5.3.4_1/bin/tesseract'
+# pytesseract.pytesseract.tesseract_cmd = r'/opt/homebrew/Cellar/tesseract/5.3.4_1/bin/tesseract'
 # pytesseract.pytesseract.tesseract_cmd = 'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'
 
-class Laghima:
-    def __init__(self):
+
+class FastMRZ:
+    def __init__(self, tesseract_path=''):
         self.interpreter = tensorflow.lite.Interpreter(model_path=os.path.abspath('../models/mrz_seg.tflite'))
         self.interpreter.allocate_tensors()
         self.input_details = self.interpreter.get_input_details()
         self.output_details = self.interpreter.get_output_details()
+        self.tesseract_path = tesseract_path
 
     def _process_image(self, image_path):
         image = cv2.imread(image_path, cv2.IMREAD_COLOR) if isinstance(image_path, str) else image_path
@@ -29,6 +31,8 @@ def _process_image(self, image_path):
         return image
 
     def _get_roi(self, output_data, image_path):
+        if self.tesseract_path != '':
+            pytesseract.pytesseract.tesseract_cmd = self.tesseract_path
         image = cv2.imread(image_path, cv2.IMREAD_COLOR) if isinstance(image_path, str) else image_path
 
         output_data = (output_data[0, :, :, 0] > 0.35) * 1
@@ -57,13 +61,15 @@ def _cleanse_roi(self, raw_text):
 
         selection_length = None
         for item in input_list:
-            if '<' in item  and len(item) in (30, 36, 44):
+            if '<' in item and len(item) in (30, 36, 44):
                 selection_length = len(item)
                 break
 
-        output_list = [item for item in input_list if len(item) >= selection_length]
+        new_list = [item for item in input_list if len(item) >= selection_length]
+
+        output_text = '\n'.join(new_list)
 
-        return output_list
+        return output_text
 
     def _get_final_check_digit(self, input_string, input_type):
         if input_type == 'TD3':
@@ -71,7 +77,8 @@ def _get_final_check_digit(self, input_string, input_type):
         elif input_type == 'TD2':
             return self._get_check_digit(input_string[0:10] + input_string[13:20] + input_string[21:35])
         else:
-            return self._get_check_digit(input_string[0][5:] + input_string[1][:7] + input_string[1][8:15] + input_string[1][18:29])
+            return self._get_check_digit(
+                input_string[0][5:] + input_string[1][:7] + input_string[1][8:15] + input_string[1][18:29])
 
     def _get_check_digit(self, input_string):
         weights_pattern = [7, 3, 1]
@@ -109,36 +116,37 @@ def read_mrz(self, image_path):
         return self._parse_mrz(mrz_text)
 
     def _parse_mrz(self, mrz_text):
-        if len(mrz_text) not in [2, 3]:
+        mrz_lines = mrz_text.strip().split('\n')
+        if len(mrz_lines) not in [2, 3]:
             return {'status': 'FAILURE', 'message': 'Invalid MRZ format'}
 
         mrz_code_dict = {}
-        if len(mrz_text) == 2:
+        if len(mrz_lines) == 2:
             # add optional data field
-            mrz_code_dict['mrz_type'] = 'TD2' if len(mrz_text[0]) == 36 else 'TD3'
+            mrz_code_dict['mrz_type'] = 'TD2' if len(mrz_lines[0]) == 36 else 'TD3'
 
             # Line 1
-            mrz_code_dict['document_type'] = mrz_text[0][:1]
-            mrz_code_dict['country_code'] = mrz_text[0][2:5]
-            names = mrz_text[0][5:].split('<<')
+            mrz_code_dict['document_type'] = mrz_lines[0][:1]
+            mrz_code_dict['country_code'] = mrz_lines[0][2:5]
+            names = mrz_lines[0][5:].split('<<')
             mrz_code_dict['surname'] = names[0].replace('<', ' ')
             mrz_code_dict['given_name'] = names[1].replace('<', ' ')
 
             # Line 2
-            mrz_code_dict['document_number'] = mrz_text[1][0:9].replace('<', '')
-            if self._get_check_digit(mrz_code_dict['document_number']) != mrz_text[1][9]:
+            mrz_code_dict['document_number'] = mrz_lines[1][0:9].replace('<', '')
+            if self._get_check_digit(mrz_code_dict['document_number']) != mrz_lines[1][9]:
                 return {'status': 'FAILURE', 'message': 'document number checksum is not matching'}
-            mrz_code_dict['nationality'] = mrz_text[1][10:13]
-            mrz_code_dict['date_of_birth'] = mrz_text[1][13:19]
-            if self._get_check_digit(mrz_code_dict['date_of_birth']) != mrz_text[1][19]:
+            mrz_code_dict['nationality'] = mrz_lines[1][10:13]
+            mrz_code_dict['date_of_birth'] = mrz_lines[1][13:19]
+            if self._get_check_digit(mrz_code_dict['date_of_birth']) != mrz_lines[1][19]:
                 return {'status': 'FAILURE', 'message': 'date of birth checksum is not matching'}
             mrz_code_dict['date_of_birth'] = self._format_date(mrz_code_dict['date_of_birth'])
-            mrz_code_dict['sex'] = mrz_text[1][20]
-            mrz_code_dict['date_of_expiry'] = mrz_text[1][21:27]
-            if self._get_check_digit(mrz_code_dict['date_of_expiry']) != mrz_text[1][27]:
+            mrz_code_dict['sex'] = mrz_lines[1][20]
+            mrz_code_dict['date_of_expiry'] = mrz_lines[1][21:27]
+            if self._get_check_digit(mrz_code_dict['date_of_expiry']) != mrz_lines[1][27]:
                 return {'status': 'FAILURE', 'message': 'date of expiry checksum is not matching'}
             mrz_code_dict['date_of_expiry'] = self._format_date(mrz_code_dict['date_of_expiry'])
-            if mrz_text[1][-1] != self._get_final_check_digit(mrz_text[1], mrz_code_dict['mrz_type']):
+            if mrz_lines[1][-1] != self._get_final_check_digit(mrz_lines[1], mrz_code_dict['mrz_type']):
                 return {'status': 'FAILURE', 'message': 'final checksum is not matching'}
 
             # Final status
@@ -147,30 +155,30 @@ def _parse_mrz(self, mrz_text):
             mrz_code_dict['mrz_type'] = 'TD1'
 
             # Line 1
-            mrz_code_dict['document_type'] = mrz_text[0][:2].replace('<', ' ')
-            mrz_code_dict['country_code'] = mrz_text[0][2:5]
-            mrz_code_dict['document_number'] = mrz_text[0][5:14]
-            if self._get_check_digit(mrz_code_dict['document_number']) != mrz_text[0][14]:
+            mrz_code_dict['document_type'] = mrz_lines[0][:2].replace('<', ' ')
+            mrz_code_dict['country_code'] = mrz_lines[0][2:5]
+            mrz_code_dict['document_number'] = mrz_lines[0][5:14]
+            if self._get_check_digit(mrz_code_dict['document_number']) != mrz_lines[0][14]:
                 return {'status': 'FAILURE', 'message': 'document number checksum is not matching'}
-            mrz_code_dict['optional_data_1'] = mrz_text[0][15:].strip('<')
+            mrz_code_dict['optional_data_1'] = mrz_lines[0][15:].strip('<')
 
             # Line 2
-            mrz_code_dict['date_of_birth'] = mrz_text[1][:6]
-            if self._get_check_digit(mrz_code_dict['date_of_birth']) != mrz_text[1][6]:
+            mrz_code_dict['date_of_birth'] = mrz_lines[1][:6]
+            if self._get_check_digit(mrz_code_dict['date_of_birth']) != mrz_lines[1][6]:
                 return {'status': 'FAILURE', 'message': 'date of birth checksum is not matching'}
             mrz_code_dict['date_of_birth'] = self._format_date(mrz_code_dict['date_of_birth'])
-            mrz_code_dict['sex'] = mrz_text[1][7]
-            mrz_code_dict['date_of_expiry'] = mrz_text[1][8:14]
-            if self._get_check_digit(mrz_code_dict['date_of_expiry']) != mrz_text[1][14]:
+            mrz_code_dict['sex'] = mrz_lines[1][7]
+            mrz_code_dict['date_of_expiry'] = mrz_lines[1][8:14]
+            if self._get_check_digit(mrz_code_dict['date_of_expiry']) != mrz_lines[1][14]:
                 return {'status': 'FAILURE', 'message': 'date of expiry checksum is not matching'}
             mrz_code_dict['date_of_expiry'] = self._format_date(mrz_code_dict['date_of_expiry'])
-            mrz_code_dict['nationality'] = mrz_text[1][15:18]
-            mrz_code_dict['optional_data_2'] = mrz_text[0][18:29].strip('<')
-            if mrz_text[1][-1] != self._get_final_check_digit(mrz_text, mrz_code_dict['mrz_type']):
+            mrz_code_dict['nationality'] = mrz_lines[1][15:18]
+            mrz_code_dict['optional_data_2'] = mrz_lines[0][18:29].strip('<')
+            if mrz_lines[1][-1] != self._get_final_check_digit(mrz_lines, mrz_code_dict['mrz_type']):
                 return {'status': 'FAILURE', 'message': 'final checksum is not matching'}
 
             # Line 3
-            names = mrz_text[2].split('<<')
+            names = mrz_lines[2].split('<<')
             mrz_code_dict['surname'] = names[0].replace('<', ' ')
             mrz_code_dict['given_name'] = names[1].replace('<', ' ')
 
@@ -178,8 +186,3 @@ def _parse_mrz(self, mrz_text):
             mrz_code_dict['status'] = 'SUCCESS'
 
         return mrz_code_dict
-
-
-
-
-
diff --git a/scripts/main.py b/scripts/main.py
index 74760bc..ce67c74 100644
--- a/scripts/main.py
+++ b/scripts/main.py
@@ -1,9 +1,11 @@
-from Laghima import Laghima
+from fastmrz import FastMRZ
 import os
 
-laghima = Laghima()
+# fast_mrz = FastMRZ(tesseract_path=r'/opt/homebrew/Cellar/tesseract/5.3.4_1/bin/tesseract')
+# fast_mrz = FastMRZ(tesseract_path=r'C:\\Program Files\\Tesseract-OCR\\tesseract.exe')
 
-# Need to add other type of documents in /data
-passport_mrz = laghima.read_mrz(os.path.abspath('../data/passport_uk.jpg'))
+fast_mrz = FastMRZ()
+passport_mrz = fast_mrz.read_mrz(os.path.abspath('../data/passport_uk.jpg'))
 print(passport_mrz)
 
+# Add README testing badge. Ref, https://github.com/mingrammer/diagrams/blob/master/README.md?plain=1
diff --git a/tests/test.py b/tests/test.py
index e69de29..ef10fa7 100644
--- a/tests/test.py
+++ b/tests/test.py
@@ -0,0 +1,65 @@
+import numpy as np
+import os
+from fastmrz import FastMRZ
+
+fast_mrz = FastMRZ()
+
+
+# Test cases for _process_image function
+def test_process_image():
+    image_path = os.path.abspath('../data/td3.jpg')
+    processed_image = fast_mrz._process_image(image_path)
+    assert isinstance(processed_image, np.ndarray)
+    assert processed_image.shape == (1, 256, 256, 3)
+
+
+# Test cases for _get_roi function
+def test_get_roi():
+    output_data = np.random.rand(1, 256, 256, 1)
+    image_path = os.path.abspath('../data/td3.jpg')
+    roi = fast_mrz._get_roi(output_data, image_path)
+    assert isinstance(roi, str)
+
+
+# Test cases for _cleanse_roi function
+def test_cleanse_roi():
+    raw_text = "P<UTOERIKSSON<<ANNA<MARIA<<< <<<<<<<<<  <<<<<<<\n\nL898902C36UTO7408122F1204159ZE184226B<<<<<10\n"
+    cleansed_text = fast_mrz._cleanse_roi(raw_text)
+    assert isinstance(cleansed_text, str)
+
+
+# Test cases for _get_final_check_digit function
+def test_get_final_check_digit():
+    input_string = "'I<UTOERIKSSON<<ANNA<MARIA<<<<<<<<<<<\nD231458907UTO7408122F1204159<<<<<<<6"
+    input_type = "TD2"
+    final_check_digit = fast_mrz._get_final_check_digit(input_string, input_type)
+    assert isinstance(final_check_digit, str)
+
+
+# Test cases for _get_check_digit function
+def test_get_check_digit():
+    input_string = "'I<UTOERIKSSON<<ANNA< MARIA<<<<< <<<<<<\nD231458907UTO7408122F1204159<<<<<<<6\n\n"
+    check_digit = fast_mrz._get_check_digit(input_string)
+    assert isinstance(check_digit, str)
+
+
+# Test cases for _format_date function
+def test_format_date():
+    input_date = "220101"
+    formatted_date = fast_mrz._format_date(input_date)
+    assert isinstance(formatted_date, str)
+
+
+# Test cases for read_raw_mrz function
+def test_read_raw_mrz():
+    image_path = os.path.abspath('../data/td2.jpg')
+    raw_mrz = fast_mrz.read_raw_mrz(image_path)
+    assert isinstance(raw_mrz, str)
+
+
+# Test cases for read_mrz function
+def test_read_mrz():
+    image_path = os.path.abspath('../data/td3.jpg')
+    mrz_data = fast_mrz.read_mrz(image_path)
+    assert isinstance(mrz_data, dict)
+    assert 'status' in mrz_data.keys()