Skip to content

Commit

Permalink
updated repo
Browse files Browse the repository at this point in the history
  • Loading branch information
sivakumar-mahalingam committed Apr 7, 2024
1 parent 19ff25d commit acac11b
Show file tree
Hide file tree
Showing 4 changed files with 118 additions and 48 deletions.
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
# LAGHIMA
# Fast MRZ

![License](https://img.shields.io/badge/license-AGPL%203.0-green)
![Python](https://img.shields.io/badge/python-3.11.8-blue)
[![CodeQL](https://github.com/sivakumar-mahalingam/Laghima/actions/workflows/codeql.yml/badge.svg)](https://github.com/sivakumar-mahalingam/Laghima/actions/workflows/codeql.yml)

<a href="https://github.com/sivakumar-mahalingam/passport-mrz-reader/" target="_blank">
<img src="docs/LAGHIMA.png" target="_blank" />
</a>
<img src="docs/FastMRZ.png" target="_blank" />

</a>

This repository extracts the Machine Readable Zone (MRZ) from passport images. The MRZ typically contains important information such as the passport holder's name, nationality, passport number, and date of birth.

Expand Down
85 changes: 44 additions & 41 deletions scripts/Laghima.py → scripts/fastmrz.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,17 @@
import os

# Set the Tesseract path
pytesseract.pytesseract.tesseract_cmd = r'/opt/homebrew/Cellar/tesseract/5.3.4_1/bin/tesseract'
# pytesseract.pytesseract.tesseract_cmd = r'/opt/homebrew/Cellar/tesseract/5.3.4_1/bin/tesseract'
# pytesseract.pytesseract.tesseract_cmd = 'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'

class Laghima:
def __init__(self):

class FastMRZ:
def __init__(self, tesseract_path=''):
self.interpreter = tensorflow.lite.Interpreter(model_path=os.path.abspath('../models/mrz_seg.tflite'))
self.interpreter.allocate_tensors()
self.input_details = self.interpreter.get_input_details()
self.output_details = self.interpreter.get_output_details()
self.tesseract_path = tesseract_path

def _process_image(self, image_path):
image = cv2.imread(image_path, cv2.IMREAD_COLOR) if isinstance(image_path, str) else image_path
Expand All @@ -29,6 +31,8 @@ def _process_image(self, image_path):
return image

def _get_roi(self, output_data, image_path):
if self.tesseract_path != '':
pytesseract.pytesseract.tesseract_cmd = self.tesseract_path
image = cv2.imread(image_path, cv2.IMREAD_COLOR) if isinstance(image_path, str) else image_path

output_data = (output_data[0, :, :, 0] > 0.35) * 1
Expand Down Expand Up @@ -57,21 +61,24 @@ def _cleanse_roi(self, raw_text):

selection_length = None
for item in input_list:
if '<' in item and len(item) in (30, 36, 44):
if '<' in item and len(item) in (30, 36, 44):
selection_length = len(item)
break

output_list = [item for item in input_list if len(item) >= selection_length]
new_list = [item for item in input_list if len(item) >= selection_length]

output_text = '\n'.join(new_list)

return output_list
return output_text

def _get_final_check_digit(self, input_string, input_type):
if input_type == 'TD3':
return self._get_check_digit(input_string[0:10] + input_string[13:20] + input_string[21:43])
elif input_type == 'TD2':
return self._get_check_digit(input_string[0:10] + input_string[13:20] + input_string[21:35])
else:
return self._get_check_digit(input_string[0][5:] + input_string[1][:7] + input_string[1][8:15] + input_string[1][18:29])
return self._get_check_digit(
input_string[0][5:] + input_string[1][:7] + input_string[1][8:15] + input_string[1][18:29])

def _get_check_digit(self, input_string):
weights_pattern = [7, 3, 1]
Expand Down Expand Up @@ -109,36 +116,37 @@ def read_mrz(self, image_path):
return self._parse_mrz(mrz_text)

def _parse_mrz(self, mrz_text):
if len(mrz_text) not in [2, 3]:
mrz_lines = mrz_text.strip().split('\n')
if len(mrz_lines) not in [2, 3]:
return {'status': 'FAILURE', 'message': 'Invalid MRZ format'}

mrz_code_dict = {}
if len(mrz_text) == 2:
if len(mrz_lines) == 2:
# add optional data field
mrz_code_dict['mrz_type'] = 'TD2' if len(mrz_text[0]) == 36 else 'TD3'
mrz_code_dict['mrz_type'] = 'TD2' if len(mrz_lines[0]) == 36 else 'TD3'

# Line 1
mrz_code_dict['document_type'] = mrz_text[0][:1]
mrz_code_dict['country_code'] = mrz_text[0][2:5]
names = mrz_text[0][5:].split('<<')
mrz_code_dict['document_type'] = mrz_lines[0][:1]
mrz_code_dict['country_code'] = mrz_lines[0][2:5]
names = mrz_lines[0][5:].split('<<')
mrz_code_dict['surname'] = names[0].replace('<', ' ')
mrz_code_dict['given_name'] = names[1].replace('<', ' ')

# Line 2
mrz_code_dict['document_number'] = mrz_text[1][0:9].replace('<', '')
if self._get_check_digit(mrz_code_dict['document_number']) != mrz_text[1][9]:
mrz_code_dict['document_number'] = mrz_lines[1][0:9].replace('<', '')
if self._get_check_digit(mrz_code_dict['document_number']) != mrz_lines[1][9]:
return {'status': 'FAILURE', 'message': 'document number checksum is not matching'}
mrz_code_dict['nationality'] = mrz_text[1][10:13]
mrz_code_dict['date_of_birth'] = mrz_text[1][13:19]
if self._get_check_digit(mrz_code_dict['date_of_birth']) != mrz_text[1][19]:
mrz_code_dict['nationality'] = mrz_lines[1][10:13]
mrz_code_dict['date_of_birth'] = mrz_lines[1][13:19]
if self._get_check_digit(mrz_code_dict['date_of_birth']) != mrz_lines[1][19]:
return {'status': 'FAILURE', 'message': 'date of birth checksum is not matching'}
mrz_code_dict['date_of_birth'] = self._format_date(mrz_code_dict['date_of_birth'])
mrz_code_dict['sex'] = mrz_text[1][20]
mrz_code_dict['date_of_expiry'] = mrz_text[1][21:27]
if self._get_check_digit(mrz_code_dict['date_of_expiry']) != mrz_text[1][27]:
mrz_code_dict['sex'] = mrz_lines[1][20]
mrz_code_dict['date_of_expiry'] = mrz_lines[1][21:27]
if self._get_check_digit(mrz_code_dict['date_of_expiry']) != mrz_lines[1][27]:
return {'status': 'FAILURE', 'message': 'date of expiry checksum is not matching'}
mrz_code_dict['date_of_expiry'] = self._format_date(mrz_code_dict['date_of_expiry'])
if mrz_text[1][-1] != self._get_final_check_digit(mrz_text[1], mrz_code_dict['mrz_type']):
if mrz_lines[1][-1] != self._get_final_check_digit(mrz_lines[1], mrz_code_dict['mrz_type']):
return {'status': 'FAILURE', 'message': 'final checksum is not matching'}

# Final status
Expand All @@ -147,39 +155,34 @@ def _parse_mrz(self, mrz_text):
mrz_code_dict['mrz_type'] = 'TD1'

# Line 1
mrz_code_dict['document_type'] = mrz_text[0][:2].replace('<', ' ')
mrz_code_dict['country_code'] = mrz_text[0][2:5]
mrz_code_dict['document_number'] = mrz_text[0][5:14]
if self._get_check_digit(mrz_code_dict['document_number']) != mrz_text[0][14]:
mrz_code_dict['document_type'] = mrz_lines[0][:2].replace('<', ' ')
mrz_code_dict['country_code'] = mrz_lines[0][2:5]
mrz_code_dict['document_number'] = mrz_lines[0][5:14]
if self._get_check_digit(mrz_code_dict['document_number']) != mrz_lines[0][14]:
return {'status': 'FAILURE', 'message': 'document number checksum is not matching'}
mrz_code_dict['optional_data_1'] = mrz_text[0][15:].strip('<')
mrz_code_dict['optional_data_1'] = mrz_lines[0][15:].strip('<')

# Line 2
mrz_code_dict['date_of_birth'] = mrz_text[1][:6]
if self._get_check_digit(mrz_code_dict['date_of_birth']) != mrz_text[1][6]:
mrz_code_dict['date_of_birth'] = mrz_lines[1][:6]
if self._get_check_digit(mrz_code_dict['date_of_birth']) != mrz_lines[1][6]:
return {'status': 'FAILURE', 'message': 'date of birth checksum is not matching'}
mrz_code_dict['date_of_birth'] = self._format_date(mrz_code_dict['date_of_birth'])
mrz_code_dict['sex'] = mrz_text[1][7]
mrz_code_dict['date_of_expiry'] = mrz_text[1][8:14]
if self._get_check_digit(mrz_code_dict['date_of_expiry']) != mrz_text[1][14]:
mrz_code_dict['sex'] = mrz_lines[1][7]
mrz_code_dict['date_of_expiry'] = mrz_lines[1][8:14]
if self._get_check_digit(mrz_code_dict['date_of_expiry']) != mrz_lines[1][14]:
return {'status': 'FAILURE', 'message': 'date of expiry checksum is not matching'}
mrz_code_dict['date_of_expiry'] = self._format_date(mrz_code_dict['date_of_expiry'])
mrz_code_dict['nationality'] = mrz_text[1][15:18]
mrz_code_dict['optional_data_2'] = mrz_text[0][18:29].strip('<')
if mrz_text[1][-1] != self._get_final_check_digit(mrz_text, mrz_code_dict['mrz_type']):
mrz_code_dict['nationality'] = mrz_lines[1][15:18]
mrz_code_dict['optional_data_2'] = mrz_lines[0][18:29].strip('<')
if mrz_lines[1][-1] != self._get_final_check_digit(mrz_lines, mrz_code_dict['mrz_type']):
return {'status': 'FAILURE', 'message': 'final checksum is not matching'}

# Line 3
names = mrz_text[2].split('<<')
names = mrz_lines[2].split('<<')
mrz_code_dict['surname'] = names[0].replace('<', ' ')
mrz_code_dict['given_name'] = names[1].replace('<', ' ')

# Final status
mrz_code_dict['status'] = 'SUCCESS'

return mrz_code_dict





10 changes: 6 additions & 4 deletions scripts/main.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
from Laghima import Laghima
from fastmrz import FastMRZ
import os

laghima = Laghima()
# fast_mrz = FastMRZ(tesseract_path=r'/opt/homebrew/Cellar/tesseract/5.3.4_1/bin/tesseract')
# fast_mrz = FastMRZ(tesseract_path=r'C:\\Program Files\\Tesseract-OCR\\tesseract.exe')

# Need to add other type of documents in /data
passport_mrz = laghima.read_mrz(os.path.abspath('../data/passport_uk.jpg'))
fast_mrz = FastMRZ()
passport_mrz = fast_mrz.read_mrz(os.path.abspath('../data/passport_uk.jpg'))
print(passport_mrz)

# Add README testing badge. Ref, https://github.com/mingrammer/diagrams/blob/master/README.md?plain=1
65 changes: 65 additions & 0 deletions tests/test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import numpy as np
import os
from fastmrz import FastMRZ

fast_mrz = FastMRZ()


# Test cases for _process_image function
def test_process_image():
image_path = os.path.abspath('../data/td3.jpg')
processed_image = fast_mrz._process_image(image_path)
assert isinstance(processed_image, np.ndarray)
assert processed_image.shape == (1, 256, 256, 3)


# Test cases for _get_roi function
def test_get_roi():
output_data = np.random.rand(1, 256, 256, 1)
image_path = os.path.abspath('../data/td3.jpg')
roi = fast_mrz._get_roi(output_data, image_path)
assert isinstance(roi, str)


# Test cases for _cleanse_roi function
def test_cleanse_roi():
raw_text = "P<UTOERIKSSON<<ANNA<MARIA<<< <<<<<<<<< <<<<<<<\n\nL898902C36UTO7408122F1204159ZE184226B<<<<<10\n"
cleansed_text = fast_mrz._cleanse_roi(raw_text)
assert isinstance(cleansed_text, str)


# Test cases for _get_final_check_digit function
def test_get_final_check_digit():
input_string = "'I<UTOERIKSSON<<ANNA<MARIA<<<<<<<<<<<\nD231458907UTO7408122F1204159<<<<<<<6"
input_type = "TD2"
final_check_digit = fast_mrz._get_final_check_digit(input_string, input_type)
assert isinstance(final_check_digit, str)


# Test cases for _get_check_digit function
def test_get_check_digit():
input_string = "'I<UTOERIKSSON<<ANNA< MARIA<<<<< <<<<<<\nD231458907UTO7408122F1204159<<<<<<<6\n\n"
check_digit = fast_mrz._get_check_digit(input_string)
assert isinstance(check_digit, str)


# Test cases for _format_date function
def test_format_date():
input_date = "220101"
formatted_date = fast_mrz._format_date(input_date)
assert isinstance(formatted_date, str)


# Test cases for read_raw_mrz function
def test_read_raw_mrz():
image_path = os.path.abspath('../data/td2.jpg')
raw_mrz = fast_mrz.read_raw_mrz(image_path)
assert isinstance(raw_mrz, str)


# Test cases for read_mrz function
def test_read_mrz():
image_path = os.path.abspath('../data/td3.jpg')
mrz_data = fast_mrz.read_mrz(image_path)
assert isinstance(mrz_data, dict)
assert 'status' in mrz_data.keys()

0 comments on commit acac11b

Please sign in to comment.