Recognize dates from documents using Sliding Window Algorithm & Python OCR.

Hey there ,

Today, lets solve a text processing problem that asks us to find any date present in a text extracted from an image.

We are using easyocr , a python OCR library to find the text from the images. Lets move on with the code.

Extracting text from images | Setting up easyocr

  1. We start by creating a data-extraction.py module.

  2. Create a DataExtraction class and initiate the easyocr model.

from datetime import datetime
import easyocr
import re
class DataExtraction:
def __init__ (self) -> None:
self.months = {
"JAN": "01",
"FEB": "02",
"MAR": "03",
"APR": "04",
"MAY": "05",
"JUN": "06",
"JUL": "07",
"AUG": "08",
"SEP": "09",
"OCT": "10",
"NOV": "11",
"DEC": "12",
}
self.reader = easyocr.Reader(["en"])
from datetime import datetime  
import easyocr  
import re  

class DataExtraction:  
    def __init__ (self) -> None:  
      self.months = {  
            "JAN": "01",  
            "FEB": "02",  
            "MAR": "03",  
            "APR": "04",  
            "MAY": "05",  
            "JUN": "06",  
            "JUL": "07",  
            "AUG": "08",  
            "SEP": "09",  
            "OCT": "10",  
            "NOV": "11",  
            "DEC": "12",  
        }  
        self.reader = easyocr.Reader(["en"])
from datetime import datetime import easyocr import re class DataExtraction: def __init__ (self) -> None: self.months = { "JAN": "01", "FEB": "02", "MAR": "03", "APR": "04", "MAY": "05", "JUN": "06", "JUL": "07", "AUG": "08", "SEP": "09", "OCT": "10", "NOV": "11", "DEC": "12", } self.reader = easyocr.Reader(["en"])

Enter fullscreen mode Exit fullscreen mode

Converting date strings to DateTime objects

There can be an unknown number of date formats and parsing each one of them will take an infinite amount of time and work. So in this example, well consider only a few well-known forms.

Well try to identify dd mmm yyyy date formats from a string.

For example, if the given date is 15 sd f may 2019 , then the output should be 15052019“.

We are going to use the Sliding Window to detect if any month is present in between two groups of numerical characters.

The string includes numbers, alphabets, including other characters. For example, consider 𝗴𝘀 𝟭𝟱 𝗺𝗮𝗶 𝗺𝗮𝘆 𝟮𝟬𝟭𝟵 𝘀𝗴𝗳 𝘀. The date should be 15th May 2019.

  1. The first step is to implement a sliding window to convert MMM to a number. Like, may to 05.

  2. We create a function that takes in a string and finds if it contains any month from the above dictionary, months.

def month_to_num(self, s: str) -> str:
res = ""
start = 0
try:
for end in range(len(s)):
rightChar = s[end]
res += rightChar
if len(res) == 3:
if res.upper() in self.months.keys():
numeric_date = self.months[res.upper()]
return numeric_date
start += 1
res = res[1:]
except Exception as e:
pass
return ""
def month_to_num(self, s: str) -> str:
        res = ""
        start = 0
        try:
            for end in range(len(s)):
                rightChar = s[end]
                res += rightChar
                if len(res) == 3:
                    if res.upper() in self.months.keys():
                        numeric_date = self.months[res.upper()]
                        return numeric_date
                    start += 1
                    res = res[1:]
        except Exception as e:
            pass

        return ""
def month_to_num(self, s: str) -> str: res = "" start = 0 try: for end in range(len(s)): rightChar = s[end] res += rightChar if len(res) == 3: if res.upper() in self.months.keys(): numeric_date = self.months[res.upper()] return numeric_date start += 1 res = res[1:] except Exception as e: pass return ""

Enter fullscreen mode Exit fullscreen mode

  1. Next, we create a function that takes in a string and gives us the desired format.
def find_date_string(self, s: str) -> list: # s = "𝗴𝘀 𝟭𝟱 𝗺𝗮𝗶 𝗺𝗮𝘆 𝟮𝟬𝟭𝟵 𝘀𝗴𝗳 "
s1 = " ".join(re.split(r"([a-zA-Z])([0-9]+)", s))
s2 = " ".join(re.split(r"([0-9]+)([a-zA-Z]+)", s1))
text = "-" + "-".join(re.split(r"[-;,.\s]\s*", s2)) + "-" # "gs-15-mai-may-2019-sgf"
dates_type_1 = re.findall(r"-[0-9][0-9]-.*?-[0-9][0-9][0-9][0-9]-", text) # "-15-mai-may-2019"
date_objects = []
if len(dates_type_1) > 0:
date_objs = self.get_date_object(dates_type_1)
for date_obj in date_objs:
date_objects.append(date_obj)
return date_objects
def get_date_object(self, date_type_1_list: list):
dates = []
for date_str in date_type_1_list:
day_str = date_str[1:3]
month_str = date_str[3:-4]
year_str = date_str[-5:-1]
month_number = self.month_to_num(month_str)
if month_number == "":
return ""
result_date_str = f"{day_str}-{month_number}-{year_str}"
date_object = datetime.strptime(result_date_str, "%d-%m-%Y")
dates.append(date_object)
return dates
def find_date_string(self, s: str) -> list: # s = "𝗴𝘀 𝟭𝟱 𝗺𝗮𝗶 𝗺𝗮𝘆 𝟮𝟬𝟭𝟵 𝘀𝗴𝗳 "
        s1 = " ".join(re.split(r"([a-zA-Z])([0-9]+)", s))
        s2 = " ".join(re.split(r"([0-9]+)([a-zA-Z]+)", s1))
        text = "-" + "-".join(re.split(r"[-;,.\s]\s*", s2)) + "-" # "gs-15-mai-may-2019-sgf"
        dates_type_1 = re.findall(r"-[0-9][0-9]-.*?-[0-9][0-9][0-9][0-9]-", text) # "-15-mai-may-2019"
        date_objects = []
        if len(dates_type_1) > 0:
            date_objs = self.get_date_object(dates_type_1)
            for date_obj in date_objs:
                date_objects.append(date_obj)
        return date_objects

def get_date_object(self, date_type_1_list: list):
    dates = []
    for date_str in date_type_1_list:
        day_str = date_str[1:3]
        month_str = date_str[3:-4]
        year_str = date_str[-5:-1]

        month_number = self.month_to_num(month_str)
        if month_number == "":
            return ""

        result_date_str = f"{day_str}-{month_number}-{year_str}"
        date_object = datetime.strptime(result_date_str, "%d-%m-%Y")
        dates.append(date_object)  

     return dates
def find_date_string(self, s: str) -> list: # s = "𝗴𝘀 𝟭𝟱 𝗺𝗮𝗶 𝗺𝗮𝘆 𝟮𝟬𝟭𝟵 𝘀𝗴𝗳 " s1 = " ".join(re.split(r"([a-zA-Z])([0-9]+)", s)) s2 = " ".join(re.split(r"([0-9]+)([a-zA-Z]+)", s1)) text = "-" + "-".join(re.split(r"[-;,.\s]\s*", s2)) + "-" # "gs-15-mai-may-2019-sgf" dates_type_1 = re.findall(r"-[0-9][0-9]-.*?-[0-9][0-9][0-9][0-9]-", text) # "-15-mai-may-2019" date_objects = [] if len(dates_type_1) > 0: date_objs = self.get_date_object(dates_type_1) for date_obj in date_objs: date_objects.append(date_obj) return date_objects def get_date_object(self, date_type_1_list: list): dates = [] for date_str in date_type_1_list: day_str = date_str[1:3] month_str = date_str[3:-4] year_str = date_str[-5:-1] month_number = self.month_to_num(month_str) if month_number == "": return "" result_date_str = f"{day_str}-{month_number}-{year_str}" date_object = datetime.strptime(result_date_str, "%d-%m-%Y") dates.append(date_object) return dates

Enter fullscreen mode Exit fullscreen mode

  1. Now we just have to pass the extracted strings into the above functions.
def get_date_from_img(self, img_path: str):
result = []
# extract the texts from the img
text_strings = self.reader.readtext(img_path, detail=0)
# check every string for dates
for s in text_strings:
date_obj_list = self.find_date_string(s)
if len(date_obj_list) > 0:
result.append(date_obj_list)
return result
def get_date_from_img(self, img_path: str):
        result = []

        # extract the texts from the img
        text_strings = self.reader.readtext(img_path, detail=0)

        # check every string for dates
        for s in text_strings:
            date_obj_list = self.find_date_string(s)
            if len(date_obj_list) > 0:
                result.append(date_obj_list)
       return result
def get_date_from_img(self, img_path: str): result = [] # extract the texts from the img text_strings = self.reader.readtext(img_path, detail=0) # check every string for dates for s in text_strings: date_obj_list = self.find_date_string(s) if len(date_obj_list) > 0: result.append(date_obj_list) return result

Enter fullscreen mode Exit fullscreen mode

  1. Thats it. We have all the DateTime objects present in a document image.

This method can be used on any kind of document, provided the date format matches the defined type. There are many kinds of date formats used throughout the world. Different countries have different formats. Parsing each one of them will require some more effort but it is definitely achievable.

Here are some of the other formats to be used for different date types.

"""
1. 1 mai/may 2019
2. 1 mai/may 19
3. 12 09 2016
4. 2 09 2016
5. 12 09 16
6. 2 09 16
"""
dates_type_2 = re.findall(r"-[0-9][0-9]-[0-9][0-9]-[0-9][0-9][0-9][0-9]-", text)
dates_type_3 = re.findall(r"-[0-9][0-9]-[0-9][0-9]-[0-9][0-9]-", text)
dates_type_4 = re.findall(r"-[0-9][0-9]-.*?-[0-9][0-9]-", text)
dates_type_5 = re.findall(r"-[0-9]-.*?-[0-9][0-9]-", text)
dates_type_6 = re.findall(r"-[0-9]-.*?-[0-9][0-9][0-9][0-9]-", text)
dates_type_7 = re.findall(r"-[0-9]-[0-9][0-9]-[0-9][0-9]-", text)
dates_type_8 = re.findall(r"-[0-9]-[0-9][0-9]-[0-9][0-9][0-9][0-9]-", text)
"""
1. 1 mai/may 2019
2. 1 mai/may 19
3. 12 09 2016
4. 2 09 2016
5. 12 09 16
6. 2 09 16  
"""  
dates_type_2 = re.findall(r"-[0-9][0-9]-[0-9][0-9]-[0-9][0-9][0-9][0-9]-", text)
dates_type_3 = re.findall(r"-[0-9][0-9]-[0-9][0-9]-[0-9][0-9]-", text)
dates_type_4 = re.findall(r"-[0-9][0-9]-.*?-[0-9][0-9]-", text)
dates_type_5 = re.findall(r"-[0-9]-.*?-[0-9][0-9]-", text)
dates_type_6 = re.findall(r"-[0-9]-.*?-[0-9][0-9][0-9][0-9]-", text)
dates_type_7 = re.findall(r"-[0-9]-[0-9][0-9]-[0-9][0-9]-", text)
dates_type_8 = re.findall(r"-[0-9]-[0-9][0-9]-[0-9][0-9][0-9][0-9]-", text)
""" 1. 1 mai/may 2019 2. 1 mai/may 19 3. 12 09 2016 4. 2 09 2016 5. 12 09 16 6. 2 09 16 """ dates_type_2 = re.findall(r"-[0-9][0-9]-[0-9][0-9]-[0-9][0-9][0-9][0-9]-", text) dates_type_3 = re.findall(r"-[0-9][0-9]-[0-9][0-9]-[0-9][0-9]-", text) dates_type_4 = re.findall(r"-[0-9][0-9]-.*?-[0-9][0-9]-", text) dates_type_5 = re.findall(r"-[0-9]-.*?-[0-9][0-9]-", text) dates_type_6 = re.findall(r"-[0-9]-.*?-[0-9][0-9][0-9][0-9]-", text) dates_type_7 = re.findall(r"-[0-9]-[0-9][0-9]-[0-9][0-9]-", text) dates_type_8 = re.findall(r"-[0-9]-[0-9][0-9]-[0-9][0-9][0-9][0-9]-", text)

Enter fullscreen mode Exit fullscreen mode

Thats all folks! See you soon.

Happy Coding 🤟

原文链接:Recognize dates from documents using Sliding Window Algorithm & Python OCR.

© 版权声明
THE END
喜欢就支持一下吧
点赞6 分享
Little compliments mean so much to me sometimes.
有时候,一点微不足道的肯定,对我却意义非凡
评论 抢沙发

请登录后发表评论

    暂无评论内容