import re from typing import Dict from enum import Enum class Entities(Enum): PAYER = "PAYER" PAYER_BANK_ACCOUNT = "PAYER_ACCOUNT" VPA = "VPA" MESSAGE = "MESSAGE" IFSCCODE = "IFSCCODE" UTR = "UTR" TXNMETHOD = "TXNMETHOD" BANK = "BANK" class StringUtils: @classmethod def replace_multiple_spaces_with_single_space(cls, text): return re.sub(r'\s+', ' ', text).strip() @classmethod def find_word_indices(cls, text, word): word = cls.replace_multiple_spaces_with_single_space(word) start_index = text.find(word) if start_index == -1: return None # Word not found end_index = start_index + len(word) - 1 return start_index, end_index @classmethod def get_spacy_ref_for_word(cls, text, word, type): start_index, end_index = StringUtils.find_word_indices(text, word) return [start_index, end_index + 1, type] @classmethod def get_spacy_dataset(cls, transaction: str, entities_name_to_type_map: Dict[str, Entities]): transaction_dataset = [] for entity_value in entities_name_to_type_map: if entities_name_to_type_map[entity_value] in [Entities.PAYER, Entities.UTR]: transaction_dataset.append(StringUtils.get_spacy_ref_for_word( text=transaction, word=entity_value, type=entities_name_to_type_map[entity_value].value )) return transaction_dataset