From 3f2bb3820849115dfeab72ca1d1a65899662406e Mon Sep 17 00:00:00 2001 From: Blade He Date: Wed, 16 Oct 2024 16:55:32 -0500 Subject: [PATCH] Resolve issue first records only with share class name but without fund name (in previous page text). --- core/data_extraction.py | 20 +++++++++++++++++++- main.py | 40 ++++++++++++++++++++-------------------- 2 files changed, 39 insertions(+), 21 deletions(-) diff --git a/core/data_extraction.py b/core/data_extraction.py index 77fd8a1..439fedf 100644 --- a/core/data_extraction.py +++ b/core/data_extraction.py @@ -130,12 +130,25 @@ class DataExtraction: data_list = [] pdf_page_count = len(self.page_text_dict.keys()) handled_page_num_list = [] + + previous_page_num = -1 + previous_page_datapoints = [] + previous_page_fund_name = None for page_num, page_text in self.page_text_dict.items(): if page_num in handled_page_num_list: continue page_datapoints = self.get_datapoints_by_page_num(page_num) if len(page_datapoints) == 0: continue + if previous_page_num == page_num - 1 and \ + previous_page_datapoints == page_datapoints and \ + previous_page_fund_name is not None: + # Transfer previous page fund name to be the pre-fix of page text + # The purpose is to get fund name if the first records without fund name + # example document: 431073795, page index 1727 to 1728 + logger.info(f"Transfer previous page fund name: {previous_page_fund_name} to be the pre-fix of page text") + page_text = f"\n{previous_page_fund_name}\n{page_text}" + extract_data = self.extract_data_by_page( page_num, page_text, @@ -144,8 +157,13 @@ class DataExtraction: exclude_data=None, ) data_list.append(extract_data) - + + page_data_list = extract_data.get("extract_data", {}).get("data", []) + if len(page_data_list) > 0: + previous_page_num = page_num + previous_page_fund_name = page_data_list[-1].get("fund_name", "") + previous_page_datapoints = page_datapoints current_page_data_count = len(page_data_list) if current_page_data_count > 0: diff --git a/main.py b/main.py index 1e34bbb..c627a51 100644 --- a/main.py +++ b/main.py @@ -786,33 +786,33 @@ if __name__ == "__main__": # ] # Documents in EMEA Case 1.docx check_db_mapping_doc_id_list = [ - "435128656", - "425480144", - "466528487", - "434902020", - "440029306", - "431073795", - "430240853", - "427637151", - "434924914", - "467595142", - "466859621", - "429564034", "424976833", - "466860852", - "466371135", - "470515549", - "434851173", - "434710819", + "425480144", + "427637151", + "429564034", "429950833", - "467788879" + "430240853", + "431073795", + "434710819", + "434851173", + "434902020", + "434924914", + "435128656", + "440029306", + "466371135", + "466528487", + "466859621", + "466860852", + "467595142", + "467788879", + "470515549" ] # special_doc_id_list = check_mapping_doc_id_list special_doc_id_list = check_db_mapping_doc_id_list - special_doc_id_list = ["514213638"] + special_doc_id_list = ["431073795"] output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/" output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/" - re_run_extract_data = False + re_run_extract_data = True re_run_mapping_data = True force_save_total_data = False calculate_metrics = False