Resolve issue first records only with share class name but without fund name (in previous page text).

This commit is contained in:
Blade He 2024-10-16 16:55:32 -05:00
parent f166e73362
commit 3f2bb38208
2 changed files with 39 additions and 21 deletions

View File

@ -130,12 +130,25 @@ class DataExtraction:
data_list = []
pdf_page_count = len(self.page_text_dict.keys())
handled_page_num_list = []
previous_page_num = -1
previous_page_datapoints = []
previous_page_fund_name = None
for page_num, page_text in self.page_text_dict.items():
if page_num in handled_page_num_list:
continue
page_datapoints = self.get_datapoints_by_page_num(page_num)
if len(page_datapoints) == 0:
continue
if previous_page_num == page_num - 1 and \
previous_page_datapoints == page_datapoints and \
previous_page_fund_name is not None:
# Transfer previous page fund name to be the pre-fix of page text
# The purpose is to get fund name if the first records without fund name
# example document: 431073795, page index 1727 to 1728
logger.info(f"Transfer previous page fund name: {previous_page_fund_name} to be the pre-fix of page text")
page_text = f"\n{previous_page_fund_name}\n{page_text}"
extract_data = self.extract_data_by_page(
page_num,
page_text,
@ -144,8 +157,13 @@ class DataExtraction:
exclude_data=None,
)
data_list.append(extract_data)
page_data_list = extract_data.get("extract_data", {}).get("data", [])
if len(page_data_list) > 0:
previous_page_num = page_num
previous_page_fund_name = page_data_list[-1].get("fund_name", "")
previous_page_datapoints = page_datapoints
current_page_data_count = len(page_data_list)
if current_page_data_count > 0:

40
main.py
View File

@ -786,33 +786,33 @@ if __name__ == "__main__":
# ]
# Documents in EMEA Case 1.docx
check_db_mapping_doc_id_list = [
"435128656",
"425480144",
"466528487",
"434902020",
"440029306",
"431073795",
"430240853",
"427637151",
"434924914",
"467595142",
"466859621",
"429564034",
"424976833",
"466860852",
"466371135",
"470515549",
"434851173",
"434710819",
"425480144",
"427637151",
"429564034",
"429950833",
"467788879"
"430240853",
"431073795",
"434710819",
"434851173",
"434902020",
"434924914",
"435128656",
"440029306",
"466371135",
"466528487",
"466859621",
"466860852",
"467595142",
"467788879",
"470515549"
]
# special_doc_id_list = check_mapping_doc_id_list
special_doc_id_list = check_db_mapping_doc_id_list
special_doc_id_list = ["514213638"]
special_doc_id_list = ["431073795"]
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
re_run_extract_data = False
re_run_extract_data = True
re_run_mapping_data = True
force_save_total_data = False
calculate_metrics = False