From 95c386911c203b9b547163f82c9944f18b597c73 Mon Sep 17 00:00:00 2001 From: Blade He Date: Wed, 4 Dec 2024 22:08:09 -0600 Subject: [PATCH] Clean fund name after getting response from ChatGPT --- core/data_extraction.py | 11 ++++++++++- main.py | 4 ++-- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/core/data_extraction.py b/core/data_extraction.py index 7b34530..380affe 100644 --- a/core/data_extraction.py +++ b/core/data_extraction.py @@ -523,16 +523,25 @@ class DataExtraction: fund_name = data.get("fund name", "").strip() if fund_name == "": remove_list.append(data) + + # Clean fund name start if previous_page_last_fund is not None and len(previous_page_last_fund) > 0: previous_page_last_fund = previous_page_last_fund.strip() if fund_name.startswith(previous_page_last_fund) and fund_name != previous_page_last_fund: modified_fund_name = fund_name.replace(previous_page_last_fund, "").strip() if len(modified_fund_name.split()) > 1: fund_name = modified_fund_name - + remove_list = ["Market Specific Equity Sub-Funds", + "International and Regional Equity Sub-Funds", + "Equity Sub-Funds"] + for remove_item in remove_list: + if fund_name.startswith(remove_item): + fund_name = fund_name.replace(remove_item, "").strip() fund_name = self.get_fund_name(fund_name, "Fund") fund_name = self.get_fund_name(fund_name, "Bond") data["fund name"] = fund_name + # Clean fund name end + keys = list(data.keys()) for key in keys: if self.datapoint_level_config.get(key, "") == "share_level": diff --git a/main.py b/main.py index f847aec..c605065 100644 --- a/main.py +++ b/main.py @@ -1151,7 +1151,7 @@ def batch_run_documents(): "534535767" ] special_doc_id_list = check_db_mapping_doc_id_list - special_doc_id_list = ["534535767"] + # special_doc_id_list = ["481482392"] pdf_folder = r"/data/emea_ar/pdf/" page_filter_ground_truth_file = ( r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx" @@ -1162,7 +1162,7 @@ def batch_run_documents(): output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/" re_run_extract_data = True re_run_mapping_data = True - force_save_total_data = False + force_save_total_data = True calculate_metrics = False extract_ways = ["text"]