Clean fund name after getting response from ChatGPT

This commit is contained in:
Blade He 2024-12-04 22:08:09 -06:00
parent 70362b554f
commit 95c386911c
2 changed files with 12 additions and 3 deletions

View File

@ -523,16 +523,25 @@ class DataExtraction:
fund_name = data.get("fund name", "").strip() fund_name = data.get("fund name", "").strip()
if fund_name == "": if fund_name == "":
remove_list.append(data) remove_list.append(data)
# Clean fund name start
if previous_page_last_fund is not None and len(previous_page_last_fund) > 0: if previous_page_last_fund is not None and len(previous_page_last_fund) > 0:
previous_page_last_fund = previous_page_last_fund.strip() previous_page_last_fund = previous_page_last_fund.strip()
if fund_name.startswith(previous_page_last_fund) and fund_name != previous_page_last_fund: if fund_name.startswith(previous_page_last_fund) and fund_name != previous_page_last_fund:
modified_fund_name = fund_name.replace(previous_page_last_fund, "").strip() modified_fund_name = fund_name.replace(previous_page_last_fund, "").strip()
if len(modified_fund_name.split()) > 1: if len(modified_fund_name.split()) > 1:
fund_name = modified_fund_name fund_name = modified_fund_name
remove_list = ["Market Specific Equity Sub-Funds",
"International and Regional Equity Sub-Funds",
"Equity Sub-Funds"]
for remove_item in remove_list:
if fund_name.startswith(remove_item):
fund_name = fund_name.replace(remove_item, "").strip()
fund_name = self.get_fund_name(fund_name, "Fund") fund_name = self.get_fund_name(fund_name, "Fund")
fund_name = self.get_fund_name(fund_name, "Bond") fund_name = self.get_fund_name(fund_name, "Bond")
data["fund name"] = fund_name data["fund name"] = fund_name
# Clean fund name end
keys = list(data.keys()) keys = list(data.keys())
for key in keys: for key in keys:
if self.datapoint_level_config.get(key, "") == "share_level": if self.datapoint_level_config.get(key, "") == "share_level":

View File

@ -1151,7 +1151,7 @@ def batch_run_documents():
"534535767" "534535767"
] ]
special_doc_id_list = check_db_mapping_doc_id_list special_doc_id_list = check_db_mapping_doc_id_list
special_doc_id_list = ["534535767"] # special_doc_id_list = ["481482392"]
pdf_folder = r"/data/emea_ar/pdf/" pdf_folder = r"/data/emea_ar/pdf/"
page_filter_ground_truth_file = ( page_filter_ground_truth_file = (
r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx" r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx"
@ -1162,7 +1162,7 @@ def batch_run_documents():
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/" output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
re_run_extract_data = True re_run_extract_data = True
re_run_mapping_data = True re_run_mapping_data = True
force_save_total_data = False force_save_total_data = True
calculate_metrics = False calculate_metrics = False
extract_ways = ["text"] extract_ways = ["text"]