Clean fund name after getting response from ChatGPT
This commit is contained in:
parent
70362b554f
commit
95c386911c
|
|
@ -523,16 +523,25 @@ class DataExtraction:
|
||||||
fund_name = data.get("fund name", "").strip()
|
fund_name = data.get("fund name", "").strip()
|
||||||
if fund_name == "":
|
if fund_name == "":
|
||||||
remove_list.append(data)
|
remove_list.append(data)
|
||||||
|
|
||||||
|
# Clean fund name start
|
||||||
if previous_page_last_fund is not None and len(previous_page_last_fund) > 0:
|
if previous_page_last_fund is not None and len(previous_page_last_fund) > 0:
|
||||||
previous_page_last_fund = previous_page_last_fund.strip()
|
previous_page_last_fund = previous_page_last_fund.strip()
|
||||||
if fund_name.startswith(previous_page_last_fund) and fund_name != previous_page_last_fund:
|
if fund_name.startswith(previous_page_last_fund) and fund_name != previous_page_last_fund:
|
||||||
modified_fund_name = fund_name.replace(previous_page_last_fund, "").strip()
|
modified_fund_name = fund_name.replace(previous_page_last_fund, "").strip()
|
||||||
if len(modified_fund_name.split()) > 1:
|
if len(modified_fund_name.split()) > 1:
|
||||||
fund_name = modified_fund_name
|
fund_name = modified_fund_name
|
||||||
|
remove_list = ["Market Specific Equity Sub-Funds",
|
||||||
|
"International and Regional Equity Sub-Funds",
|
||||||
|
"Equity Sub-Funds"]
|
||||||
|
for remove_item in remove_list:
|
||||||
|
if fund_name.startswith(remove_item):
|
||||||
|
fund_name = fund_name.replace(remove_item, "").strip()
|
||||||
fund_name = self.get_fund_name(fund_name, "Fund")
|
fund_name = self.get_fund_name(fund_name, "Fund")
|
||||||
fund_name = self.get_fund_name(fund_name, "Bond")
|
fund_name = self.get_fund_name(fund_name, "Bond")
|
||||||
data["fund name"] = fund_name
|
data["fund name"] = fund_name
|
||||||
|
# Clean fund name end
|
||||||
|
|
||||||
keys = list(data.keys())
|
keys = list(data.keys())
|
||||||
for key in keys:
|
for key in keys:
|
||||||
if self.datapoint_level_config.get(key, "") == "share_level":
|
if self.datapoint_level_config.get(key, "") == "share_level":
|
||||||
|
|
|
||||||
4
main.py
4
main.py
|
|
@ -1151,7 +1151,7 @@ def batch_run_documents():
|
||||||
"534535767"
|
"534535767"
|
||||||
]
|
]
|
||||||
special_doc_id_list = check_db_mapping_doc_id_list
|
special_doc_id_list = check_db_mapping_doc_id_list
|
||||||
special_doc_id_list = ["534535767"]
|
# special_doc_id_list = ["481482392"]
|
||||||
pdf_folder = r"/data/emea_ar/pdf/"
|
pdf_folder = r"/data/emea_ar/pdf/"
|
||||||
page_filter_ground_truth_file = (
|
page_filter_ground_truth_file = (
|
||||||
r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx"
|
r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx"
|
||||||
|
|
@ -1162,7 +1162,7 @@ def batch_run_documents():
|
||||||
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
|
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
|
||||||
re_run_extract_data = True
|
re_run_extract_data = True
|
||||||
re_run_mapping_data = True
|
re_run_mapping_data = True
|
||||||
force_save_total_data = False
|
force_save_total_data = True
|
||||||
calculate_metrics = False
|
calculate_metrics = False
|
||||||
|
|
||||||
extract_ways = ["text"]
|
extract_ways = ["text"]
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue