From df66489c5f8b1c2ba8b078a4677bde0224c32a5c Mon Sep 17 00:00:00 2001 From: Blade He Date: Fri, 11 Oct 2024 13:14:04 -0500 Subject: [PATCH] support this scenario: fund and share are with same name. --- configuration/datapoint_keyword.json | 3 ++- core/data_extraction.py | 31 +++++++++++++++++++++++++--- main.py | 6 +++--- 3 files changed, 33 insertions(+), 7 deletions(-) diff --git a/configuration/datapoint_keyword.json b/configuration/datapoint_keyword.json index 3bfadda..50fcac3 100644 --- a/configuration/datapoint_keyword.json +++ b/configuration/datapoint_keyword.json @@ -139,7 +139,8 @@ "Total des frais sur encours", "TER", "Ratio des dépenses totales", - "Ratio de dépenses totales" + "Ratio de dépenses totales", + "Total des commissions et frais imputés à" ], "finnish": [ "palkkiot yhteensä", diff --git a/core/data_extraction.py b/core/data_extraction.py index 2d002a1..9fa0541 100644 --- a/core/data_extraction.py +++ b/core/data_extraction.py @@ -550,14 +550,19 @@ class DataExtraction: return extract_data_info remove_list = [] for data in data_list: - if data.get("fund name", "") == "": + fund_name = data.get("fund name", "") + if fund_name == "": remove_list.append(data) keys = list(data.keys()) for key in keys: if self.datapoint_level_config.get(key, "") == "share_level": if data.get("share name", "") == "": - remove_list.append(data) - break + is_share_name = self.check_fund_name_as_share(fund_name) + if not is_share_name: + remove_list.append(data) + break + else: + data["share name"] = fund_name if data.get(key, "") == "": data.pop(key) for remove_data in remove_list: @@ -608,6 +613,26 @@ class DataExtraction: extract_data_info["data"] = new_data_list return extract_data_info + + def check_fund_name_as_share(self, fund_name: str) -> bool: + """ + Check if the fund name is the same as share name + """ + if len(fund_name) == 0 == 0: + return False + share_name_list = self.document_mapping_info_df["ShareClassName"].unique().tolist() + if len(share_name_list) == 0: + return False + max_similarity_name, max_similarity = get_most_similar_name( + text=fund_name, + name_list=share_name_list, + share_name=None, + fund_name=None, + matching_type="share", + process_cache=None) + if max_similarity >= 0.8: + return True + return False def get_datapoints_by_page_num(self, page_num: int) -> list: diff --git a/main.py b/main.py index 870d11e..1b54381 100644 --- a/main.py +++ b/main.py @@ -809,12 +809,12 @@ if __name__ == "__main__": ] # special_doc_id_list = check_mapping_doc_id_list special_doc_id_list = check_db_mapping_doc_id_list - # special_doc_id_list = ["404712928"] + special_doc_id_list = ["422760156"] output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/" output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/" - re_run_extract_data = True + re_run_extract_data = False re_run_mapping_data = True - force_save_total_data = True + force_save_total_data = False calculate_metrics = False extract_ways = ["text"]