support this scenario: fund and share are with same name.

This commit is contained in:
Blade He 2024-10-11 13:14:04 -05:00
parent 92a26cd262
commit df66489c5f
3 changed files with 33 additions and 7 deletions

View File

@ -139,7 +139,8 @@
"Total des frais sur encours", "Total des frais sur encours",
"TER", "TER",
"Ratio des dépenses totales", "Ratio des dépenses totales",
"Ratio de dépenses totales" "Ratio de dépenses totales",
"Total des commissions et frais imputés à"
], ],
"finnish": [ "finnish": [
"palkkiot yhteensä", "palkkiot yhteensä",

View File

@ -550,14 +550,19 @@ class DataExtraction:
return extract_data_info return extract_data_info
remove_list = [] remove_list = []
for data in data_list: for data in data_list:
if data.get("fund name", "") == "": fund_name = data.get("fund name", "")
if fund_name == "":
remove_list.append(data) remove_list.append(data)
keys = list(data.keys()) keys = list(data.keys())
for key in keys: for key in keys:
if self.datapoint_level_config.get(key, "") == "share_level": if self.datapoint_level_config.get(key, "") == "share_level":
if data.get("share name", "") == "": if data.get("share name", "") == "":
is_share_name = self.check_fund_name_as_share(fund_name)
if not is_share_name:
remove_list.append(data) remove_list.append(data)
break break
else:
data["share name"] = fund_name
if data.get(key, "") == "": if data.get(key, "") == "":
data.pop(key) data.pop(key)
for remove_data in remove_list: for remove_data in remove_list:
@ -609,6 +614,26 @@ class DataExtraction:
extract_data_info["data"] = new_data_list extract_data_info["data"] = new_data_list
return extract_data_info return extract_data_info
def check_fund_name_as_share(self, fund_name: str) -> bool:
"""
Check if the fund name is the same as share name
"""
if len(fund_name) == 0 == 0:
return False
share_name_list = self.document_mapping_info_df["ShareClassName"].unique().tolist()
if len(share_name_list) == 0:
return False
max_similarity_name, max_similarity = get_most_similar_name(
text=fund_name,
name_list=share_name_list,
share_name=None,
fund_name=None,
matching_type="share",
process_cache=None)
if max_similarity >= 0.8:
return True
return False
def get_datapoints_by_page_num(self, page_num: int) -> list: def get_datapoints_by_page_num(self, page_num: int) -> list:
datapoints = [] datapoints = []

View File

@ -809,12 +809,12 @@ if __name__ == "__main__":
] ]
# special_doc_id_list = check_mapping_doc_id_list # special_doc_id_list = check_mapping_doc_id_list
special_doc_id_list = check_db_mapping_doc_id_list special_doc_id_list = check_db_mapping_doc_id_list
# special_doc_id_list = ["404712928"] special_doc_id_list = ["422760156"]
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/" output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/" output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
re_run_extract_data = True re_run_extract_data = False
re_run_mapping_data = True re_run_mapping_data = True
force_save_total_data = True force_save_total_data = False
calculate_metrics = False calculate_metrics = False
extract_ways = ["text"] extract_ways = ["text"]