support this scenario: fund and share are with same name.

This commit is contained in:
Blade He 2024-10-11 13:14:04 -05:00
parent 92a26cd262
commit df66489c5f
3 changed files with 33 additions and 7 deletions

View File

@ -139,7 +139,8 @@
"Total des frais sur encours",
"TER",
"Ratio des dépenses totales",
"Ratio de dépenses totales"
"Ratio de dépenses totales",
"Total des commissions et frais imputés à"
],
"finnish": [
"palkkiot yhteensä",

View File

@ -550,14 +550,19 @@ class DataExtraction:
return extract_data_info
remove_list = []
for data in data_list:
if data.get("fund name", "") == "":
fund_name = data.get("fund name", "")
if fund_name == "":
remove_list.append(data)
keys = list(data.keys())
for key in keys:
if self.datapoint_level_config.get(key, "") == "share_level":
if data.get("share name", "") == "":
remove_list.append(data)
break
is_share_name = self.check_fund_name_as_share(fund_name)
if not is_share_name:
remove_list.append(data)
break
else:
data["share name"] = fund_name
if data.get(key, "") == "":
data.pop(key)
for remove_data in remove_list:
@ -609,6 +614,26 @@ class DataExtraction:
extract_data_info["data"] = new_data_list
return extract_data_info
def check_fund_name_as_share(self, fund_name: str) -> bool:
"""
Check if the fund name is the same as share name
"""
if len(fund_name) == 0 == 0:
return False
share_name_list = self.document_mapping_info_df["ShareClassName"].unique().tolist()
if len(share_name_list) == 0:
return False
max_similarity_name, max_similarity = get_most_similar_name(
text=fund_name,
name_list=share_name_list,
share_name=None,
fund_name=None,
matching_type="share",
process_cache=None)
if max_similarity >= 0.8:
return True
return False
def get_datapoints_by_page_num(self, page_num: int) -> list:
datapoints = []

View File

@ -809,12 +809,12 @@ if __name__ == "__main__":
]
# special_doc_id_list = check_mapping_doc_id_list
special_doc_id_list = check_db_mapping_doc_id_list
# special_doc_id_list = ["404712928"]
special_doc_id_list = ["422760156"]
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
re_run_extract_data = True
re_run_extract_data = False
re_run_mapping_data = True
force_save_total_data = True
force_save_total_data = False
calculate_metrics = False
extract_ways = ["text"]