support this scenario: fund and share are with same name.
This commit is contained in:
parent
92a26cd262
commit
df66489c5f
|
|
@ -139,7 +139,8 @@
|
||||||
"Total des frais sur encours",
|
"Total des frais sur encours",
|
||||||
"TER",
|
"TER",
|
||||||
"Ratio des dépenses totales",
|
"Ratio des dépenses totales",
|
||||||
"Ratio de dépenses totales"
|
"Ratio de dépenses totales",
|
||||||
|
"Total des commissions et frais imputés à"
|
||||||
],
|
],
|
||||||
"finnish": [
|
"finnish": [
|
||||||
"palkkiot yhteensä",
|
"palkkiot yhteensä",
|
||||||
|
|
|
||||||
|
|
@ -550,14 +550,19 @@ class DataExtraction:
|
||||||
return extract_data_info
|
return extract_data_info
|
||||||
remove_list = []
|
remove_list = []
|
||||||
for data in data_list:
|
for data in data_list:
|
||||||
if data.get("fund name", "") == "":
|
fund_name = data.get("fund name", "")
|
||||||
|
if fund_name == "":
|
||||||
remove_list.append(data)
|
remove_list.append(data)
|
||||||
keys = list(data.keys())
|
keys = list(data.keys())
|
||||||
for key in keys:
|
for key in keys:
|
||||||
if self.datapoint_level_config.get(key, "") == "share_level":
|
if self.datapoint_level_config.get(key, "") == "share_level":
|
||||||
if data.get("share name", "") == "":
|
if data.get("share name", "") == "":
|
||||||
remove_list.append(data)
|
is_share_name = self.check_fund_name_as_share(fund_name)
|
||||||
break
|
if not is_share_name:
|
||||||
|
remove_list.append(data)
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
data["share name"] = fund_name
|
||||||
if data.get(key, "") == "":
|
if data.get(key, "") == "":
|
||||||
data.pop(key)
|
data.pop(key)
|
||||||
for remove_data in remove_list:
|
for remove_data in remove_list:
|
||||||
|
|
@ -608,6 +613,26 @@ class DataExtraction:
|
||||||
|
|
||||||
extract_data_info["data"] = new_data_list
|
extract_data_info["data"] = new_data_list
|
||||||
return extract_data_info
|
return extract_data_info
|
||||||
|
|
||||||
|
def check_fund_name_as_share(self, fund_name: str) -> bool:
|
||||||
|
"""
|
||||||
|
Check if the fund name is the same as share name
|
||||||
|
"""
|
||||||
|
if len(fund_name) == 0 == 0:
|
||||||
|
return False
|
||||||
|
share_name_list = self.document_mapping_info_df["ShareClassName"].unique().tolist()
|
||||||
|
if len(share_name_list) == 0:
|
||||||
|
return False
|
||||||
|
max_similarity_name, max_similarity = get_most_similar_name(
|
||||||
|
text=fund_name,
|
||||||
|
name_list=share_name_list,
|
||||||
|
share_name=None,
|
||||||
|
fund_name=None,
|
||||||
|
matching_type="share",
|
||||||
|
process_cache=None)
|
||||||
|
if max_similarity >= 0.8:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
def get_datapoints_by_page_num(self, page_num: int) -> list:
|
def get_datapoints_by_page_num(self, page_num: int) -> list:
|
||||||
|
|
|
||||||
6
main.py
6
main.py
|
|
@ -809,12 +809,12 @@ if __name__ == "__main__":
|
||||||
]
|
]
|
||||||
# special_doc_id_list = check_mapping_doc_id_list
|
# special_doc_id_list = check_mapping_doc_id_list
|
||||||
special_doc_id_list = check_db_mapping_doc_id_list
|
special_doc_id_list = check_db_mapping_doc_id_list
|
||||||
# special_doc_id_list = ["404712928"]
|
special_doc_id_list = ["422760156"]
|
||||||
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
|
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
|
||||||
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
|
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
|
||||||
re_run_extract_data = True
|
re_run_extract_data = False
|
||||||
re_run_mapping_data = True
|
re_run_mapping_data = True
|
||||||
force_save_total_data = True
|
force_save_total_data = False
|
||||||
calculate_metrics = False
|
calculate_metrics = False
|
||||||
|
|
||||||
extract_ways = ["text"]
|
extract_ways = ["text"]
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue