support this scenario: fund and share are with same name.
This commit is contained in:
parent
92a26cd262
commit
df66489c5f
|
|
@ -139,7 +139,8 @@
|
|||
"Total des frais sur encours",
|
||||
"TER",
|
||||
"Ratio des dépenses totales",
|
||||
"Ratio de dépenses totales"
|
||||
"Ratio de dépenses totales",
|
||||
"Total des commissions et frais imputés à"
|
||||
],
|
||||
"finnish": [
|
||||
"palkkiot yhteensä",
|
||||
|
|
|
|||
|
|
@ -550,14 +550,19 @@ class DataExtraction:
|
|||
return extract_data_info
|
||||
remove_list = []
|
||||
for data in data_list:
|
||||
if data.get("fund name", "") == "":
|
||||
fund_name = data.get("fund name", "")
|
||||
if fund_name == "":
|
||||
remove_list.append(data)
|
||||
keys = list(data.keys())
|
||||
for key in keys:
|
||||
if self.datapoint_level_config.get(key, "") == "share_level":
|
||||
if data.get("share name", "") == "":
|
||||
remove_list.append(data)
|
||||
break
|
||||
is_share_name = self.check_fund_name_as_share(fund_name)
|
||||
if not is_share_name:
|
||||
remove_list.append(data)
|
||||
break
|
||||
else:
|
||||
data["share name"] = fund_name
|
||||
if data.get(key, "") == "":
|
||||
data.pop(key)
|
||||
for remove_data in remove_list:
|
||||
|
|
@ -609,6 +614,26 @@ class DataExtraction:
|
|||
extract_data_info["data"] = new_data_list
|
||||
return extract_data_info
|
||||
|
||||
def check_fund_name_as_share(self, fund_name: str) -> bool:
|
||||
"""
|
||||
Check if the fund name is the same as share name
|
||||
"""
|
||||
if len(fund_name) == 0 == 0:
|
||||
return False
|
||||
share_name_list = self.document_mapping_info_df["ShareClassName"].unique().tolist()
|
||||
if len(share_name_list) == 0:
|
||||
return False
|
||||
max_similarity_name, max_similarity = get_most_similar_name(
|
||||
text=fund_name,
|
||||
name_list=share_name_list,
|
||||
share_name=None,
|
||||
fund_name=None,
|
||||
matching_type="share",
|
||||
process_cache=None)
|
||||
if max_similarity >= 0.8:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def get_datapoints_by_page_num(self, page_num: int) -> list:
|
||||
datapoints = []
|
||||
|
|
|
|||
6
main.py
6
main.py
|
|
@ -809,12 +809,12 @@ if __name__ == "__main__":
|
|||
]
|
||||
# special_doc_id_list = check_mapping_doc_id_list
|
||||
special_doc_id_list = check_db_mapping_doc_id_list
|
||||
# special_doc_id_list = ["404712928"]
|
||||
special_doc_id_list = ["422760156"]
|
||||
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
|
||||
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
|
||||
re_run_extract_data = True
|
||||
re_run_extract_data = False
|
||||
re_run_mapping_data = True
|
||||
force_save_total_data = True
|
||||
force_save_total_data = False
|
||||
calculate_metrics = False
|
||||
|
||||
extract_ways = ["text"]
|
||||
|
|
|
|||
Loading…
Reference in New Issue