Compare commits
No commits in common. "f7d53acddec2e4f8dd66ada024f9e8e40eac6770" and "b15d260a58a2e7cda18d872da2dccfe5c96d67b8" have entirely different histories.
f7d53acdde
...
b15d260a58
|
|
@ -44,8 +44,6 @@ def emea_ar_data_extract():
|
||||||
output_extract_data_folder = r"./data/emea_ar/output/extract_data/docs/"
|
output_extract_data_folder = r"./data/emea_ar/output/extract_data/docs/"
|
||||||
output_mapping_data_folder = r"./data/emea_ar/output/mapping_data/docs/"
|
output_mapping_data_folder = r"./data/emea_ar/output/mapping_data/docs/"
|
||||||
drilldown_folder = r"./data/emea_ar/output/drilldown/"
|
drilldown_folder = r"./data/emea_ar/output/drilldown/"
|
||||||
db_mapping_document_folder = r"./data/emea_ar/output/db_mapping/document/"
|
|
||||||
db_mapping_provider_folder = r"./data/emea_ar/output/db_mapping/provider/"
|
|
||||||
extract_way = "text"
|
extract_way = "text"
|
||||||
|
|
||||||
os.makedirs(pdf_folder, exist_ok=True)
|
os.makedirs(pdf_folder, exist_ok=True)
|
||||||
|
|
@ -53,16 +51,12 @@ def emea_ar_data_extract():
|
||||||
os.makedirs(output_extract_data_folder, exist_ok=True)
|
os.makedirs(output_extract_data_folder, exist_ok=True)
|
||||||
os.makedirs(output_mapping_data_folder, exist_ok=True)
|
os.makedirs(output_mapping_data_folder, exist_ok=True)
|
||||||
os.makedirs(drilldown_folder, exist_ok=True)
|
os.makedirs(drilldown_folder, exist_ok=True)
|
||||||
os.makedirs(db_mapping_document_folder, exist_ok=True)
|
|
||||||
os.makedirs(db_mapping_provider_folder, exist_ok=True)
|
|
||||||
|
|
||||||
clean_folder(pdf_folder)
|
clean_folder(pdf_folder)
|
||||||
clean_folder(output_pdf_text_folder)
|
clean_folder(output_pdf_text_folder)
|
||||||
clean_folder(output_extract_data_folder)
|
clean_folder(output_extract_data_folder)
|
||||||
clean_folder(output_mapping_data_folder)
|
clean_folder(output_mapping_data_folder)
|
||||||
clean_folder(drilldown_folder)
|
clean_folder(drilldown_folder)
|
||||||
clean_folder(db_mapping_document_folder)
|
|
||||||
clean_folder(db_mapping_provider_folder)
|
|
||||||
|
|
||||||
re_run_extract_data = False
|
re_run_extract_data = False
|
||||||
re_run_mapping_data = False
|
re_run_mapping_data = False
|
||||||
|
|
@ -75,8 +69,7 @@ def emea_ar_data_extract():
|
||||||
output_extract_data_folder=output_extract_data_folder,
|
output_extract_data_folder=output_extract_data_folder,
|
||||||
output_mapping_data_folder=output_mapping_data_folder,
|
output_mapping_data_folder=output_mapping_data_folder,
|
||||||
extract_way=extract_way,
|
extract_way=extract_way,
|
||||||
drilldown_folder=drilldown_folder,
|
drilldown_folder=drilldown_folder)
|
||||||
compare_with_provider=False)
|
|
||||||
doc_data_from_gpt, annotation_list = emea_ar_parsing.extract_data(re_run=re_run_extract_data)
|
doc_data_from_gpt, annotation_list = emea_ar_parsing.extract_data(re_run=re_run_extract_data)
|
||||||
doc_mapping_data = emea_ar_parsing.mapping_data(
|
doc_mapping_data = emea_ar_parsing.mapping_data(
|
||||||
data_from_gpt=doc_data_from_gpt, re_run=re_run_mapping_data
|
data_from_gpt=doc_data_from_gpt, re_run=re_run_mapping_data
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,6 @@
|
||||||
{
|
{
|
||||||
"total_annual_dollar_based_charges": {"english": ["total annual dollar based charges", "total annual dollar based charges ($)","total annual dollar"]},
|
"total_annual_dollar_based_charges": {"english": ["total annual dollar based charges", "total annual dollar based charges ($)","total annual dollar"]},
|
||||||
"management_fee_and_costs": {"english": ["management fees and cost", "management fees and costs", "Plus other investment fees and costs"]},
|
"management_fee_and_costs": {"english": ["management fees and cost", "Plus other investment fees and costs"]},
|
||||||
"management_fee": {"english": ["management fee", "management fees","investment management fees","management fees and cost", "investment option management costs", "investment option management costs1", "Plus other investment fees and costs"]},
|
"management_fee": {"english": ["management fee", "management fees","investment management fees","management fees and cost", "investment option management costs", "investment option management costs1", "Plus other investment fees and costs"]},
|
||||||
"performance_fee": {"english": ["performance fee", "performance fees"]},
|
"performance_fee": {"english": ["performance fee", "performance fees"]},
|
||||||
"performance_fee_costs": {"english": ["performance fee costs", "performance fees costs"]},
|
"performance_fee_costs": {"english": ["performance fee costs", "performance fees costs"]},
|
||||||
|
|
|
||||||
File diff suppressed because it is too large
Load Diff
|
|
@ -32,24 +32,22 @@ from openai import AzureOpenAI
|
||||||
|
|
||||||
ABB_JSON = dict()
|
ABB_JSON = dict()
|
||||||
|
|
||||||
def get_abb_json(doc_source: str = "aus_prospectus"):
|
def get_abb_json():
|
||||||
global ABB_JSON
|
global ABB_JSON
|
||||||
if len(ABB_JSON.keys()) == 0:
|
with open("abbreviation_records.json", "r") as file:
|
||||||
with open(f"./configuration/{doc_source}/abbreviation_records.json", "r") as file:
|
# Load the JSON and convert keys to lowercase
|
||||||
# Load the JSON and convert keys to lowercase
|
ABB_JSON = {key.lower(): value for key, value in json.load(file).items()}
|
||||||
ABB_JSON = {key.lower(): value for key, value in json.load(file).items()}
|
|
||||||
|
|
||||||
def get_abbre_format_str(fundname, doc_source: str = "aus_prospectus"):
|
def get_abbre_format_str(fundname):
|
||||||
"""Replaces abbreviations in a fund name with their expanded forms."""
|
"""Replaces abbreviations in a fund name with their expanded forms."""
|
||||||
# Convert fund name to lowercase while matching
|
# Convert fund name to lowercase while matching
|
||||||
f_list = fundname.lower().split()
|
f_list = fundname.lower().split()
|
||||||
get_abb_json(doc_source)
|
|
||||||
updated_doc_fname_words = [ABB_JSON.get(word, word).lower() for word in f_list]
|
updated_doc_fname_words = [ABB_JSON.get(word, word).lower() for word in f_list]
|
||||||
return " ".join(updated_doc_fname_words)
|
return " ".join(updated_doc_fname_words)
|
||||||
|
|
||||||
def replace_abbrevs_in_fundnames(fund_names_list, doc_source: str = "aus_prospectus"):
|
def replace_abbrevs_in_fundnames(fund_names_list):
|
||||||
"""Replaces abbreviations in a list of fund names."""
|
"""Replaces abbreviations in a list of fund names."""
|
||||||
return [get_abbre_format_str(fund_name, doc_source) for fund_name in fund_names_list]
|
return [get_abbre_format_str(fund_name) for fund_name in fund_names_list]
|
||||||
|
|
||||||
|
|
||||||
### STEP 2 - Remove Stopwords
|
### STEP 2 - Remove Stopwords
|
||||||
|
|
@ -440,7 +438,7 @@ def format_response(doc_id, pred_fund, db_fund, clean_pred_name, clean_db_name,
|
||||||
return dt
|
return dt
|
||||||
|
|
||||||
|
|
||||||
def final_function_to_match(doc_id, pred_list, db_list, provider_name, doc_source: str = "aus_prospectus"):
|
def final_function_to_match(doc_id, pred_list, db_list, provider_name):
|
||||||
final_result = {}
|
final_result = {}
|
||||||
df_data = []
|
df_data = []
|
||||||
unmatched_pred_list = pred_list.copy()
|
unmatched_pred_list = pred_list.copy()
|
||||||
|
|
@ -458,16 +456,12 @@ def final_function_to_match(doc_id, pred_list, db_list, provider_name, doc_sourc
|
||||||
step0_matched_db_name_cosine= all_matched_fund_names_[0], step0_matched_db_name_jacc= all_matched_fund_names_[1], step0_matched_db_name_leven= all_matched_fund_names_[2],
|
step0_matched_db_name_cosine= all_matched_fund_names_[0], step0_matched_db_name_jacc= all_matched_fund_names_[1], step0_matched_db_name_leven= all_matched_fund_names_[2],
|
||||||
step0_cosine=all_scores_[0], step0_jaccard=all_scores_[1], step0_levenshtein=all_scores_[2],
|
step0_cosine=all_scores_[0], step0_jaccard=all_scores_[1], step0_levenshtein=all_scores_[2],
|
||||||
llm_flag=False))
|
llm_flag=False))
|
||||||
if db_list[matched_index] in unmatched_db_list:
|
unmatched_db_list.remove(db_list[matched_index])
|
||||||
unmatched_db_list.remove(db_list[matched_index])
|
unmatched_pred_list.remove(pred_list[index])
|
||||||
# unmatched_db_list.remove(db_list[matched_index])
|
|
||||||
if pred_list[index] in unmatched_pred_list:
|
|
||||||
unmatched_pred_list.remove(pred_list[index])
|
|
||||||
# unmatched_pred_list.remove(pred_list[index])
|
|
||||||
else:
|
else:
|
||||||
### STEP-1 Abbreviation replacement
|
### STEP-1 Abbreviation replacement
|
||||||
cleaned_pred_name1 = replace_abbrevs_in_fundnames([pred_fund], doc_source)[0]
|
cleaned_pred_name1 = replace_abbrevs_in_fundnames([pred_fund])[0]
|
||||||
cleaned_db_list1 = replace_abbrevs_in_fundnames(db_list, doc_source)
|
cleaned_db_list1 = replace_abbrevs_in_fundnames(db_list)
|
||||||
# print("--> ",cleaned_db_list1, cleaned_pred_name1)
|
# print("--> ",cleaned_db_list1, cleaned_pred_name1)
|
||||||
step1_result, matched_index, all_scores1_, all_matched_fund_names1_ = get_fund_match_final_score(cleaned_db_list1, cleaned_pred_name1)
|
step1_result, matched_index, all_scores1_, all_matched_fund_names1_ = get_fund_match_final_score(cleaned_db_list1, cleaned_pred_name1)
|
||||||
# print(f"\nStep 1 - Abbreviation Replacement Result: {step1_result}")
|
# print(f"\nStep 1 - Abbreviation Replacement Result: {step1_result}")
|
||||||
|
|
@ -483,12 +477,8 @@ def final_function_to_match(doc_id, pred_list, db_list, provider_name, doc_sourc
|
||||||
step1_pred_name=cleaned_pred_name1, step1_db_name=cleaned_db_list1,
|
step1_pred_name=cleaned_pred_name1, step1_db_name=cleaned_db_list1,
|
||||||
step1_matched_db_name_cosine= all_matched_fund_names1_[0], step1_matched_db_name_jacc= all_matched_fund_names1_[1], step1_matched_db_name_leven= all_matched_fund_names1_[2],
|
step1_matched_db_name_cosine= all_matched_fund_names1_[0], step1_matched_db_name_jacc= all_matched_fund_names1_[1], step1_matched_db_name_leven= all_matched_fund_names1_[2],
|
||||||
step1_cosine=all_scores1_[0], step1_jaccard=all_scores1_[1], step1_levenshtein=all_scores1_[2], llm_flag=False))
|
step1_cosine=all_scores1_[0], step1_jaccard=all_scores1_[1], step1_levenshtein=all_scores1_[2], llm_flag=False))
|
||||||
if db_list[matched_index] in unmatched_db_list:
|
unmatched_db_list.remove(db_list[matched_index])
|
||||||
unmatched_db_list.remove(db_list[matched_index])
|
unmatched_pred_list.remove(pred_list[index])
|
||||||
# unmatched_db_list.remove(db_list[matched_index])
|
|
||||||
if pred_list[index] in unmatched_pred_list:
|
|
||||||
unmatched_pred_list.remove(pred_list[index])
|
|
||||||
# unmatched_pred_list.remove(pred_list[index])
|
|
||||||
else:
|
else:
|
||||||
### STEP-2 Remove Stopwords
|
### STEP-2 Remove Stopwords
|
||||||
cleaned_pred_name2 = remove_stopwords_nltk([cleaned_pred_name1])[0]
|
cleaned_pred_name2 = remove_stopwords_nltk([cleaned_pred_name1])[0]
|
||||||
|
|
@ -511,12 +501,8 @@ def final_function_to_match(doc_id, pred_list, db_list, provider_name, doc_sourc
|
||||||
step2_pred_name=cleaned_pred_name2, step2_db_name=cleaned_db_list2,
|
step2_pred_name=cleaned_pred_name2, step2_db_name=cleaned_db_list2,
|
||||||
step2_matched_db_name_cosine= all_matched_fund_names2_[0], step2_matched_db_name_jacc= all_matched_fund_names2_[1], step2_matched_db_name_leven= all_matched_fund_names2_[2],
|
step2_matched_db_name_cosine= all_matched_fund_names2_[0], step2_matched_db_name_jacc= all_matched_fund_names2_[1], step2_matched_db_name_leven= all_matched_fund_names2_[2],
|
||||||
step2_cosine=all_scores2_[0], step2_jaccard=all_scores2_[1], step2_levenshtein=all_scores2_[2],llm_flag=False))
|
step2_cosine=all_scores2_[0], step2_jaccard=all_scores2_[1], step2_levenshtein=all_scores2_[2],llm_flag=False))
|
||||||
if db_list[matched_index] in unmatched_db_list:
|
unmatched_db_list.remove(db_list[matched_index])
|
||||||
unmatched_db_list.remove(db_list[matched_index])
|
unmatched_pred_list.remove(pred_list[index])
|
||||||
# unmatched_db_list.remove(db_list[matched_index])
|
|
||||||
if pred_list[index] in unmatched_pred_list:
|
|
||||||
unmatched_pred_list.remove(pred_list[index])
|
|
||||||
# unmatched_pred_list.remove(pred_list[index])
|
|
||||||
else:
|
else:
|
||||||
### STEP-3 Special Character Removal
|
### STEP-3 Special Character Removal
|
||||||
cleaned_pred_name3 = remove_special_characters([cleaned_pred_name2])[0]
|
cleaned_pred_name3 = remove_special_characters([cleaned_pred_name2])[0]
|
||||||
|
|
@ -541,12 +527,8 @@ def final_function_to_match(doc_id, pred_list, db_list, provider_name, doc_sourc
|
||||||
step3_pred_name=cleaned_pred_name3, step3_db_name=cleaned_db_list3,
|
step3_pred_name=cleaned_pred_name3, step3_db_name=cleaned_db_list3,
|
||||||
step3_matched_db_name_cosine= all_matched_fund_names3_[0], step3_matched_db_name_jacc= all_matched_fund_names3_[1], step3_matched_db_name_leven= all_matched_fund_names3_[2],
|
step3_matched_db_name_cosine= all_matched_fund_names3_[0], step3_matched_db_name_jacc= all_matched_fund_names3_[1], step3_matched_db_name_leven= all_matched_fund_names3_[2],
|
||||||
step3_cosine=all_scores3_[0], step3_jaccard=all_scores3_[1], step3_levenshtein=all_scores3_[2],llm_flag=False))
|
step3_cosine=all_scores3_[0], step3_jaccard=all_scores3_[1], step3_levenshtein=all_scores3_[2],llm_flag=False))
|
||||||
if db_list[matched_index] in unmatched_db_list:
|
unmatched_db_list.remove(db_list[matched_index])
|
||||||
unmatched_db_list.remove(db_list[matched_index])
|
unmatched_pred_list.remove(pred_list[index])
|
||||||
# unmatched_db_list.remove(db_list[matched_index])
|
|
||||||
if pred_list[index] in unmatched_pred_list:
|
|
||||||
unmatched_pred_list.remove(pred_list[index])
|
|
||||||
# unmatched_pred_list.remove(pred_list[index])
|
|
||||||
else:
|
else:
|
||||||
### STEP-4 Common Words Removal
|
### STEP-4 Common Words Removal
|
||||||
cleaned_db_list4, _ = remove_common_words(cleaned_db_list3)
|
cleaned_db_list4, _ = remove_common_words(cleaned_db_list3)
|
||||||
|
|
@ -583,12 +565,8 @@ def final_function_to_match(doc_id, pred_list, db_list, provider_name, doc_sourc
|
||||||
# print("unmatched_pred_list: ",unmatched_pred_list)
|
# print("unmatched_pred_list: ",unmatched_pred_list)
|
||||||
# print("db_list[matched_index]: ",db_list[matched_index])
|
# print("db_list[matched_index]: ",db_list[matched_index])
|
||||||
# print("pred_list[index]: ",pred_list[index])
|
# print("pred_list[index]: ",pred_list[index])
|
||||||
if db_list[matched_index] in unmatched_db_list:
|
unmatched_db_list.remove(db_list[matched_index])
|
||||||
unmatched_db_list.remove(db_list[matched_index])
|
unmatched_pred_list.remove(pred_list[index])
|
||||||
# unmatched_db_list.remove(db_list[matched_index])
|
|
||||||
if pred_list[index] in unmatched_pred_list:
|
|
||||||
unmatched_pred_list.remove(pred_list[index])
|
|
||||||
# unmatched_pred_list.remove(pred_list[index])
|
|
||||||
else:
|
else:
|
||||||
df_data.append(format_response(doc_id, pred_list[index], db_list[matched_index], cleaned_pred_name4,
|
df_data.append(format_response(doc_id, pred_list[index], db_list[matched_index], cleaned_pred_name4,
|
||||||
db_list[matched_index],
|
db_list[matched_index],
|
||||||
|
|
@ -617,11 +595,11 @@ def final_function_to_match(doc_id, pred_list, db_list, provider_name, doc_sourc
|
||||||
# print("==>>> DB LIST: ",unmatched_db_list)
|
# print("==>>> DB LIST: ",unmatched_db_list)
|
||||||
# print("==>>> PRED LIST: ",unmatched_pred_list)
|
# print("==>>> PRED LIST: ",unmatched_pred_list)
|
||||||
if len(unmatched_pred_list)!=0:
|
if len(unmatched_pred_list)!=0:
|
||||||
cleaned_unmatched_pred_list = replace_abbrevs_in_fundnames(unmatched_pred_list, doc_source)
|
cleaned_unmatched_pred_list = replace_abbrevs_in_fundnames(unmatched_pred_list)
|
||||||
cleaned_unmatched_pred_list = remove_stopwords_nltk(cleaned_unmatched_pred_list)
|
cleaned_unmatched_pred_list = remove_stopwords_nltk(cleaned_unmatched_pred_list)
|
||||||
cleaned_unmatched_pred_list = remove_special_characters(cleaned_unmatched_pred_list)
|
cleaned_unmatched_pred_list = remove_special_characters(cleaned_unmatched_pred_list)
|
||||||
|
|
||||||
cleaned_unmatched_db_list = replace_abbrevs_in_fundnames(unmatched_db_list, doc_source)
|
cleaned_unmatched_db_list = replace_abbrevs_in_fundnames(unmatched_db_list)
|
||||||
cleaned_unmatched_db_list = remove_stopwords_nltk(cleaned_unmatched_db_list)
|
cleaned_unmatched_db_list = remove_stopwords_nltk(cleaned_unmatched_db_list)
|
||||||
cleaned_unmatched_db_list = remove_special_characters(cleaned_unmatched_db_list)
|
cleaned_unmatched_db_list = remove_special_characters(cleaned_unmatched_db_list)
|
||||||
prompt_context = f"""
|
prompt_context = f"""
|
||||||
|
|
|
||||||
|
|
@ -969,9 +969,7 @@ class DataExtraction:
|
||||||
if datapoint_name == "performance_fee":
|
if datapoint_name == "performance_fee":
|
||||||
datapoint_name = "performance fees"
|
datapoint_name = "performance fees"
|
||||||
else:
|
else:
|
||||||
datapoint_name = self.datapoint_name_config.get(datapoint_name, "")
|
datapoint_name = datapoint_name.upper()
|
||||||
if len(datapoint_name) == 0:
|
|
||||||
datapoint_name = datapoint.upper()
|
|
||||||
reported_name = f"The {datapoint_name} reported name could be:\n{joined_reported_name}"
|
reported_name = f"The {datapoint_name} reported name could be:\n{joined_reported_name}"
|
||||||
|
|
||||||
instructions.append(reported_name)
|
instructions.append(reported_name)
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,6 @@
|
||||||
import os
|
import os
|
||||||
import json
|
import json
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from copy import deepcopy
|
|
||||||
from utils.biz_utils import get_most_similar_name, remove_common_word
|
from utils.biz_utils import get_most_similar_name, remove_common_word
|
||||||
from utils.sql_query_util import (
|
from utils.sql_query_util import (
|
||||||
query_document_fund_mapping,
|
query_document_fund_mapping,
|
||||||
|
|
@ -19,18 +18,14 @@ class DataMapping:
|
||||||
raw_document_data_list: list,
|
raw_document_data_list: list,
|
||||||
document_mapping_info_df: pd.DataFrame,
|
document_mapping_info_df: pd.DataFrame,
|
||||||
output_data_folder: str,
|
output_data_folder: str,
|
||||||
doc_source: str = "emea_ar",
|
doc_source: str = "emea_ar"
|
||||||
compare_with_provider: bool = True
|
|
||||||
):
|
):
|
||||||
self.doc_id = doc_id
|
self.doc_id = doc_id
|
||||||
self.datapoints = datapoints
|
self.datapoints = datapoints
|
||||||
self.doc_source = doc_source
|
self.doc_source = doc_source
|
||||||
self.compare_with_provider = compare_with_provider
|
|
||||||
self.raw_document_data_list = raw_document_data_list
|
self.raw_document_data_list = raw_document_data_list
|
||||||
if document_mapping_info_df is None or len(document_mapping_info_df) == 0:
|
if document_mapping_info_df is None or len(document_mapping_info_df) == 0:
|
||||||
self.document_mapping_info_df = query_document_fund_mapping(
|
self.document_mapping_info_df = query_document_fund_mapping(doc_id, rerun=False)
|
||||||
doc_id, rerun=False
|
|
||||||
)
|
|
||||||
else:
|
else:
|
||||||
self.document_mapping_info_df = document_mapping_info_df
|
self.document_mapping_info_df = document_mapping_info_df
|
||||||
|
|
||||||
|
|
@ -49,9 +44,7 @@ class DataMapping:
|
||||||
def set_mapping_data_by_db(self, document_mapping_info_df: pd.DataFrame):
|
def set_mapping_data_by_db(self, document_mapping_info_df: pd.DataFrame):
|
||||||
logger.info("Setting document mapping data")
|
logger.info("Setting document mapping data")
|
||||||
if document_mapping_info_df is None or len(document_mapping_info_df) == 0:
|
if document_mapping_info_df is None or len(document_mapping_info_df) == 0:
|
||||||
self.document_mapping_info_df = query_document_fund_mapping(
|
self.document_mapping_info_df = query_document_fund_mapping(self.doc_id, rerun=False)
|
||||||
self.doc_id, rerun=False
|
|
||||||
)
|
|
||||||
else:
|
else:
|
||||||
self.document_mapping_info_df = document_mapping_info_df
|
self.document_mapping_info_df = document_mapping_info_df
|
||||||
if len(self.document_mapping_info_df) == 0:
|
if len(self.document_mapping_info_df) == 0:
|
||||||
|
|
@ -99,27 +92,26 @@ class DataMapping:
|
||||||
def get_provider_mapping(self):
|
def get_provider_mapping(self):
|
||||||
if len(self.document_mapping_info_df) == 0:
|
if len(self.document_mapping_info_df) == 0:
|
||||||
return pd.DataFrame()
|
return pd.DataFrame()
|
||||||
provider_id_list = self.document_mapping_info_df["ProviderId"].unique().tolist()
|
provider_id_list = (
|
||||||
|
self.document_mapping_info_df["ProviderId"].unique().tolist()
|
||||||
|
)
|
||||||
provider_mapping_list = []
|
provider_mapping_list = []
|
||||||
for provider_id in provider_id_list:
|
for provider_id in provider_id_list:
|
||||||
provider_mapping_list.append(
|
provider_mapping_list.append(query_investment_by_provider(provider_id, rerun=False))
|
||||||
query_investment_by_provider(provider_id, rerun=False)
|
|
||||||
)
|
|
||||||
provider_mapping_df = pd.concat(provider_mapping_list)
|
provider_mapping_df = pd.concat(provider_mapping_list)
|
||||||
provider_mapping_df = provider_mapping_df.drop_duplicates()
|
provider_mapping_df = provider_mapping_df.drop_duplicates()
|
||||||
provider_mapping_df.reset_index(drop=True, inplace=True)
|
provider_mapping_df.reset_index(drop=True, inplace=True)
|
||||||
return provider_mapping_df
|
return provider_mapping_df
|
||||||
|
|
||||||
def mapping_raw_data_entrance(self):
|
def mapping_raw_data_entrance(self):
|
||||||
if self.doc_source == "emea_ar":
|
if self.doc_source == "emear_ar":
|
||||||
return self.mapping_raw_data()
|
return self.mapping_raw_data()
|
||||||
elif self.doc_source == "aus_prospectus":
|
elif self.doc_source == "aus_prospectus":
|
||||||
return self.mapping_raw_data_generic()
|
return self.mapping_raw_data_aus()
|
||||||
else:
|
else:
|
||||||
return self.mapping_raw_data()
|
return self.mapping_raw_data()
|
||||||
# return self.mapping_raw_data_generic()
|
|
||||||
|
|
||||||
def mapping_raw_data_generic(self):
|
def mapping_raw_data_aus(self):
|
||||||
logger.info(f"Mapping raw data for AUS Prospectus document {self.doc_id}")
|
logger.info(f"Mapping raw data for AUS Prospectus document {self.doc_id}")
|
||||||
mapped_data_list = []
|
mapped_data_list = []
|
||||||
# Generate raw name based on fund name and share name by integrate_share_name
|
# Generate raw name based on fund name and share name by integrate_share_name
|
||||||
|
|
@ -136,9 +128,7 @@ class DataMapping:
|
||||||
raw_share_name = raw_data.get("share_name", "")
|
raw_share_name = raw_data.get("share_name", "")
|
||||||
raw_data_keys = list(raw_data.keys())
|
raw_data_keys = list(raw_data.keys())
|
||||||
if len(raw_share_name) > 0:
|
if len(raw_share_name) > 0:
|
||||||
integrated_share_name = self.integrate_share_name(
|
integrated_share_name = self.integrate_share_name(raw_fund_name, raw_share_name)
|
||||||
raw_fund_name, raw_share_name
|
|
||||||
)
|
|
||||||
if integrated_share_name not in share_raw_name_list:
|
if integrated_share_name not in share_raw_name_list:
|
||||||
share_raw_name_list.append(integrated_share_name)
|
share_raw_name_list.append(integrated_share_name)
|
||||||
for datapoint in self.datapoints:
|
for datapoint in self.datapoints:
|
||||||
|
|
@ -154,7 +144,7 @@ class DataMapping:
|
||||||
"investment_type": 1,
|
"investment_type": 1,
|
||||||
"investment_id": "",
|
"investment_id": "",
|
||||||
"investment_name": "",
|
"investment_name": "",
|
||||||
"similarity": 0,
|
"similarity": 0
|
||||||
}
|
}
|
||||||
mapped_data_list.append(mapped_data)
|
mapped_data_list.append(mapped_data)
|
||||||
else:
|
else:
|
||||||
|
|
@ -172,23 +162,19 @@ class DataMapping:
|
||||||
"value": raw_data[datapoint],
|
"value": raw_data[datapoint],
|
||||||
"investment_type": 33,
|
"investment_type": 33,
|
||||||
"investment_id": "",
|
"investment_id": "",
|
||||||
"investment_name": "",
|
"investment_name": ""
|
||||||
}
|
}
|
||||||
mapped_data_list.append(mapped_data)
|
mapped_data_list.append(mapped_data)
|
||||||
# Mapping raw data with database
|
# Mapping raw data with database
|
||||||
iter_count = 60
|
iter_count = 30
|
||||||
fund_match_result = {}
|
fund_match_result = {}
|
||||||
if len(fund_raw_name_list) > 0:
|
if len(fund_raw_name_list) > 0:
|
||||||
fund_match_result = self.get_raw_name_db_match_result(
|
fund_match_result = self.get_raw_name_db_match_result(fund_raw_name_list, "fund", iter_count)
|
||||||
fund_raw_name_list, "fund", iter_count
|
logger.info(f"Fund match result: \n{fund_match_result}")
|
||||||
)
|
|
||||||
# logger.info(f"Fund match result: \n{fund_match_result}")
|
|
||||||
share_match_result = {}
|
share_match_result = {}
|
||||||
if len(share_raw_name_list) > 0:
|
if len(share_raw_name_list) > 0:
|
||||||
share_match_result = self.get_raw_name_db_match_result(
|
share_match_result = self.get_raw_name_db_match_result(share_raw_name_list, "share", iter_count)
|
||||||
share_raw_name_list, "share", iter_count
|
logger.info(f"Share match result: \n{share_match_result}")
|
||||||
)
|
|
||||||
# logger.info(f"Share match result: \n{share_match_result}")
|
|
||||||
|
|
||||||
for mapped_data in mapped_data_list:
|
for mapped_data in mapped_data_list:
|
||||||
investment_type = mapped_data["investment_type"]
|
investment_type = mapped_data["investment_type"]
|
||||||
|
|
@ -196,14 +182,9 @@ class DataMapping:
|
||||||
if investment_type == 33:
|
if investment_type == 33:
|
||||||
if fund_match_result.get(raw_name) is not None:
|
if fund_match_result.get(raw_name) is not None:
|
||||||
matched_db_fund_name = fund_match_result[raw_name]
|
matched_db_fund_name = fund_match_result[raw_name]
|
||||||
if (
|
if matched_db_fund_name is not None and len(matched_db_fund_name) > 0:
|
||||||
matched_db_fund_name is not None
|
|
||||||
and len(matched_db_fund_name) > 0
|
|
||||||
):
|
|
||||||
# get FundId from self.doc_fund_mapping
|
# get FundId from self.doc_fund_mapping
|
||||||
find_fund_df = self.doc_fund_mapping[
|
find_fund_df = self.doc_fund_mapping[self.doc_fund_mapping["FundName"] == matched_db_fund_name]
|
||||||
self.doc_fund_mapping["FundName"] == matched_db_fund_name
|
|
||||||
]
|
|
||||||
if find_fund_df is not None and len(find_fund_df) > 0:
|
if find_fund_df is not None and len(find_fund_df) > 0:
|
||||||
fund_id = find_fund_df["FundId"].values[0]
|
fund_id = find_fund_df["FundId"].values[0]
|
||||||
mapped_data["investment_id"] = fund_id
|
mapped_data["investment_id"] = fund_id
|
||||||
|
|
@ -212,15 +193,9 @@ class DataMapping:
|
||||||
if investment_type == 1:
|
if investment_type == 1:
|
||||||
if share_match_result.get(raw_name) is not None:
|
if share_match_result.get(raw_name) is not None:
|
||||||
matched_db_share_name = share_match_result[raw_name]
|
matched_db_share_name = share_match_result[raw_name]
|
||||||
if (
|
if matched_db_share_name is not None and len(matched_db_share_name) > 0:
|
||||||
matched_db_share_name is not None
|
|
||||||
and len(matched_db_share_name) > 0
|
|
||||||
):
|
|
||||||
# get SecId from self.doc_fund_class_mapping
|
# get SecId from self.doc_fund_class_mapping
|
||||||
find_share_df = self.doc_fund_class_mapping[
|
find_share_df = self.doc_fund_class_mapping[self.doc_fund_class_mapping["ShareClassName"] == matched_db_share_name]
|
||||||
self.doc_fund_class_mapping["ShareClassName"]
|
|
||||||
== matched_db_share_name
|
|
||||||
]
|
|
||||||
if find_share_df is not None and len(find_share_df) > 0:
|
if find_share_df is not None and len(find_share_df) > 0:
|
||||||
share_id = find_share_df["SecId"].values[0]
|
share_id = find_share_df["SecId"].values[0]
|
||||||
mapped_data["investment_id"] = share_id
|
mapped_data["investment_id"] = share_id
|
||||||
|
|
@ -230,64 +205,26 @@ class DataMapping:
|
||||||
self.output_mapping_file(mapped_data_list)
|
self.output_mapping_file(mapped_data_list)
|
||||||
return mapped_data_list
|
return mapped_data_list
|
||||||
|
|
||||||
def get_raw_name_db_match_result(
|
def get_raw_name_db_match_result(self, raw_name_list, investment_type: str, iter_count: int = 30):
|
||||||
self, raw_name_list, investment_type: str, iter_count: int = 30
|
|
||||||
):
|
|
||||||
# split raw_name_list into several parts which each part is with 30 elements
|
# split raw_name_list into several parts which each part is with 30 elements
|
||||||
# The reason to split is to avoid invoke token limitation issues from CahtGPT
|
# The reason to split is to avoid invoke token limitation issues from CahtGPT
|
||||||
raw_name_list_parts = [
|
raw_name_list_parts = [raw_name_list[i:i + iter_count]
|
||||||
raw_name_list[i : i + iter_count]
|
for i in range(0, len(raw_name_list), iter_count)]
|
||||||
for i in range(0, len(raw_name_list), iter_count)
|
|
||||||
]
|
|
||||||
all_match_result = {}
|
all_match_result = {}
|
||||||
doc_fund_name_list = deepcopy(self.doc_fund_name_list)
|
|
||||||
doc_share_name_list = deepcopy(self.doc_share_name_list)
|
|
||||||
for raw_name_list in raw_name_list_parts:
|
for raw_name_list in raw_name_list_parts:
|
||||||
if investment_type == "fund":
|
if investment_type == "fund":
|
||||||
match_result, doc_fund_name_list = self.get_final_function_to_match(
|
match_result = final_function_to_match(doc_id=self.doc_id,
|
||||||
raw_name_list, doc_fund_name_list
|
pred_list=raw_name_list,
|
||||||
)
|
db_list=self.doc_fund_name_list,
|
||||||
|
provider_name=self.provider_name)
|
||||||
else:
|
else:
|
||||||
match_result, doc_share_name_list = self.get_final_function_to_match(
|
match_result = final_function_to_match(doc_id=self.doc_id,
|
||||||
raw_name_list, doc_share_name_list
|
pred_list=raw_name_list,
|
||||||
)
|
db_list=self.doc_share_name_list,
|
||||||
|
provider_name=self.provider_name)
|
||||||
all_match_result.update(match_result)
|
all_match_result.update(match_result)
|
||||||
return all_match_result
|
return all_match_result
|
||||||
|
|
||||||
def get_final_function_to_match(self, raw_name_list, db_name_list):
|
|
||||||
if len(db_name_list) == 0:
|
|
||||||
match_result = {}
|
|
||||||
for raw_name in raw_name_list:
|
|
||||||
match_result[raw_name] = ""
|
|
||||||
else:
|
|
||||||
match_result = final_function_to_match(
|
|
||||||
doc_id=self.doc_id,
|
|
||||||
pred_list=raw_name_list,
|
|
||||||
db_list=db_name_list,
|
|
||||||
provider_name=self.provider_name,
|
|
||||||
doc_source=self.doc_source
|
|
||||||
)
|
|
||||||
matched_name_list = list(match_result.values())
|
|
||||||
db_name_list = self.remove_matched_names(db_name_list, matched_name_list)
|
|
||||||
return match_result, db_name_list
|
|
||||||
|
|
||||||
def remove_matched_names(self, target_name_list: list, matched_name_list: list):
|
|
||||||
if len(matched_name_list) == 0:
|
|
||||||
return target_name_list
|
|
||||||
|
|
||||||
matched_name_list = list(set(matched_name_list))
|
|
||||||
matched_name_list = [
|
|
||||||
value for value in matched_name_list if value is not None and len(value) > 0
|
|
||||||
]
|
|
||||||
for matched_name in matched_name_list:
|
|
||||||
if (
|
|
||||||
matched_name is not None
|
|
||||||
and len(matched_name) > 0
|
|
||||||
and matched_name in target_name_list
|
|
||||||
):
|
|
||||||
target_name_list.remove(matched_name)
|
|
||||||
return target_name_list
|
|
||||||
|
|
||||||
def mapping_raw_data(self):
|
def mapping_raw_data(self):
|
||||||
"""
|
"""
|
||||||
doc_id, page_index, datapoint, value,
|
doc_id, page_index, datapoint, value,
|
||||||
|
|
@ -308,14 +245,9 @@ class DataMapping:
|
||||||
if raw_fund_name is None or len(raw_fund_name) == 0:
|
if raw_fund_name is None or len(raw_fund_name) == 0:
|
||||||
continue
|
continue
|
||||||
raw_share_name = raw_data.get("share_name", "")
|
raw_share_name = raw_data.get("share_name", "")
|
||||||
if (
|
if len(self.doc_fund_name_list) == 0 and len(self.provider_fund_name_list) == 0:
|
||||||
len(self.doc_fund_name_list) == 0
|
|
||||||
and len(self.provider_fund_name_list) == 0
|
|
||||||
):
|
|
||||||
if len(raw_share_name) > 0:
|
if len(raw_share_name) > 0:
|
||||||
integrated_share_name = self.integrate_share_name(
|
integrated_share_name = self.integrate_share_name(raw_fund_name, raw_share_name)
|
||||||
raw_fund_name, raw_share_name
|
|
||||||
)
|
|
||||||
raw_data_keys = list(raw_data.keys())
|
raw_data_keys = list(raw_data.keys())
|
||||||
for datapoint in self.datapoints:
|
for datapoint in self.datapoints:
|
||||||
if datapoint in raw_data_keys:
|
if datapoint in raw_data_keys:
|
||||||
|
|
@ -330,7 +262,7 @@ class DataMapping:
|
||||||
"investment_type": 1,
|
"investment_type": 1,
|
||||||
"investment_id": "",
|
"investment_id": "",
|
||||||
"investment_name": "",
|
"investment_name": "",
|
||||||
"similarity": 0,
|
"similarity": 0
|
||||||
}
|
}
|
||||||
mapped_data_list.append(mapped_data)
|
mapped_data_list.append(mapped_data)
|
||||||
else:
|
else:
|
||||||
|
|
@ -347,15 +279,13 @@ class DataMapping:
|
||||||
"value": raw_data[datapoint],
|
"value": raw_data[datapoint],
|
||||||
"investment_type": 33,
|
"investment_type": 33,
|
||||||
"investment_id": "",
|
"investment_id": "",
|
||||||
"investment_name": "",
|
"investment_name": ""
|
||||||
}
|
}
|
||||||
mapped_data_list.append(mapped_data)
|
mapped_data_list.append(mapped_data)
|
||||||
else:
|
else:
|
||||||
raw_name = ""
|
raw_name = ""
|
||||||
if raw_share_name is not None and len(raw_share_name) > 0:
|
if raw_share_name is not None and len(raw_share_name) > 0:
|
||||||
raw_name = self.integrate_share_name(
|
raw_name = self.integrate_share_name(raw_fund_name, raw_share_name)
|
||||||
raw_fund_name, raw_share_name
|
|
||||||
)
|
|
||||||
if mapped_share_cache.get(raw_name) is not None:
|
if mapped_share_cache.get(raw_name) is not None:
|
||||||
investment_info = mapped_share_cache[raw_name]
|
investment_info = mapped_share_cache[raw_name]
|
||||||
else:
|
else:
|
||||||
|
|
@ -368,20 +298,14 @@ class DataMapping:
|
||||||
)
|
)
|
||||||
fund_id = fund_info["id"]
|
fund_id = fund_info["id"]
|
||||||
mapped_fund_cache[raw_fund_name] = fund_info
|
mapped_fund_cache[raw_fund_name] = fund_info
|
||||||
investment_info = {}
|
investment_info = self.matching_with_database(
|
||||||
if len(fund_id) > 0:
|
raw_name=raw_name,
|
||||||
investment_info = self.mapping_unique_raw_data(fund_id=fund_id,
|
raw_share_name=raw_share_name,
|
||||||
raw_fund_name=raw_fund_name,
|
raw_fund_name=raw_fund_name,
|
||||||
raw_data_list=raw_data_list)
|
parent_id=fund_id,
|
||||||
if investment_info.get("id", None) is None or len(investment_info.get("id", "")) == 0:
|
matching_type="share",
|
||||||
investment_info = self.matching_with_database(
|
process_cache=process_cache
|
||||||
raw_name=raw_name,
|
)
|
||||||
raw_share_name=raw_share_name,
|
|
||||||
raw_fund_name=raw_fund_name,
|
|
||||||
parent_id=fund_id,
|
|
||||||
matching_type="share",
|
|
||||||
process_cache=process_cache,
|
|
||||||
)
|
|
||||||
mapped_share_cache[raw_name] = investment_info
|
mapped_share_cache[raw_name] = investment_info
|
||||||
elif raw_fund_name is not None and len(raw_fund_name) > 0:
|
elif raw_fund_name is not None and len(raw_fund_name) > 0:
|
||||||
raw_name = raw_fund_name
|
raw_name = raw_fund_name
|
||||||
|
|
@ -398,7 +322,7 @@ class DataMapping:
|
||||||
"id": "",
|
"id": "",
|
||||||
"legal_name": "",
|
"legal_name": "",
|
||||||
"investment_type": -1,
|
"investment_type": -1,
|
||||||
"similarity": 0,
|
"similarity": 0
|
||||||
}
|
}
|
||||||
|
|
||||||
raw_data_keys = list(raw_data.keys())
|
raw_data_keys = list(raw_data.keys())
|
||||||
|
|
@ -415,35 +339,13 @@ class DataMapping:
|
||||||
"investment_type": investment_info["investment_type"],
|
"investment_type": investment_info["investment_type"],
|
||||||
"investment_id": investment_info["id"],
|
"investment_id": investment_info["id"],
|
||||||
"investment_name": investment_info["legal_name"],
|
"investment_name": investment_info["legal_name"],
|
||||||
"similarity": investment_info["similarity"],
|
"similarity": investment_info["similarity"]
|
||||||
}
|
}
|
||||||
mapped_data_list.append(mapped_data)
|
mapped_data_list.append(mapped_data)
|
||||||
|
|
||||||
self.output_mapping_file(mapped_data_list)
|
self.output_mapping_file(mapped_data_list)
|
||||||
return mapped_data_list
|
return mapped_data_list
|
||||||
|
|
||||||
def mapping_unique_raw_data(self, fund_id: str, raw_fund_name: str, raw_data_list: list):
|
|
||||||
share_count = 0
|
|
||||||
for raw_data in raw_data_list:
|
|
||||||
fund_name = raw_data.get("fund_name", "")
|
|
||||||
share_name = raw_data.get("share_name", "")
|
|
||||||
if fund_name == raw_fund_name and share_name is not None and len(share_name) > 0:
|
|
||||||
share_count += 1
|
|
||||||
if share_count > 1:
|
|
||||||
break
|
|
||||||
data_info = {}
|
|
||||||
if share_count == 1:
|
|
||||||
doc_compare_mapping = self.doc_fund_class_mapping[
|
|
||||||
self.doc_fund_class_mapping["FundId"] == fund_id
|
|
||||||
]
|
|
||||||
if len(doc_compare_mapping) == 1:
|
|
||||||
data_info["id"] = doc_compare_mapping["SecId"].values[0]
|
|
||||||
data_info["legal_name"] = doc_compare_mapping["ShareClassName"].values[0]
|
|
||||||
data_info["investment_type"] = 1
|
|
||||||
data_info["similarity"] = 1
|
|
||||||
return data_info
|
|
||||||
|
|
||||||
|
|
||||||
def output_mapping_file(self, mapped_data_list: list):
|
def output_mapping_file(self, mapped_data_list: list):
|
||||||
json_data_file = os.path.join(
|
json_data_file = os.path.join(
|
||||||
self.output_data_json_folder, f"{self.doc_id}.json"
|
self.output_data_json_folder, f"{self.doc_id}.json"
|
||||||
|
|
@ -488,7 +390,7 @@ class DataMapping:
|
||||||
raw_fund_name: str = None,
|
raw_fund_name: str = None,
|
||||||
parent_id: str = None,
|
parent_id: str = None,
|
||||||
matching_type: str = "fund",
|
matching_type: str = "fund",
|
||||||
process_cache: dict = {},
|
process_cache: dict = {}
|
||||||
):
|
):
|
||||||
if len(self.doc_fund_name_list) == 0 and len(self.provider_fund_name_list) == 0:
|
if len(self.doc_fund_name_list) == 0 and len(self.provider_fund_name_list) == 0:
|
||||||
data_info["id"] = ""
|
data_info["id"] = ""
|
||||||
|
|
@ -515,9 +417,8 @@ class DataMapping:
|
||||||
doc_compare_mapping = self.doc_fund_class_mapping[
|
doc_compare_mapping = self.doc_fund_class_mapping[
|
||||||
self.doc_fund_class_mapping["FundId"] == parent_id
|
self.doc_fund_class_mapping["FundId"] == parent_id
|
||||||
]
|
]
|
||||||
provider_compare_mapping = self.provider_fund_class_mapping[
|
provider_compare_mapping = self.provider_fund_class_mapping\
|
||||||
self.provider_fund_class_mapping["FundId"] == parent_id
|
[self.provider_fund_class_mapping["FundId"] == parent_id]
|
||||||
]
|
|
||||||
if len(doc_compare_mapping) == 0:
|
if len(doc_compare_mapping) == 0:
|
||||||
if len(provider_compare_mapping) == 0:
|
if len(provider_compare_mapping) == 0:
|
||||||
doc_compare_name_list = self.doc_share_name_list
|
doc_compare_name_list = self.doc_share_name_list
|
||||||
|
|
@ -535,9 +436,8 @@ class DataMapping:
|
||||||
doc_compare_mapping["ShareClassName"].unique().tolist()
|
doc_compare_mapping["ShareClassName"].unique().tolist()
|
||||||
)
|
)
|
||||||
|
|
||||||
if len(provider_compare_mapping) == 0 or len(
|
if len(provider_compare_mapping) == 0 or \
|
||||||
provider_compare_mapping
|
len(provider_compare_mapping) < len(doc_compare_mapping):
|
||||||
) < len(doc_compare_mapping):
|
|
||||||
provider_compare_name_list = doc_compare_name_list
|
provider_compare_name_list = doc_compare_name_list
|
||||||
provider_compare_mapping = doc_compare_mapping
|
provider_compare_mapping = doc_compare_mapping
|
||||||
else:
|
else:
|
||||||
|
|
@ -564,15 +464,11 @@ class DataMapping:
|
||||||
share_name=raw_share_name,
|
share_name=raw_share_name,
|
||||||
fund_name=raw_fund_name,
|
fund_name=raw_fund_name,
|
||||||
matching_type=matching_type,
|
matching_type=matching_type,
|
||||||
process_cache=process_cache,
|
process_cache=process_cache)
|
||||||
)
|
|
||||||
if matching_type == "fund":
|
if matching_type == "fund":
|
||||||
threshold = 0.7
|
threshold = 0.7
|
||||||
else:
|
else:
|
||||||
if self.compare_with_provider:
|
threshold = 0.9
|
||||||
threshold = 0.9
|
|
||||||
else:
|
|
||||||
threshold = 0.6
|
|
||||||
if max_similarity is not None and max_similarity >= threshold:
|
if max_similarity is not None and max_similarity >= threshold:
|
||||||
data_info["id"] = doc_compare_mapping[
|
data_info["id"] = doc_compare_mapping[
|
||||||
doc_compare_mapping[compare_name_dp] == max_similarity_name
|
doc_compare_mapping[compare_name_dp] == max_similarity_name
|
||||||
|
|
@ -583,44 +479,38 @@ class DataMapping:
|
||||||
if data_info.get("id", None) is None or len(data_info.get("id", "")) == 0:
|
if data_info.get("id", None) is None or len(data_info.get("id", "")) == 0:
|
||||||
# set pre_common_word_list, reason: the document mapping for same fund maybe different with provider mapping
|
# set pre_common_word_list, reason: the document mapping for same fund maybe different with provider mapping
|
||||||
# the purpose is to get the most common word list, to improve the similarity.
|
# the purpose is to get the most common word list, to improve the similarity.
|
||||||
if self.compare_with_provider:
|
max_similarity_name, max_similarity = get_most_similar_name(
|
||||||
max_similarity_name, max_similarity = get_most_similar_name(
|
raw_name,
|
||||||
raw_name,
|
provider_compare_name_list,
|
||||||
provider_compare_name_list,
|
share_name=raw_share_name,
|
||||||
share_name=raw_share_name,
|
fund_name=raw_fund_name,
|
||||||
fund_name=raw_fund_name,
|
matching_type=matching_type,
|
||||||
matching_type=matching_type,
|
pre_common_word_list=pre_common_word_list,
|
||||||
pre_common_word_list=pre_common_word_list,
|
process_cache=process_cache
|
||||||
process_cache=process_cache,
|
)
|
||||||
)
|
threshold = 0.7
|
||||||
threshold = 0.7
|
if matching_type == "share":
|
||||||
if matching_type == "share":
|
threshold = 0.5
|
||||||
threshold = 0.5
|
round_similarity = 0
|
||||||
round_similarity = 0
|
if max_similarity is not None and isinstance(max_similarity, float):
|
||||||
if max_similarity is not None and isinstance(max_similarity, float):
|
round_similarity = round(max_similarity, 1)
|
||||||
round_similarity = round(max_similarity, 1)
|
if round_similarity is not None and round_similarity >= threshold:
|
||||||
if round_similarity is not None and round_similarity >= threshold:
|
data_info["id"] = provider_compare_mapping[
|
||||||
data_info["id"] = provider_compare_mapping[
|
provider_compare_mapping[compare_name_dp] == max_similarity_name
|
||||||
provider_compare_mapping[compare_name_dp] == max_similarity_name
|
][compare_id_dp].values[0]
|
||||||
][compare_id_dp].values[0]
|
data_info["legal_name"] = max_similarity_name
|
||||||
data_info["legal_name"] = max_similarity_name
|
data_info["similarity"] = max_similarity
|
||||||
data_info["similarity"] = max_similarity
|
|
||||||
else:
|
|
||||||
if len(doc_compare_name_list) == 1:
|
|
||||||
data_info["id"] = doc_compare_mapping[
|
|
||||||
doc_compare_mapping[compare_name_dp]
|
|
||||||
== doc_compare_name_list[0]
|
|
||||||
][compare_id_dp].values[0]
|
|
||||||
data_info["legal_name"] = doc_compare_name_list[0]
|
|
||||||
data_info["similarity"] = 1
|
|
||||||
else:
|
|
||||||
data_info["id"] = ""
|
|
||||||
data_info["legal_name"] = ""
|
|
||||||
data_info["similarity"] = 0
|
|
||||||
else:
|
else:
|
||||||
data_info["id"] = ""
|
if len(doc_compare_name_list) == 1:
|
||||||
data_info["legal_name"] = ""
|
data_info["id"] = doc_compare_mapping[
|
||||||
data_info["similarity"] = 0
|
doc_compare_mapping[compare_name_dp] == doc_compare_name_list[0]
|
||||||
|
][compare_id_dp].values[0]
|
||||||
|
data_info["legal_name"] = doc_compare_name_list[0]
|
||||||
|
data_info["similarity"] = 1
|
||||||
|
else:
|
||||||
|
data_info["id"] = ""
|
||||||
|
data_info["legal_name"] = ""
|
||||||
|
data_info["similarity"] = 0
|
||||||
data_info["investment_type"] = investment_type
|
data_info["investment_type"] = investment_type
|
||||||
else:
|
else:
|
||||||
data_info["id"] = ""
|
data_info["id"] = ""
|
||||||
|
|
|
||||||
|
|
@ -61,6 +61,21 @@
|
||||||
"---Example End---",
|
"---Example End---",
|
||||||
"The output should be:",
|
"The output should be:",
|
||||||
"{\"data\": [{\"fund name\": \"Allan Gray Australian Equity Fund\", \"share name\": \"Class A\", \"management_fee_and_costs\": 1.19, \"management_fee\": 0.77, \"administration_fees\": 0.42}]",
|
"{\"data\": [{\"fund name\": \"Allan Gray Australian Equity Fund\", \"share name\": \"Class A\", \"management_fee_and_costs\": 1.19, \"management_fee\": 0.77, \"administration_fees\": 0.42}]",
|
||||||
|
"- 5. Reverse order of data columns from table text in PDF:",
|
||||||
|
"For this case, 1. the columns order is reversed, \n2. The fund name is in the end of row with number value in front of fund name.",
|
||||||
|
"---Example 1 Start---",
|
||||||
|
"Transaction\ncosts\n(gross)1\nBuy-sell\nspreads\nTransaction\ncosts (net)\nEquals\ninvestment fees and\ncosts\nThe investment fees and\ncosts are made up of\nPlus\nother\ninvestment\nfees and\ncosts\nPerformance\nfee\n% pa\nEntry %/\nExit %\n% pa\n% pa\n% pa\nReady-made portfolios\nSimple choice\n0.04\n0.10/0.10\n0.00\n0.62\n0.55\n0.07\nMLC Stable\n0.05\n0.10/0.10\n0.02\n0.80\n0.65\n0.15\nMLC Conservative Balanced",
|
||||||
|
"---Example 1 End---",
|
||||||
|
"For this case, Management fees and costs = Management fees with same reported name: Plus\nother\ninvestment\nfees and\ncosts",
|
||||||
|
"The output should be: ",
|
||||||
|
"{\"data\": [{\"fund name\": \"MLC Stable\", \"share name\": \"MLC Stable\", \"buy_spread\": 0.10, \"sell_spread\": 0.10, \"management_fee_and_costs\": 0.55, \"management_fee\": 0.55, \"performance_fee\": 0.07}, {\"fund name\": \"MLC Conservative Balanced\", \"share name\": \"MLC Conservative Balanced\", \"buy_spread\": 0.10, \"sell_spread\": 0.10, \"management_fee_and_costs\": 0.65, \"management_fee\": 0.65, \"performance_fee\": 0.15}]",
|
||||||
|
"\n",
|
||||||
|
"---Example 2 Start---",
|
||||||
|
"\nTotal\nTransaction Costs\nPerformance Fees\nManagement fees and costs\nIndirect Fee\nManagement fees\nMLC diversified investment\noption\n1.49% p.a.\n0.01% p.a.\n0.06% p.a.\n0.07% p.a.\n1.35% p.a.\nMLC Horizon 2\nIncome Portfolio\n",
|
||||||
|
"---Example 2 End---",
|
||||||
|
"For this case, Management fees and costs = Management fees + Indirect Fee.",
|
||||||
|
"The output should be:",
|
||||||
|
"{\"data\": [{\"fund name\": \"MLC Horizon 2 Income Portfolio\", \"share name\": \"MLC Horizon 2 Income Portfolio\", \"management_fee_and_costs\": 1.42, \"management_fee\": 1.35, \"indirect_costs\": 0.07, \"performance_fee\": 0.06}]",
|
||||||
"- 6. Please ignore these words as fund names, it means never extract these words as fund names. They are:",
|
"- 6. Please ignore these words as fund names, it means never extract these words as fund names. They are:",
|
||||||
"\"Ready-made portfolios\", \"Simple choice\", \"Build-your-own portfolio\"."
|
"\"Ready-made portfolios\", \"Simple choice\", \"Build-your-own portfolio\"."
|
||||||
],
|
],
|
||||||
|
|
@ -121,7 +136,7 @@
|
||||||
"special_rule": {
|
"special_rule": {
|
||||||
"management_fee_and_costs": [
|
"management_fee_and_costs": [
|
||||||
"If there are multiple Management fee and costs reported names, here is the priority rule:",
|
"If there are multiple Management fee and costs reported names, here is the priority rule:",
|
||||||
"A. With \"Total Management fees and costs (gross)\" and \"Total Management fees and costs (net)\", pick up the values from \"Total Management fees and costs (net)\".",
|
"- With \"Total Management fees and costs (gross)\" and \"Total Management fees and costs (net)\", pick up the values from \"Total Management fees and costs (net)\".",
|
||||||
"---Example Start---",
|
"---Example Start---",
|
||||||
"\n Investment option \nInvestment option \nmanagement \ncosts1 \n% p.a. \n(A)\nLifeplan \nadministration fee \n(gross)2 \n% p.a. \n(B)\nLifeplan \nadministration fee \n(net) \n% p.a. \n(C)\nTotal Management \nfees and costs \n(gross) \n% p.a. \n(A + B)\nTotal Management \nfees and costs \n(net) \n% p.a. \n(A + C)\nAllan Gray Australian Equity Fund \u2013 Class A\n0.77\n0.60\n0.42\n1.37\n1.19\n",
|
"\n Investment option \nInvestment option \nmanagement \ncosts1 \n% p.a. \n(A)\nLifeplan \nadministration fee \n(gross)2 \n% p.a. \n(B)\nLifeplan \nadministration fee \n(net) \n% p.a. \n(C)\nTotal Management \nfees and costs \n(gross) \n% p.a. \n(A + B)\nTotal Management \nfees and costs \n(net) \n% p.a. \n(A + C)\nAllan Gray Australian Equity Fund \u2013 Class A\n0.77\n0.60\n0.42\n1.37\n1.19\n",
|
||||||
"---Example End---",
|
"---Example End---",
|
||||||
|
|
@ -129,24 +144,19 @@
|
||||||
"{\"data\": [{\"fund name\": \"Allan Gray Australian Equity Fund\", \"share name\": \"Class A\", \"management_fee_and_costs\": 1.19, \"management_fee\": 0.77, \"administration_fees\": 0.42}]",
|
"{\"data\": [{\"fund name\": \"Allan Gray Australian Equity Fund\", \"share name\": \"Class A\", \"management_fee_and_costs\": 1.19, \"management_fee\": 0.77, \"administration_fees\": 0.42}]",
|
||||||
"\n",
|
"\n",
|
||||||
"If there are multiple Management fee and costs sub-columns, here is the rule:",
|
"If there are multiple Management fee and costs sub-columns, here is the rule:",
|
||||||
"B. With \"Management fees\" and \"Indirect fee\", sum the values from these two columns: \"Management fees\" + \"Indirect fee\".",
|
"- With \"Management fees\" and \"Indirect fee\", sum the values from these two columns: \"Management fees\" + \"Indirect fee\".",
|
||||||
"---Example Start---",
|
"---Example Start---",
|
||||||
"\n\nManagement fees \nManagement fees and costs \nIndirect Fee \nPerformance Fees \nTransaction Costs \nTotal \nMLC diversified investment \noption \nMLC Horizon 2 \nIncome Portfolio \n1.35% p.a. \n0.07% p.a. \n0.06% p.a. \n0.01% p.a. \n1.49% p.a. \n",
|
"\nTotal\nTransaction Costs\nPerformance Fees\nManagement fees and costs\nIndirect Fee\nManagement fees\nMLC diversified investment\noption\n1.49% p.a.\n0.01% p.a.\n0.06% p.a.\n0.07% p.a.\n1.35% p.a.\nMLC Horizon 2\nIncome Portfolio\n",
|
||||||
"---Example End---",
|
"---Example End---",
|
||||||
"The output should be:",
|
"The output should be:",
|
||||||
"{\"data\": [{\"fund name\": \"MLC Horizon 2 Income Portfolio\", \"share name\": \"MLC Horizon 2 Income Portfolio\", \"management_fee_and_costs\": 1.42, \"management_fee\": 1.35, \"indirect_costs\": 0.07, \"performance_fee\": 0.06}]",
|
"{\"data\": [{\"fund name\": \"MLC Horizon 2 Income Portfolio\", \"share name\": \"MLC Horizon 2 Income Portfolio\", \"management_fee_and_costs\": 1.42, \"management_fee\": 1.35, \"indirect_costs\": 0.07, \"performance_fee\": 0.06}]",
|
||||||
"\n",
|
"\n",
|
||||||
"C. If only find \"Management fees and costs\", please output the relevant as data point key: \"management_fee_and_costs\", instead of \"management_fee\".",
|
"- With \"Management fees\" and \"Administration fee\", sum the values from these two columns: \"Management fees\" + \"Administration fee\".",
|
||||||
"---Example 1 Start---",
|
"---Example Start---",
|
||||||
"The fees and costs for managing \nyour investment \nManagement fees and costs \n1 \n• \nSPDR World: 0.30% per annum of net asset \nvalue. This is reduced to 0.18% per annum of net \nasset value with effect from 14 February 2022.",
|
"\nTotal\nTransaction Costs\nPerformance Fees\nManagement fees and costs\nAdministration Fee\nManagement fees\nMLC diversified investment\noption\n1.62% p.a.\n0.02% p.a.\n0.03% p.a.\n0.09% p.a.\n1.58% p.a.\nMLC Horizon 4 Balanced\nPortfolio\n",
|
||||||
"---Example 1 End---",
|
"---Example End---",
|
||||||
"The output should be:",
|
"The output should be:",
|
||||||
"{\"data\": [{\"fund name\": \"SPDR World\", \"share name\": \"SPDR World\", \"management_fee_and_costs\": 0.18}]",
|
"{\"data\": [{\"fund name\": \"MLC Horizon 4 Balanced Portfolio\", \"share name\": \"MLC Horizon 4 Balanced Portfolio\", \"management_fee_and_costs\": 1.67, \"management_fee\": 1.58, \"administration_fees\": 0.09, \"performance_fee\": 0.03}]"
|
||||||
"---Example 2 Start---",
|
|
||||||
"Management Fees and Costs \n\nAs at the date of this PDS, Management Fees and Costs will be capped at: \n\n• 0.18% pa of net asset value for SPDR World \n\n• 0.21% pa of net asset value for SPDR World (Hedged) \n\n",
|
|
||||||
"---Example 2 End---",
|
|
||||||
"The output should be:",
|
|
||||||
"{\"data\": [{\"fund name\": \"SPDR World\", \"share name\": \"SPDR World\", \"management_fee_and_costs\": 0.18}, {\"fund name\": \"SPDR World (Hedged)\", \"share name\": \"SPDR World (Hedged)\", \"management_fee_and_costs\": 0.21}]"
|
|
||||||
],
|
],
|
||||||
"buy_spread": [
|
"buy_spread": [
|
||||||
"Please don't extract data by the reported names for buy_spread or sell_spread, they are: ",
|
"Please don't extract data by the reported names for buy_spread or sell_spread, they are: ",
|
||||||
|
|
@ -253,7 +263,7 @@
|
||||||
"date_of_last_hwm_reset_value": ["29 March 2023", "18 April 2024", "19 October 2021"],
|
"date_of_last_hwm_reset_value": ["29 March 2023", "18 April 2024", "19 October 2021"],
|
||||||
"date_of_last_performance_fee_restructure_value": ["12 August 2022", "15 March 2024", "11 November 2023"],
|
"date_of_last_performance_fee_restructure_value": ["12 August 2022", "15 March 2024", "11 November 2023"],
|
||||||
"high_water_mark_type_value": ["Total Return", "Excess Return", "Both TR & ER"],
|
"high_water_mark_type_value": ["Total Return", "Excess Return", "Both TR & ER"],
|
||||||
"minimum_initial_investment_value": [0, 5000, 10000],
|
"minimum_initial_investment_value": [0, 5, 12],
|
||||||
"recoverable_expenses_value": [0.12, 0.05, 0.06],
|
"recoverable_expenses_value": [0.12, 0.05, 0.06],
|
||||||
"indirect_costs_value": [0.12, 0.16, 0.02]
|
"indirect_costs_value": [0.12, 0.16, 0.02]
|
||||||
},
|
},
|
||||||
|
|
|
||||||
152
main.py
152
main.py
|
|
@ -31,14 +31,11 @@ class EMEA_AR_Parsing:
|
||||||
output_mapping_data_folder: str = r"/data/emea_ar/output/mapping_data/docs/",
|
output_mapping_data_folder: str = r"/data/emea_ar/output/mapping_data/docs/",
|
||||||
extract_way: str = "text",
|
extract_way: str = "text",
|
||||||
drilldown_folder: str = r"/data/emea_ar/output/drilldown/",
|
drilldown_folder: str = r"/data/emea_ar/output/drilldown/",
|
||||||
compare_with_provider: bool = True
|
|
||||||
) -> None:
|
) -> None:
|
||||||
self.doc_id = doc_id
|
self.doc_id = doc_id
|
||||||
self.doc_source = doc_source
|
self.doc_source = doc_source
|
||||||
self.pdf_folder = pdf_folder
|
self.pdf_folder = pdf_folder
|
||||||
os.makedirs(self.pdf_folder, exist_ok=True)
|
os.makedirs(self.pdf_folder, exist_ok=True)
|
||||||
self.compare_with_provider = compare_with_provider
|
|
||||||
|
|
||||||
self.pdf_file = self.download_pdf()
|
self.pdf_file = self.download_pdf()
|
||||||
self.document_mapping_info_df = query_document_fund_mapping(doc_id, rerun=False)
|
self.document_mapping_info_df = query_document_fund_mapping(doc_id, rerun=False)
|
||||||
|
|
||||||
|
|
@ -79,7 +76,7 @@ class EMEA_AR_Parsing:
|
||||||
self.pdf_file,
|
self.pdf_file,
|
||||||
self.document_mapping_info_df,
|
self.document_mapping_info_df,
|
||||||
self.doc_source,
|
self.doc_source,
|
||||||
output_pdf_text_folder,
|
output_pdf_text_folder
|
||||||
)
|
)
|
||||||
self.page_text_dict = self.filter_pages.page_text_dict
|
self.page_text_dict = self.filter_pages.page_text_dict
|
||||||
|
|
||||||
|
|
@ -90,9 +87,7 @@ class EMEA_AR_Parsing:
|
||||||
drilldown_folder = r"/data/emea_ar/output/drilldown/"
|
drilldown_folder = r"/data/emea_ar/output/drilldown/"
|
||||||
os.makedirs(drilldown_folder, exist_ok=True)
|
os.makedirs(drilldown_folder, exist_ok=True)
|
||||||
self.drilldown_folder = drilldown_folder
|
self.drilldown_folder = drilldown_folder
|
||||||
misc_config_file = os.path.join(
|
misc_config_file = os.path.join(f"./configuration/{doc_source}/", "misc_config.json")
|
||||||
f"./configuration/{doc_source}/", "misc_config.json"
|
|
||||||
)
|
|
||||||
if os.path.exists(misc_config_file):
|
if os.path.exists(misc_config_file):
|
||||||
with open(misc_config_file, "r", encoding="utf-8") as f:
|
with open(misc_config_file, "r", encoding="utf-8") as f:
|
||||||
misc_config = json.load(f)
|
misc_config = json.load(f)
|
||||||
|
|
@ -254,14 +249,6 @@ class EMEA_AR_Parsing:
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error: {e}")
|
logger.error(f"Error: {e}")
|
||||||
annotation_list = annotation_list_df.to_dict(orient="records")
|
annotation_list = annotation_list_df.to_dict(orient="records")
|
||||||
try:
|
|
||||||
drilldown_json_file = os.path.join(
|
|
||||||
drilldown_data_folder, f"{doc_id}_drilldown.json"
|
|
||||||
)
|
|
||||||
with open(drilldown_json_file, "w", encoding="utf-8") as f:
|
|
||||||
json.dump(annotation_list, f, ensure_ascii=False, indent=4)
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error: {e}")
|
|
||||||
return annotation_list
|
return annotation_list
|
||||||
|
|
||||||
def mapping_data(self, data_from_gpt: list, re_run: bool = False) -> list:
|
def mapping_data(self, data_from_gpt: list, re_run: bool = False) -> list:
|
||||||
|
|
@ -291,8 +278,7 @@ class EMEA_AR_Parsing:
|
||||||
data_from_gpt,
|
data_from_gpt,
|
||||||
self.document_mapping_info_df,
|
self.document_mapping_info_df,
|
||||||
self.output_mapping_data_folder,
|
self.output_mapping_data_folder,
|
||||||
self.doc_source,
|
self.doc_source
|
||||||
compare_with_provider=self.compare_with_provider
|
|
||||||
)
|
)
|
||||||
return data_mapping.mapping_raw_data_entrance()
|
return data_mapping.mapping_raw_data_entrance()
|
||||||
|
|
||||||
|
|
@ -348,7 +334,6 @@ def mapping_data(
|
||||||
output_mapping_data_folder=output_mapping_folder,
|
output_mapping_data_folder=output_mapping_folder,
|
||||||
extract_way=extract_way,
|
extract_way=extract_way,
|
||||||
drilldown_folder=drilldown_folder,
|
drilldown_folder=drilldown_folder,
|
||||||
compare_with_provider=False
|
|
||||||
)
|
)
|
||||||
doc_data_from_gpt, annotation_list = emea_ar_parsing.extract_data(
|
doc_data_from_gpt, annotation_list = emea_ar_parsing.extract_data(
|
||||||
re_run=re_run_extract_data
|
re_run=re_run_extract_data
|
||||||
|
|
@ -517,29 +502,18 @@ def batch_start_job(
|
||||||
writer, index=False, sheet_name="extract_data"
|
writer, index=False, sheet_name="extract_data"
|
||||||
)
|
)
|
||||||
|
|
||||||
if (
|
if document_mapping_file is not None and len(document_mapping_file) > 0 and os.path.exists(document_mapping_file):
|
||||||
doc_source == "aus_prospectus"
|
|
||||||
and document_mapping_file is not None
|
|
||||||
and len(document_mapping_file) > 0
|
|
||||||
and os.path.exists(document_mapping_file)
|
|
||||||
):
|
|
||||||
try:
|
try:
|
||||||
merged_total_data_folder = os.path.join(
|
merged_total_data_folder = os.path.join(output_mapping_total_folder, "merged/")
|
||||||
output_mapping_total_folder, "merged/"
|
|
||||||
)
|
|
||||||
os.makedirs(merged_total_data_folder, exist_ok=True)
|
os.makedirs(merged_total_data_folder, exist_ok=True)
|
||||||
data_file_base_name = os.path.basename(output_file)
|
data_file_base_name = os.path.basename(output_file)
|
||||||
output_merged_data_file_path = os.path.join(
|
output_merged_data_file_path = os.path.join(merged_total_data_folder, "merged_" + data_file_base_name)
|
||||||
merged_total_data_folder, "merged_" + data_file_base_name
|
merge_output_data_aus_prospectus(output_file, document_mapping_file, output_merged_data_file_path)
|
||||||
)
|
|
||||||
merge_output_data_aus_prospectus(
|
|
||||||
output_file, document_mapping_file, output_merged_data_file_path
|
|
||||||
)
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error: {e}")
|
logger.error(f"Error: {e}")
|
||||||
|
|
||||||
if calculate_metrics:
|
if calculate_metrics:
|
||||||
prediction_sheet_name = "data_in_doc_mapping"
|
prediction_sheet_name = "total_mapping_data"
|
||||||
ground_truth_file = r"/data/emea_ar/ground_truth/data_extraction/mapping_data_info_73_documents.xlsx"
|
ground_truth_file = r"/data/emea_ar/ground_truth/data_extraction/mapping_data_info_73_documents.xlsx"
|
||||||
ground_truth_sheet_name = "mapping_data"
|
ground_truth_sheet_name = "mapping_data"
|
||||||
metrics_output_folder = r"/data/emea_ar/output/metrics/"
|
metrics_output_folder = r"/data/emea_ar/output/metrics/"
|
||||||
|
|
@ -796,11 +770,11 @@ def test_auto_generate_instructions():
|
||||||
|
|
||||||
|
|
||||||
def test_data_extraction_metrics():
|
def test_data_extraction_metrics():
|
||||||
data_type = "document_mapping_in_db"
|
data_type = "data_extraction"
|
||||||
# prediction_file = r"/data/emea_ar/output/mapping_data/total/mapping_data_info_88_documents_by_image_20240920033929.xlsx"
|
# prediction_file = r"/data/emea_ar/output/mapping_data/total/mapping_data_info_88_documents_by_image_20240920033929.xlsx"
|
||||||
prediction_file = r"/data/emea_ar/output/mapping_data/total/mapping_data_info_51_documents_by_text_20250127104008.xlsx"
|
prediction_file = r"/data/emea_ar/output/mapping_data/total/mapping_data_info_88_documents_by_text_20240922152517.xlsx"
|
||||||
# prediction_file = r"/data/emea_ar/output/mapping_data/docs/by_text/excel/481475385.xlsx"
|
# prediction_file = r"/data/emea_ar/output/mapping_data/docs/by_text/excel/481475385.xlsx"
|
||||||
prediction_sheet_name = "data_in_doc_mapping"
|
prediction_sheet_name = "mapping_data"
|
||||||
ground_truth_file = r"/data/emea_ar/ground_truth/data_extraction/mapping_data_info_73_documents.xlsx"
|
ground_truth_file = r"/data/emea_ar/ground_truth/data_extraction/mapping_data_info_73_documents.xlsx"
|
||||||
ground_truth_sheet_name = "mapping_data"
|
ground_truth_sheet_name = "mapping_data"
|
||||||
metrics_output_folder = r"/data/emea_ar/output/metrics/"
|
metrics_output_folder = r"/data/emea_ar/output/metrics/"
|
||||||
|
|
@ -1041,9 +1015,9 @@ def batch_run_documents(
|
||||||
page_filter_ground_truth_file = (
|
page_filter_ground_truth_file = (
|
||||||
r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx"
|
r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx"
|
||||||
)
|
)
|
||||||
re_run_extract_data = True
|
re_run_extract_data = False
|
||||||
re_run_mapping_data = True
|
re_run_mapping_data = False
|
||||||
force_save_total_data = False
|
force_save_total_data = True
|
||||||
calculate_metrics = False
|
calculate_metrics = False
|
||||||
|
|
||||||
extract_way = "text"
|
extract_way = "text"
|
||||||
|
|
@ -1220,17 +1194,13 @@ def merge_output_data_aus_prospectus(
|
||||||
):
|
):
|
||||||
# TODO: merge output data for aus prospectus, plan to realize it on 2025-01-16
|
# TODO: merge output data for aus prospectus, plan to realize it on 2025-01-16
|
||||||
data_df = pd.read_excel(data_file_path, sheet_name="total_mapping_data")
|
data_df = pd.read_excel(data_file_path, sheet_name="total_mapping_data")
|
||||||
document_mapping_df = pd.read_excel(
|
document_mapping_df = pd.read_excel(document_mapping_file, sheet_name="document_mapping")
|
||||||
document_mapping_file, sheet_name="document_mapping"
|
|
||||||
)
|
|
||||||
# set doc_id to be string type
|
# set doc_id to be string type
|
||||||
data_df["doc_id"] = data_df["doc_id"].astype(str)
|
data_df["doc_id"] = data_df["doc_id"].astype(str)
|
||||||
document_mapping_df["DocumentId"] = document_mapping_df["DocumentId"].astype(str)
|
document_mapping_df["DocumentId"] = document_mapping_df["DocumentId"].astype(str)
|
||||||
|
|
||||||
doc_id_list = data_df["doc_id"].unique().tolist()
|
doc_id_list = data_df["doc_id"].unique().tolist()
|
||||||
datapoint_keyword_config_file = (
|
datapoint_keyword_config_file = r"./configuration/aus_prospectus/datapoint_name.json"
|
||||||
r"./configuration/aus_prospectus/datapoint_name.json"
|
|
||||||
)
|
|
||||||
with open(datapoint_keyword_config_file, "r", encoding="utf-8") as f:
|
with open(datapoint_keyword_config_file, "r", encoding="utf-8") as f:
|
||||||
datapoint_keyword_config = json.load(f)
|
datapoint_keyword_config = json.load(f)
|
||||||
datapoint_name_list = list(datapoint_keyword_config.keys())
|
datapoint_name_list = list(datapoint_keyword_config.keys())
|
||||||
|
|
@ -1242,9 +1212,7 @@ def merge_output_data_aus_prospectus(
|
||||||
"EffectiveDate"
|
"EffectiveDate"
|
||||||
].values[0]
|
].values[0]
|
||||||
)[0:10]
|
)[0:10]
|
||||||
share_doc_data_df = data_df[
|
share_doc_data_df = data_df[(data_df["doc_id"] == doc_id) & (data_df["investment_type"] == 1)]
|
||||||
(data_df["doc_id"] == doc_id) & (data_df["investment_type"] == 1)
|
|
||||||
]
|
|
||||||
exist_raw_name_list = []
|
exist_raw_name_list = []
|
||||||
for index, row in share_doc_data_df.iterrows():
|
for index, row in share_doc_data_df.iterrows():
|
||||||
doc_id = str(row["doc_id"])
|
doc_id = str(row["doc_id"])
|
||||||
|
|
@ -1260,9 +1228,7 @@ def merge_output_data_aus_prospectus(
|
||||||
fund_id = ""
|
fund_id = ""
|
||||||
fund_legal_name = ""
|
fund_legal_name = ""
|
||||||
if share_class_id != "":
|
if share_class_id != "":
|
||||||
record_row = document_mapping_df[
|
record_row = document_mapping_df[document_mapping_df["FundClassId"] == share_class_id]
|
||||||
document_mapping_df["FundClassId"] == share_class_id
|
|
||||||
]
|
|
||||||
if len(record_row) > 0:
|
if len(record_row) > 0:
|
||||||
fund_id = record_row["FundId"].values[0]
|
fund_id = record_row["FundId"].values[0]
|
||||||
fund_legal_name = record_row["FundLegalName"].values[0]
|
fund_legal_name = record_row["FundLegalName"].values[0]
|
||||||
|
|
@ -1299,16 +1265,16 @@ def merge_output_data_aus_prospectus(
|
||||||
doc_data_list.append(data)
|
doc_data_list.append(data)
|
||||||
# find data from total_data_list by raw_name
|
# find data from total_data_list by raw_name
|
||||||
for data in doc_data_list:
|
for data in doc_data_list:
|
||||||
if data["raw_name"] == raw_name:
|
if (
|
||||||
|
data["raw_name"] == raw_name
|
||||||
|
):
|
||||||
update_key = datapoint
|
update_key = datapoint
|
||||||
data[update_key] = value
|
data[update_key] = value
|
||||||
if page_index not in data["page_index"]:
|
if page_index not in data["page_index"]:
|
||||||
data["page_index"].append(page_index)
|
data["page_index"].append(page_index)
|
||||||
break
|
break
|
||||||
|
|
||||||
fund_doc_data_df = data_df[
|
fund_doc_data_df = data_df[(data_df["doc_id"] == doc_id) & (data_df["investment_type"] == 33)]
|
||||||
(data_df["doc_id"] == doc_id) & (data_df["investment_type"] == 33)
|
|
||||||
]
|
|
||||||
for index, row in fund_doc_data_df.iterrows():
|
for index, row in fund_doc_data_df.iterrows():
|
||||||
doc_id = str(row["doc_id"])
|
doc_id = str(row["doc_id"])
|
||||||
page_index = int(row["page_index"])
|
page_index = int(row["page_index"])
|
||||||
|
|
@ -1323,9 +1289,8 @@ def merge_output_data_aus_prospectus(
|
||||||
exist = False
|
exist = False
|
||||||
if fund_id != "":
|
if fund_id != "":
|
||||||
for data in doc_data_list:
|
for data in doc_data_list:
|
||||||
if (fund_id != "" and data["fund_id"] == fund_id) or (
|
if (fund_id != "" and data["fund_id"] == fund_id) or \
|
||||||
data["raw_fund_name"] == raw_fund_name
|
(data["raw_fund_name"] == raw_fund_name):
|
||||||
):
|
|
||||||
update_key = datapoint
|
update_key = datapoint
|
||||||
data[update_key] = value
|
data[update_key] = value
|
||||||
if page_index not in data["page_index"]:
|
if page_index not in data["page_index"]:
|
||||||
|
|
@ -1358,7 +1323,6 @@ def merge_output_data_aus_prospectus(
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
# test_data_extraction_metrics()
|
|
||||||
# data_file_path = r"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_11_documents_by_text_20250116220811.xlsx"
|
# data_file_path = r"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_11_documents_by_text_20250116220811.xlsx"
|
||||||
# document_mapping_file_path = r"/data/aus_prospectus/basic_information/11_documents/document_mapping.xlsx"
|
# document_mapping_file_path = r"/data/aus_prospectus/basic_information/11_documents/document_mapping.xlsx"
|
||||||
# merged_total_data_folder = r'/data/aus_prospectus/output/mapping_data/total/merged/'
|
# merged_total_data_folder = r'/data/aus_prospectus/output/mapping_data/total/merged/'
|
||||||
|
|
@ -1386,16 +1350,10 @@ if __name__ == "__main__":
|
||||||
|
|
||||||
doc_source = "aus_prospectus"
|
doc_source = "aus_prospectus"
|
||||||
if doc_source == "aus_prospectus":
|
if doc_source == "aus_prospectus":
|
||||||
# document_sample_file = (
|
document_sample_file = r"./sample_documents/aus_prospectus_100_documents_multi_fund_sample.txt"
|
||||||
# r"./sample_documents/aus_prospectus_100_documents_multi_fund_sample.txt"
|
|
||||||
# )
|
|
||||||
document_sample_file = (
|
|
||||||
r"./sample_documents/aus_prospectus_17_documents_sample.txt"
|
|
||||||
)
|
|
||||||
with open(document_sample_file, "r", encoding="utf-8") as f:
|
with open(document_sample_file, "r", encoding="utf-8") as f:
|
||||||
special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()]
|
special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()]
|
||||||
# document_mapping_file = r"/data/aus_prospectus/basic_information/from_2024_documents/aus_100_document_prospectus_multi_fund.xlsx"
|
document_mapping_file = r"/data/aus_prospectus/basic_information/from_2024_documents/aus_100_document_prospectus_multi_fund.xlsx"
|
||||||
document_mapping_file = r"/data/aus_prospectus/basic_information/17_documents/aus_prospectus_17_documents_mapping.xlsx"
|
|
||||||
# special_doc_id_list: list = [
|
# special_doc_id_list: list = [
|
||||||
# "539790009",
|
# "539790009",
|
||||||
# "542300403",
|
# "542300403",
|
||||||
|
|
@ -1409,7 +1367,7 @@ if __name__ == "__main__":
|
||||||
# "555377021",
|
# "555377021",
|
||||||
# "555654388",
|
# "555654388",
|
||||||
# ]
|
# ]
|
||||||
special_doc_id_list: list = ["377377369"]
|
# special_doc_id_list: list = ["534287518"]
|
||||||
pdf_folder: str = r"/data/aus_prospectus/pdf/"
|
pdf_folder: str = r"/data/aus_prospectus/pdf/"
|
||||||
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
|
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
|
||||||
output_extract_data_child_folder: str = (
|
output_extract_data_child_folder: str = (
|
||||||
|
|
@ -1439,61 +1397,7 @@ if __name__ == "__main__":
|
||||||
drilldown_folder=drilldown_folder,
|
drilldown_folder=drilldown_folder,
|
||||||
)
|
)
|
||||||
elif doc_source == "emea_ar":
|
elif doc_source == "emea_ar":
|
||||||
special_doc_id_list = [
|
special_doc_id_list = ["553242408"]
|
||||||
"292989214",
|
|
||||||
"316237292",
|
|
||||||
"321733631",
|
|
||||||
"323390570",
|
|
||||||
"327956364",
|
|
||||||
"333207452",
|
|
||||||
"334718372",
|
|
||||||
"344636875",
|
|
||||||
"362246081",
|
|
||||||
"366179419",
|
|
||||||
"380945052",
|
|
||||||
"382366116",
|
|
||||||
"387202452",
|
|
||||||
"389171486",
|
|
||||||
"391456740",
|
|
||||||
"391736837",
|
|
||||||
"394778487",
|
|
||||||
"401684600",
|
|
||||||
"402113224",
|
|
||||||
"402181770",
|
|
||||||
"402397014",
|
|
||||||
"405803396",
|
|
||||||
"445102363",
|
|
||||||
"445256897",
|
|
||||||
"448265376",
|
|
||||||
"449555622",
|
|
||||||
"449623976",
|
|
||||||
"458291624",
|
|
||||||
"458359181",
|
|
||||||
"463081566",
|
|
||||||
"469138353",
|
|
||||||
"471641628",
|
|
||||||
"476492237",
|
|
||||||
"478585901",
|
|
||||||
"478586066",
|
|
||||||
"479042264",
|
|
||||||
"479793787",
|
|
||||||
"481475385",
|
|
||||||
"483617247",
|
|
||||||
"486378555",
|
|
||||||
"486383912",
|
|
||||||
"492121213",
|
|
||||||
"497497599",
|
|
||||||
"502693599",
|
|
||||||
"502821436",
|
|
||||||
"503194284",
|
|
||||||
"506559375",
|
|
||||||
"507967525",
|
|
||||||
"508854243",
|
|
||||||
"509845549",
|
|
||||||
"520879048",
|
|
||||||
"529925114",
|
|
||||||
]
|
|
||||||
# special_doc_id_list = ["532438210"]
|
|
||||||
batch_run_documents(
|
batch_run_documents(
|
||||||
doc_source=doc_source, special_doc_id_list=special_doc_id_list
|
doc_source=doc_source, special_doc_id_list=special_doc_id_list
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -1,17 +0,0 @@
|
||||||
377377369
|
|
||||||
397107472
|
|
||||||
401212184
|
|
||||||
409723592
|
|
||||||
411062815
|
|
||||||
412778803
|
|
||||||
414751292
|
|
||||||
462770987
|
|
||||||
471206458
|
|
||||||
391080133
|
|
||||||
391080140
|
|
||||||
410899007
|
|
||||||
420339794
|
|
||||||
441280757
|
|
||||||
446324179
|
|
||||||
454036250
|
|
||||||
384508026
|
|
||||||
|
|
@ -543,7 +543,7 @@ class PDFUtil:
|
||||||
matching_val_area = page.search_for(text_block.replace('\n', '').replace('-', ''))
|
matching_val_area = page.search_for(text_block.replace('\n', '').replace('-', ''))
|
||||||
if len(matching_val_area) == 0:
|
if len(matching_val_area) == 0:
|
||||||
matching_val_area = page.search_for(text_block.replace('-\n', ''))
|
matching_val_area = page.search_for(text_block.replace('-\n', ''))
|
||||||
if len(matching_val_area) > 0 and len(text_block.strip().split()) < 3:
|
if len(matching_val_area) > 0 and len(text_block.strip().split()) == 1:
|
||||||
new_matching_val_area = []
|
new_matching_val_area = []
|
||||||
for area in matching_val_area:
|
for area in matching_val_area:
|
||||||
# get text by text_bbox
|
# get text by text_bbox
|
||||||
|
|
|
||||||
|
|
@ -8,7 +8,7 @@ import dotenv
|
||||||
dotenv.load_dotenv()
|
dotenv.load_dotenv()
|
||||||
|
|
||||||
|
|
||||||
def query_document_fund_mapping(doc_id, rerun=True, output_folder=r"./data/emea_ar/output/db_mapping/document/"):
|
def query_document_fund_mapping(doc_id, rerun=True, output_folder=r"/data/emea_ar/output/mapping/document/"):
|
||||||
count = 1
|
count = 1
|
||||||
while True:
|
while True:
|
||||||
try:
|
try:
|
||||||
|
|
@ -27,13 +27,10 @@ def query_document_fund_mapping(doc_id, rerun=True, output_folder=r"./data/emea_
|
||||||
by=["FundName", "ShareClassName"]
|
by=["FundName", "ShareClassName"]
|
||||||
).reset_index(drop=True)
|
).reset_index(drop=True)
|
||||||
if output_folder is not None and len(output_folder) > 0:
|
if output_folder is not None and len(output_folder) > 0:
|
||||||
try:
|
os.makedirs(output_folder, exist_ok=True)
|
||||||
os.makedirs(output_folder, exist_ok=True)
|
output_file = os.path.join(output_folder, f"{doc_id}.xlsx")
|
||||||
output_file = os.path.join(output_folder, f"{doc_id}.xlsx")
|
with pd.ExcelWriter(output_file) as writer:
|
||||||
with pd.ExcelWriter(output_file) as writer:
|
document_mapping_info_df.to_excel(writer, index=False)
|
||||||
document_mapping_info_df.to_excel(writer, index=False)
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
return document_mapping_info_df
|
return document_mapping_info_df
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(e)
|
print(e)
|
||||||
|
|
@ -43,7 +40,7 @@ def query_document_fund_mapping(doc_id, rerun=True, output_folder=r"./data/emea_
|
||||||
count += 1
|
count += 1
|
||||||
|
|
||||||
|
|
||||||
def query_investment_by_provider(company_id: str, rerun=True, output_folder=r"./data/emea_ar/output/db_mapping/provider/"):
|
def query_investment_by_provider(company_id: str, rerun=True, output_folder=r"/data/emea_ar/output/mapping/provider/"):
|
||||||
count = 1
|
count = 1
|
||||||
while True:
|
while True:
|
||||||
try:
|
try:
|
||||||
|
|
@ -62,13 +59,10 @@ def query_investment_by_provider(company_id: str, rerun=True, output_folder=r"./
|
||||||
.sort_values(by=['FundName', 'ShareClassName']) \
|
.sort_values(by=['FundName', 'ShareClassName']) \
|
||||||
.reset_index(drop=True)
|
.reset_index(drop=True)
|
||||||
if output_folder is not None and len(output_folder) > 0:
|
if output_folder is not None and len(output_folder) > 0:
|
||||||
try:
|
os.makedirs(output_folder, exist_ok=True)
|
||||||
os.makedirs(output_folder, exist_ok=True)
|
output_file = os.path.join(output_folder, f"{company_id}.xlsx")
|
||||||
output_file = os.path.join(output_folder, f"{company_id}.xlsx")
|
with pd.ExcelWriter(output_file) as writer:
|
||||||
with pd.ExcelWriter(output_file) as writer:
|
investment_by_provider_df.to_excel(writer, index=False)
|
||||||
investment_by_provider_df.to_excel(writer, index=False)
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
return investment_by_provider_df
|
return investment_by_provider_df
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(e)
|
print(e)
|
||||||
|
|
@ -79,7 +73,7 @@ def query_investment_by_provider(company_id: str, rerun=True, output_folder=r"./
|
||||||
|
|
||||||
|
|
||||||
def query_data_by_biz_type(biztype: str, para, return_df: bool):
|
def query_data_by_biz_type(biztype: str, para, return_df: bool):
|
||||||
sqlpass_url = os.getenv("SQL_PASS_URL")
|
sqlpass_url = "https://api.morningstar.com/sqlpassapi/v1/sql"
|
||||||
url = sqlpass_url + "?sqlName={0}¶ms={1}".format(biztype, str(para))
|
url = sqlpass_url + "?sqlName={0}¶ms={1}".format(biztype, str(para))
|
||||||
headers = {"ApiKey": os.getenv("SQL_PASS_KEY")}
|
headers = {"ApiKey": os.getenv("SQL_PASS_KEY")}
|
||||||
if return_df:
|
if return_df:
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue