Compare commits

..

10 Commits

Author SHA1 Message Date
Blade He f7d53acdde support get sqlpass api by configuration 2025-02-19 14:37:21 -06:00
Blade He a8810519f8 optimize instructions configuration
optimize drilldown part logic
2025-02-04 15:29:24 -06:00
Blade He f9ef4cec96 update sql_query cache file store location
At most cache 5 days, then clean from local disk.
2025-01-31 10:59:54 -06:00
Blade He 7f37f3532f switch example document 2025-01-27 14:59:26 -06:00
Blade He 6f831e241c Merge branch 'aus_prospectus_ravi' 2025-01-27 12:32:42 -06:00
Blade He 41f8c307ff a little change 2025-01-27 12:32:36 -06:00
Blade He 47c41e492f 1. only get name mapping data from document mapping
2. Compare name mapping metrics between Ravi's and mine.
2025-01-27 12:29:49 -06:00
Blade He d9b0bed39a a little change 2025-01-22 09:57:42 -06:00
Blade He 350550d1b0 fix issue for removing item from list 2025-01-21 17:24:05 -06:00
Blade He e2b9bcbdbc initial abbreviation configurations 2025-01-21 17:09:45 -06:00
11 changed files with 1518 additions and 202 deletions

View File

@ -44,6 +44,8 @@ def emea_ar_data_extract():
output_extract_data_folder = r"./data/emea_ar/output/extract_data/docs/" output_extract_data_folder = r"./data/emea_ar/output/extract_data/docs/"
output_mapping_data_folder = r"./data/emea_ar/output/mapping_data/docs/" output_mapping_data_folder = r"./data/emea_ar/output/mapping_data/docs/"
drilldown_folder = r"./data/emea_ar/output/drilldown/" drilldown_folder = r"./data/emea_ar/output/drilldown/"
db_mapping_document_folder = r"./data/emea_ar/output/db_mapping/document/"
db_mapping_provider_folder = r"./data/emea_ar/output/db_mapping/provider/"
extract_way = "text" extract_way = "text"
os.makedirs(pdf_folder, exist_ok=True) os.makedirs(pdf_folder, exist_ok=True)
@ -51,12 +53,16 @@ def emea_ar_data_extract():
os.makedirs(output_extract_data_folder, exist_ok=True) os.makedirs(output_extract_data_folder, exist_ok=True)
os.makedirs(output_mapping_data_folder, exist_ok=True) os.makedirs(output_mapping_data_folder, exist_ok=True)
os.makedirs(drilldown_folder, exist_ok=True) os.makedirs(drilldown_folder, exist_ok=True)
os.makedirs(db_mapping_document_folder, exist_ok=True)
os.makedirs(db_mapping_provider_folder, exist_ok=True)
clean_folder(pdf_folder) clean_folder(pdf_folder)
clean_folder(output_pdf_text_folder) clean_folder(output_pdf_text_folder)
clean_folder(output_extract_data_folder) clean_folder(output_extract_data_folder)
clean_folder(output_mapping_data_folder) clean_folder(output_mapping_data_folder)
clean_folder(drilldown_folder) clean_folder(drilldown_folder)
clean_folder(db_mapping_document_folder)
clean_folder(db_mapping_provider_folder)
re_run_extract_data = False re_run_extract_data = False
re_run_mapping_data = False re_run_mapping_data = False
@ -69,7 +75,8 @@ def emea_ar_data_extract():
output_extract_data_folder=output_extract_data_folder, output_extract_data_folder=output_extract_data_folder,
output_mapping_data_folder=output_mapping_data_folder, output_mapping_data_folder=output_mapping_data_folder,
extract_way=extract_way, extract_way=extract_way,
drilldown_folder=drilldown_folder) drilldown_folder=drilldown_folder,
compare_with_provider=False)
doc_data_from_gpt, annotation_list = emea_ar_parsing.extract_data(re_run=re_run_extract_data) doc_data_from_gpt, annotation_list = emea_ar_parsing.extract_data(re_run=re_run_extract_data)
doc_mapping_data = emea_ar_parsing.mapping_data( doc_mapping_data = emea_ar_parsing.mapping_data(
data_from_gpt=doc_data_from_gpt, re_run=re_run_mapping_data data_from_gpt=doc_data_from_gpt, re_run=re_run_mapping_data

View File

@ -1,6 +1,6 @@
{ {
"total_annual_dollar_based_charges": {"english": ["total annual dollar based charges", "total annual dollar based charges ($)","total annual dollar"]}, "total_annual_dollar_based_charges": {"english": ["total annual dollar based charges", "total annual dollar based charges ($)","total annual dollar"]},
"management_fee_and_costs": {"english": ["management fees and cost", "Plus other investment fees and costs"]}, "management_fee_and_costs": {"english": ["management fees and cost", "management fees and costs", "Plus other investment fees and costs"]},
"management_fee": {"english": ["management fee", "management fees","investment management fees","management fees and cost", "investment option management costs", "investment option management costs1", "Plus other investment fees and costs"]}, "management_fee": {"english": ["management fee", "management fees","investment management fees","management fees and cost", "investment option management costs", "investment option management costs1", "Plus other investment fees and costs"]},
"performance_fee": {"english": ["performance fee", "performance fees"]}, "performance_fee": {"english": ["performance fee", "performance fees"]},
"performance_fee_costs": {"english": ["performance fee costs", "performance fees costs"]}, "performance_fee_costs": {"english": ["performance fee costs", "performance fees costs"]},

File diff suppressed because it is too large Load Diff

View File

@ -32,22 +32,24 @@ from openai import AzureOpenAI
ABB_JSON = dict() ABB_JSON = dict()
def get_abb_json(): def get_abb_json(doc_source: str = "aus_prospectus"):
global ABB_JSON global ABB_JSON
with open("abbreviation_records.json", "r") as file: if len(ABB_JSON.keys()) == 0:
# Load the JSON and convert keys to lowercase with open(f"./configuration/{doc_source}/abbreviation_records.json", "r") as file:
ABB_JSON = {key.lower(): value for key, value in json.load(file).items()} # Load the JSON and convert keys to lowercase
ABB_JSON = {key.lower(): value for key, value in json.load(file).items()}
def get_abbre_format_str(fundname): def get_abbre_format_str(fundname, doc_source: str = "aus_prospectus"):
"""Replaces abbreviations in a fund name with their expanded forms.""" """Replaces abbreviations in a fund name with their expanded forms."""
# Convert fund name to lowercase while matching # Convert fund name to lowercase while matching
f_list = fundname.lower().split() f_list = fundname.lower().split()
get_abb_json(doc_source)
updated_doc_fname_words = [ABB_JSON.get(word, word).lower() for word in f_list] updated_doc_fname_words = [ABB_JSON.get(word, word).lower() for word in f_list]
return " ".join(updated_doc_fname_words) return " ".join(updated_doc_fname_words)
def replace_abbrevs_in_fundnames(fund_names_list): def replace_abbrevs_in_fundnames(fund_names_list, doc_source: str = "aus_prospectus"):
"""Replaces abbreviations in a list of fund names.""" """Replaces abbreviations in a list of fund names."""
return [get_abbre_format_str(fund_name) for fund_name in fund_names_list] return [get_abbre_format_str(fund_name, doc_source) for fund_name in fund_names_list]
### STEP 2 - Remove Stopwords ### STEP 2 - Remove Stopwords
@ -438,7 +440,7 @@ def format_response(doc_id, pred_fund, db_fund, clean_pred_name, clean_db_name,
return dt return dt
def final_function_to_match(doc_id, pred_list, db_list, provider_name): def final_function_to_match(doc_id, pred_list, db_list, provider_name, doc_source: str = "aus_prospectus"):
final_result = {} final_result = {}
df_data = [] df_data = []
unmatched_pred_list = pred_list.copy() unmatched_pred_list = pred_list.copy()
@ -456,12 +458,16 @@ def final_function_to_match(doc_id, pred_list, db_list, provider_name):
step0_matched_db_name_cosine= all_matched_fund_names_[0], step0_matched_db_name_jacc= all_matched_fund_names_[1], step0_matched_db_name_leven= all_matched_fund_names_[2], step0_matched_db_name_cosine= all_matched_fund_names_[0], step0_matched_db_name_jacc= all_matched_fund_names_[1], step0_matched_db_name_leven= all_matched_fund_names_[2],
step0_cosine=all_scores_[0], step0_jaccard=all_scores_[1], step0_levenshtein=all_scores_[2], step0_cosine=all_scores_[0], step0_jaccard=all_scores_[1], step0_levenshtein=all_scores_[2],
llm_flag=False)) llm_flag=False))
unmatched_db_list.remove(db_list[matched_index]) if db_list[matched_index] in unmatched_db_list:
unmatched_pred_list.remove(pred_list[index]) unmatched_db_list.remove(db_list[matched_index])
# unmatched_db_list.remove(db_list[matched_index])
if pred_list[index] in unmatched_pred_list:
unmatched_pred_list.remove(pred_list[index])
# unmatched_pred_list.remove(pred_list[index])
else: else:
### STEP-1 Abbreviation replacement ### STEP-1 Abbreviation replacement
cleaned_pred_name1 = replace_abbrevs_in_fundnames([pred_fund])[0] cleaned_pred_name1 = replace_abbrevs_in_fundnames([pred_fund], doc_source)[0]
cleaned_db_list1 = replace_abbrevs_in_fundnames(db_list) cleaned_db_list1 = replace_abbrevs_in_fundnames(db_list, doc_source)
# print("--> ",cleaned_db_list1, cleaned_pred_name1) # print("--> ",cleaned_db_list1, cleaned_pred_name1)
step1_result, matched_index, all_scores1_, all_matched_fund_names1_ = get_fund_match_final_score(cleaned_db_list1, cleaned_pred_name1) step1_result, matched_index, all_scores1_, all_matched_fund_names1_ = get_fund_match_final_score(cleaned_db_list1, cleaned_pred_name1)
# print(f"\nStep 1 - Abbreviation Replacement Result: {step1_result}") # print(f"\nStep 1 - Abbreviation Replacement Result: {step1_result}")
@ -477,8 +483,12 @@ def final_function_to_match(doc_id, pred_list, db_list, provider_name):
step1_pred_name=cleaned_pred_name1, step1_db_name=cleaned_db_list1, step1_pred_name=cleaned_pred_name1, step1_db_name=cleaned_db_list1,
step1_matched_db_name_cosine= all_matched_fund_names1_[0], step1_matched_db_name_jacc= all_matched_fund_names1_[1], step1_matched_db_name_leven= all_matched_fund_names1_[2], step1_matched_db_name_cosine= all_matched_fund_names1_[0], step1_matched_db_name_jacc= all_matched_fund_names1_[1], step1_matched_db_name_leven= all_matched_fund_names1_[2],
step1_cosine=all_scores1_[0], step1_jaccard=all_scores1_[1], step1_levenshtein=all_scores1_[2], llm_flag=False)) step1_cosine=all_scores1_[0], step1_jaccard=all_scores1_[1], step1_levenshtein=all_scores1_[2], llm_flag=False))
unmatched_db_list.remove(db_list[matched_index]) if db_list[matched_index] in unmatched_db_list:
unmatched_pred_list.remove(pred_list[index]) unmatched_db_list.remove(db_list[matched_index])
# unmatched_db_list.remove(db_list[matched_index])
if pred_list[index] in unmatched_pred_list:
unmatched_pred_list.remove(pred_list[index])
# unmatched_pred_list.remove(pred_list[index])
else: else:
### STEP-2 Remove Stopwords ### STEP-2 Remove Stopwords
cleaned_pred_name2 = remove_stopwords_nltk([cleaned_pred_name1])[0] cleaned_pred_name2 = remove_stopwords_nltk([cleaned_pred_name1])[0]
@ -501,8 +511,12 @@ def final_function_to_match(doc_id, pred_list, db_list, provider_name):
step2_pred_name=cleaned_pred_name2, step2_db_name=cleaned_db_list2, step2_pred_name=cleaned_pred_name2, step2_db_name=cleaned_db_list2,
step2_matched_db_name_cosine= all_matched_fund_names2_[0], step2_matched_db_name_jacc= all_matched_fund_names2_[1], step2_matched_db_name_leven= all_matched_fund_names2_[2], step2_matched_db_name_cosine= all_matched_fund_names2_[0], step2_matched_db_name_jacc= all_matched_fund_names2_[1], step2_matched_db_name_leven= all_matched_fund_names2_[2],
step2_cosine=all_scores2_[0], step2_jaccard=all_scores2_[1], step2_levenshtein=all_scores2_[2],llm_flag=False)) step2_cosine=all_scores2_[0], step2_jaccard=all_scores2_[1], step2_levenshtein=all_scores2_[2],llm_flag=False))
unmatched_db_list.remove(db_list[matched_index]) if db_list[matched_index] in unmatched_db_list:
unmatched_pred_list.remove(pred_list[index]) unmatched_db_list.remove(db_list[matched_index])
# unmatched_db_list.remove(db_list[matched_index])
if pred_list[index] in unmatched_pred_list:
unmatched_pred_list.remove(pred_list[index])
# unmatched_pred_list.remove(pred_list[index])
else: else:
### STEP-3 Special Character Removal ### STEP-3 Special Character Removal
cleaned_pred_name3 = remove_special_characters([cleaned_pred_name2])[0] cleaned_pred_name3 = remove_special_characters([cleaned_pred_name2])[0]
@ -527,8 +541,12 @@ def final_function_to_match(doc_id, pred_list, db_list, provider_name):
step3_pred_name=cleaned_pred_name3, step3_db_name=cleaned_db_list3, step3_pred_name=cleaned_pred_name3, step3_db_name=cleaned_db_list3,
step3_matched_db_name_cosine= all_matched_fund_names3_[0], step3_matched_db_name_jacc= all_matched_fund_names3_[1], step3_matched_db_name_leven= all_matched_fund_names3_[2], step3_matched_db_name_cosine= all_matched_fund_names3_[0], step3_matched_db_name_jacc= all_matched_fund_names3_[1], step3_matched_db_name_leven= all_matched_fund_names3_[2],
step3_cosine=all_scores3_[0], step3_jaccard=all_scores3_[1], step3_levenshtein=all_scores3_[2],llm_flag=False)) step3_cosine=all_scores3_[0], step3_jaccard=all_scores3_[1], step3_levenshtein=all_scores3_[2],llm_flag=False))
unmatched_db_list.remove(db_list[matched_index]) if db_list[matched_index] in unmatched_db_list:
unmatched_pred_list.remove(pred_list[index]) unmatched_db_list.remove(db_list[matched_index])
# unmatched_db_list.remove(db_list[matched_index])
if pred_list[index] in unmatched_pred_list:
unmatched_pred_list.remove(pred_list[index])
# unmatched_pred_list.remove(pred_list[index])
else: else:
### STEP-4 Common Words Removal ### STEP-4 Common Words Removal
cleaned_db_list4, _ = remove_common_words(cleaned_db_list3) cleaned_db_list4, _ = remove_common_words(cleaned_db_list3)
@ -565,8 +583,12 @@ def final_function_to_match(doc_id, pred_list, db_list, provider_name):
# print("unmatched_pred_list: ",unmatched_pred_list) # print("unmatched_pred_list: ",unmatched_pred_list)
# print("db_list[matched_index]: ",db_list[matched_index]) # print("db_list[matched_index]: ",db_list[matched_index])
# print("pred_list[index]: ",pred_list[index]) # print("pred_list[index]: ",pred_list[index])
unmatched_db_list.remove(db_list[matched_index]) if db_list[matched_index] in unmatched_db_list:
unmatched_pred_list.remove(pred_list[index]) unmatched_db_list.remove(db_list[matched_index])
# unmatched_db_list.remove(db_list[matched_index])
if pred_list[index] in unmatched_pred_list:
unmatched_pred_list.remove(pred_list[index])
# unmatched_pred_list.remove(pred_list[index])
else: else:
df_data.append(format_response(doc_id, pred_list[index], db_list[matched_index], cleaned_pred_name4, df_data.append(format_response(doc_id, pred_list[index], db_list[matched_index], cleaned_pred_name4,
db_list[matched_index], db_list[matched_index],
@ -595,11 +617,11 @@ def final_function_to_match(doc_id, pred_list, db_list, provider_name):
# print("==>>> DB LIST: ",unmatched_db_list) # print("==>>> DB LIST: ",unmatched_db_list)
# print("==>>> PRED LIST: ",unmatched_pred_list) # print("==>>> PRED LIST: ",unmatched_pred_list)
if len(unmatched_pred_list)!=0: if len(unmatched_pred_list)!=0:
cleaned_unmatched_pred_list = replace_abbrevs_in_fundnames(unmatched_pred_list) cleaned_unmatched_pred_list = replace_abbrevs_in_fundnames(unmatched_pred_list, doc_source)
cleaned_unmatched_pred_list = remove_stopwords_nltk(cleaned_unmatched_pred_list) cleaned_unmatched_pred_list = remove_stopwords_nltk(cleaned_unmatched_pred_list)
cleaned_unmatched_pred_list = remove_special_characters(cleaned_unmatched_pred_list) cleaned_unmatched_pred_list = remove_special_characters(cleaned_unmatched_pred_list)
cleaned_unmatched_db_list = replace_abbrevs_in_fundnames(unmatched_db_list) cleaned_unmatched_db_list = replace_abbrevs_in_fundnames(unmatched_db_list, doc_source)
cleaned_unmatched_db_list = remove_stopwords_nltk(cleaned_unmatched_db_list) cleaned_unmatched_db_list = remove_stopwords_nltk(cleaned_unmatched_db_list)
cleaned_unmatched_db_list = remove_special_characters(cleaned_unmatched_db_list) cleaned_unmatched_db_list = remove_special_characters(cleaned_unmatched_db_list)
prompt_context = f""" prompt_context = f"""

View File

@ -969,7 +969,9 @@ class DataExtraction:
if datapoint_name == "performance_fee": if datapoint_name == "performance_fee":
datapoint_name = "performance fees" datapoint_name = "performance fees"
else: else:
datapoint_name = datapoint_name.upper() datapoint_name = self.datapoint_name_config.get(datapoint_name, "")
if len(datapoint_name) == 0:
datapoint_name = datapoint.upper()
reported_name = f"The {datapoint_name} reported name could be:\n{joined_reported_name}" reported_name = f"The {datapoint_name} reported name could be:\n{joined_reported_name}"
instructions.append(reported_name) instructions.append(reported_name)

View File

@ -1,6 +1,7 @@
import os import os
import json import json
import pandas as pd import pandas as pd
from copy import deepcopy
from utils.biz_utils import get_most_similar_name, remove_common_word from utils.biz_utils import get_most_similar_name, remove_common_word
from utils.sql_query_util import ( from utils.sql_query_util import (
query_document_fund_mapping, query_document_fund_mapping,
@ -18,14 +19,18 @@ class DataMapping:
raw_document_data_list: list, raw_document_data_list: list,
document_mapping_info_df: pd.DataFrame, document_mapping_info_df: pd.DataFrame,
output_data_folder: str, output_data_folder: str,
doc_source: str = "emea_ar" doc_source: str = "emea_ar",
compare_with_provider: bool = True
): ):
self.doc_id = doc_id self.doc_id = doc_id
self.datapoints = datapoints self.datapoints = datapoints
self.doc_source = doc_source self.doc_source = doc_source
self.compare_with_provider = compare_with_provider
self.raw_document_data_list = raw_document_data_list self.raw_document_data_list = raw_document_data_list
if document_mapping_info_df is None or len(document_mapping_info_df) == 0: if document_mapping_info_df is None or len(document_mapping_info_df) == 0:
self.document_mapping_info_df = query_document_fund_mapping(doc_id, rerun=False) self.document_mapping_info_df = query_document_fund_mapping(
doc_id, rerun=False
)
else: else:
self.document_mapping_info_df = document_mapping_info_df self.document_mapping_info_df = document_mapping_info_df
@ -44,7 +49,9 @@ class DataMapping:
def set_mapping_data_by_db(self, document_mapping_info_df: pd.DataFrame): def set_mapping_data_by_db(self, document_mapping_info_df: pd.DataFrame):
logger.info("Setting document mapping data") logger.info("Setting document mapping data")
if document_mapping_info_df is None or len(document_mapping_info_df) == 0: if document_mapping_info_df is None or len(document_mapping_info_df) == 0:
self.document_mapping_info_df = query_document_fund_mapping(self.doc_id, rerun=False) self.document_mapping_info_df = query_document_fund_mapping(
self.doc_id, rerun=False
)
else: else:
self.document_mapping_info_df = document_mapping_info_df self.document_mapping_info_df = document_mapping_info_df
if len(self.document_mapping_info_df) == 0: if len(self.document_mapping_info_df) == 0:
@ -92,26 +99,27 @@ class DataMapping:
def get_provider_mapping(self): def get_provider_mapping(self):
if len(self.document_mapping_info_df) == 0: if len(self.document_mapping_info_df) == 0:
return pd.DataFrame() return pd.DataFrame()
provider_id_list = ( provider_id_list = self.document_mapping_info_df["ProviderId"].unique().tolist()
self.document_mapping_info_df["ProviderId"].unique().tolist()
)
provider_mapping_list = [] provider_mapping_list = []
for provider_id in provider_id_list: for provider_id in provider_id_list:
provider_mapping_list.append(query_investment_by_provider(provider_id, rerun=False)) provider_mapping_list.append(
query_investment_by_provider(provider_id, rerun=False)
)
provider_mapping_df = pd.concat(provider_mapping_list) provider_mapping_df = pd.concat(provider_mapping_list)
provider_mapping_df = provider_mapping_df.drop_duplicates() provider_mapping_df = provider_mapping_df.drop_duplicates()
provider_mapping_df.reset_index(drop=True, inplace=True) provider_mapping_df.reset_index(drop=True, inplace=True)
return provider_mapping_df return provider_mapping_df
def mapping_raw_data_entrance(self): def mapping_raw_data_entrance(self):
if self.doc_source == "emear_ar": if self.doc_source == "emea_ar":
return self.mapping_raw_data() return self.mapping_raw_data()
elif self.doc_source == "aus_prospectus": elif self.doc_source == "aus_prospectus":
return self.mapping_raw_data_aus() return self.mapping_raw_data_generic()
else: else:
return self.mapping_raw_data() return self.mapping_raw_data()
# return self.mapping_raw_data_generic()
def mapping_raw_data_aus(self):
def mapping_raw_data_generic(self):
logger.info(f"Mapping raw data for AUS Prospectus document {self.doc_id}") logger.info(f"Mapping raw data for AUS Prospectus document {self.doc_id}")
mapped_data_list = [] mapped_data_list = []
# Generate raw name based on fund name and share name by integrate_share_name # Generate raw name based on fund name and share name by integrate_share_name
@ -128,7 +136,9 @@ class DataMapping:
raw_share_name = raw_data.get("share_name", "") raw_share_name = raw_data.get("share_name", "")
raw_data_keys = list(raw_data.keys()) raw_data_keys = list(raw_data.keys())
if len(raw_share_name) > 0: if len(raw_share_name) > 0:
integrated_share_name = self.integrate_share_name(raw_fund_name, raw_share_name) integrated_share_name = self.integrate_share_name(
raw_fund_name, raw_share_name
)
if integrated_share_name not in share_raw_name_list: if integrated_share_name not in share_raw_name_list:
share_raw_name_list.append(integrated_share_name) share_raw_name_list.append(integrated_share_name)
for datapoint in self.datapoints: for datapoint in self.datapoints:
@ -144,7 +154,7 @@ class DataMapping:
"investment_type": 1, "investment_type": 1,
"investment_id": "", "investment_id": "",
"investment_name": "", "investment_name": "",
"similarity": 0 "similarity": 0,
} }
mapped_data_list.append(mapped_data) mapped_data_list.append(mapped_data)
else: else:
@ -162,29 +172,38 @@ class DataMapping:
"value": raw_data[datapoint], "value": raw_data[datapoint],
"investment_type": 33, "investment_type": 33,
"investment_id": "", "investment_id": "",
"investment_name": "" "investment_name": "",
} }
mapped_data_list.append(mapped_data) mapped_data_list.append(mapped_data)
# Mapping raw data with database # Mapping raw data with database
iter_count = 30 iter_count = 60
fund_match_result = {} fund_match_result = {}
if len(fund_raw_name_list) > 0: if len(fund_raw_name_list) > 0:
fund_match_result = self.get_raw_name_db_match_result(fund_raw_name_list, "fund", iter_count) fund_match_result = self.get_raw_name_db_match_result(
logger.info(f"Fund match result: \n{fund_match_result}") fund_raw_name_list, "fund", iter_count
)
# logger.info(f"Fund match result: \n{fund_match_result}")
share_match_result = {} share_match_result = {}
if len(share_raw_name_list) > 0: if len(share_raw_name_list) > 0:
share_match_result = self.get_raw_name_db_match_result(share_raw_name_list, "share", iter_count) share_match_result = self.get_raw_name_db_match_result(
logger.info(f"Share match result: \n{share_match_result}") share_raw_name_list, "share", iter_count
)
# logger.info(f"Share match result: \n{share_match_result}")
for mapped_data in mapped_data_list: for mapped_data in mapped_data_list:
investment_type = mapped_data["investment_type"] investment_type = mapped_data["investment_type"]
raw_name = mapped_data["raw_name"] raw_name = mapped_data["raw_name"]
if investment_type == 33: if investment_type == 33:
if fund_match_result.get(raw_name) is not None: if fund_match_result.get(raw_name) is not None:
matched_db_fund_name = fund_match_result[raw_name] matched_db_fund_name = fund_match_result[raw_name]
if matched_db_fund_name is not None and len(matched_db_fund_name) > 0: if (
matched_db_fund_name is not None
and len(matched_db_fund_name) > 0
):
# get FundId from self.doc_fund_mapping # get FundId from self.doc_fund_mapping
find_fund_df = self.doc_fund_mapping[self.doc_fund_mapping["FundName"] == matched_db_fund_name] find_fund_df = self.doc_fund_mapping[
self.doc_fund_mapping["FundName"] == matched_db_fund_name
]
if find_fund_df is not None and len(find_fund_df) > 0: if find_fund_df is not None and len(find_fund_df) > 0:
fund_id = find_fund_df["FundId"].values[0] fund_id = find_fund_df["FundId"].values[0]
mapped_data["investment_id"] = fund_id mapped_data["investment_id"] = fund_id
@ -193,38 +212,82 @@ class DataMapping:
if investment_type == 1: if investment_type == 1:
if share_match_result.get(raw_name) is not None: if share_match_result.get(raw_name) is not None:
matched_db_share_name = share_match_result[raw_name] matched_db_share_name = share_match_result[raw_name]
if matched_db_share_name is not None and len(matched_db_share_name) > 0: if (
matched_db_share_name is not None
and len(matched_db_share_name) > 0
):
# get SecId from self.doc_fund_class_mapping # get SecId from self.doc_fund_class_mapping
find_share_df = self.doc_fund_class_mapping[self.doc_fund_class_mapping["ShareClassName"] == matched_db_share_name] find_share_df = self.doc_fund_class_mapping[
self.doc_fund_class_mapping["ShareClassName"]
== matched_db_share_name
]
if find_share_df is not None and len(find_share_df) > 0: if find_share_df is not None and len(find_share_df) > 0:
share_id = find_share_df["SecId"].values[0] share_id = find_share_df["SecId"].values[0]
mapped_data["investment_id"] = share_id mapped_data["investment_id"] = share_id
mapped_data["investment_name"] = matched_db_share_name mapped_data["investment_name"] = matched_db_share_name
mapped_data["similarity"] = 1 mapped_data["similarity"] = 1
self.output_mapping_file(mapped_data_list) self.output_mapping_file(mapped_data_list)
return mapped_data_list return mapped_data_list
def get_raw_name_db_match_result(self, raw_name_list, investment_type: str, iter_count: int = 30): def get_raw_name_db_match_result(
self, raw_name_list, investment_type: str, iter_count: int = 30
):
# split raw_name_list into several parts which each part is with 30 elements # split raw_name_list into several parts which each part is with 30 elements
# The reason to split is to avoid invoke token limitation issues from CahtGPT # The reason to split is to avoid invoke token limitation issues from CahtGPT
raw_name_list_parts = [raw_name_list[i:i + iter_count] raw_name_list_parts = [
for i in range(0, len(raw_name_list), iter_count)] raw_name_list[i : i + iter_count]
for i in range(0, len(raw_name_list), iter_count)
]
all_match_result = {} all_match_result = {}
doc_fund_name_list = deepcopy(self.doc_fund_name_list)
doc_share_name_list = deepcopy(self.doc_share_name_list)
for raw_name_list in raw_name_list_parts: for raw_name_list in raw_name_list_parts:
if investment_type == "fund": if investment_type == "fund":
match_result = final_function_to_match(doc_id=self.doc_id, match_result, doc_fund_name_list = self.get_final_function_to_match(
pred_list=raw_name_list, raw_name_list, doc_fund_name_list
db_list=self.doc_fund_name_list, )
provider_name=self.provider_name)
else: else:
match_result = final_function_to_match(doc_id=self.doc_id, match_result, doc_share_name_list = self.get_final_function_to_match(
pred_list=raw_name_list, raw_name_list, doc_share_name_list
db_list=self.doc_share_name_list, )
provider_name=self.provider_name)
all_match_result.update(match_result) all_match_result.update(match_result)
return all_match_result return all_match_result
def get_final_function_to_match(self, raw_name_list, db_name_list):
if len(db_name_list) == 0:
match_result = {}
for raw_name in raw_name_list:
match_result[raw_name] = ""
else:
match_result = final_function_to_match(
doc_id=self.doc_id,
pred_list=raw_name_list,
db_list=db_name_list,
provider_name=self.provider_name,
doc_source=self.doc_source
)
matched_name_list = list(match_result.values())
db_name_list = self.remove_matched_names(db_name_list, matched_name_list)
return match_result, db_name_list
def remove_matched_names(self, target_name_list: list, matched_name_list: list):
if len(matched_name_list) == 0:
return target_name_list
matched_name_list = list(set(matched_name_list))
matched_name_list = [
value for value in matched_name_list if value is not None and len(value) > 0
]
for matched_name in matched_name_list:
if (
matched_name is not None
and len(matched_name) > 0
and matched_name in target_name_list
):
target_name_list.remove(matched_name)
return target_name_list
def mapping_raw_data(self): def mapping_raw_data(self):
""" """
doc_id, page_index, datapoint, value, doc_id, page_index, datapoint, value,
@ -245,9 +308,14 @@ class DataMapping:
if raw_fund_name is None or len(raw_fund_name) == 0: if raw_fund_name is None or len(raw_fund_name) == 0:
continue continue
raw_share_name = raw_data.get("share_name", "") raw_share_name = raw_data.get("share_name", "")
if len(self.doc_fund_name_list) == 0 and len(self.provider_fund_name_list) == 0: if (
len(self.doc_fund_name_list) == 0
and len(self.provider_fund_name_list) == 0
):
if len(raw_share_name) > 0: if len(raw_share_name) > 0:
integrated_share_name = self.integrate_share_name(raw_fund_name, raw_share_name) integrated_share_name = self.integrate_share_name(
raw_fund_name, raw_share_name
)
raw_data_keys = list(raw_data.keys()) raw_data_keys = list(raw_data.keys())
for datapoint in self.datapoints: for datapoint in self.datapoints:
if datapoint in raw_data_keys: if datapoint in raw_data_keys:
@ -262,7 +330,7 @@ class DataMapping:
"investment_type": 1, "investment_type": 1,
"investment_id": "", "investment_id": "",
"investment_name": "", "investment_name": "",
"similarity": 0 "similarity": 0,
} }
mapped_data_list.append(mapped_data) mapped_data_list.append(mapped_data)
else: else:
@ -279,13 +347,15 @@ class DataMapping:
"value": raw_data[datapoint], "value": raw_data[datapoint],
"investment_type": 33, "investment_type": 33,
"investment_id": "", "investment_id": "",
"investment_name": "" "investment_name": "",
} }
mapped_data_list.append(mapped_data) mapped_data_list.append(mapped_data)
else: else:
raw_name = "" raw_name = ""
if raw_share_name is not None and len(raw_share_name) > 0: if raw_share_name is not None and len(raw_share_name) > 0:
raw_name = self.integrate_share_name(raw_fund_name, raw_share_name) raw_name = self.integrate_share_name(
raw_fund_name, raw_share_name
)
if mapped_share_cache.get(raw_name) is not None: if mapped_share_cache.get(raw_name) is not None:
investment_info = mapped_share_cache[raw_name] investment_info = mapped_share_cache[raw_name]
else: else:
@ -298,14 +368,20 @@ class DataMapping:
) )
fund_id = fund_info["id"] fund_id = fund_info["id"]
mapped_fund_cache[raw_fund_name] = fund_info mapped_fund_cache[raw_fund_name] = fund_info
investment_info = self.matching_with_database( investment_info = {}
raw_name=raw_name, if len(fund_id) > 0:
raw_share_name=raw_share_name, investment_info = self.mapping_unique_raw_data(fund_id=fund_id,
raw_fund_name=raw_fund_name, raw_fund_name=raw_fund_name,
parent_id=fund_id, raw_data_list=raw_data_list)
matching_type="share", if investment_info.get("id", None) is None or len(investment_info.get("id", "")) == 0:
process_cache=process_cache investment_info = self.matching_with_database(
) raw_name=raw_name,
raw_share_name=raw_share_name,
raw_fund_name=raw_fund_name,
parent_id=fund_id,
matching_type="share",
process_cache=process_cache,
)
mapped_share_cache[raw_name] = investment_info mapped_share_cache[raw_name] = investment_info
elif raw_fund_name is not None and len(raw_fund_name) > 0: elif raw_fund_name is not None and len(raw_fund_name) > 0:
raw_name = raw_fund_name raw_name = raw_fund_name
@ -322,7 +398,7 @@ class DataMapping:
"id": "", "id": "",
"legal_name": "", "legal_name": "",
"investment_type": -1, "investment_type": -1,
"similarity": 0 "similarity": 0,
} }
raw_data_keys = list(raw_data.keys()) raw_data_keys = list(raw_data.keys())
@ -339,13 +415,35 @@ class DataMapping:
"investment_type": investment_info["investment_type"], "investment_type": investment_info["investment_type"],
"investment_id": investment_info["id"], "investment_id": investment_info["id"],
"investment_name": investment_info["legal_name"], "investment_name": investment_info["legal_name"],
"similarity": investment_info["similarity"] "similarity": investment_info["similarity"],
} }
mapped_data_list.append(mapped_data) mapped_data_list.append(mapped_data)
self.output_mapping_file(mapped_data_list) self.output_mapping_file(mapped_data_list)
return mapped_data_list return mapped_data_list
def mapping_unique_raw_data(self, fund_id: str, raw_fund_name: str, raw_data_list: list):
share_count = 0
for raw_data in raw_data_list:
fund_name = raw_data.get("fund_name", "")
share_name = raw_data.get("share_name", "")
if fund_name == raw_fund_name and share_name is not None and len(share_name) > 0:
share_count += 1
if share_count > 1:
break
data_info = {}
if share_count == 1:
doc_compare_mapping = self.doc_fund_class_mapping[
self.doc_fund_class_mapping["FundId"] == fund_id
]
if len(doc_compare_mapping) == 1:
data_info["id"] = doc_compare_mapping["SecId"].values[0]
data_info["legal_name"] = doc_compare_mapping["ShareClassName"].values[0]
data_info["investment_type"] = 1
data_info["similarity"] = 1
return data_info
def output_mapping_file(self, mapped_data_list: list): def output_mapping_file(self, mapped_data_list: list):
json_data_file = os.path.join( json_data_file = os.path.join(
self.output_data_json_folder, f"{self.doc_id}.json" self.output_data_json_folder, f"{self.doc_id}.json"
@ -355,10 +453,10 @@ class DataMapping:
extract_data_df = pd.DataFrame(self.raw_document_data_list) extract_data_df = pd.DataFrame(self.raw_document_data_list)
extract_data_df.reset_index(drop=True, inplace=True) extract_data_df.reset_index(drop=True, inplace=True)
mapping_data_df = pd.DataFrame(mapped_data_list) mapping_data_df = pd.DataFrame(mapped_data_list)
mapping_data_df.reset_index(drop=True, inplace=True) mapping_data_df.reset_index(drop=True, inplace=True)
excel_data_file = os.path.join( excel_data_file = os.path.join(
self.output_data_excel_folder, f"{self.doc_id}.xlsx" self.output_data_excel_folder, f"{self.doc_id}.xlsx"
) )
@ -373,7 +471,7 @@ class DataMapping:
raw_name = "" raw_name = ""
if raw_share_name is not None and len(raw_share_name) > 0: if raw_share_name is not None and len(raw_share_name) > 0:
raw_name = raw_share_name raw_name = raw_share_name
# some share names are very short, # some share names are very short,
# so we need to combine with fund name # so we need to combine with fund name
raw_name_splits = raw_name.split() raw_name_splits = raw_name.split()
raw_fund_name_splits = raw_fund_name.split() raw_fund_name_splits = raw_fund_name.split()
@ -384,13 +482,13 @@ class DataMapping:
return raw_name return raw_name
def matching_with_database( def matching_with_database(
self, self,
raw_name: str, raw_name: str,
raw_share_name: str = None, raw_share_name: str = None,
raw_fund_name: str = None, raw_fund_name: str = None,
parent_id: str = None, parent_id: str = None,
matching_type: str = "fund", matching_type: str = "fund",
process_cache: dict = {} process_cache: dict = {},
): ):
if len(self.doc_fund_name_list) == 0 and len(self.provider_fund_name_list) == 0: if len(self.doc_fund_name_list) == 0 and len(self.provider_fund_name_list) == 0:
data_info["id"] = "" data_info["id"] = ""
@ -402,7 +500,7 @@ class DataMapping:
data_info["investment_type"] = investment_type data_info["investment_type"] = investment_type
data_info["similarity"] = 0 data_info["similarity"] = 0
return data_info return data_info
if matching_type == "fund": if matching_type == "fund":
doc_compare_name_list = self.doc_fund_name_list doc_compare_name_list = self.doc_fund_name_list
doc_compare_mapping = self.doc_fund_mapping doc_compare_mapping = self.doc_fund_mapping
@ -417,8 +515,9 @@ class DataMapping:
doc_compare_mapping = self.doc_fund_class_mapping[ doc_compare_mapping = self.doc_fund_class_mapping[
self.doc_fund_class_mapping["FundId"] == parent_id self.doc_fund_class_mapping["FundId"] == parent_id
] ]
provider_compare_mapping = self.provider_fund_class_mapping\ provider_compare_mapping = self.provider_fund_class_mapping[
[self.provider_fund_class_mapping["FundId"] == parent_id] self.provider_fund_class_mapping["FundId"] == parent_id
]
if len(doc_compare_mapping) == 0: if len(doc_compare_mapping) == 0:
if len(provider_compare_mapping) == 0: if len(provider_compare_mapping) == 0:
doc_compare_name_list = self.doc_share_name_list doc_compare_name_list = self.doc_share_name_list
@ -435,9 +534,10 @@ class DataMapping:
doc_compare_name_list = ( doc_compare_name_list = (
doc_compare_mapping["ShareClassName"].unique().tolist() doc_compare_mapping["ShareClassName"].unique().tolist()
) )
if len(provider_compare_mapping) == 0 or \ if len(provider_compare_mapping) == 0 or len(
len(provider_compare_mapping) < len(doc_compare_mapping): provider_compare_mapping
) < len(doc_compare_mapping):
provider_compare_name_list = doc_compare_name_list provider_compare_name_list = doc_compare_name_list
provider_compare_mapping = doc_compare_mapping provider_compare_mapping = doc_compare_mapping
else: else:
@ -459,58 +559,68 @@ class DataMapping:
if doc_compare_name_list is not None and len(doc_compare_name_list) > 0: if doc_compare_name_list is not None and len(doc_compare_name_list) > 0:
_, pre_common_word_list = remove_common_word(doc_compare_name_list) _, pre_common_word_list = remove_common_word(doc_compare_name_list)
max_similarity_name, max_similarity = get_most_similar_name( max_similarity_name, max_similarity = get_most_similar_name(
raw_name, raw_name,
doc_compare_name_list, doc_compare_name_list,
share_name=raw_share_name, share_name=raw_share_name,
fund_name=raw_fund_name, fund_name=raw_fund_name,
matching_type=matching_type, matching_type=matching_type,
process_cache=process_cache) process_cache=process_cache,
)
if matching_type == "fund": if matching_type == "fund":
threshold = 0.7 threshold = 0.7
else: else:
threshold = 0.9 if self.compare_with_provider:
threshold = 0.9
else:
threshold = 0.6
if max_similarity is not None and max_similarity >= threshold: if max_similarity is not None and max_similarity >= threshold:
data_info["id"] = doc_compare_mapping[ data_info["id"] = doc_compare_mapping[
doc_compare_mapping[compare_name_dp] == max_similarity_name doc_compare_mapping[compare_name_dp] == max_similarity_name
][compare_id_dp].values[0] ][compare_id_dp].values[0]
data_info["legal_name"] = max_similarity_name data_info["legal_name"] = max_similarity_name
data_info["similarity"] = max_similarity data_info["similarity"] = max_similarity
if data_info.get("id", None) is None or len(data_info.get("id", "")) == 0: if data_info.get("id", None) is None or len(data_info.get("id", "")) == 0:
# set pre_common_word_list, reason: the document mapping for same fund maybe different with provider mapping # set pre_common_word_list, reason: the document mapping for same fund maybe different with provider mapping
# the purpose is to get the most common word list, to improve the similarity. # the purpose is to get the most common word list, to improve the similarity.
max_similarity_name, max_similarity = get_most_similar_name( if self.compare_with_provider:
raw_name, max_similarity_name, max_similarity = get_most_similar_name(
provider_compare_name_list, raw_name,
share_name=raw_share_name, provider_compare_name_list,
fund_name=raw_fund_name, share_name=raw_share_name,
matching_type=matching_type, fund_name=raw_fund_name,
pre_common_word_list=pre_common_word_list, matching_type=matching_type,
process_cache=process_cache pre_common_word_list=pre_common_word_list,
) process_cache=process_cache,
threshold = 0.7 )
if matching_type == "share": threshold = 0.7
threshold = 0.5 if matching_type == "share":
round_similarity = 0 threshold = 0.5
if max_similarity is not None and isinstance(max_similarity, float): round_similarity = 0
round_similarity = round(max_similarity, 1) if max_similarity is not None and isinstance(max_similarity, float):
if round_similarity is not None and round_similarity >= threshold: round_similarity = round(max_similarity, 1)
data_info["id"] = provider_compare_mapping[ if round_similarity is not None and round_similarity >= threshold:
provider_compare_mapping[compare_name_dp] == max_similarity_name data_info["id"] = provider_compare_mapping[
][compare_id_dp].values[0] provider_compare_mapping[compare_name_dp] == max_similarity_name
data_info["legal_name"] = max_similarity_name
data_info["similarity"] = max_similarity
else:
if len(doc_compare_name_list) == 1:
data_info["id"] = doc_compare_mapping[
doc_compare_mapping[compare_name_dp] == doc_compare_name_list[0]
][compare_id_dp].values[0] ][compare_id_dp].values[0]
data_info["legal_name"] = doc_compare_name_list[0] data_info["legal_name"] = max_similarity_name
data_info["similarity"] = 1 data_info["similarity"] = max_similarity
else: else:
data_info["id"] = "" if len(doc_compare_name_list) == 1:
data_info["legal_name"] = "" data_info["id"] = doc_compare_mapping[
data_info["similarity"] = 0 doc_compare_mapping[compare_name_dp]
== doc_compare_name_list[0]
][compare_id_dp].values[0]
data_info["legal_name"] = doc_compare_name_list[0]
data_info["similarity"] = 1
else:
data_info["id"] = ""
data_info["legal_name"] = ""
data_info["similarity"] = 0
else:
data_info["id"] = ""
data_info["legal_name"] = ""
data_info["similarity"] = 0
data_info["investment_type"] = investment_type data_info["investment_type"] = investment_type
else: else:
data_info["id"] = "" data_info["id"] = ""

View File

@ -61,23 +61,8 @@
"---Example End---", "---Example End---",
"The output should be:", "The output should be:",
"{\"data\": [{\"fund name\": \"Allan Gray Australian Equity Fund\", \"share name\": \"Class A\", \"management_fee_and_costs\": 1.19, \"management_fee\": 0.77, \"administration_fees\": 0.42}]", "{\"data\": [{\"fund name\": \"Allan Gray Australian Equity Fund\", \"share name\": \"Class A\", \"management_fee_and_costs\": 1.19, \"management_fee\": 0.77, \"administration_fees\": 0.42}]",
"- 5. Reverse order of data columns from table text in PDF:",
"For this case, 1. the columns order is reversed, \n2. The fund name is in the end of row with number value in front of fund name.",
"---Example 1 Start---",
"Transaction\ncosts\n(gross)1\nBuy-sell\nspreads\nTransaction\ncosts (net)\nEquals\ninvestment fees and\ncosts\nThe investment fees and\ncosts are made up of\nPlus\nother\ninvestment\nfees and\ncosts\nPerformance\nfee\n% pa\nEntry %/\nExit %\n% pa\n% pa\n% pa\nReady-made portfolios\nSimple choice\n0.04\n0.10/0.10\n0.00\n0.62\n0.55\n0.07\nMLC Stable\n0.05\n0.10/0.10\n0.02\n0.80\n0.65\n0.15\nMLC Conservative Balanced",
"---Example 1 End---",
"For this case, Management fees and costs = Management fees with same reported name: Plus\nother\ninvestment\nfees and\ncosts",
"The output should be: ",
"{\"data\": [{\"fund name\": \"MLC Stable\", \"share name\": \"MLC Stable\", \"buy_spread\": 0.10, \"sell_spread\": 0.10, \"management_fee_and_costs\": 0.55, \"management_fee\": 0.55, \"performance_fee\": 0.07}, {\"fund name\": \"MLC Conservative Balanced\", \"share name\": \"MLC Conservative Balanced\", \"buy_spread\": 0.10, \"sell_spread\": 0.10, \"management_fee_and_costs\": 0.65, \"management_fee\": 0.65, \"performance_fee\": 0.15}]",
"\n",
"---Example 2 Start---",
"\nTotal\nTransaction Costs\nPerformance Fees\nManagement fees and costs\nIndirect Fee\nManagement fees\nMLC diversified investment\noption\n1.49% p.a.\n0.01% p.a.\n0.06% p.a.\n0.07% p.a.\n1.35% p.a.\nMLC Horizon 2\nIncome Portfolio\n",
"---Example 2 End---",
"For this case, Management fees and costs = Management fees + Indirect Fee.",
"The output should be:",
"{\"data\": [{\"fund name\": \"MLC Horizon 2 Income Portfolio\", \"share name\": \"MLC Horizon 2 Income Portfolio\", \"management_fee_and_costs\": 1.42, \"management_fee\": 1.35, \"indirect_costs\": 0.07, \"performance_fee\": 0.06}]",
"- 6. Please ignore these words as fund names, it means never extract these words as fund names. They are:", "- 6. Please ignore these words as fund names, it means never extract these words as fund names. They are:",
"\"Ready-made portfolios\", \"Simple choice\", \"Build-your-own portfolio\"." "\"Ready-made portfolios\", \"Simple choice\", \"Build-your-own portfolio\"."
], ],
"investment_level": { "investment_level": {
"total_annual_dollar_based_charges": "Total annual dollar based charges is share level data.", "total_annual_dollar_based_charges": "Total annual dollar based charges is share level data.",
@ -136,7 +121,7 @@
"special_rule": { "special_rule": {
"management_fee_and_costs": [ "management_fee_and_costs": [
"If there are multiple Management fee and costs reported names, here is the priority rule:", "If there are multiple Management fee and costs reported names, here is the priority rule:",
"- With \"Total Management fees and costs (gross)\" and \"Total Management fees and costs (net)\", pick up the values from \"Total Management fees and costs (net)\".", "A. With \"Total Management fees and costs (gross)\" and \"Total Management fees and costs (net)\", pick up the values from \"Total Management fees and costs (net)\".",
"---Example Start---", "---Example Start---",
"\n Investment option \nInvestment option \nmanagement \ncosts1 \n% p.a. \n(A)\nLifeplan \nadministration fee \n(gross)2 \n% p.a. \n(B)\nLifeplan \nadministration fee \n(net) \n% p.a. \n(C)\nTotal Management \nfees and costs \n(gross) \n% p.a. \n(A + B)\nTotal Management \nfees and costs \n(net) \n% p.a. \n(A + C)\nAllan Gray Australian Equity Fund \u2013 Class A\n0.77\n0.60\n0.42\n1.37\n1.19\n", "\n Investment option \nInvestment option \nmanagement \ncosts1 \n% p.a. \n(A)\nLifeplan \nadministration fee \n(gross)2 \n% p.a. \n(B)\nLifeplan \nadministration fee \n(net) \n% p.a. \n(C)\nTotal Management \nfees and costs \n(gross) \n% p.a. \n(A + B)\nTotal Management \nfees and costs \n(net) \n% p.a. \n(A + C)\nAllan Gray Australian Equity Fund \u2013 Class A\n0.77\n0.60\n0.42\n1.37\n1.19\n",
"---Example End---", "---Example End---",
@ -144,19 +129,24 @@
"{\"data\": [{\"fund name\": \"Allan Gray Australian Equity Fund\", \"share name\": \"Class A\", \"management_fee_and_costs\": 1.19, \"management_fee\": 0.77, \"administration_fees\": 0.42}]", "{\"data\": [{\"fund name\": \"Allan Gray Australian Equity Fund\", \"share name\": \"Class A\", \"management_fee_and_costs\": 1.19, \"management_fee\": 0.77, \"administration_fees\": 0.42}]",
"\n", "\n",
"If there are multiple Management fee and costs sub-columns, here is the rule:", "If there are multiple Management fee and costs sub-columns, here is the rule:",
"- With \"Management fees\" and \"Indirect fee\", sum the values from these two columns: \"Management fees\" + \"Indirect fee\".", "B. With \"Management fees\" and \"Indirect fee\", sum the values from these two columns: \"Management fees\" + \"Indirect fee\".",
"---Example Start---", "---Example Start---",
"\nTotal\nTransaction Costs\nPerformance Fees\nManagement fees and costs\nIndirect Fee\nManagement fees\nMLC diversified investment\noption\n1.49% p.a.\n0.01% p.a.\n0.06% p.a.\n0.07% p.a.\n1.35% p.a.\nMLC Horizon 2\nIncome Portfolio\n", "\n\nManagement fees \nManagement fees and costs \nIndirect Fee \nPerformance Fees \nTransaction Costs \nTotal \nMLC diversified investment \noption \nMLC Horizon 2 \nIncome Portfolio \n1.35% p.a. \n0.07% p.a. \n0.06% p.a. \n0.01% p.a. \n1.49% p.a. \n",
"---Example End---", "---Example End---",
"The output should be:", "The output should be:",
"{\"data\": [{\"fund name\": \"MLC Horizon 2 Income Portfolio\", \"share name\": \"MLC Horizon 2 Income Portfolio\", \"management_fee_and_costs\": 1.42, \"management_fee\": 1.35, \"indirect_costs\": 0.07, \"performance_fee\": 0.06}]", "{\"data\": [{\"fund name\": \"MLC Horizon 2 Income Portfolio\", \"share name\": \"MLC Horizon 2 Income Portfolio\", \"management_fee_and_costs\": 1.42, \"management_fee\": 1.35, \"indirect_costs\": 0.07, \"performance_fee\": 0.06}]",
"\n", "\n",
"- With \"Management fees\" and \"Administration fee\", sum the values from these two columns: \"Management fees\" + \"Administration fee\".", "C. If only find \"Management fees and costs\", please output the relevant as data point key: \"management_fee_and_costs\", instead of \"management_fee\".",
"---Example Start---", "---Example 1 Start---",
"\nTotal\nTransaction Costs\nPerformance Fees\nManagement fees and costs\nAdministration Fee\nManagement fees\nMLC diversified investment\noption\n1.62% p.a.\n0.02% p.a.\n0.03% p.a.\n0.09% p.a.\n1.58% p.a.\nMLC Horizon 4 Balanced\nPortfolio\n", "The fees and costs for managing \nyour investment \nManagement fees and costs \n1 \n• \nSPDR World: 0.30% per annum of net asset \nvalue. This is reduced to 0.18% per annum of net \nasset value with effect from 14 February 2022.",
"---Example End---", "---Example 1 End---",
"The output should be:", "The output should be:",
"{\"data\": [{\"fund name\": \"MLC Horizon 4 Balanced Portfolio\", \"share name\": \"MLC Horizon 4 Balanced Portfolio\", \"management_fee_and_costs\": 1.67, \"management_fee\": 1.58, \"administration_fees\": 0.09, \"performance_fee\": 0.03}]" "{\"data\": [{\"fund name\": \"SPDR World\", \"share name\": \"SPDR World\", \"management_fee_and_costs\": 0.18}]",
"---Example 2 Start---",
"Management Fees and Costs \n\nAs at the date of this PDS, Management Fees and Costs will be capped at: \n\n• 0.18% pa of net asset value for SPDR World \n\n• 0.21% pa of net asset value for SPDR World (Hedged) \n\n",
"---Example 2 End---",
"The output should be:",
"{\"data\": [{\"fund name\": \"SPDR World\", \"share name\": \"SPDR World\", \"management_fee_and_costs\": 0.18}, {\"fund name\": \"SPDR World (Hedged)\", \"share name\": \"SPDR World (Hedged)\", \"management_fee_and_costs\": 0.21}]"
], ],
"buy_spread": [ "buy_spread": [
"Please don't extract data by the reported names for buy_spread or sell_spread, they are: ", "Please don't extract data by the reported names for buy_spread or sell_spread, they are: ",
@ -263,7 +253,7 @@
"date_of_last_hwm_reset_value": ["29 March 2023", "18 April 2024", "19 October 2021"], "date_of_last_hwm_reset_value": ["29 March 2023", "18 April 2024", "19 October 2021"],
"date_of_last_performance_fee_restructure_value": ["12 August 2022", "15 March 2024", "11 November 2023"], "date_of_last_performance_fee_restructure_value": ["12 August 2022", "15 March 2024", "11 November 2023"],
"high_water_mark_type_value": ["Total Return", "Excess Return", "Both TR & ER"], "high_water_mark_type_value": ["Total Return", "Excess Return", "Both TR & ER"],
"minimum_initial_investment_value": [0, 5, 12], "minimum_initial_investment_value": [0, 5000, 10000],
"recoverable_expenses_value": [0.12, 0.05, 0.06], "recoverable_expenses_value": [0.12, 0.05, 0.06],
"indirect_costs_value": [0.12, 0.16, 0.02] "indirect_costs_value": [0.12, 0.16, 0.02]
}, },

168
main.py
View File

@ -31,11 +31,14 @@ class EMEA_AR_Parsing:
output_mapping_data_folder: str = r"/data/emea_ar/output/mapping_data/docs/", output_mapping_data_folder: str = r"/data/emea_ar/output/mapping_data/docs/",
extract_way: str = "text", extract_way: str = "text",
drilldown_folder: str = r"/data/emea_ar/output/drilldown/", drilldown_folder: str = r"/data/emea_ar/output/drilldown/",
compare_with_provider: bool = True
) -> None: ) -> None:
self.doc_id = doc_id self.doc_id = doc_id
self.doc_source = doc_source self.doc_source = doc_source
self.pdf_folder = pdf_folder self.pdf_folder = pdf_folder
os.makedirs(self.pdf_folder, exist_ok=True) os.makedirs(self.pdf_folder, exist_ok=True)
self.compare_with_provider = compare_with_provider
self.pdf_file = self.download_pdf() self.pdf_file = self.download_pdf()
self.document_mapping_info_df = query_document_fund_mapping(doc_id, rerun=False) self.document_mapping_info_df = query_document_fund_mapping(doc_id, rerun=False)
@ -72,11 +75,11 @@ class EMEA_AR_Parsing:
os.makedirs(self.output_mapping_data_folder, exist_ok=True) os.makedirs(self.output_mapping_data_folder, exist_ok=True)
self.filter_pages = FilterPages( self.filter_pages = FilterPages(
self.doc_id, self.doc_id,
self.pdf_file, self.pdf_file,
self.document_mapping_info_df, self.document_mapping_info_df,
self.doc_source, self.doc_source,
output_pdf_text_folder output_pdf_text_folder,
) )
self.page_text_dict = self.filter_pages.page_text_dict self.page_text_dict = self.filter_pages.page_text_dict
@ -87,7 +90,9 @@ class EMEA_AR_Parsing:
drilldown_folder = r"/data/emea_ar/output/drilldown/" drilldown_folder = r"/data/emea_ar/output/drilldown/"
os.makedirs(drilldown_folder, exist_ok=True) os.makedirs(drilldown_folder, exist_ok=True)
self.drilldown_folder = drilldown_folder self.drilldown_folder = drilldown_folder
misc_config_file = os.path.join(f"./configuration/{doc_source}/", "misc_config.json") misc_config_file = os.path.join(
f"./configuration/{doc_source}/", "misc_config.json"
)
if os.path.exists(misc_config_file): if os.path.exists(misc_config_file):
with open(misc_config_file, "r", encoding="utf-8") as f: with open(misc_config_file, "r", encoding="utf-8") as f:
misc_config = json.load(f) misc_config = json.load(f)
@ -249,6 +254,14 @@ class EMEA_AR_Parsing:
except Exception as e: except Exception as e:
logger.error(f"Error: {e}") logger.error(f"Error: {e}")
annotation_list = annotation_list_df.to_dict(orient="records") annotation_list = annotation_list_df.to_dict(orient="records")
try:
drilldown_json_file = os.path.join(
drilldown_data_folder, f"{doc_id}_drilldown.json"
)
with open(drilldown_json_file, "w", encoding="utf-8") as f:
json.dump(annotation_list, f, ensure_ascii=False, indent=4)
except Exception as e:
logger.error(f"Error: {e}")
return annotation_list return annotation_list
def mapping_data(self, data_from_gpt: list, re_run: bool = False) -> list: def mapping_data(self, data_from_gpt: list, re_run: bool = False) -> list:
@ -278,7 +291,8 @@ class EMEA_AR_Parsing:
data_from_gpt, data_from_gpt,
self.document_mapping_info_df, self.document_mapping_info_df,
self.output_mapping_data_folder, self.output_mapping_data_folder,
self.doc_source self.doc_source,
compare_with_provider=self.compare_with_provider
) )
return data_mapping.mapping_raw_data_entrance() return data_mapping.mapping_raw_data_entrance()
@ -334,6 +348,7 @@ def mapping_data(
output_mapping_data_folder=output_mapping_folder, output_mapping_data_folder=output_mapping_folder,
extract_way=extract_way, extract_way=extract_way,
drilldown_folder=drilldown_folder, drilldown_folder=drilldown_folder,
compare_with_provider=False
) )
doc_data_from_gpt, annotation_list = emea_ar_parsing.extract_data( doc_data_from_gpt, annotation_list = emea_ar_parsing.extract_data(
re_run=re_run_extract_data re_run=re_run_extract_data
@ -501,19 +516,30 @@ def batch_start_job(
result_extract_data_df.to_excel( result_extract_data_df.to_excel(
writer, index=False, sheet_name="extract_data" writer, index=False, sheet_name="extract_data"
) )
if document_mapping_file is not None and len(document_mapping_file) > 0 and os.path.exists(document_mapping_file): if (
doc_source == "aus_prospectus"
and document_mapping_file is not None
and len(document_mapping_file) > 0
and os.path.exists(document_mapping_file)
):
try: try:
merged_total_data_folder = os.path.join(output_mapping_total_folder, "merged/") merged_total_data_folder = os.path.join(
output_mapping_total_folder, "merged/"
)
os.makedirs(merged_total_data_folder, exist_ok=True) os.makedirs(merged_total_data_folder, exist_ok=True)
data_file_base_name = os.path.basename(output_file) data_file_base_name = os.path.basename(output_file)
output_merged_data_file_path = os.path.join(merged_total_data_folder, "merged_" + data_file_base_name) output_merged_data_file_path = os.path.join(
merge_output_data_aus_prospectus(output_file, document_mapping_file, output_merged_data_file_path) merged_total_data_folder, "merged_" + data_file_base_name
)
merge_output_data_aus_prospectus(
output_file, document_mapping_file, output_merged_data_file_path
)
except Exception as e: except Exception as e:
logger.error(f"Error: {e}") logger.error(f"Error: {e}")
if calculate_metrics: if calculate_metrics:
prediction_sheet_name = "total_mapping_data" prediction_sheet_name = "data_in_doc_mapping"
ground_truth_file = r"/data/emea_ar/ground_truth/data_extraction/mapping_data_info_73_documents.xlsx" ground_truth_file = r"/data/emea_ar/ground_truth/data_extraction/mapping_data_info_73_documents.xlsx"
ground_truth_sheet_name = "mapping_data" ground_truth_sheet_name = "mapping_data"
metrics_output_folder = r"/data/emea_ar/output/metrics/" metrics_output_folder = r"/data/emea_ar/output/metrics/"
@ -770,11 +796,11 @@ def test_auto_generate_instructions():
def test_data_extraction_metrics(): def test_data_extraction_metrics():
data_type = "data_extraction" data_type = "document_mapping_in_db"
# prediction_file = r"/data/emea_ar/output/mapping_data/total/mapping_data_info_88_documents_by_image_20240920033929.xlsx" # prediction_file = r"/data/emea_ar/output/mapping_data/total/mapping_data_info_88_documents_by_image_20240920033929.xlsx"
prediction_file = r"/data/emea_ar/output/mapping_data/total/mapping_data_info_88_documents_by_text_20240922152517.xlsx" prediction_file = r"/data/emea_ar/output/mapping_data/total/mapping_data_info_51_documents_by_text_20250127104008.xlsx"
# prediction_file = r"/data/emea_ar/output/mapping_data/docs/by_text/excel/481475385.xlsx" # prediction_file = r"/data/emea_ar/output/mapping_data/docs/by_text/excel/481475385.xlsx"
prediction_sheet_name = "mapping_data" prediction_sheet_name = "data_in_doc_mapping"
ground_truth_file = r"/data/emea_ar/ground_truth/data_extraction/mapping_data_info_73_documents.xlsx" ground_truth_file = r"/data/emea_ar/ground_truth/data_extraction/mapping_data_info_73_documents.xlsx"
ground_truth_sheet_name = "mapping_data" ground_truth_sheet_name = "mapping_data"
metrics_output_folder = r"/data/emea_ar/output/metrics/" metrics_output_folder = r"/data/emea_ar/output/metrics/"
@ -1015,9 +1041,9 @@ def batch_run_documents(
page_filter_ground_truth_file = ( page_filter_ground_truth_file = (
r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx" r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx"
) )
re_run_extract_data = False re_run_extract_data = True
re_run_mapping_data = False re_run_mapping_data = True
force_save_total_data = True force_save_total_data = False
calculate_metrics = False calculate_metrics = False
extract_way = "text" extract_way = "text"
@ -1194,13 +1220,17 @@ def merge_output_data_aus_prospectus(
): ):
# TODO: merge output data for aus prospectus, plan to realize it on 2025-01-16 # TODO: merge output data for aus prospectus, plan to realize it on 2025-01-16
data_df = pd.read_excel(data_file_path, sheet_name="total_mapping_data") data_df = pd.read_excel(data_file_path, sheet_name="total_mapping_data")
document_mapping_df = pd.read_excel(document_mapping_file, sheet_name="document_mapping") document_mapping_df = pd.read_excel(
document_mapping_file, sheet_name="document_mapping"
)
# set doc_id to be string type # set doc_id to be string type
data_df["doc_id"] = data_df["doc_id"].astype(str) data_df["doc_id"] = data_df["doc_id"].astype(str)
document_mapping_df["DocumentId"] = document_mapping_df["DocumentId"].astype(str) document_mapping_df["DocumentId"] = document_mapping_df["DocumentId"].astype(str)
doc_id_list = data_df["doc_id"].unique().tolist() doc_id_list = data_df["doc_id"].unique().tolist()
datapoint_keyword_config_file = r"./configuration/aus_prospectus/datapoint_name.json" datapoint_keyword_config_file = (
r"./configuration/aus_prospectus/datapoint_name.json"
)
with open(datapoint_keyword_config_file, "r", encoding="utf-8") as f: with open(datapoint_keyword_config_file, "r", encoding="utf-8") as f:
datapoint_keyword_config = json.load(f) datapoint_keyword_config = json.load(f)
datapoint_name_list = list(datapoint_keyword_config.keys()) datapoint_name_list = list(datapoint_keyword_config.keys())
@ -1212,7 +1242,9 @@ def merge_output_data_aus_prospectus(
"EffectiveDate" "EffectiveDate"
].values[0] ].values[0]
)[0:10] )[0:10]
share_doc_data_df = data_df[(data_df["doc_id"] == doc_id) & (data_df["investment_type"] == 1)] share_doc_data_df = data_df[
(data_df["doc_id"] == doc_id) & (data_df["investment_type"] == 1)
]
exist_raw_name_list = [] exist_raw_name_list = []
for index, row in share_doc_data_df.iterrows(): for index, row in share_doc_data_df.iterrows():
doc_id = str(row["doc_id"]) doc_id = str(row["doc_id"])
@ -1228,7 +1260,9 @@ def merge_output_data_aus_prospectus(
fund_id = "" fund_id = ""
fund_legal_name = "" fund_legal_name = ""
if share_class_id != "": if share_class_id != "":
record_row = document_mapping_df[document_mapping_df["FundClassId"] == share_class_id] record_row = document_mapping_df[
document_mapping_df["FundClassId"] == share_class_id
]
if len(record_row) > 0: if len(record_row) > 0:
fund_id = record_row["FundId"].values[0] fund_id = record_row["FundId"].values[0]
fund_legal_name = record_row["FundLegalName"].values[0] fund_legal_name = record_row["FundLegalName"].values[0]
@ -1265,16 +1299,16 @@ def merge_output_data_aus_prospectus(
doc_data_list.append(data) doc_data_list.append(data)
# find data from total_data_list by raw_name # find data from total_data_list by raw_name
for data in doc_data_list: for data in doc_data_list:
if ( if data["raw_name"] == raw_name:
data["raw_name"] == raw_name
):
update_key = datapoint update_key = datapoint
data[update_key] = value data[update_key] = value
if page_index not in data["page_index"]: if page_index not in data["page_index"]:
data["page_index"].append(page_index) data["page_index"].append(page_index)
break break
fund_doc_data_df = data_df[(data_df["doc_id"] == doc_id) & (data_df["investment_type"] == 33)] fund_doc_data_df = data_df[
(data_df["doc_id"] == doc_id) & (data_df["investment_type"] == 33)
]
for index, row in fund_doc_data_df.iterrows(): for index, row in fund_doc_data_df.iterrows():
doc_id = str(row["doc_id"]) doc_id = str(row["doc_id"])
page_index = int(row["page_index"]) page_index = int(row["page_index"])
@ -1285,12 +1319,13 @@ def merge_output_data_aus_prospectus(
value = row["value"] value = row["value"]
fund_id = row["investment_id"] fund_id = row["investment_id"]
fund_legal_name = row["investment_name"] fund_legal_name = row["investment_name"]
exist = False exist = False
if fund_id != "": if fund_id != "":
for data in doc_data_list: for data in doc_data_list:
if (fund_id != "" and data["fund_id"] == fund_id) or \ if (fund_id != "" and data["fund_id"] == fund_id) or (
(data["raw_fund_name"] == raw_fund_name): data["raw_fund_name"] == raw_fund_name
):
update_key = datapoint update_key = datapoint
data[update_key] = value data[update_key] = value
if page_index not in data["page_index"]: if page_index not in data["page_index"]:
@ -1323,6 +1358,7 @@ def merge_output_data_aus_prospectus(
if __name__ == "__main__": if __name__ == "__main__":
# test_data_extraction_metrics()
# data_file_path = r"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_11_documents_by_text_20250116220811.xlsx" # data_file_path = r"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_11_documents_by_text_20250116220811.xlsx"
# document_mapping_file_path = r"/data/aus_prospectus/basic_information/11_documents/document_mapping.xlsx" # document_mapping_file_path = r"/data/aus_prospectus/basic_information/11_documents/document_mapping.xlsx"
# merged_total_data_folder = r'/data/aus_prospectus/output/mapping_data/total/merged/' # merged_total_data_folder = r'/data/aus_prospectus/output/mapping_data/total/merged/'
@ -1347,13 +1383,19 @@ if __name__ == "__main__":
# output_mapping_child_folder=output_mapping_child_folder) # output_mapping_child_folder=output_mapping_child_folder)
# special_doc_id_list = ["553242411"] # special_doc_id_list = ["553242411"]
doc_source = "aus_prospectus" doc_source = "aus_prospectus"
if doc_source == "aus_prospectus": if doc_source == "aus_prospectus":
document_sample_file = r"./sample_documents/aus_prospectus_100_documents_multi_fund_sample.txt" # document_sample_file = (
# r"./sample_documents/aus_prospectus_100_documents_multi_fund_sample.txt"
# )
document_sample_file = (
r"./sample_documents/aus_prospectus_17_documents_sample.txt"
)
with open(document_sample_file, "r", encoding="utf-8") as f: with open(document_sample_file, "r", encoding="utf-8") as f:
special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()] special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()]
document_mapping_file = r"/data/aus_prospectus/basic_information/from_2024_documents/aus_100_document_prospectus_multi_fund.xlsx" # document_mapping_file = r"/data/aus_prospectus/basic_information/from_2024_documents/aus_100_document_prospectus_multi_fund.xlsx"
document_mapping_file = r"/data/aus_prospectus/basic_information/17_documents/aus_prospectus_17_documents_mapping.xlsx"
# special_doc_id_list: list = [ # special_doc_id_list: list = [
# "539790009", # "539790009",
# "542300403", # "542300403",
@ -1367,7 +1409,7 @@ if __name__ == "__main__":
# "555377021", # "555377021",
# "555654388", # "555654388",
# ] # ]
# special_doc_id_list: list = ["534287518"] special_doc_id_list: list = ["377377369"]
pdf_folder: str = r"/data/aus_prospectus/pdf/" pdf_folder: str = r"/data/aus_prospectus/pdf/"
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/" output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
output_extract_data_child_folder: str = ( output_extract_data_child_folder: str = (
@ -1383,7 +1425,7 @@ if __name__ == "__main__":
r"/data/aus_prospectus/output/mapping_data/total/" r"/data/aus_prospectus/output/mapping_data/total/"
) )
drilldown_folder = r"/data/aus_prospectus/output/drilldown/" drilldown_folder = r"/data/aus_prospectus/output/drilldown/"
batch_run_documents( batch_run_documents(
doc_source=doc_source, doc_source=doc_source,
special_doc_id_list=special_doc_id_list, special_doc_id_list=special_doc_id_list,
@ -1397,7 +1439,61 @@ if __name__ == "__main__":
drilldown_folder=drilldown_folder, drilldown_folder=drilldown_folder,
) )
elif doc_source == "emea_ar": elif doc_source == "emea_ar":
special_doc_id_list = ["553242408"] special_doc_id_list = [
"292989214",
"316237292",
"321733631",
"323390570",
"327956364",
"333207452",
"334718372",
"344636875",
"362246081",
"366179419",
"380945052",
"382366116",
"387202452",
"389171486",
"391456740",
"391736837",
"394778487",
"401684600",
"402113224",
"402181770",
"402397014",
"405803396",
"445102363",
"445256897",
"448265376",
"449555622",
"449623976",
"458291624",
"458359181",
"463081566",
"469138353",
"471641628",
"476492237",
"478585901",
"478586066",
"479042264",
"479793787",
"481475385",
"483617247",
"486378555",
"486383912",
"492121213",
"497497599",
"502693599",
"502821436",
"503194284",
"506559375",
"507967525",
"508854243",
"509845549",
"520879048",
"529925114",
]
# special_doc_id_list = ["532438210"]
batch_run_documents( batch_run_documents(
doc_source=doc_source, special_doc_id_list=special_doc_id_list doc_source=doc_source, special_doc_id_list=special_doc_id_list
) )

View File

@ -0,0 +1,17 @@
377377369
397107472
401212184
409723592
411062815
412778803
414751292
462770987
471206458
391080133
391080140
410899007
420339794
441280757
446324179
454036250
384508026

View File

@ -543,7 +543,7 @@ class PDFUtil:
matching_val_area = page.search_for(text_block.replace('\n', '').replace('-', '')) matching_val_area = page.search_for(text_block.replace('\n', '').replace('-', ''))
if len(matching_val_area) == 0: if len(matching_val_area) == 0:
matching_val_area = page.search_for(text_block.replace('-\n', '')) matching_val_area = page.search_for(text_block.replace('-\n', ''))
if len(matching_val_area) > 0 and len(text_block.strip().split()) == 1: if len(matching_val_area) > 0 and len(text_block.strip().split()) < 3:
new_matching_val_area = [] new_matching_val_area = []
for area in matching_val_area: for area in matching_val_area:
# get text by text_bbox # get text by text_bbox

View File

@ -8,7 +8,7 @@ import dotenv
dotenv.load_dotenv() dotenv.load_dotenv()
def query_document_fund_mapping(doc_id, rerun=True, output_folder=r"/data/emea_ar/output/mapping/document/"): def query_document_fund_mapping(doc_id, rerun=True, output_folder=r"./data/emea_ar/output/db_mapping/document/"):
count = 1 count = 1
while True: while True:
try: try:
@ -27,10 +27,13 @@ def query_document_fund_mapping(doc_id, rerun=True, output_folder=r"/data/emea_a
by=["FundName", "ShareClassName"] by=["FundName", "ShareClassName"]
).reset_index(drop=True) ).reset_index(drop=True)
if output_folder is not None and len(output_folder) > 0: if output_folder is not None and len(output_folder) > 0:
os.makedirs(output_folder, exist_ok=True) try:
output_file = os.path.join(output_folder, f"{doc_id}.xlsx") os.makedirs(output_folder, exist_ok=True)
with pd.ExcelWriter(output_file) as writer: output_file = os.path.join(output_folder, f"{doc_id}.xlsx")
document_mapping_info_df.to_excel(writer, index=False) with pd.ExcelWriter(output_file) as writer:
document_mapping_info_df.to_excel(writer, index=False)
except:
pass
return document_mapping_info_df return document_mapping_info_df
except Exception as e: except Exception as e:
print(e) print(e)
@ -40,7 +43,7 @@ def query_document_fund_mapping(doc_id, rerun=True, output_folder=r"/data/emea_a
count += 1 count += 1
def query_investment_by_provider(company_id: str, rerun=True, output_folder=r"/data/emea_ar/output/mapping/provider/"): def query_investment_by_provider(company_id: str, rerun=True, output_folder=r"./data/emea_ar/output/db_mapping/provider/"):
count = 1 count = 1
while True: while True:
try: try:
@ -59,10 +62,13 @@ def query_investment_by_provider(company_id: str, rerun=True, output_folder=r"/d
.sort_values(by=['FundName', 'ShareClassName']) \ .sort_values(by=['FundName', 'ShareClassName']) \
.reset_index(drop=True) .reset_index(drop=True)
if output_folder is not None and len(output_folder) > 0: if output_folder is not None and len(output_folder) > 0:
os.makedirs(output_folder, exist_ok=True) try:
output_file = os.path.join(output_folder, f"{company_id}.xlsx") os.makedirs(output_folder, exist_ok=True)
with pd.ExcelWriter(output_file) as writer: output_file = os.path.join(output_folder, f"{company_id}.xlsx")
investment_by_provider_df.to_excel(writer, index=False) with pd.ExcelWriter(output_file) as writer:
investment_by_provider_df.to_excel(writer, index=False)
except:
pass
return investment_by_provider_df return investment_by_provider_df
except Exception as e: except Exception as e:
print(e) print(e)
@ -73,7 +79,7 @@ def query_investment_by_provider(company_id: str, rerun=True, output_folder=r"/d
def query_data_by_biz_type(biztype: str, para, return_df: bool): def query_data_by_biz_type(biztype: str, para, return_df: bool):
sqlpass_url = "https://api.morningstar.com/sqlpassapi/v1/sql" sqlpass_url = os.getenv("SQL_PASS_URL")
url = sqlpass_url + "?sqlName={0}&params={1}".format(biztype, str(para)) url = sqlpass_url + "?sqlName={0}&params={1}".format(biztype, str(para))
headers = {"ApiKey": os.getenv("SQL_PASS_KEY")} headers = {"ApiKey": os.getenv("SQL_PASS_KEY")}
if return_df: if return_df: