Compare commits

..

10 Commits

Author SHA1 Message Date
Blade He f7d53acdde support get sqlpass api by configuration 2025-02-19 14:37:21 -06:00
Blade He a8810519f8 optimize instructions configuration
optimize drilldown part logic
2025-02-04 15:29:24 -06:00
Blade He f9ef4cec96 update sql_query cache file store location
At most cache 5 days, then clean from local disk.
2025-01-31 10:59:54 -06:00
Blade He 7f37f3532f switch example document 2025-01-27 14:59:26 -06:00
Blade He 6f831e241c Merge branch 'aus_prospectus_ravi' 2025-01-27 12:32:42 -06:00
Blade He 41f8c307ff a little change 2025-01-27 12:32:36 -06:00
Blade He 47c41e492f 1. only get name mapping data from document mapping
2. Compare name mapping metrics between Ravi's and mine.
2025-01-27 12:29:49 -06:00
Blade He d9b0bed39a a little change 2025-01-22 09:57:42 -06:00
Blade He 350550d1b0 fix issue for removing item from list 2025-01-21 17:24:05 -06:00
Blade He e2b9bcbdbc initial abbreviation configurations 2025-01-21 17:09:45 -06:00
11 changed files with 1518 additions and 202 deletions

View File

@ -44,6 +44,8 @@ def emea_ar_data_extract():
output_extract_data_folder = r"./data/emea_ar/output/extract_data/docs/"
output_mapping_data_folder = r"./data/emea_ar/output/mapping_data/docs/"
drilldown_folder = r"./data/emea_ar/output/drilldown/"
db_mapping_document_folder = r"./data/emea_ar/output/db_mapping/document/"
db_mapping_provider_folder = r"./data/emea_ar/output/db_mapping/provider/"
extract_way = "text"
os.makedirs(pdf_folder, exist_ok=True)
@ -51,12 +53,16 @@ def emea_ar_data_extract():
os.makedirs(output_extract_data_folder, exist_ok=True)
os.makedirs(output_mapping_data_folder, exist_ok=True)
os.makedirs(drilldown_folder, exist_ok=True)
os.makedirs(db_mapping_document_folder, exist_ok=True)
os.makedirs(db_mapping_provider_folder, exist_ok=True)
clean_folder(pdf_folder)
clean_folder(output_pdf_text_folder)
clean_folder(output_extract_data_folder)
clean_folder(output_mapping_data_folder)
clean_folder(drilldown_folder)
clean_folder(db_mapping_document_folder)
clean_folder(db_mapping_provider_folder)
re_run_extract_data = False
re_run_mapping_data = False
@ -69,7 +75,8 @@ def emea_ar_data_extract():
output_extract_data_folder=output_extract_data_folder,
output_mapping_data_folder=output_mapping_data_folder,
extract_way=extract_way,
drilldown_folder=drilldown_folder)
drilldown_folder=drilldown_folder,
compare_with_provider=False)
doc_data_from_gpt, annotation_list = emea_ar_parsing.extract_data(re_run=re_run_extract_data)
doc_mapping_data = emea_ar_parsing.mapping_data(
data_from_gpt=doc_data_from_gpt, re_run=re_run_mapping_data

View File

@ -1,6 +1,6 @@
{
"total_annual_dollar_based_charges": {"english": ["total annual dollar based charges", "total annual dollar based charges ($)","total annual dollar"]},
"management_fee_and_costs": {"english": ["management fees and cost", "Plus other investment fees and costs"]},
"management_fee_and_costs": {"english": ["management fees and cost", "management fees and costs", "Plus other investment fees and costs"]},
"management_fee": {"english": ["management fee", "management fees","investment management fees","management fees and cost", "investment option management costs", "investment option management costs1", "Plus other investment fees and costs"]},
"performance_fee": {"english": ["performance fee", "performance fees"]},
"performance_fee_costs": {"english": ["performance fee costs", "performance fees costs"]},

File diff suppressed because it is too large Load Diff

View File

@ -32,22 +32,24 @@ from openai import AzureOpenAI
ABB_JSON = dict()
def get_abb_json():
def get_abb_json(doc_source: str = "aus_prospectus"):
global ABB_JSON
with open("abbreviation_records.json", "r") as file:
# Load the JSON and convert keys to lowercase
ABB_JSON = {key.lower(): value for key, value in json.load(file).items()}
if len(ABB_JSON.keys()) == 0:
with open(f"./configuration/{doc_source}/abbreviation_records.json", "r") as file:
# Load the JSON and convert keys to lowercase
ABB_JSON = {key.lower(): value for key, value in json.load(file).items()}
def get_abbre_format_str(fundname):
def get_abbre_format_str(fundname, doc_source: str = "aus_prospectus"):
"""Replaces abbreviations in a fund name with their expanded forms."""
# Convert fund name to lowercase while matching
f_list = fundname.lower().split()
get_abb_json(doc_source)
updated_doc_fname_words = [ABB_JSON.get(word, word).lower() for word in f_list]
return " ".join(updated_doc_fname_words)
def replace_abbrevs_in_fundnames(fund_names_list):
def replace_abbrevs_in_fundnames(fund_names_list, doc_source: str = "aus_prospectus"):
"""Replaces abbreviations in a list of fund names."""
return [get_abbre_format_str(fund_name) for fund_name in fund_names_list]
return [get_abbre_format_str(fund_name, doc_source) for fund_name in fund_names_list]
### STEP 2 - Remove Stopwords
@ -438,7 +440,7 @@ def format_response(doc_id, pred_fund, db_fund, clean_pred_name, clean_db_name,
return dt
def final_function_to_match(doc_id, pred_list, db_list, provider_name):
def final_function_to_match(doc_id, pred_list, db_list, provider_name, doc_source: str = "aus_prospectus"):
final_result = {}
df_data = []
unmatched_pred_list = pred_list.copy()
@ -456,12 +458,16 @@ def final_function_to_match(doc_id, pred_list, db_list, provider_name):
step0_matched_db_name_cosine= all_matched_fund_names_[0], step0_matched_db_name_jacc= all_matched_fund_names_[1], step0_matched_db_name_leven= all_matched_fund_names_[2],
step0_cosine=all_scores_[0], step0_jaccard=all_scores_[1], step0_levenshtein=all_scores_[2],
llm_flag=False))
unmatched_db_list.remove(db_list[matched_index])
unmatched_pred_list.remove(pred_list[index])
if db_list[matched_index] in unmatched_db_list:
unmatched_db_list.remove(db_list[matched_index])
# unmatched_db_list.remove(db_list[matched_index])
if pred_list[index] in unmatched_pred_list:
unmatched_pred_list.remove(pred_list[index])
# unmatched_pred_list.remove(pred_list[index])
else:
### STEP-1 Abbreviation replacement
cleaned_pred_name1 = replace_abbrevs_in_fundnames([pred_fund])[0]
cleaned_db_list1 = replace_abbrevs_in_fundnames(db_list)
cleaned_pred_name1 = replace_abbrevs_in_fundnames([pred_fund], doc_source)[0]
cleaned_db_list1 = replace_abbrevs_in_fundnames(db_list, doc_source)
# print("--> ",cleaned_db_list1, cleaned_pred_name1)
step1_result, matched_index, all_scores1_, all_matched_fund_names1_ = get_fund_match_final_score(cleaned_db_list1, cleaned_pred_name1)
# print(f"\nStep 1 - Abbreviation Replacement Result: {step1_result}")
@ -477,8 +483,12 @@ def final_function_to_match(doc_id, pred_list, db_list, provider_name):
step1_pred_name=cleaned_pred_name1, step1_db_name=cleaned_db_list1,
step1_matched_db_name_cosine= all_matched_fund_names1_[0], step1_matched_db_name_jacc= all_matched_fund_names1_[1], step1_matched_db_name_leven= all_matched_fund_names1_[2],
step1_cosine=all_scores1_[0], step1_jaccard=all_scores1_[1], step1_levenshtein=all_scores1_[2], llm_flag=False))
unmatched_db_list.remove(db_list[matched_index])
unmatched_pred_list.remove(pred_list[index])
if db_list[matched_index] in unmatched_db_list:
unmatched_db_list.remove(db_list[matched_index])
# unmatched_db_list.remove(db_list[matched_index])
if pred_list[index] in unmatched_pred_list:
unmatched_pred_list.remove(pred_list[index])
# unmatched_pred_list.remove(pred_list[index])
else:
### STEP-2 Remove Stopwords
cleaned_pred_name2 = remove_stopwords_nltk([cleaned_pred_name1])[0]
@ -501,8 +511,12 @@ def final_function_to_match(doc_id, pred_list, db_list, provider_name):
step2_pred_name=cleaned_pred_name2, step2_db_name=cleaned_db_list2,
step2_matched_db_name_cosine= all_matched_fund_names2_[0], step2_matched_db_name_jacc= all_matched_fund_names2_[1], step2_matched_db_name_leven= all_matched_fund_names2_[2],
step2_cosine=all_scores2_[0], step2_jaccard=all_scores2_[1], step2_levenshtein=all_scores2_[2],llm_flag=False))
unmatched_db_list.remove(db_list[matched_index])
unmatched_pred_list.remove(pred_list[index])
if db_list[matched_index] in unmatched_db_list:
unmatched_db_list.remove(db_list[matched_index])
# unmatched_db_list.remove(db_list[matched_index])
if pred_list[index] in unmatched_pred_list:
unmatched_pred_list.remove(pred_list[index])
# unmatched_pred_list.remove(pred_list[index])
else:
### STEP-3 Special Character Removal
cleaned_pred_name3 = remove_special_characters([cleaned_pred_name2])[0]
@ -527,8 +541,12 @@ def final_function_to_match(doc_id, pred_list, db_list, provider_name):
step3_pred_name=cleaned_pred_name3, step3_db_name=cleaned_db_list3,
step3_matched_db_name_cosine= all_matched_fund_names3_[0], step3_matched_db_name_jacc= all_matched_fund_names3_[1], step3_matched_db_name_leven= all_matched_fund_names3_[2],
step3_cosine=all_scores3_[0], step3_jaccard=all_scores3_[1], step3_levenshtein=all_scores3_[2],llm_flag=False))
unmatched_db_list.remove(db_list[matched_index])
unmatched_pred_list.remove(pred_list[index])
if db_list[matched_index] in unmatched_db_list:
unmatched_db_list.remove(db_list[matched_index])
# unmatched_db_list.remove(db_list[matched_index])
if pred_list[index] in unmatched_pred_list:
unmatched_pred_list.remove(pred_list[index])
# unmatched_pred_list.remove(pred_list[index])
else:
### STEP-4 Common Words Removal
cleaned_db_list4, _ = remove_common_words(cleaned_db_list3)
@ -565,8 +583,12 @@ def final_function_to_match(doc_id, pred_list, db_list, provider_name):
# print("unmatched_pred_list: ",unmatched_pred_list)
# print("db_list[matched_index]: ",db_list[matched_index])
# print("pred_list[index]: ",pred_list[index])
unmatched_db_list.remove(db_list[matched_index])
unmatched_pred_list.remove(pred_list[index])
if db_list[matched_index] in unmatched_db_list:
unmatched_db_list.remove(db_list[matched_index])
# unmatched_db_list.remove(db_list[matched_index])
if pred_list[index] in unmatched_pred_list:
unmatched_pred_list.remove(pred_list[index])
# unmatched_pred_list.remove(pred_list[index])
else:
df_data.append(format_response(doc_id, pred_list[index], db_list[matched_index], cleaned_pred_name4,
db_list[matched_index],
@ -595,11 +617,11 @@ def final_function_to_match(doc_id, pred_list, db_list, provider_name):
# print("==>>> DB LIST: ",unmatched_db_list)
# print("==>>> PRED LIST: ",unmatched_pred_list)
if len(unmatched_pred_list)!=0:
cleaned_unmatched_pred_list = replace_abbrevs_in_fundnames(unmatched_pred_list)
cleaned_unmatched_pred_list = replace_abbrevs_in_fundnames(unmatched_pred_list, doc_source)
cleaned_unmatched_pred_list = remove_stopwords_nltk(cleaned_unmatched_pred_list)
cleaned_unmatched_pred_list = remove_special_characters(cleaned_unmatched_pred_list)
cleaned_unmatched_db_list = replace_abbrevs_in_fundnames(unmatched_db_list)
cleaned_unmatched_db_list = replace_abbrevs_in_fundnames(unmatched_db_list, doc_source)
cleaned_unmatched_db_list = remove_stopwords_nltk(cleaned_unmatched_db_list)
cleaned_unmatched_db_list = remove_special_characters(cleaned_unmatched_db_list)
prompt_context = f"""

View File

@ -969,7 +969,9 @@ class DataExtraction:
if datapoint_name == "performance_fee":
datapoint_name = "performance fees"
else:
datapoint_name = datapoint_name.upper()
datapoint_name = self.datapoint_name_config.get(datapoint_name, "")
if len(datapoint_name) == 0:
datapoint_name = datapoint.upper()
reported_name = f"The {datapoint_name} reported name could be:\n{joined_reported_name}"
instructions.append(reported_name)

View File

@ -1,6 +1,7 @@
import os
import json
import pandas as pd
from copy import deepcopy
from utils.biz_utils import get_most_similar_name, remove_common_word
from utils.sql_query_util import (
query_document_fund_mapping,
@ -18,14 +19,18 @@ class DataMapping:
raw_document_data_list: list,
document_mapping_info_df: pd.DataFrame,
output_data_folder: str,
doc_source: str = "emea_ar"
doc_source: str = "emea_ar",
compare_with_provider: bool = True
):
self.doc_id = doc_id
self.datapoints = datapoints
self.doc_source = doc_source
self.compare_with_provider = compare_with_provider
self.raw_document_data_list = raw_document_data_list
if document_mapping_info_df is None or len(document_mapping_info_df) == 0:
self.document_mapping_info_df = query_document_fund_mapping(doc_id, rerun=False)
self.document_mapping_info_df = query_document_fund_mapping(
doc_id, rerun=False
)
else:
self.document_mapping_info_df = document_mapping_info_df
@ -44,7 +49,9 @@ class DataMapping:
def set_mapping_data_by_db(self, document_mapping_info_df: pd.DataFrame):
logger.info("Setting document mapping data")
if document_mapping_info_df is None or len(document_mapping_info_df) == 0:
self.document_mapping_info_df = query_document_fund_mapping(self.doc_id, rerun=False)
self.document_mapping_info_df = query_document_fund_mapping(
self.doc_id, rerun=False
)
else:
self.document_mapping_info_df = document_mapping_info_df
if len(self.document_mapping_info_df) == 0:
@ -92,26 +99,27 @@ class DataMapping:
def get_provider_mapping(self):
if len(self.document_mapping_info_df) == 0:
return pd.DataFrame()
provider_id_list = (
self.document_mapping_info_df["ProviderId"].unique().tolist()
)
provider_id_list = self.document_mapping_info_df["ProviderId"].unique().tolist()
provider_mapping_list = []
for provider_id in provider_id_list:
provider_mapping_list.append(query_investment_by_provider(provider_id, rerun=False))
provider_mapping_list.append(
query_investment_by_provider(provider_id, rerun=False)
)
provider_mapping_df = pd.concat(provider_mapping_list)
provider_mapping_df = provider_mapping_df.drop_duplicates()
provider_mapping_df.reset_index(drop=True, inplace=True)
return provider_mapping_df
def mapping_raw_data_entrance(self):
if self.doc_source == "emear_ar":
if self.doc_source == "emea_ar":
return self.mapping_raw_data()
elif self.doc_source == "aus_prospectus":
return self.mapping_raw_data_aus()
return self.mapping_raw_data_generic()
else:
return self.mapping_raw_data()
# return self.mapping_raw_data_generic()
def mapping_raw_data_aus(self):
def mapping_raw_data_generic(self):
logger.info(f"Mapping raw data for AUS Prospectus document {self.doc_id}")
mapped_data_list = []
# Generate raw name based on fund name and share name by integrate_share_name
@ -128,7 +136,9 @@ class DataMapping:
raw_share_name = raw_data.get("share_name", "")
raw_data_keys = list(raw_data.keys())
if len(raw_share_name) > 0:
integrated_share_name = self.integrate_share_name(raw_fund_name, raw_share_name)
integrated_share_name = self.integrate_share_name(
raw_fund_name, raw_share_name
)
if integrated_share_name not in share_raw_name_list:
share_raw_name_list.append(integrated_share_name)
for datapoint in self.datapoints:
@ -144,7 +154,7 @@ class DataMapping:
"investment_type": 1,
"investment_id": "",
"investment_name": "",
"similarity": 0
"similarity": 0,
}
mapped_data_list.append(mapped_data)
else:
@ -162,19 +172,23 @@ class DataMapping:
"value": raw_data[datapoint],
"investment_type": 33,
"investment_id": "",
"investment_name": ""
"investment_name": "",
}
mapped_data_list.append(mapped_data)
# Mapping raw data with database
iter_count = 30
iter_count = 60
fund_match_result = {}
if len(fund_raw_name_list) > 0:
fund_match_result = self.get_raw_name_db_match_result(fund_raw_name_list, "fund", iter_count)
logger.info(f"Fund match result: \n{fund_match_result}")
fund_match_result = self.get_raw_name_db_match_result(
fund_raw_name_list, "fund", iter_count
)
# logger.info(f"Fund match result: \n{fund_match_result}")
share_match_result = {}
if len(share_raw_name_list) > 0:
share_match_result = self.get_raw_name_db_match_result(share_raw_name_list, "share", iter_count)
logger.info(f"Share match result: \n{share_match_result}")
share_match_result = self.get_raw_name_db_match_result(
share_raw_name_list, "share", iter_count
)
# logger.info(f"Share match result: \n{share_match_result}")
for mapped_data in mapped_data_list:
investment_type = mapped_data["investment_type"]
@ -182,9 +196,14 @@ class DataMapping:
if investment_type == 33:
if fund_match_result.get(raw_name) is not None:
matched_db_fund_name = fund_match_result[raw_name]
if matched_db_fund_name is not None and len(matched_db_fund_name) > 0:
if (
matched_db_fund_name is not None
and len(matched_db_fund_name) > 0
):
# get FundId from self.doc_fund_mapping
find_fund_df = self.doc_fund_mapping[self.doc_fund_mapping["FundName"] == matched_db_fund_name]
find_fund_df = self.doc_fund_mapping[
self.doc_fund_mapping["FundName"] == matched_db_fund_name
]
if find_fund_df is not None and len(find_fund_df) > 0:
fund_id = find_fund_df["FundId"].values[0]
mapped_data["investment_id"] = fund_id
@ -193,9 +212,15 @@ class DataMapping:
if investment_type == 1:
if share_match_result.get(raw_name) is not None:
matched_db_share_name = share_match_result[raw_name]
if matched_db_share_name is not None and len(matched_db_share_name) > 0:
if (
matched_db_share_name is not None
and len(matched_db_share_name) > 0
):
# get SecId from self.doc_fund_class_mapping
find_share_df = self.doc_fund_class_mapping[self.doc_fund_class_mapping["ShareClassName"] == matched_db_share_name]
find_share_df = self.doc_fund_class_mapping[
self.doc_fund_class_mapping["ShareClassName"]
== matched_db_share_name
]
if find_share_df is not None and len(find_share_df) > 0:
share_id = find_share_df["SecId"].values[0]
mapped_data["investment_id"] = share_id
@ -205,26 +230,64 @@ class DataMapping:
self.output_mapping_file(mapped_data_list)
return mapped_data_list
def get_raw_name_db_match_result(self, raw_name_list, investment_type: str, iter_count: int = 30):
def get_raw_name_db_match_result(
self, raw_name_list, investment_type: str, iter_count: int = 30
):
# split raw_name_list into several parts which each part is with 30 elements
# The reason to split is to avoid invoke token limitation issues from CahtGPT
raw_name_list_parts = [raw_name_list[i:i + iter_count]
for i in range(0, len(raw_name_list), iter_count)]
raw_name_list_parts = [
raw_name_list[i : i + iter_count]
for i in range(0, len(raw_name_list), iter_count)
]
all_match_result = {}
doc_fund_name_list = deepcopy(self.doc_fund_name_list)
doc_share_name_list = deepcopy(self.doc_share_name_list)
for raw_name_list in raw_name_list_parts:
if investment_type == "fund":
match_result = final_function_to_match(doc_id=self.doc_id,
pred_list=raw_name_list,
db_list=self.doc_fund_name_list,
provider_name=self.provider_name)
match_result, doc_fund_name_list = self.get_final_function_to_match(
raw_name_list, doc_fund_name_list
)
else:
match_result = final_function_to_match(doc_id=self.doc_id,
pred_list=raw_name_list,
db_list=self.doc_share_name_list,
provider_name=self.provider_name)
match_result, doc_share_name_list = self.get_final_function_to_match(
raw_name_list, doc_share_name_list
)
all_match_result.update(match_result)
return all_match_result
def get_final_function_to_match(self, raw_name_list, db_name_list):
if len(db_name_list) == 0:
match_result = {}
for raw_name in raw_name_list:
match_result[raw_name] = ""
else:
match_result = final_function_to_match(
doc_id=self.doc_id,
pred_list=raw_name_list,
db_list=db_name_list,
provider_name=self.provider_name,
doc_source=self.doc_source
)
matched_name_list = list(match_result.values())
db_name_list = self.remove_matched_names(db_name_list, matched_name_list)
return match_result, db_name_list
def remove_matched_names(self, target_name_list: list, matched_name_list: list):
if len(matched_name_list) == 0:
return target_name_list
matched_name_list = list(set(matched_name_list))
matched_name_list = [
value for value in matched_name_list if value is not None and len(value) > 0
]
for matched_name in matched_name_list:
if (
matched_name is not None
and len(matched_name) > 0
and matched_name in target_name_list
):
target_name_list.remove(matched_name)
return target_name_list
def mapping_raw_data(self):
"""
doc_id, page_index, datapoint, value,
@ -245,9 +308,14 @@ class DataMapping:
if raw_fund_name is None or len(raw_fund_name) == 0:
continue
raw_share_name = raw_data.get("share_name", "")
if len(self.doc_fund_name_list) == 0 and len(self.provider_fund_name_list) == 0:
if (
len(self.doc_fund_name_list) == 0
and len(self.provider_fund_name_list) == 0
):
if len(raw_share_name) > 0:
integrated_share_name = self.integrate_share_name(raw_fund_name, raw_share_name)
integrated_share_name = self.integrate_share_name(
raw_fund_name, raw_share_name
)
raw_data_keys = list(raw_data.keys())
for datapoint in self.datapoints:
if datapoint in raw_data_keys:
@ -262,7 +330,7 @@ class DataMapping:
"investment_type": 1,
"investment_id": "",
"investment_name": "",
"similarity": 0
"similarity": 0,
}
mapped_data_list.append(mapped_data)
else:
@ -279,13 +347,15 @@ class DataMapping:
"value": raw_data[datapoint],
"investment_type": 33,
"investment_id": "",
"investment_name": ""
"investment_name": "",
}
mapped_data_list.append(mapped_data)
else:
raw_name = ""
if raw_share_name is not None and len(raw_share_name) > 0:
raw_name = self.integrate_share_name(raw_fund_name, raw_share_name)
raw_name = self.integrate_share_name(
raw_fund_name, raw_share_name
)
if mapped_share_cache.get(raw_name) is not None:
investment_info = mapped_share_cache[raw_name]
else:
@ -298,14 +368,20 @@ class DataMapping:
)
fund_id = fund_info["id"]
mapped_fund_cache[raw_fund_name] = fund_info
investment_info = self.matching_with_database(
raw_name=raw_name,
raw_share_name=raw_share_name,
raw_fund_name=raw_fund_name,
parent_id=fund_id,
matching_type="share",
process_cache=process_cache
)
investment_info = {}
if len(fund_id) > 0:
investment_info = self.mapping_unique_raw_data(fund_id=fund_id,
raw_fund_name=raw_fund_name,
raw_data_list=raw_data_list)
if investment_info.get("id", None) is None or len(investment_info.get("id", "")) == 0:
investment_info = self.matching_with_database(
raw_name=raw_name,
raw_share_name=raw_share_name,
raw_fund_name=raw_fund_name,
parent_id=fund_id,
matching_type="share",
process_cache=process_cache,
)
mapped_share_cache[raw_name] = investment_info
elif raw_fund_name is not None and len(raw_fund_name) > 0:
raw_name = raw_fund_name
@ -322,7 +398,7 @@ class DataMapping:
"id": "",
"legal_name": "",
"investment_type": -1,
"similarity": 0
"similarity": 0,
}
raw_data_keys = list(raw_data.keys())
@ -339,13 +415,35 @@ class DataMapping:
"investment_type": investment_info["investment_type"],
"investment_id": investment_info["id"],
"investment_name": investment_info["legal_name"],
"similarity": investment_info["similarity"]
"similarity": investment_info["similarity"],
}
mapped_data_list.append(mapped_data)
self.output_mapping_file(mapped_data_list)
return mapped_data_list
def mapping_unique_raw_data(self, fund_id: str, raw_fund_name: str, raw_data_list: list):
share_count = 0
for raw_data in raw_data_list:
fund_name = raw_data.get("fund_name", "")
share_name = raw_data.get("share_name", "")
if fund_name == raw_fund_name and share_name is not None and len(share_name) > 0:
share_count += 1
if share_count > 1:
break
data_info = {}
if share_count == 1:
doc_compare_mapping = self.doc_fund_class_mapping[
self.doc_fund_class_mapping["FundId"] == fund_id
]
if len(doc_compare_mapping) == 1:
data_info["id"] = doc_compare_mapping["SecId"].values[0]
data_info["legal_name"] = doc_compare_mapping["ShareClassName"].values[0]
data_info["investment_type"] = 1
data_info["similarity"] = 1
return data_info
def output_mapping_file(self, mapped_data_list: list):
json_data_file = os.path.join(
self.output_data_json_folder, f"{self.doc_id}.json"
@ -390,7 +488,7 @@ class DataMapping:
raw_fund_name: str = None,
parent_id: str = None,
matching_type: str = "fund",
process_cache: dict = {}
process_cache: dict = {},
):
if len(self.doc_fund_name_list) == 0 and len(self.provider_fund_name_list) == 0:
data_info["id"] = ""
@ -417,8 +515,9 @@ class DataMapping:
doc_compare_mapping = self.doc_fund_class_mapping[
self.doc_fund_class_mapping["FundId"] == parent_id
]
provider_compare_mapping = self.provider_fund_class_mapping\
[self.provider_fund_class_mapping["FundId"] == parent_id]
provider_compare_mapping = self.provider_fund_class_mapping[
self.provider_fund_class_mapping["FundId"] == parent_id
]
if len(doc_compare_mapping) == 0:
if len(provider_compare_mapping) == 0:
doc_compare_name_list = self.doc_share_name_list
@ -436,8 +535,9 @@ class DataMapping:
doc_compare_mapping["ShareClassName"].unique().tolist()
)
if len(provider_compare_mapping) == 0 or \
len(provider_compare_mapping) < len(doc_compare_mapping):
if len(provider_compare_mapping) == 0 or len(
provider_compare_mapping
) < len(doc_compare_mapping):
provider_compare_name_list = doc_compare_name_list
provider_compare_mapping = doc_compare_mapping
else:
@ -464,11 +564,15 @@ class DataMapping:
share_name=raw_share_name,
fund_name=raw_fund_name,
matching_type=matching_type,
process_cache=process_cache)
process_cache=process_cache,
)
if matching_type == "fund":
threshold = 0.7
else:
threshold = 0.9
if self.compare_with_provider:
threshold = 0.9
else:
threshold = 0.6
if max_similarity is not None and max_similarity >= threshold:
data_info["id"] = doc_compare_mapping[
doc_compare_mapping[compare_name_dp] == max_similarity_name
@ -479,38 +583,44 @@ class DataMapping:
if data_info.get("id", None) is None or len(data_info.get("id", "")) == 0:
# set pre_common_word_list, reason: the document mapping for same fund maybe different with provider mapping
# the purpose is to get the most common word list, to improve the similarity.
max_similarity_name, max_similarity = get_most_similar_name(
raw_name,
provider_compare_name_list,
share_name=raw_share_name,
fund_name=raw_fund_name,
matching_type=matching_type,
pre_common_word_list=pre_common_word_list,
process_cache=process_cache
)
threshold = 0.7
if matching_type == "share":
threshold = 0.5
round_similarity = 0
if max_similarity is not None and isinstance(max_similarity, float):
round_similarity = round(max_similarity, 1)
if round_similarity is not None and round_similarity >= threshold:
data_info["id"] = provider_compare_mapping[
provider_compare_mapping[compare_name_dp] == max_similarity_name
][compare_id_dp].values[0]
data_info["legal_name"] = max_similarity_name
data_info["similarity"] = max_similarity
else:
if len(doc_compare_name_list) == 1:
data_info["id"] = doc_compare_mapping[
doc_compare_mapping[compare_name_dp] == doc_compare_name_list[0]
if self.compare_with_provider:
max_similarity_name, max_similarity = get_most_similar_name(
raw_name,
provider_compare_name_list,
share_name=raw_share_name,
fund_name=raw_fund_name,
matching_type=matching_type,
pre_common_word_list=pre_common_word_list,
process_cache=process_cache,
)
threshold = 0.7
if matching_type == "share":
threshold = 0.5
round_similarity = 0
if max_similarity is not None and isinstance(max_similarity, float):
round_similarity = round(max_similarity, 1)
if round_similarity is not None and round_similarity >= threshold:
data_info["id"] = provider_compare_mapping[
provider_compare_mapping[compare_name_dp] == max_similarity_name
][compare_id_dp].values[0]
data_info["legal_name"] = doc_compare_name_list[0]
data_info["similarity"] = 1
data_info["legal_name"] = max_similarity_name
data_info["similarity"] = max_similarity
else:
data_info["id"] = ""
data_info["legal_name"] = ""
data_info["similarity"] = 0
if len(doc_compare_name_list) == 1:
data_info["id"] = doc_compare_mapping[
doc_compare_mapping[compare_name_dp]
== doc_compare_name_list[0]
][compare_id_dp].values[0]
data_info["legal_name"] = doc_compare_name_list[0]
data_info["similarity"] = 1
else:
data_info["id"] = ""
data_info["legal_name"] = ""
data_info["similarity"] = 0
else:
data_info["id"] = ""
data_info["legal_name"] = ""
data_info["similarity"] = 0
data_info["investment_type"] = investment_type
else:
data_info["id"] = ""

View File

@ -61,21 +61,6 @@
"---Example End---",
"The output should be:",
"{\"data\": [{\"fund name\": \"Allan Gray Australian Equity Fund\", \"share name\": \"Class A\", \"management_fee_and_costs\": 1.19, \"management_fee\": 0.77, \"administration_fees\": 0.42}]",
"- 5. Reverse order of data columns from table text in PDF:",
"For this case, 1. the columns order is reversed, \n2. The fund name is in the end of row with number value in front of fund name.",
"---Example 1 Start---",
"Transaction\ncosts\n(gross)1\nBuy-sell\nspreads\nTransaction\ncosts (net)\nEquals\ninvestment fees and\ncosts\nThe investment fees and\ncosts are made up of\nPlus\nother\ninvestment\nfees and\ncosts\nPerformance\nfee\n% pa\nEntry %/\nExit %\n% pa\n% pa\n% pa\nReady-made portfolios\nSimple choice\n0.04\n0.10/0.10\n0.00\n0.62\n0.55\n0.07\nMLC Stable\n0.05\n0.10/0.10\n0.02\n0.80\n0.65\n0.15\nMLC Conservative Balanced",
"---Example 1 End---",
"For this case, Management fees and costs = Management fees with same reported name: Plus\nother\ninvestment\nfees and\ncosts",
"The output should be: ",
"{\"data\": [{\"fund name\": \"MLC Stable\", \"share name\": \"MLC Stable\", \"buy_spread\": 0.10, \"sell_spread\": 0.10, \"management_fee_and_costs\": 0.55, \"management_fee\": 0.55, \"performance_fee\": 0.07}, {\"fund name\": \"MLC Conservative Balanced\", \"share name\": \"MLC Conservative Balanced\", \"buy_spread\": 0.10, \"sell_spread\": 0.10, \"management_fee_and_costs\": 0.65, \"management_fee\": 0.65, \"performance_fee\": 0.15}]",
"\n",
"---Example 2 Start---",
"\nTotal\nTransaction Costs\nPerformance Fees\nManagement fees and costs\nIndirect Fee\nManagement fees\nMLC diversified investment\noption\n1.49% p.a.\n0.01% p.a.\n0.06% p.a.\n0.07% p.a.\n1.35% p.a.\nMLC Horizon 2\nIncome Portfolio\n",
"---Example 2 End---",
"For this case, Management fees and costs = Management fees + Indirect Fee.",
"The output should be:",
"{\"data\": [{\"fund name\": \"MLC Horizon 2 Income Portfolio\", \"share name\": \"MLC Horizon 2 Income Portfolio\", \"management_fee_and_costs\": 1.42, \"management_fee\": 1.35, \"indirect_costs\": 0.07, \"performance_fee\": 0.06}]",
"- 6. Please ignore these words as fund names, it means never extract these words as fund names. They are:",
"\"Ready-made portfolios\", \"Simple choice\", \"Build-your-own portfolio\"."
],
@ -136,7 +121,7 @@
"special_rule": {
"management_fee_and_costs": [
"If there are multiple Management fee and costs reported names, here is the priority rule:",
"- With \"Total Management fees and costs (gross)\" and \"Total Management fees and costs (net)\", pick up the values from \"Total Management fees and costs (net)\".",
"A. With \"Total Management fees and costs (gross)\" and \"Total Management fees and costs (net)\", pick up the values from \"Total Management fees and costs (net)\".",
"---Example Start---",
"\n Investment option \nInvestment option \nmanagement \ncosts1 \n% p.a. \n(A)\nLifeplan \nadministration fee \n(gross)2 \n% p.a. \n(B)\nLifeplan \nadministration fee \n(net) \n% p.a. \n(C)\nTotal Management \nfees and costs \n(gross) \n% p.a. \n(A + B)\nTotal Management \nfees and costs \n(net) \n% p.a. \n(A + C)\nAllan Gray Australian Equity Fund \u2013 Class A\n0.77\n0.60\n0.42\n1.37\n1.19\n",
"---Example End---",
@ -144,19 +129,24 @@
"{\"data\": [{\"fund name\": \"Allan Gray Australian Equity Fund\", \"share name\": \"Class A\", \"management_fee_and_costs\": 1.19, \"management_fee\": 0.77, \"administration_fees\": 0.42}]",
"\n",
"If there are multiple Management fee and costs sub-columns, here is the rule:",
"- With \"Management fees\" and \"Indirect fee\", sum the values from these two columns: \"Management fees\" + \"Indirect fee\".",
"B. With \"Management fees\" and \"Indirect fee\", sum the values from these two columns: \"Management fees\" + \"Indirect fee\".",
"---Example Start---",
"\nTotal\nTransaction Costs\nPerformance Fees\nManagement fees and costs\nIndirect Fee\nManagement fees\nMLC diversified investment\noption\n1.49% p.a.\n0.01% p.a.\n0.06% p.a.\n0.07% p.a.\n1.35% p.a.\nMLC Horizon 2\nIncome Portfolio\n",
"\n\nManagement fees \nManagement fees and costs \nIndirect Fee \nPerformance Fees \nTransaction Costs \nTotal \nMLC diversified investment \noption \nMLC Horizon 2 \nIncome Portfolio \n1.35% p.a. \n0.07% p.a. \n0.06% p.a. \n0.01% p.a. \n1.49% p.a. \n",
"---Example End---",
"The output should be:",
"{\"data\": [{\"fund name\": \"MLC Horizon 2 Income Portfolio\", \"share name\": \"MLC Horizon 2 Income Portfolio\", \"management_fee_and_costs\": 1.42, \"management_fee\": 1.35, \"indirect_costs\": 0.07, \"performance_fee\": 0.06}]",
"\n",
"- With \"Management fees\" and \"Administration fee\", sum the values from these two columns: \"Management fees\" + \"Administration fee\".",
"---Example Start---",
"\nTotal\nTransaction Costs\nPerformance Fees\nManagement fees and costs\nAdministration Fee\nManagement fees\nMLC diversified investment\noption\n1.62% p.a.\n0.02% p.a.\n0.03% p.a.\n0.09% p.a.\n1.58% p.a.\nMLC Horizon 4 Balanced\nPortfolio\n",
"---Example End---",
"C. If only find \"Management fees and costs\", please output the relevant as data point key: \"management_fee_and_costs\", instead of \"management_fee\".",
"---Example 1 Start---",
"The fees and costs for managing \nyour investment \nManagement fees and costs \n1 \n• \nSPDR World: 0.30% per annum of net asset \nvalue. This is reduced to 0.18% per annum of net \nasset value with effect from 14 February 2022.",
"---Example 1 End---",
"The output should be:",
"{\"data\": [{\"fund name\": \"MLC Horizon 4 Balanced Portfolio\", \"share name\": \"MLC Horizon 4 Balanced Portfolio\", \"management_fee_and_costs\": 1.67, \"management_fee\": 1.58, \"administration_fees\": 0.09, \"performance_fee\": 0.03}]"
"{\"data\": [{\"fund name\": \"SPDR World\", \"share name\": \"SPDR World\", \"management_fee_and_costs\": 0.18}]",
"---Example 2 Start---",
"Management Fees and Costs \n\nAs at the date of this PDS, Management Fees and Costs will be capped at: \n\n• 0.18% pa of net asset value for SPDR World \n\n• 0.21% pa of net asset value for SPDR World (Hedged) \n\n",
"---Example 2 End---",
"The output should be:",
"{\"data\": [{\"fund name\": \"SPDR World\", \"share name\": \"SPDR World\", \"management_fee_and_costs\": 0.18}, {\"fund name\": \"SPDR World (Hedged)\", \"share name\": \"SPDR World (Hedged)\", \"management_fee_and_costs\": 0.21}]"
],
"buy_spread": [
"Please don't extract data by the reported names for buy_spread or sell_spread, they are: ",
@ -263,7 +253,7 @@
"date_of_last_hwm_reset_value": ["29 March 2023", "18 April 2024", "19 October 2021"],
"date_of_last_performance_fee_restructure_value": ["12 August 2022", "15 March 2024", "11 November 2023"],
"high_water_mark_type_value": ["Total Return", "Excess Return", "Both TR & ER"],
"minimum_initial_investment_value": [0, 5, 12],
"minimum_initial_investment_value": [0, 5000, 10000],
"recoverable_expenses_value": [0.12, 0.05, 0.06],
"indirect_costs_value": [0.12, 0.16, 0.02]
},

152
main.py
View File

@ -31,11 +31,14 @@ class EMEA_AR_Parsing:
output_mapping_data_folder: str = r"/data/emea_ar/output/mapping_data/docs/",
extract_way: str = "text",
drilldown_folder: str = r"/data/emea_ar/output/drilldown/",
compare_with_provider: bool = True
) -> None:
self.doc_id = doc_id
self.doc_source = doc_source
self.pdf_folder = pdf_folder
os.makedirs(self.pdf_folder, exist_ok=True)
self.compare_with_provider = compare_with_provider
self.pdf_file = self.download_pdf()
self.document_mapping_info_df = query_document_fund_mapping(doc_id, rerun=False)
@ -76,7 +79,7 @@ class EMEA_AR_Parsing:
self.pdf_file,
self.document_mapping_info_df,
self.doc_source,
output_pdf_text_folder
output_pdf_text_folder,
)
self.page_text_dict = self.filter_pages.page_text_dict
@ -87,7 +90,9 @@ class EMEA_AR_Parsing:
drilldown_folder = r"/data/emea_ar/output/drilldown/"
os.makedirs(drilldown_folder, exist_ok=True)
self.drilldown_folder = drilldown_folder
misc_config_file = os.path.join(f"./configuration/{doc_source}/", "misc_config.json")
misc_config_file = os.path.join(
f"./configuration/{doc_source}/", "misc_config.json"
)
if os.path.exists(misc_config_file):
with open(misc_config_file, "r", encoding="utf-8") as f:
misc_config = json.load(f)
@ -249,6 +254,14 @@ class EMEA_AR_Parsing:
except Exception as e:
logger.error(f"Error: {e}")
annotation_list = annotation_list_df.to_dict(orient="records")
try:
drilldown_json_file = os.path.join(
drilldown_data_folder, f"{doc_id}_drilldown.json"
)
with open(drilldown_json_file, "w", encoding="utf-8") as f:
json.dump(annotation_list, f, ensure_ascii=False, indent=4)
except Exception as e:
logger.error(f"Error: {e}")
return annotation_list
def mapping_data(self, data_from_gpt: list, re_run: bool = False) -> list:
@ -278,7 +291,8 @@ class EMEA_AR_Parsing:
data_from_gpt,
self.document_mapping_info_df,
self.output_mapping_data_folder,
self.doc_source
self.doc_source,
compare_with_provider=self.compare_with_provider
)
return data_mapping.mapping_raw_data_entrance()
@ -334,6 +348,7 @@ def mapping_data(
output_mapping_data_folder=output_mapping_folder,
extract_way=extract_way,
drilldown_folder=drilldown_folder,
compare_with_provider=False
)
doc_data_from_gpt, annotation_list = emea_ar_parsing.extract_data(
re_run=re_run_extract_data
@ -502,18 +517,29 @@ def batch_start_job(
writer, index=False, sheet_name="extract_data"
)
if document_mapping_file is not None and len(document_mapping_file) > 0 and os.path.exists(document_mapping_file):
if (
doc_source == "aus_prospectus"
and document_mapping_file is not None
and len(document_mapping_file) > 0
and os.path.exists(document_mapping_file)
):
try:
merged_total_data_folder = os.path.join(output_mapping_total_folder, "merged/")
merged_total_data_folder = os.path.join(
output_mapping_total_folder, "merged/"
)
os.makedirs(merged_total_data_folder, exist_ok=True)
data_file_base_name = os.path.basename(output_file)
output_merged_data_file_path = os.path.join(merged_total_data_folder, "merged_" + data_file_base_name)
merge_output_data_aus_prospectus(output_file, document_mapping_file, output_merged_data_file_path)
output_merged_data_file_path = os.path.join(
merged_total_data_folder, "merged_" + data_file_base_name
)
merge_output_data_aus_prospectus(
output_file, document_mapping_file, output_merged_data_file_path
)
except Exception as e:
logger.error(f"Error: {e}")
if calculate_metrics:
prediction_sheet_name = "total_mapping_data"
prediction_sheet_name = "data_in_doc_mapping"
ground_truth_file = r"/data/emea_ar/ground_truth/data_extraction/mapping_data_info_73_documents.xlsx"
ground_truth_sheet_name = "mapping_data"
metrics_output_folder = r"/data/emea_ar/output/metrics/"
@ -770,11 +796,11 @@ def test_auto_generate_instructions():
def test_data_extraction_metrics():
data_type = "data_extraction"
data_type = "document_mapping_in_db"
# prediction_file = r"/data/emea_ar/output/mapping_data/total/mapping_data_info_88_documents_by_image_20240920033929.xlsx"
prediction_file = r"/data/emea_ar/output/mapping_data/total/mapping_data_info_88_documents_by_text_20240922152517.xlsx"
prediction_file = r"/data/emea_ar/output/mapping_data/total/mapping_data_info_51_documents_by_text_20250127104008.xlsx"
# prediction_file = r"/data/emea_ar/output/mapping_data/docs/by_text/excel/481475385.xlsx"
prediction_sheet_name = "mapping_data"
prediction_sheet_name = "data_in_doc_mapping"
ground_truth_file = r"/data/emea_ar/ground_truth/data_extraction/mapping_data_info_73_documents.xlsx"
ground_truth_sheet_name = "mapping_data"
metrics_output_folder = r"/data/emea_ar/output/metrics/"
@ -1015,9 +1041,9 @@ def batch_run_documents(
page_filter_ground_truth_file = (
r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx"
)
re_run_extract_data = False
re_run_mapping_data = False
force_save_total_data = True
re_run_extract_data = True
re_run_mapping_data = True
force_save_total_data = False
calculate_metrics = False
extract_way = "text"
@ -1194,13 +1220,17 @@ def merge_output_data_aus_prospectus(
):
# TODO: merge output data for aus prospectus, plan to realize it on 2025-01-16
data_df = pd.read_excel(data_file_path, sheet_name="total_mapping_data")
document_mapping_df = pd.read_excel(document_mapping_file, sheet_name="document_mapping")
document_mapping_df = pd.read_excel(
document_mapping_file, sheet_name="document_mapping"
)
# set doc_id to be string type
data_df["doc_id"] = data_df["doc_id"].astype(str)
document_mapping_df["DocumentId"] = document_mapping_df["DocumentId"].astype(str)
doc_id_list = data_df["doc_id"].unique().tolist()
datapoint_keyword_config_file = r"./configuration/aus_prospectus/datapoint_name.json"
datapoint_keyword_config_file = (
r"./configuration/aus_prospectus/datapoint_name.json"
)
with open(datapoint_keyword_config_file, "r", encoding="utf-8") as f:
datapoint_keyword_config = json.load(f)
datapoint_name_list = list(datapoint_keyword_config.keys())
@ -1212,7 +1242,9 @@ def merge_output_data_aus_prospectus(
"EffectiveDate"
].values[0]
)[0:10]
share_doc_data_df = data_df[(data_df["doc_id"] == doc_id) & (data_df["investment_type"] == 1)]
share_doc_data_df = data_df[
(data_df["doc_id"] == doc_id) & (data_df["investment_type"] == 1)
]
exist_raw_name_list = []
for index, row in share_doc_data_df.iterrows():
doc_id = str(row["doc_id"])
@ -1228,7 +1260,9 @@ def merge_output_data_aus_prospectus(
fund_id = ""
fund_legal_name = ""
if share_class_id != "":
record_row = document_mapping_df[document_mapping_df["FundClassId"] == share_class_id]
record_row = document_mapping_df[
document_mapping_df["FundClassId"] == share_class_id
]
if len(record_row) > 0:
fund_id = record_row["FundId"].values[0]
fund_legal_name = record_row["FundLegalName"].values[0]
@ -1265,16 +1299,16 @@ def merge_output_data_aus_prospectus(
doc_data_list.append(data)
# find data from total_data_list by raw_name
for data in doc_data_list:
if (
data["raw_name"] == raw_name
):
if data["raw_name"] == raw_name:
update_key = datapoint
data[update_key] = value
if page_index not in data["page_index"]:
data["page_index"].append(page_index)
break
fund_doc_data_df = data_df[(data_df["doc_id"] == doc_id) & (data_df["investment_type"] == 33)]
fund_doc_data_df = data_df[
(data_df["doc_id"] == doc_id) & (data_df["investment_type"] == 33)
]
for index, row in fund_doc_data_df.iterrows():
doc_id = str(row["doc_id"])
page_index = int(row["page_index"])
@ -1289,8 +1323,9 @@ def merge_output_data_aus_prospectus(
exist = False
if fund_id != "":
for data in doc_data_list:
if (fund_id != "" and data["fund_id"] == fund_id) or \
(data["raw_fund_name"] == raw_fund_name):
if (fund_id != "" and data["fund_id"] == fund_id) or (
data["raw_fund_name"] == raw_fund_name
):
update_key = datapoint
data[update_key] = value
if page_index not in data["page_index"]:
@ -1323,6 +1358,7 @@ def merge_output_data_aus_prospectus(
if __name__ == "__main__":
# test_data_extraction_metrics()
# data_file_path = r"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_11_documents_by_text_20250116220811.xlsx"
# document_mapping_file_path = r"/data/aus_prospectus/basic_information/11_documents/document_mapping.xlsx"
# merged_total_data_folder = r'/data/aus_prospectus/output/mapping_data/total/merged/'
@ -1350,10 +1386,16 @@ if __name__ == "__main__":
doc_source = "aus_prospectus"
if doc_source == "aus_prospectus":
document_sample_file = r"./sample_documents/aus_prospectus_100_documents_multi_fund_sample.txt"
# document_sample_file = (
# r"./sample_documents/aus_prospectus_100_documents_multi_fund_sample.txt"
# )
document_sample_file = (
r"./sample_documents/aus_prospectus_17_documents_sample.txt"
)
with open(document_sample_file, "r", encoding="utf-8") as f:
special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()]
document_mapping_file = r"/data/aus_prospectus/basic_information/from_2024_documents/aus_100_document_prospectus_multi_fund.xlsx"
# document_mapping_file = r"/data/aus_prospectus/basic_information/from_2024_documents/aus_100_document_prospectus_multi_fund.xlsx"
document_mapping_file = r"/data/aus_prospectus/basic_information/17_documents/aus_prospectus_17_documents_mapping.xlsx"
# special_doc_id_list: list = [
# "539790009",
# "542300403",
@ -1367,7 +1409,7 @@ if __name__ == "__main__":
# "555377021",
# "555654388",
# ]
# special_doc_id_list: list = ["534287518"]
special_doc_id_list: list = ["377377369"]
pdf_folder: str = r"/data/aus_prospectus/pdf/"
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
output_extract_data_child_folder: str = (
@ -1397,7 +1439,61 @@ if __name__ == "__main__":
drilldown_folder=drilldown_folder,
)
elif doc_source == "emea_ar":
special_doc_id_list = ["553242408"]
special_doc_id_list = [
"292989214",
"316237292",
"321733631",
"323390570",
"327956364",
"333207452",
"334718372",
"344636875",
"362246081",
"366179419",
"380945052",
"382366116",
"387202452",
"389171486",
"391456740",
"391736837",
"394778487",
"401684600",
"402113224",
"402181770",
"402397014",
"405803396",
"445102363",
"445256897",
"448265376",
"449555622",
"449623976",
"458291624",
"458359181",
"463081566",
"469138353",
"471641628",
"476492237",
"478585901",
"478586066",
"479042264",
"479793787",
"481475385",
"483617247",
"486378555",
"486383912",
"492121213",
"497497599",
"502693599",
"502821436",
"503194284",
"506559375",
"507967525",
"508854243",
"509845549",
"520879048",
"529925114",
]
# special_doc_id_list = ["532438210"]
batch_run_documents(
doc_source=doc_source, special_doc_id_list=special_doc_id_list
)

View File

@ -0,0 +1,17 @@
377377369
397107472
401212184
409723592
411062815
412778803
414751292
462770987
471206458
391080133
391080140
410899007
420339794
441280757
446324179
454036250
384508026

View File

@ -543,7 +543,7 @@ class PDFUtil:
matching_val_area = page.search_for(text_block.replace('\n', '').replace('-', ''))
if len(matching_val_area) == 0:
matching_val_area = page.search_for(text_block.replace('-\n', ''))
if len(matching_val_area) > 0 and len(text_block.strip().split()) == 1:
if len(matching_val_area) > 0 and len(text_block.strip().split()) < 3:
new_matching_val_area = []
for area in matching_val_area:
# get text by text_bbox

View File

@ -8,7 +8,7 @@ import dotenv
dotenv.load_dotenv()
def query_document_fund_mapping(doc_id, rerun=True, output_folder=r"/data/emea_ar/output/mapping/document/"):
def query_document_fund_mapping(doc_id, rerun=True, output_folder=r"./data/emea_ar/output/db_mapping/document/"):
count = 1
while True:
try:
@ -27,10 +27,13 @@ def query_document_fund_mapping(doc_id, rerun=True, output_folder=r"/data/emea_a
by=["FundName", "ShareClassName"]
).reset_index(drop=True)
if output_folder is not None and len(output_folder) > 0:
os.makedirs(output_folder, exist_ok=True)
output_file = os.path.join(output_folder, f"{doc_id}.xlsx")
with pd.ExcelWriter(output_file) as writer:
document_mapping_info_df.to_excel(writer, index=False)
try:
os.makedirs(output_folder, exist_ok=True)
output_file = os.path.join(output_folder, f"{doc_id}.xlsx")
with pd.ExcelWriter(output_file) as writer:
document_mapping_info_df.to_excel(writer, index=False)
except:
pass
return document_mapping_info_df
except Exception as e:
print(e)
@ -40,7 +43,7 @@ def query_document_fund_mapping(doc_id, rerun=True, output_folder=r"/data/emea_a
count += 1
def query_investment_by_provider(company_id: str, rerun=True, output_folder=r"/data/emea_ar/output/mapping/provider/"):
def query_investment_by_provider(company_id: str, rerun=True, output_folder=r"./data/emea_ar/output/db_mapping/provider/"):
count = 1
while True:
try:
@ -59,10 +62,13 @@ def query_investment_by_provider(company_id: str, rerun=True, output_folder=r"/d
.sort_values(by=['FundName', 'ShareClassName']) \
.reset_index(drop=True)
if output_folder is not None and len(output_folder) > 0:
os.makedirs(output_folder, exist_ok=True)
output_file = os.path.join(output_folder, f"{company_id}.xlsx")
with pd.ExcelWriter(output_file) as writer:
investment_by_provider_df.to_excel(writer, index=False)
try:
os.makedirs(output_folder, exist_ok=True)
output_file = os.path.join(output_folder, f"{company_id}.xlsx")
with pd.ExcelWriter(output_file) as writer:
investment_by_provider_df.to_excel(writer, index=False)
except:
pass
return investment_by_provider_df
except Exception as e:
print(e)
@ -73,7 +79,7 @@ def query_investment_by_provider(company_id: str, rerun=True, output_folder=r"/d
def query_data_by_biz_type(biztype: str, para, return_df: bool):
sqlpass_url = "https://api.morningstar.com/sqlpassapi/v1/sql"
sqlpass_url = os.getenv("SQL_PASS_URL")
url = sqlpass_url + "?sqlName={0}&params={1}".format(biztype, str(para))
headers = {"ApiKey": os.getenv("SQL_PASS_KEY")}
if return_df: