If found total_annual_dollar_based_charges and could be divisible by 52 or 12,

then set the fund name and share name to be document production name
This commit is contained in:
Blade He 2025-03-28 01:33:33 -05:00
parent 46f86b124b
commit 355b145cf7
3 changed files with 114 additions and 45 deletions

View File

@ -289,6 +289,7 @@ class DataExtraction:
data_list = self.supplement_ttr_pension(data_list) data_list = self.supplement_ttr_pension(data_list)
data_list = self.align_fund_share_name(data_list) data_list = self.align_fund_share_name(data_list)
data_list = self.supplement_minimum_initial_investment(data_list) data_list = self.supplement_minimum_initial_investment(data_list)
data_list = self.check_total_annual_dollar_based_charges(data_list)
data_list, datapoint_list_with_production_name = self.post_adjust_for_value_with_production_name(data_list) data_list, datapoint_list_with_production_name = self.post_adjust_for_value_with_production_name(data_list)
data_list = self.remove_duplicate_data(data_list) data_list = self.remove_duplicate_data(data_list)
if "management_fee" not in datapoint_list_with_production_name and "management_fee_and_costs" not in datapoint_list_with_production_name: if "management_fee" not in datapoint_list_with_production_name and "management_fee_and_costs" not in datapoint_list_with_production_name:
@ -503,6 +504,36 @@ class DataExtraction:
pass pass
return data_list return data_list
def check_total_annual_dollar_based_charges(self, data_list: list):
"""
If found total_annual_dollar_based_charges and could be divisible by 52 or 12,
then set the fund name and share name to be document production name.
"""
for data_dict in data_list:
extract_data = data_dict.get("extract_data", {})
data = extract_data.get("data", [])
found = False
for data_item in data:
keys = list(data_item.keys())
fund_name = data_item.get("fund_name", "")
share_name = data_item.get("share_name", "")
if len(fund_name) == 0:
continue
if "total_annual_dollar_based_charges" in keys:
value = data_item.get("total_annual_dollar_based_charges", -1)
if len(str(value)) > 0:
value_divide_52 = value / 52
value_divide_12 = value / 12
if (value_divide_52 == round(value_divide_52, 4)) or \
(value_divide_12 == round(value_divide_12, 4)):
data_item["fund_name"] = self.document_production
data_item["share_name"] = self.document_production
found = True
break
if found:
break
return data_list
def post_adjust_for_value_with_production_name(self, data_list: list): def post_adjust_for_value_with_production_name(self, data_list: list):
""" """
If some datapoint with production name, then each fund/ share class in the same document for the datapoint should be with same value. If some datapoint with production name, then each fund/ share class in the same document for the datapoint should be with same value.

View File

@ -1448,7 +1448,7 @@ def get_aus_prospectus_document_category():
def test_post_adjust_extract_data(): def test_post_adjust_extract_data():
doc_id = "462780211" doc_id = "448576924"
pdf_folder: str = r"/data/aus_prospectus/pdf/" pdf_folder: str = r"/data/aus_prospectus/pdf/"
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/" output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
output_extract_data_child_folder: str = ( output_extract_data_child_folder: str = (
@ -1538,7 +1538,7 @@ if __name__ == "__main__":
with open(document_sample_file, "r", encoding="utf-8") as f: with open(document_sample_file, "r", encoding="utf-8") as f:
special_doc_id_list = [doc_id.strip() for doc_id in f.readlines() special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()
if len(doc_id.strip()) > 0] if len(doc_id.strip()) > 0]
# special_doc_id_list = ["420339794"] # special_doc_id_list = ["448576924"]
pdf_folder: str = r"/data/aus_prospectus/pdf/" pdf_folder: str = r"/data/aus_prospectus/pdf/"
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/" output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
output_extract_data_child_folder: str = ( output_extract_data_child_folder: str = (

File diff suppressed because one or more lines are too long