If found total_annual_dollar_based_charges and could be divisible by 52 or 12,
then set the fund name and share name to be document production name
This commit is contained in:
parent
46f86b124b
commit
355b145cf7
|
|
@ -289,6 +289,7 @@ class DataExtraction:
|
|||
data_list = self.supplement_ttr_pension(data_list)
|
||||
data_list = self.align_fund_share_name(data_list)
|
||||
data_list = self.supplement_minimum_initial_investment(data_list)
|
||||
data_list = self.check_total_annual_dollar_based_charges(data_list)
|
||||
data_list, datapoint_list_with_production_name = self.post_adjust_for_value_with_production_name(data_list)
|
||||
data_list = self.remove_duplicate_data(data_list)
|
||||
if "management_fee" not in datapoint_list_with_production_name and "management_fee_and_costs" not in datapoint_list_with_production_name:
|
||||
|
|
@ -503,6 +504,36 @@ class DataExtraction:
|
|||
pass
|
||||
return data_list
|
||||
|
||||
def check_total_annual_dollar_based_charges(self, data_list: list):
|
||||
"""
|
||||
If found total_annual_dollar_based_charges and could be divisible by 52 or 12,
|
||||
then set the fund name and share name to be document production name.
|
||||
"""
|
||||
for data_dict in data_list:
|
||||
extract_data = data_dict.get("extract_data", {})
|
||||
data = extract_data.get("data", [])
|
||||
found = False
|
||||
for data_item in data:
|
||||
keys = list(data_item.keys())
|
||||
fund_name = data_item.get("fund_name", "")
|
||||
share_name = data_item.get("share_name", "")
|
||||
if len(fund_name) == 0:
|
||||
continue
|
||||
if "total_annual_dollar_based_charges" in keys:
|
||||
value = data_item.get("total_annual_dollar_based_charges", -1)
|
||||
if len(str(value)) > 0:
|
||||
value_divide_52 = value / 52
|
||||
value_divide_12 = value / 12
|
||||
if (value_divide_52 == round(value_divide_52, 4)) or \
|
||||
(value_divide_12 == round(value_divide_12, 4)):
|
||||
data_item["fund_name"] = self.document_production
|
||||
data_item["share_name"] = self.document_production
|
||||
found = True
|
||||
break
|
||||
if found:
|
||||
break
|
||||
return data_list
|
||||
|
||||
def post_adjust_for_value_with_production_name(self, data_list: list):
|
||||
"""
|
||||
If some datapoint with production name, then each fund/ share class in the same document for the datapoint should be with same value.
|
||||
|
|
|
|||
4
main.py
4
main.py
|
|
@ -1448,7 +1448,7 @@ def get_aus_prospectus_document_category():
|
|||
|
||||
|
||||
def test_post_adjust_extract_data():
|
||||
doc_id = "462780211"
|
||||
doc_id = "448576924"
|
||||
pdf_folder: str = r"/data/aus_prospectus/pdf/"
|
||||
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
|
||||
output_extract_data_child_folder: str = (
|
||||
|
|
@ -1538,7 +1538,7 @@ if __name__ == "__main__":
|
|||
with open(document_sample_file, "r", encoding="utf-8") as f:
|
||||
special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()
|
||||
if len(doc_id.strip()) > 0]
|
||||
# special_doc_id_list = ["420339794"]
|
||||
# special_doc_id_list = ["448576924"]
|
||||
pdf_folder: str = r"/data/aus_prospectus/pdf/"
|
||||
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
|
||||
output_extract_data_child_folder: str = (
|
||||
|
|
|
|||
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue