If found total_annual_dollar_based_charges and could be divisible by 52 or 12,
then set the fund name and share name to be document production name
This commit is contained in:
parent
46f86b124b
commit
355b145cf7
|
|
@ -289,6 +289,7 @@ class DataExtraction:
|
||||||
data_list = self.supplement_ttr_pension(data_list)
|
data_list = self.supplement_ttr_pension(data_list)
|
||||||
data_list = self.align_fund_share_name(data_list)
|
data_list = self.align_fund_share_name(data_list)
|
||||||
data_list = self.supplement_minimum_initial_investment(data_list)
|
data_list = self.supplement_minimum_initial_investment(data_list)
|
||||||
|
data_list = self.check_total_annual_dollar_based_charges(data_list)
|
||||||
data_list, datapoint_list_with_production_name = self.post_adjust_for_value_with_production_name(data_list)
|
data_list, datapoint_list_with_production_name = self.post_adjust_for_value_with_production_name(data_list)
|
||||||
data_list = self.remove_duplicate_data(data_list)
|
data_list = self.remove_duplicate_data(data_list)
|
||||||
if "management_fee" not in datapoint_list_with_production_name and "management_fee_and_costs" not in datapoint_list_with_production_name:
|
if "management_fee" not in datapoint_list_with_production_name and "management_fee_and_costs" not in datapoint_list_with_production_name:
|
||||||
|
|
@ -503,6 +504,36 @@ class DataExtraction:
|
||||||
pass
|
pass
|
||||||
return data_list
|
return data_list
|
||||||
|
|
||||||
|
def check_total_annual_dollar_based_charges(self, data_list: list):
|
||||||
|
"""
|
||||||
|
If found total_annual_dollar_based_charges and could be divisible by 52 or 12,
|
||||||
|
then set the fund name and share name to be document production name.
|
||||||
|
"""
|
||||||
|
for data_dict in data_list:
|
||||||
|
extract_data = data_dict.get("extract_data", {})
|
||||||
|
data = extract_data.get("data", [])
|
||||||
|
found = False
|
||||||
|
for data_item in data:
|
||||||
|
keys = list(data_item.keys())
|
||||||
|
fund_name = data_item.get("fund_name", "")
|
||||||
|
share_name = data_item.get("share_name", "")
|
||||||
|
if len(fund_name) == 0:
|
||||||
|
continue
|
||||||
|
if "total_annual_dollar_based_charges" in keys:
|
||||||
|
value = data_item.get("total_annual_dollar_based_charges", -1)
|
||||||
|
if len(str(value)) > 0:
|
||||||
|
value_divide_52 = value / 52
|
||||||
|
value_divide_12 = value / 12
|
||||||
|
if (value_divide_52 == round(value_divide_52, 4)) or \
|
||||||
|
(value_divide_12 == round(value_divide_12, 4)):
|
||||||
|
data_item["fund_name"] = self.document_production
|
||||||
|
data_item["share_name"] = self.document_production
|
||||||
|
found = True
|
||||||
|
break
|
||||||
|
if found:
|
||||||
|
break
|
||||||
|
return data_list
|
||||||
|
|
||||||
def post_adjust_for_value_with_production_name(self, data_list: list):
|
def post_adjust_for_value_with_production_name(self, data_list: list):
|
||||||
"""
|
"""
|
||||||
If some datapoint with production name, then each fund/ share class in the same document for the datapoint should be with same value.
|
If some datapoint with production name, then each fund/ share class in the same document for the datapoint should be with same value.
|
||||||
|
|
|
||||||
4
main.py
4
main.py
|
|
@ -1448,7 +1448,7 @@ def get_aus_prospectus_document_category():
|
||||||
|
|
||||||
|
|
||||||
def test_post_adjust_extract_data():
|
def test_post_adjust_extract_data():
|
||||||
doc_id = "462780211"
|
doc_id = "448576924"
|
||||||
pdf_folder: str = r"/data/aus_prospectus/pdf/"
|
pdf_folder: str = r"/data/aus_prospectus/pdf/"
|
||||||
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
|
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
|
||||||
output_extract_data_child_folder: str = (
|
output_extract_data_child_folder: str = (
|
||||||
|
|
@ -1538,7 +1538,7 @@ if __name__ == "__main__":
|
||||||
with open(document_sample_file, "r", encoding="utf-8") as f:
|
with open(document_sample_file, "r", encoding="utf-8") as f:
|
||||||
special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()
|
special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()
|
||||||
if len(doc_id.strip()) > 0]
|
if len(doc_id.strip()) > 0]
|
||||||
# special_doc_id_list = ["420339794"]
|
# special_doc_id_list = ["448576924"]
|
||||||
pdf_folder: str = r"/data/aus_prospectus/pdf/"
|
pdf_folder: str = r"/data/aus_prospectus/pdf/"
|
||||||
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
|
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
|
||||||
output_extract_data_child_folder: str = (
|
output_extract_data_child_folder: str = (
|
||||||
|
|
|
||||||
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue