1. fit the scenario when document type is not 1 or 4, 5
2. support the scenario: "investment fees and costs including performance" statement in performance fee data page, instead of in management fee and costs data page.
This commit is contained in:
parent
4b896f4460
commit
f333cc30f5
|
|
@ -1,9 +1,42 @@
|
||||||
{
|
{
|
||||||
"management_fee_including_performance_fee": {
|
"management_fee_including_performance_fee": {
|
||||||
"regex_text": ["investment\\s*fees\\s*and\\s*costs\\W*including\\s*performance\\s*fees"],
|
"details": [
|
||||||
|
{"regex_text": ["investment\\s*fees\\s*and\\s*costs\\W*including\\s*performance\\s*fees"],
|
||||||
"effective_datapoints": ["management_fee_and_costs"],
|
"effective_datapoints": ["management_fee_and_costs"],
|
||||||
"exclude_datapoints": ["performance_fee_costs"],
|
"exclude_datapoints": ["performance_fee_costs"]},
|
||||||
"provider_ids": ["0C00005549"],
|
{"regex_text": ["Investment\\s*fees\\s*and\\s*costs\\s*includ.*performance\\s*fees"],
|
||||||
"provider_names": ["Vision Super Pty Ltd"]
|
"effective_datapoints": ["performance_fee_costs"],
|
||||||
|
"exclude_datapoints": ["management_fee_and_costs"]}
|
||||||
|
],
|
||||||
|
"provider_ids": ["0C00006CX6", "0C000056BP", "0C000056KJ", "0C000056KK",
|
||||||
|
"0C000069VJ", "0C0000AL58", "0C00006B9E",
|
||||||
|
"0C00006BDB", "0C00006BDD", "0C00006BDG",
|
||||||
|
"0C000035YC", "0C0000CSKN", "0C00005549",
|
||||||
|
"0C000051C6", "0C00008JA0", "0C000093Z4",
|
||||||
|
"0C0000B5L6", "0C00006EGK", "0C00006EJI",
|
||||||
|
"0C00006FYL", "0C00006G0Q", "0C00006GIF",
|
||||||
|
"0C00006GNW", "0C00006GPU", "0C00006H46",
|
||||||
|
"0C00006H4J", "0C00006H4Q", "0C0000A5XQ",
|
||||||
|
"0C0000BBPL", "0C0000C2MS", "0C0000CVRL",
|
||||||
|
"0C0000AV6P", "0C00001XXQ", "0C00001XYR",
|
||||||
|
"0C00006AZB", "0C00006BN6", "0C00006BXE",
|
||||||
|
"0C00006CIK", "0C00006CJ2", "0C00006DOA",
|
||||||
|
"0C0000CAQF", "0C0000CAQH", "0C0000CAQO",
|
||||||
|
"0C0000CAQR"],
|
||||||
|
"provider_names": ["Bh Super Pty Ltd", "Mellett Super Pty Ltd", "LQ Super Pty Ltd",
|
||||||
|
"Q Super Pty Ltd", "RPM Super Pty Ltd", "VicSuper Pty Ltd",
|
||||||
|
"RMK Super Pty Ltd", "CCM Super Pty Ltd", "Judd Super Pty Ltd",
|
||||||
|
"JMJ Super Pty Ltd", "CARE Super Pty Ltd", "AvSuper Pty Ltd",
|
||||||
|
"Vision Super Pty Ltd", "AustralianSuper Pty Ltd", "First Super Pty Ltd",
|
||||||
|
"GeoVet Super Pty Ltd", "Gilby Super Pty Ltd", "Incani & Papadopoulos Super Pty Ltd",
|
||||||
|
"Gardner Super Pty Ltd", "Terlet Super Pty Ltd", "Rizzo Super Pty Ltd",
|
||||||
|
"Mellet Super Pty Ltd", "Smithley Super Pty Ltd", "Snowflake Super Pty Ltd",
|
||||||
|
"Fruitful Super Pty Ltd", "Seawell Super Pty Ltd", "St Super Pty Ltd",
|
||||||
|
"Christian Super Pty Ltd", "SCS Super Pty Ltd", "Aware Super Pty Ltd",
|
||||||
|
"Vanguard Super Pty Ltd", "United Super Pty Ltd", "National Australia Super Pty Ltd",
|
||||||
|
"AGEST Super Pty Ltd", "Huoncan Super Pty Ltd", "JHG Super Pty Ltd",
|
||||||
|
"Telstra Super Pty Ltd", "P & M Bellero Super Pty Ltd", "J J N A Super Pty Ltd",
|
||||||
|
"KSL Super Pty Ltd", "NESS Super Pty Ltd", "Prime Super Pty Ltd",
|
||||||
|
"PostSuper Pty Ltd", "Legal Super Pty Ltd"]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -99,7 +99,7 @@ class DataExtraction:
|
||||||
len(list(self.special_datapoint_feature_config.keys())) == 0:
|
len(list(self.special_datapoint_feature_config.keys())) == 0:
|
||||||
return special_datapoint_feature
|
return special_datapoint_feature
|
||||||
for feature in list(self.special_datapoint_feature_config.keys()):
|
for feature in list(self.special_datapoint_feature_config.keys()):
|
||||||
special_datapoint_feature[feature] = {"page_index": []}
|
special_datapoint_feature[feature] = {}
|
||||||
return special_datapoint_feature
|
return special_datapoint_feature
|
||||||
|
|
||||||
def get_document_category_production(self):
|
def get_document_category_production(self):
|
||||||
|
|
@ -153,7 +153,6 @@ class DataExtraction:
|
||||||
pass
|
pass
|
||||||
return fund_name
|
return fund_name
|
||||||
|
|
||||||
|
|
||||||
def get_datapoint_page_info(self, datapoint_page_info: dict) -> dict:
|
def get_datapoint_page_info(self, datapoint_page_info: dict) -> dict:
|
||||||
"""
|
"""
|
||||||
If document source is aus_propectus and document category is MIS
|
If document source is aus_propectus and document category is MIS
|
||||||
|
|
@ -558,6 +557,8 @@ class DataExtraction:
|
||||||
"""
|
"""
|
||||||
If some datapoint with production name, then each fund/ share class in the same document for the datapoint should be with same value.
|
If some datapoint with production name, then each fund/ share class in the same document for the datapoint should be with same value.
|
||||||
"""
|
"""
|
||||||
|
if len(self.fund_name_list) < 3:
|
||||||
|
return data_list, []
|
||||||
raw_name_dict = self.get_raw_name_dict(data_list)
|
raw_name_dict = self.get_raw_name_dict(data_list)
|
||||||
raw_name_list = list(raw_name_dict.keys())
|
raw_name_list = list(raw_name_dict.keys())
|
||||||
if len(raw_name_list) < 3:
|
if len(raw_name_list) < 3:
|
||||||
|
|
@ -729,10 +730,34 @@ class DataExtraction:
|
||||||
|
|
||||||
def post_management_fee_exclude_performance_fee(self, data_list: list):
|
def post_management_fee_exclude_performance_fee(self, data_list: list):
|
||||||
adjust = False
|
adjust = False
|
||||||
|
|
||||||
mangement_fee_index_list = self.special_datapoint_feature.get("management_fee_including_performance_fee", {}).\
|
mangement_fee_index_list = self.special_datapoint_feature.get("management_fee_including_performance_fee", {}).\
|
||||||
get("page_index", [])
|
get("page_index", [])
|
||||||
if len(mangement_fee_index_list) == 0:
|
if len(mangement_fee_index_list) == 0:
|
||||||
return data_list, adjust
|
return data_list, adjust
|
||||||
|
effective_datapoint = self.special_datapoint_feature.get("management_fee_including_performance_fee", {}).\
|
||||||
|
get("datapoint", "")
|
||||||
|
if effective_datapoint == "performance_fee_costs":
|
||||||
|
mangement_fee_index_list = []
|
||||||
|
exist_effective_datapoints = False
|
||||||
|
exist_exclude_datapoints = False
|
||||||
|
for data_dict in data_list:
|
||||||
|
page_index = data_dict.get("page_index", -1)
|
||||||
|
data = data_dict.get("extract_data", {}).get("data", [])
|
||||||
|
for data_item in data:
|
||||||
|
datapoints = [datapoint for datapoint in list(data_item.keys())
|
||||||
|
if datapoint == "management_fee_and_costs"]
|
||||||
|
if len(datapoints) > 0:
|
||||||
|
exist_effective_datapoints = True
|
||||||
|
datapoints = [datapoint for datapoint in list(data_item.keys())
|
||||||
|
if datapoint == "performance_fee_costs"]
|
||||||
|
if len(datapoints) > 0:
|
||||||
|
exist_exclude_datapoints = True
|
||||||
|
if exist_effective_datapoints and exist_exclude_datapoints:
|
||||||
|
break
|
||||||
|
if exist_effective_datapoints and not exist_exclude_datapoints:
|
||||||
|
if page_index not in mangement_fee_index_list:
|
||||||
|
mangement_fee_index_list.append(page_index)
|
||||||
min_page_index = min(mangement_fee_index_list)
|
min_page_index = min(mangement_fee_index_list)
|
||||||
performance_fee_item_list = []
|
performance_fee_item_list = []
|
||||||
for data_dict in data_list:
|
for data_dict in data_list:
|
||||||
|
|
@ -759,7 +784,7 @@ class DataExtraction:
|
||||||
keys = list(management_fee_data.keys())
|
keys = list(management_fee_data.keys())
|
||||||
fund_name = management_fee_data.get("fund_name", "")
|
fund_name = management_fee_data.get("fund_name", "")
|
||||||
share_name = management_fee_data.get("share_name", "")
|
share_name = management_fee_data.get("share_name", "")
|
||||||
if len(fund_name) == 0 or len(share_name) == 0:
|
if fund_name == "" or share_name == "":
|
||||||
continue
|
continue
|
||||||
if "management_fee_and_costs" in keys:
|
if "management_fee_and_costs" in keys:
|
||||||
management_fee_and_costs = management_fee_data.get("management_fee_and_costs", -1)
|
management_fee_and_costs = management_fee_data.get("management_fee_and_costs", -1)
|
||||||
|
|
@ -771,7 +796,9 @@ class DataExtraction:
|
||||||
for performance_fee_item in performance_fee_item_list:
|
for performance_fee_item in performance_fee_item_list:
|
||||||
pf_fund_name = performance_fee_item.get("fund_name", "")
|
pf_fund_name = performance_fee_item.get("fund_name", "")
|
||||||
pf_share_name = performance_fee_item.get("share_name", "")
|
pf_share_name = performance_fee_item.get("share_name", "")
|
||||||
if pf_fund_name == fund_name and pf_share_name == share_name:
|
if pf_fund_name == "" or pf_share_name == "":
|
||||||
|
continue
|
||||||
|
if pf_fund_name.lower() == fund_name.lower() and pf_share_name.lower() == share_name.lower():
|
||||||
performance_fee_costs = performance_fee_item.get("performance_fee_costs", -1)
|
performance_fee_costs = performance_fee_item.get("performance_fee_costs", -1)
|
||||||
try:
|
try:
|
||||||
performance_fee_costs = float(performance_fee_costs)
|
performance_fee_costs = float(performance_fee_costs)
|
||||||
|
|
@ -943,7 +970,7 @@ class DataExtraction:
|
||||||
previous_page_datapoints = []
|
previous_page_datapoints = []
|
||||||
previous_page_fund_name = None
|
previous_page_fund_name = None
|
||||||
for page_num, page_text in self.page_text_dict.items():
|
for page_num, page_text in self.page_text_dict.items():
|
||||||
# if page_num not in [37, 38]:
|
# if page_num not in [42]:
|
||||||
# continue
|
# continue
|
||||||
if page_num in handled_page_num_list:
|
if page_num in handled_page_num_list:
|
||||||
continue
|
continue
|
||||||
|
|
@ -1692,16 +1719,15 @@ class DataExtraction:
|
||||||
new_data_list.append(new_data)
|
new_data_list.append(new_data)
|
||||||
extract_data_info["data"] = new_data_list
|
extract_data_info["data"] = new_data_list
|
||||||
if page_text is not None and len(page_text) > 0:
|
if page_text is not None and len(page_text) > 0:
|
||||||
|
try:
|
||||||
self.set_datapoint_feature_properties(new_data_list, page_text, page_num)
|
self.set_datapoint_feature_properties(new_data_list, page_text, page_num)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error in setting datapoint feature properties: {e}")
|
||||||
return extract_data_info
|
return extract_data_info
|
||||||
|
|
||||||
def set_datapoint_feature_properties(self, data_list: list, page_text: str, page_num: int) -> None:
|
def set_datapoint_feature_properties(self, data_list: list, page_text: str, page_num: int) -> None:
|
||||||
for feature, properties in self.special_datapoint_feature_config.items():
|
for feature, properties in self.special_datapoint_feature_config.items():
|
||||||
regex_text_list = properties.get("regex_text", [])
|
if self.special_datapoint_feature.get(feature, {}).get("page_index", None) is not None:
|
||||||
if len(regex_text_list) == 0:
|
|
||||||
continue
|
|
||||||
effective_datapoints = properties.get("effective_datapoints", [])
|
|
||||||
if len(effective_datapoints) == 0:
|
|
||||||
continue
|
continue
|
||||||
provider_ids = properties.get("provider_ids", [])
|
provider_ids = properties.get("provider_ids", [])
|
||||||
if len(provider_ids) > 0:
|
if len(provider_ids) > 0:
|
||||||
|
|
@ -1714,7 +1740,18 @@ class DataExtraction:
|
||||||
break
|
break
|
||||||
if not is_current_provider:
|
if not is_current_provider:
|
||||||
continue
|
continue
|
||||||
exclude_datapoints = properties.get("exclude_datapoints", [])
|
detail_list = properties.get("details", [])
|
||||||
|
if len(detail_list) == 0:
|
||||||
|
continue
|
||||||
|
set_feature_property = False
|
||||||
|
for detail in detail_list:
|
||||||
|
regex_text_list = detail.get("regex_text", [])
|
||||||
|
if len(regex_text_list) == 0:
|
||||||
|
continue
|
||||||
|
effective_datapoints = detail.get("effective_datapoints", [])
|
||||||
|
if len(effective_datapoints) == 0:
|
||||||
|
continue
|
||||||
|
exclude_datapoints = detail.get("exclude_datapoints", [])
|
||||||
|
|
||||||
exist_effective_datapoints = False
|
exist_effective_datapoints = False
|
||||||
exist_exclude_datapoints = False
|
exist_exclude_datapoints = False
|
||||||
|
|
@ -1743,7 +1780,11 @@ class DataExtraction:
|
||||||
if found_regex_text:
|
if found_regex_text:
|
||||||
if self.special_datapoint_feature[feature].get("page_index", None) is None:
|
if self.special_datapoint_feature[feature].get("page_index", None) is None:
|
||||||
self.special_datapoint_feature[feature]["page_index"] = []
|
self.special_datapoint_feature[feature]["page_index"] = []
|
||||||
|
self.special_datapoint_feature[feature]["datapoint"] = effective_datapoints[0]
|
||||||
self.special_datapoint_feature[feature]["page_index"].append(page_num)
|
self.special_datapoint_feature[feature]["page_index"].append(page_num)
|
||||||
|
set_feature_property = True
|
||||||
|
if set_feature_property:
|
||||||
|
break
|
||||||
|
|
||||||
def split_multi_share_name(self, raw_share_name: str) -> list:
|
def split_multi_share_name(self, raw_share_name: str) -> list:
|
||||||
"""
|
"""
|
||||||
|
|
|
||||||
|
|
@ -148,9 +148,9 @@ class FilterPages:
|
||||||
}
|
}
|
||||||
effective_date = self.document_mapping_info_df["EffectiveDate"].iloc[0]
|
effective_date = self.document_mapping_info_df["EffectiveDate"].iloc[0]
|
||||||
document_type = self.document_mapping_info_df["DocumentType"].iloc[0]
|
document_type = self.document_mapping_info_df["DocumentType"].iloc[0]
|
||||||
if document_type in [4, 5]:
|
if document_type in [4, 5] or self.doc_source == "emea_ar":
|
||||||
document_type = "ar"
|
document_type = "ar"
|
||||||
elif document_type == 1:
|
elif document_type == 1 or self.doc_source == "aus_prospectus":
|
||||||
document_type = "prospectus"
|
document_type = "prospectus"
|
||||||
language_id = self.document_mapping_info_df["Language"].iloc[0]
|
language_id = self.document_mapping_info_df["Language"].iloc[0]
|
||||||
language = self.language_config.get(language_id, None)
|
language = self.language_config.get(language_id, None)
|
||||||
|
|
|
||||||
|
|
@ -71,6 +71,7 @@
|
||||||
"If with multiple data values in same row, please extract the latest.",
|
"If with multiple data values in same row, please extract the latest.",
|
||||||
"\n",
|
"\n",
|
||||||
"4. Reported names:",
|
"4. Reported names:",
|
||||||
|
"**IGNORE THE TABLE WHICH TABLE HEADER IS WITH REPORTED NAME: \"Cost of product\"!!!**",
|
||||||
"Only output the values which with significant reported names.",
|
"Only output the values which with significant reported names.",
|
||||||
"Multiple data columns with same reported name but different post-fix:",
|
"Multiple data columns with same reported name but different post-fix:",
|
||||||
"If there are multiple reported names with different post-fix text, here is the priority rule:",
|
"If there are multiple reported names with different post-fix text, here is the priority rule:",
|
||||||
|
|
@ -122,7 +123,7 @@
|
||||||
"total_annual_dollar_based_charges": "Total annual dollar based charges is belong to decimal number, the value could be more than 100, e.g. 625.00",
|
"total_annual_dollar_based_charges": "Total annual dollar based charges is belong to decimal number, the value could be more than 100, e.g. 625.00",
|
||||||
"management_fee_and_costs": "Management fee and costs is belong to percentage number, the value should be less than 100.",
|
"management_fee_and_costs": "Management fee and costs is belong to percentage number, the value should be less than 100.",
|
||||||
"management_fee": "Management fee is belong to percentage number, the value should be less than 100.",
|
"management_fee": "Management fee is belong to percentage number, the value should be less than 100.",
|
||||||
"performance_fee_costs": "Performance fees costs is belong to percentage number, the value should be less than 100.",
|
"performance_fee_costs": "Performance fees costs is belong to percentage number, the value should be less than 10.",
|
||||||
"buy_spread": "Buy spread is belong to percentage number, the value should be less than 100.",
|
"buy_spread": "Buy spread is belong to percentage number, the value should be less than 100.",
|
||||||
"sell_spread": "Sell spread is belong to percentage number, the value should be less than 100.",
|
"sell_spread": "Sell spread is belong to percentage number, the value should be less than 100.",
|
||||||
"establishment_fee": "Establishment fee is belong to percentage number, the value should be less than 100.",
|
"establishment_fee": "Establishment fee is belong to percentage number, the value should be less than 100.",
|
||||||
|
|
|
||||||
11
main.py
11
main.py
|
|
@ -1531,17 +1531,18 @@ if __name__ == "__main__":
|
||||||
# document_sample_file = (
|
# document_sample_file = (
|
||||||
# r"./sample_documents/aus_prospectus_verify_6_documents_sample.txt"
|
# r"./sample_documents/aus_prospectus_verify_6_documents_sample.txt"
|
||||||
# )
|
# )
|
||||||
document_sample_file = (
|
|
||||||
r"./sample_documents/aus_prospectus_46_documents_sample.txt"
|
|
||||||
)
|
|
||||||
# document_sample_file = (
|
# document_sample_file = (
|
||||||
# r"./sample_documents/aus_prospectus_87_vision_cfs_documents_sample.txt"
|
# r"./sample_documents/aus_prospectus_46_documents_sample.txt"
|
||||||
# )
|
# )
|
||||||
|
document_sample_file = (
|
||||||
|
r"./sample_documents/aus_prospectus_87_vision_cfs_documents_sample.txt"
|
||||||
|
)
|
||||||
logger.info(f"Start to run document sample file: {document_sample_file}")
|
logger.info(f"Start to run document sample file: {document_sample_file}")
|
||||||
with open(document_sample_file, "r", encoding="utf-8") as f:
|
with open(document_sample_file, "r", encoding="utf-8") as f:
|
||||||
special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()
|
special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()
|
||||||
if len(doc_id.strip()) > 0]
|
if len(doc_id.strip()) > 0]
|
||||||
# special_doc_id_list = ["527969661"]
|
# special_doc_id_list = ["470879332", "462780211", "561929947", "422100350"]
|
||||||
|
# special_doc_id_list = ["539999907", "455235248", "448576924"]
|
||||||
pdf_folder: str = r"/data/aus_prospectus/pdf/"
|
pdf_folder: str = r"/data/aus_prospectus/pdf/"
|
||||||
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
|
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
|
||||||
output_extract_data_child_folder: str = (
|
output_extract_data_child_folder: str = (
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue