1. fit the scenario when document type is not 1 or 4, 5
2. support the scenario: "investment fees and costs including performance" statement in performance fee data page, instead of in management fee and costs data page.
This commit is contained in:
parent
4b896f4460
commit
f333cc30f5
|
|
@ -1,9 +1,42 @@
|
|||
{
|
||||
"management_fee_including_performance_fee": {
|
||||
"regex_text": ["investment\\s*fees\\s*and\\s*costs\\W*including\\s*performance\\s*fees"],
|
||||
"details": [
|
||||
{"regex_text": ["investment\\s*fees\\s*and\\s*costs\\W*including\\s*performance\\s*fees"],
|
||||
"effective_datapoints": ["management_fee_and_costs"],
|
||||
"exclude_datapoints": ["performance_fee_costs"],
|
||||
"provider_ids": ["0C00005549"],
|
||||
"provider_names": ["Vision Super Pty Ltd"]
|
||||
"exclude_datapoints": ["performance_fee_costs"]},
|
||||
{"regex_text": ["Investment\\s*fees\\s*and\\s*costs\\s*includ.*performance\\s*fees"],
|
||||
"effective_datapoints": ["performance_fee_costs"],
|
||||
"exclude_datapoints": ["management_fee_and_costs"]}
|
||||
],
|
||||
"provider_ids": ["0C00006CX6", "0C000056BP", "0C000056KJ", "0C000056KK",
|
||||
"0C000069VJ", "0C0000AL58", "0C00006B9E",
|
||||
"0C00006BDB", "0C00006BDD", "0C00006BDG",
|
||||
"0C000035YC", "0C0000CSKN", "0C00005549",
|
||||
"0C000051C6", "0C00008JA0", "0C000093Z4",
|
||||
"0C0000B5L6", "0C00006EGK", "0C00006EJI",
|
||||
"0C00006FYL", "0C00006G0Q", "0C00006GIF",
|
||||
"0C00006GNW", "0C00006GPU", "0C00006H46",
|
||||
"0C00006H4J", "0C00006H4Q", "0C0000A5XQ",
|
||||
"0C0000BBPL", "0C0000C2MS", "0C0000CVRL",
|
||||
"0C0000AV6P", "0C00001XXQ", "0C00001XYR",
|
||||
"0C00006AZB", "0C00006BN6", "0C00006BXE",
|
||||
"0C00006CIK", "0C00006CJ2", "0C00006DOA",
|
||||
"0C0000CAQF", "0C0000CAQH", "0C0000CAQO",
|
||||
"0C0000CAQR"],
|
||||
"provider_names": ["Bh Super Pty Ltd", "Mellett Super Pty Ltd", "LQ Super Pty Ltd",
|
||||
"Q Super Pty Ltd", "RPM Super Pty Ltd", "VicSuper Pty Ltd",
|
||||
"RMK Super Pty Ltd", "CCM Super Pty Ltd", "Judd Super Pty Ltd",
|
||||
"JMJ Super Pty Ltd", "CARE Super Pty Ltd", "AvSuper Pty Ltd",
|
||||
"Vision Super Pty Ltd", "AustralianSuper Pty Ltd", "First Super Pty Ltd",
|
||||
"GeoVet Super Pty Ltd", "Gilby Super Pty Ltd", "Incani & Papadopoulos Super Pty Ltd",
|
||||
"Gardner Super Pty Ltd", "Terlet Super Pty Ltd", "Rizzo Super Pty Ltd",
|
||||
"Mellet Super Pty Ltd", "Smithley Super Pty Ltd", "Snowflake Super Pty Ltd",
|
||||
"Fruitful Super Pty Ltd", "Seawell Super Pty Ltd", "St Super Pty Ltd",
|
||||
"Christian Super Pty Ltd", "SCS Super Pty Ltd", "Aware Super Pty Ltd",
|
||||
"Vanguard Super Pty Ltd", "United Super Pty Ltd", "National Australia Super Pty Ltd",
|
||||
"AGEST Super Pty Ltd", "Huoncan Super Pty Ltd", "JHG Super Pty Ltd",
|
||||
"Telstra Super Pty Ltd", "P & M Bellero Super Pty Ltd", "J J N A Super Pty Ltd",
|
||||
"KSL Super Pty Ltd", "NESS Super Pty Ltd", "Prime Super Pty Ltd",
|
||||
"PostSuper Pty Ltd", "Legal Super Pty Ltd"]
|
||||
}
|
||||
}
|
||||
|
|
@ -99,7 +99,7 @@ class DataExtraction:
|
|||
len(list(self.special_datapoint_feature_config.keys())) == 0:
|
||||
return special_datapoint_feature
|
||||
for feature in list(self.special_datapoint_feature_config.keys()):
|
||||
special_datapoint_feature[feature] = {"page_index": []}
|
||||
special_datapoint_feature[feature] = {}
|
||||
return special_datapoint_feature
|
||||
|
||||
def get_document_category_production(self):
|
||||
|
|
@ -153,7 +153,6 @@ class DataExtraction:
|
|||
pass
|
||||
return fund_name
|
||||
|
||||
|
||||
def get_datapoint_page_info(self, datapoint_page_info: dict) -> dict:
|
||||
"""
|
||||
If document source is aus_propectus and document category is MIS
|
||||
|
|
@ -558,6 +557,8 @@ class DataExtraction:
|
|||
"""
|
||||
If some datapoint with production name, then each fund/ share class in the same document for the datapoint should be with same value.
|
||||
"""
|
||||
if len(self.fund_name_list) < 3:
|
||||
return data_list, []
|
||||
raw_name_dict = self.get_raw_name_dict(data_list)
|
||||
raw_name_list = list(raw_name_dict.keys())
|
||||
if len(raw_name_list) < 3:
|
||||
|
|
@ -729,10 +730,34 @@ class DataExtraction:
|
|||
|
||||
def post_management_fee_exclude_performance_fee(self, data_list: list):
|
||||
adjust = False
|
||||
|
||||
mangement_fee_index_list = self.special_datapoint_feature.get("management_fee_including_performance_fee", {}).\
|
||||
get("page_index", [])
|
||||
if len(mangement_fee_index_list) == 0:
|
||||
return data_list, adjust
|
||||
effective_datapoint = self.special_datapoint_feature.get("management_fee_including_performance_fee", {}).\
|
||||
get("datapoint", "")
|
||||
if effective_datapoint == "performance_fee_costs":
|
||||
mangement_fee_index_list = []
|
||||
exist_effective_datapoints = False
|
||||
exist_exclude_datapoints = False
|
||||
for data_dict in data_list:
|
||||
page_index = data_dict.get("page_index", -1)
|
||||
data = data_dict.get("extract_data", {}).get("data", [])
|
||||
for data_item in data:
|
||||
datapoints = [datapoint for datapoint in list(data_item.keys())
|
||||
if datapoint == "management_fee_and_costs"]
|
||||
if len(datapoints) > 0:
|
||||
exist_effective_datapoints = True
|
||||
datapoints = [datapoint for datapoint in list(data_item.keys())
|
||||
if datapoint == "performance_fee_costs"]
|
||||
if len(datapoints) > 0:
|
||||
exist_exclude_datapoints = True
|
||||
if exist_effective_datapoints and exist_exclude_datapoints:
|
||||
break
|
||||
if exist_effective_datapoints and not exist_exclude_datapoints:
|
||||
if page_index not in mangement_fee_index_list:
|
||||
mangement_fee_index_list.append(page_index)
|
||||
min_page_index = min(mangement_fee_index_list)
|
||||
performance_fee_item_list = []
|
||||
for data_dict in data_list:
|
||||
|
|
@ -759,7 +784,7 @@ class DataExtraction:
|
|||
keys = list(management_fee_data.keys())
|
||||
fund_name = management_fee_data.get("fund_name", "")
|
||||
share_name = management_fee_data.get("share_name", "")
|
||||
if len(fund_name) == 0 or len(share_name) == 0:
|
||||
if fund_name == "" or share_name == "":
|
||||
continue
|
||||
if "management_fee_and_costs" in keys:
|
||||
management_fee_and_costs = management_fee_data.get("management_fee_and_costs", -1)
|
||||
|
|
@ -771,7 +796,9 @@ class DataExtraction:
|
|||
for performance_fee_item in performance_fee_item_list:
|
||||
pf_fund_name = performance_fee_item.get("fund_name", "")
|
||||
pf_share_name = performance_fee_item.get("share_name", "")
|
||||
if pf_fund_name == fund_name and pf_share_name == share_name:
|
||||
if pf_fund_name == "" or pf_share_name == "":
|
||||
continue
|
||||
if pf_fund_name.lower() == fund_name.lower() and pf_share_name.lower() == share_name.lower():
|
||||
performance_fee_costs = performance_fee_item.get("performance_fee_costs", -1)
|
||||
try:
|
||||
performance_fee_costs = float(performance_fee_costs)
|
||||
|
|
@ -943,7 +970,7 @@ class DataExtraction:
|
|||
previous_page_datapoints = []
|
||||
previous_page_fund_name = None
|
||||
for page_num, page_text in self.page_text_dict.items():
|
||||
# if page_num not in [37, 38]:
|
||||
# if page_num not in [42]:
|
||||
# continue
|
||||
if page_num in handled_page_num_list:
|
||||
continue
|
||||
|
|
@ -1692,16 +1719,15 @@ class DataExtraction:
|
|||
new_data_list.append(new_data)
|
||||
extract_data_info["data"] = new_data_list
|
||||
if page_text is not None and len(page_text) > 0:
|
||||
try:
|
||||
self.set_datapoint_feature_properties(new_data_list, page_text, page_num)
|
||||
except Exception as e:
|
||||
logger.error(f"Error in setting datapoint feature properties: {e}")
|
||||
return extract_data_info
|
||||
|
||||
def set_datapoint_feature_properties(self, data_list: list, page_text: str, page_num: int) -> None:
|
||||
for feature, properties in self.special_datapoint_feature_config.items():
|
||||
regex_text_list = properties.get("regex_text", [])
|
||||
if len(regex_text_list) == 0:
|
||||
continue
|
||||
effective_datapoints = properties.get("effective_datapoints", [])
|
||||
if len(effective_datapoints) == 0:
|
||||
if self.special_datapoint_feature.get(feature, {}).get("page_index", None) is not None:
|
||||
continue
|
||||
provider_ids = properties.get("provider_ids", [])
|
||||
if len(provider_ids) > 0:
|
||||
|
|
@ -1714,7 +1740,18 @@ class DataExtraction:
|
|||
break
|
||||
if not is_current_provider:
|
||||
continue
|
||||
exclude_datapoints = properties.get("exclude_datapoints", [])
|
||||
detail_list = properties.get("details", [])
|
||||
if len(detail_list) == 0:
|
||||
continue
|
||||
set_feature_property = False
|
||||
for detail in detail_list:
|
||||
regex_text_list = detail.get("regex_text", [])
|
||||
if len(regex_text_list) == 0:
|
||||
continue
|
||||
effective_datapoints = detail.get("effective_datapoints", [])
|
||||
if len(effective_datapoints) == 0:
|
||||
continue
|
||||
exclude_datapoints = detail.get("exclude_datapoints", [])
|
||||
|
||||
exist_effective_datapoints = False
|
||||
exist_exclude_datapoints = False
|
||||
|
|
@ -1743,7 +1780,11 @@ class DataExtraction:
|
|||
if found_regex_text:
|
||||
if self.special_datapoint_feature[feature].get("page_index", None) is None:
|
||||
self.special_datapoint_feature[feature]["page_index"] = []
|
||||
self.special_datapoint_feature[feature]["datapoint"] = effective_datapoints[0]
|
||||
self.special_datapoint_feature[feature]["page_index"].append(page_num)
|
||||
set_feature_property = True
|
||||
if set_feature_property:
|
||||
break
|
||||
|
||||
def split_multi_share_name(self, raw_share_name: str) -> list:
|
||||
"""
|
||||
|
|
|
|||
|
|
@ -148,9 +148,9 @@ class FilterPages:
|
|||
}
|
||||
effective_date = self.document_mapping_info_df["EffectiveDate"].iloc[0]
|
||||
document_type = self.document_mapping_info_df["DocumentType"].iloc[0]
|
||||
if document_type in [4, 5]:
|
||||
if document_type in [4, 5] or self.doc_source == "emea_ar":
|
||||
document_type = "ar"
|
||||
elif document_type == 1:
|
||||
elif document_type == 1 or self.doc_source == "aus_prospectus":
|
||||
document_type = "prospectus"
|
||||
language_id = self.document_mapping_info_df["Language"].iloc[0]
|
||||
language = self.language_config.get(language_id, None)
|
||||
|
|
|
|||
|
|
@ -71,6 +71,7 @@
|
|||
"If with multiple data values in same row, please extract the latest.",
|
||||
"\n",
|
||||
"4. Reported names:",
|
||||
"**IGNORE THE TABLE WHICH TABLE HEADER IS WITH REPORTED NAME: \"Cost of product\"!!!**",
|
||||
"Only output the values which with significant reported names.",
|
||||
"Multiple data columns with same reported name but different post-fix:",
|
||||
"If there are multiple reported names with different post-fix text, here is the priority rule:",
|
||||
|
|
@ -122,7 +123,7 @@
|
|||
"total_annual_dollar_based_charges": "Total annual dollar based charges is belong to decimal number, the value could be more than 100, e.g. 625.00",
|
||||
"management_fee_and_costs": "Management fee and costs is belong to percentage number, the value should be less than 100.",
|
||||
"management_fee": "Management fee is belong to percentage number, the value should be less than 100.",
|
||||
"performance_fee_costs": "Performance fees costs is belong to percentage number, the value should be less than 100.",
|
||||
"performance_fee_costs": "Performance fees costs is belong to percentage number, the value should be less than 10.",
|
||||
"buy_spread": "Buy spread is belong to percentage number, the value should be less than 100.",
|
||||
"sell_spread": "Sell spread is belong to percentage number, the value should be less than 100.",
|
||||
"establishment_fee": "Establishment fee is belong to percentage number, the value should be less than 100.",
|
||||
|
|
|
|||
11
main.py
11
main.py
|
|
@ -1531,17 +1531,18 @@ if __name__ == "__main__":
|
|||
# document_sample_file = (
|
||||
# r"./sample_documents/aus_prospectus_verify_6_documents_sample.txt"
|
||||
# )
|
||||
document_sample_file = (
|
||||
r"./sample_documents/aus_prospectus_46_documents_sample.txt"
|
||||
)
|
||||
# document_sample_file = (
|
||||
# r"./sample_documents/aus_prospectus_87_vision_cfs_documents_sample.txt"
|
||||
# r"./sample_documents/aus_prospectus_46_documents_sample.txt"
|
||||
# )
|
||||
document_sample_file = (
|
||||
r"./sample_documents/aus_prospectus_87_vision_cfs_documents_sample.txt"
|
||||
)
|
||||
logger.info(f"Start to run document sample file: {document_sample_file}")
|
||||
with open(document_sample_file, "r", encoding="utf-8") as f:
|
||||
special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()
|
||||
if len(doc_id.strip()) > 0]
|
||||
# special_doc_id_list = ["527969661"]
|
||||
# special_doc_id_list = ["470879332", "462780211", "561929947", "422100350"]
|
||||
# special_doc_id_list = ["539999907", "455235248", "448576924"]
|
||||
pdf_folder: str = r"/data/aus_prospectus/pdf/"
|
||||
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
|
||||
output_extract_data_child_folder: str = (
|
||||
|
|
|
|||
Loading…
Reference in New Issue