diff --git a/configuration/aus_prospectus/datapoint_keyword.json b/configuration/aus_prospectus/datapoint_keyword.json index a7514cf..2453ff3 100644 --- a/configuration/aus_prospectus/datapoint_keyword.json +++ b/configuration/aus_prospectus/datapoint_keyword.json @@ -1,7 +1,7 @@ { "total_annual_dollar_based_charges": {"english": ["total annual dollar based charges", "total annual dollar based charges ($)","total annual dollar"]}, - "management_fee_and_costs": {"english": ["management fees and cost", "management fees and costs", "Plus other investment fees and costs", "Management costs"]}, - "management_fee": {"english": ["management fee", "management fees","investment management fees","management fees and cost", "investment option management costs", "investment option management costs1", "Plus other investment fees and costs", "Management costs"]}, + "management_fee_and_costs": {"english": ["management fees and cost", "management fees and costs", "investment fees and costs", "Management costs", "investment fee and costs"]}, + "management_fee": {"english": ["management fee", "management fees","investment management fees","management fees and cost", "investment option management costs", "investment option management costs1", "investment fees and costs", "investment fee and costs", "Management costs"]}, "performance_fee": {"english": ["performance fee", "performance fees"]}, "performance_fee_costs": {"english": ["performance fee costs", "performance fees costs"]}, "buy_spread": {"english": ["buy-spread", "buy spread", "buy/sell spreads", "BUY-SELL SPREAD"]}, diff --git a/configuration/aus_prospectus/datapoint_reported_name.json b/configuration/aus_prospectus/datapoint_reported_name.json index 020f252..d480be9 100644 --- a/configuration/aus_prospectus/datapoint_reported_name.json +++ b/configuration/aus_prospectus/datapoint_reported_name.json @@ -1,7 +1,7 @@ { "total_annual_dollar_based_charges": {"english": ["total annual dollar based charges", "total annual dollar based charges ($)","total annual dollar"]}, - "management_fee_and_costs": {"english": ["management fees and cost", "management fees and costs", "management fee and cost", "Plus other investment fees and costs", "Management costs"]}, - "management_fee": {"english": ["management fee", "management fees","investment management fees","management fees and cost", "investment option management costs", "investment option management costs1", "Plus other investment fees and costs", "Management costs"]}, + "management_fee_and_costs": {"english": ["management fees and cost", "management fees and costs", "management fee and cost", "Plus other investment fees and costs", "Management costs", "investment fees and costs", "investment fee and cost"]}, + "management_fee": {"english": ["management fee", "management fees","investment management fees","management fees and cost", "investment option management costs", "investment option management costs1", "Plus other investment fees and costs", "Management costs", "investment fees and costs", "investment fee and cost"]}, "performance_fee": {"english": ["performance fee", "performance fees"]}, "performance_fee_costs": {"english": ["performance fee costs", "performance fees costs"]}, "buy_spread": {"english": ["buy-spread", "buy spread", "buy/sell spreads", "BUY-SELL SPREAD"]}, diff --git a/main.py b/main.py index b7c03aa..401514c 100644 --- a/main.py +++ b/main.py @@ -1042,8 +1042,8 @@ def batch_run_documents( page_filter_ground_truth_file = ( r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx" ) - re_run_extract_data = True - re_run_mapping_data = True + re_run_extract_data = False + re_run_mapping_data = False force_save_total_data = True calculate_metrics = False @@ -1531,7 +1531,7 @@ if __name__ == "__main__": # "555377021", # "555654388", # ] - special_doc_id_list: list = ["412778803"] + # special_doc_id_list: list = ["471206458"] pdf_folder: str = r"/data/aus_prospectus/pdf/" output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/" output_extract_data_child_folder: str = (