From 309bb714f675d4f8c4cda9df7d5e1b6cb87a6a9a Mon Sep 17 00:00:00 2001 From: Blade He Date: Wed, 11 Dec 2024 16:49:04 -0600 Subject: [PATCH] fix issue for parsing data via Vision Function. --- configuration/datapoint_keyword.json | 6 + configuration/datapoint_reported_name.json | 2 + core/data_extraction.py | 42 +++--- .../data_extraction_prompts_config.json | 23 +++- main.py | 26 +++- prepare_data.py | 121 ++++++++++-------- sample_documents/sample_documents_12_11.txt | 71 ++++++++++ 7 files changed, 220 insertions(+), 71 deletions(-) create mode 100644 sample_documents/sample_documents_12_11.txt diff --git a/configuration/datapoint_keyword.json b/configuration/datapoint_keyword.json index c71919e..b765dc9 100644 --- a/configuration/datapoint_keyword.json +++ b/configuration/datapoint_keyword.json @@ -398,6 +398,12 @@ "Performancegebühren", "Performancevergütung", "Anlageerfolgsprämie", + "TER in % (inkl.", + "TER % (inkl.", + "TER in % (exkl.", + "TER % (exkl.", + "TER% (einschließlich", + "TER% (ohne", "An die Wertentwicklung des Fonds gebundene Gebühren", "Performancegebühr", "Performance-gebühr", diff --git a/configuration/datapoint_reported_name.json b/configuration/datapoint_reported_name.json index e1b0342..107b15b 100644 --- a/configuration/datapoint_reported_name.json +++ b/configuration/datapoint_reported_name.json @@ -37,6 +37,8 @@ "german": [ "Mit anteiliger Performance Fee in %", "TER inkl. Performance-Fee in % **)", + "TER% (einschließlich Anlageerfolgsprämie)", + "TER % (inkl. Anlageerfolgsprämie)", "Gesamtgebühren", "Kostenpauschale", "Gesamtkostenquote", diff --git a/core/data_extraction.py b/core/data_extraction.py index a8a9cd0..c9f652f 100644 --- a/core/data_extraction.py +++ b/core/data_extraction.py @@ -171,7 +171,7 @@ class DataExtraction: previous_page_datapoints = [] previous_page_fund_name = None for page_num, page_text in self.page_text_dict.items(): - # if page_num > 640 or page_num < 610: + # if page_num != 344: # continue if page_num in handled_page_num_list: continue @@ -395,7 +395,7 @@ class DataExtraction: previous_page_last_fund: str = None) -> dict: # If can't find numberic value, e.g. 1.25 or 3,88 # apply Vision ChatGPT to extract data - special_code_regex = r"\x00|\x01|\x02|\x03|\x04|\x05|\x06|\x07|\x08|\x09|\x0a|\x0b|\x0c|\x0d|\x0e|\x0f|\x10|\x11|\x12|\x13|\x14|\x15|\x16|\x17|\x18|\x19|\x1a|\x1b|\x1c|\x1d|\x1e|\x1f" + special_code_regex = r"\x10|\x11|\x12|\x13|\x14|\x15|\x16|\x17|\x18|\x19|\x1a|\x1b|\x1c|\x1d|\x1e|\x1f" special_code_all = [code for code in re.findall(special_code_regex, page_text) if code != "\n"] page_text_line_count = len(page_text.split("\n")) @@ -470,9 +470,12 @@ class DataExtraction: data = json_repair.loads(response) except: data = {"data": []} - data = self.validate_data(extract_data_info=data, - page_text=page_text, - previous_page_last_fund=previous_page_last_fund) + try: + data = self.validate_data(extract_data_info=data, + page_text=page_text, + previous_page_last_fund=previous_page_last_fund) + except: + pass data_dict = {"doc_id": self.doc_id} data_dict["page_index"] = page_num @@ -574,7 +577,10 @@ class DataExtraction: data = json_repair.loads(response) except: data = {"data": []} - data = self.validate_data(data, None, previous_page_last_fund) + try: + data = self.validate_data(data, None, previous_page_last_fund) + except: + pass data_dict = {"doc_id": self.doc_id} data_dict["page_index"] = page_num @@ -620,8 +626,8 @@ class DataExtraction: if len(data_list) == 0: return extract_data_info remove_list = [] - performance_fee_regex = r"Amount\s+of\s+the\s+performance\s+fees|Performance\s+Fees\s+amounts|Performance\s+fees\s+amounts|Commissioni\s+di\s+performance|Performance\s+Fee\s+" - nav_regex = r"based\s+on\s+(the\s+)?NAV|on\s+the\s+Share\s+Class\s+NAV|NAV\s+of\s+performance\s+fee|of\s+the\s+average\s+Net\s+Asset\s+Value|Attivi\s+in\s+gestione|Performance\s+Fee\s+of\s+NAV\s+in" + performance_fee_regex = r"Amount\s+of\s+the\s+performance\s+fees|Performance\s+Fees\s+amounts|Performance\s+fees\s+amounts|Commissioni\s+di\s+performance|Performance\s+Fee\s+|Performance\s+fees\s+charged" + nav_regex = r"based\s+on\s+(the\s+)?NAV|on\s+the\s+Share\s+Class\s+NAV|NAV\s+of\s+performance\s+fee|of\s+the\s+average\s+Net\s+Asset\s+Value|Attivi\s+in\s+gestione|Performance\s+Fee\s+of\s+NAV\s+in|share\s+class\s+dealing\s+NAV" if page_text is not None and len(page_text) > 0: performance_fee_search = re.search(performance_fee_regex, page_text) nav_search = re.search(nav_regex, page_text) @@ -667,12 +673,18 @@ class DataExtraction: for key in keys: if self.datapoint_level_config.get(key, "") == "share_level": if data.get("share name", "") == "": - is_share_name = self.check_fund_name_as_share(fund_name) - if not is_share_name: - remove_list.append(data) - break - else: - data["share name"] = fund_name + include_key_words = False + if key == "ter" and page_text is not None and len(page_text) > 0: + ter_regex = r"TER\s+in\s+\%|TER\s*\%" + ter_search = re.search(ter_regex, page_text) + if ter_search is not None: + include_key_words = True + if not include_key_words: + is_share_name = self.check_fund_name_as_share(fund_name) + if not is_share_name: + remove_list.append(data) + break + data["share name"] = fund_name if data.get(key, "") == "": data.pop(key) for remove_data in remove_list: @@ -929,7 +941,7 @@ class DataExtraction: example_count = 1 none_value_example_count = 0 for mul_reported_name in mul_reported_name_list: - if datapoint in ["ter", "performance_fee"] and example_count == 3: + if datapoint in ["ter", "performance_fee"] and example_count >= 3: break value = value_examples[example_count % len(value_examples)] answer = {"fund name": fund_example, diff --git a/instructions/data_extraction_prompts_config.json b/instructions/data_extraction_prompts_config.json index f56caab..b7d37fb 100644 --- a/instructions/data_extraction_prompts_config.json +++ b/instructions/data_extraction_prompts_config.json @@ -216,6 +216,13 @@ "The output should be:", "{\"data\": [{\"fund name\": \"D/S Strategie ausgewogen\", \"ter\": 1.15, \"performance_fee\": 0}]}", "The performance fees value is TER inkl. Performance-Fee in % **) - TER exkl. Performance-Fee in % **) = 1,15 - 1,15 = 0", + "Example 3:", + "-----Example Start-----", + "TER % \n(inkl. \nAnlageerfolgsprämie)\nTER %\n(exkl. \nAnlageerfolgsprämie)\nPIANO 400 Fund\n0,58 %\n0,58 %\n", + "-----Example End-----", + "The output should be:", + "{\"data\": [{\"fund name\": \"PIANO 400 Fund\", \"ter\": 0.58, \"performance_fee\": 0}]}", + "The performance fees value is TER % (inkl. Anlageerfolgsprämie) - TER % (exkl. Anlageerfolgsprämie) = 0,58 - 0,58 = 0", "Case 2:", "If some table is with three value columns: \"TER including performance fees\", \"TER excluding performance fees\", \"Performance fees\", ", "The Performance fees value in column: Performance fees, chould be \"-\", because of \"TER including performance fees\" - \"TER excluding performance fees\" = 0, ", @@ -230,7 +237,8 @@ "As at September 30, 2022, the annualised total expense ratios of \\nthe sub-fund Pictet - Corto Europe Long Short are as follows: \\nCLASS \\nANNUALISED TER INCLUDING \\nPERFORMANCE FEES \\nANNUALISED TER EXCLUDING \\nPERFORMANCE FEES \\nSYNTHETIC TER \\nP EUR \\n1.66% \\n1.66% \\n1.98%", "-----Example End-----", "The output should be:", - "{\"data\": [{\"fund name\": \"Pictet Corto Europe Long Short\", \"share name\": \"P EUR\", \"ter\": 1.98, \"performance_fee\": 0}]}" + "{\"data\": [{\"fund name\": \"Pictet Corto Europe Long Short\", \"share name\": \"P EUR\", \"ter\": 1.98, \"performance_fee\": 0}]}", + "Attention: Please always output performance fee value including 0 after calculation as (TER including performance fees - TER excluding performance fees), although the value is 0, but it's with actual value." ] }, { @@ -258,6 +266,19 @@ ] } ], + "tor": [ + { + "title": "TOR with TER and multiple years:", + "contents": [ + "TOR and TER are in same table and with multiple years, please extract the TER and TOR value from the latest year column.", + "---Example 1 Start---", + "APPENDIX 1 – TOTAL EXPENSE RATIOS AND PORTFOLIO TURNOVER RATIOS\nTotal Expense Ratios are based on the trading 12 months preceding the dates listed below. \nTER \nPTR* \nFor the period/year ended \n2024\n2023\n2024\n2023\nYacktman \nClass A US$ \n1.70%\n1.71%\nTotal Sub-Fund \n(5.94)%\n(5.57)%\nDriehaus Emerging \nClass A US$ \n1.76%\n1.89%\nTotal Sub-Fund \n101.51%\n89.41%", + "---Example 1 End---", + "The output should be:", + "{\"data\": [{\"fund name\": \"Yacktman\", \"share name\": \"Class A US$\", \"ter\": 1.70}, {\"fund name\": \"Yacktman\", \"tor\": -5.94}, {\"fund name\": \"Driehaus Emerging\", \"share name\": \"Class A US$\", \"ter\": 1.76}, {\"fund name\": \"Driehaus Emerging\", \"tor\": 101.51}]}" + ] + } + ], "extreme_complex": [ { "title": "Complex Data Table Structure", diff --git a/main.py b/main.py index f3862cb..7eea57b 100644 --- a/main.py +++ b/main.py @@ -887,11 +887,11 @@ def batch_run_documents(): calculate_metrics = False extract_way = "text" - special_doc_id_list = ["435128656"] + special_doc_id_list = [] if len(special_doc_id_list) == 0: force_save_total_data = True # file_base_name_candidates = ["sample_document_complex", "emea_case_from_word_complex"] - file_base_name_candidates = ["sample_document_complex"] + file_base_name_candidates = ["sample_documents_12_11"] for document_list_file in document_list_files: file_base_name = os.path.basename(document_list_file).replace(".txt", "") if (file_base_name_candidates is not None and @@ -931,9 +931,29 @@ def batch_run_documents(): force_save_total_data=force_save_total_data, calculate_metrics=calculate_metrics, ) - + + +def batch_initial_document(): + sample_document_list_folder = r'./sample_documents/' + document_list_file = os.path.join(sample_document_list_folder, "sample_documents_12_11.txt") + with open(document_list_file, "r", encoding="utf-8") as f: + doc_id_list = f.readlines() + doc_id_list = [doc_id.strip() for doc_id in doc_id_list] + pdf_folder = r"/data/emea_ar/pdf/" + page_filter_ground_truth_file = ( + r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx" + ) + output_extract_data_child_folder = r"/data/emea_ar/output/extract_data/docs/" + output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/" + for doc_id in tqdm(doc_id_list): + logger.info(f"Start to initial document: {doc_id}") + emea_ar_parsing = EMEA_AR_Parsing(doc_id=doc_id, + pdf_folder=pdf_folder, + output_extract_data_folder=output_extract_data_child_folder, + output_mapping_data_folder=output_mapping_child_folder) if __name__ == "__main__": + # batch_initial_document() batch_run_documents() # new_data_file = r"/data/emea_ar/output/mapping_data/total/mapping_data_info_15_documents_by_text_20241121154243.xlsx" diff --git a/prepare_data.py b/prepare_data.py index 43c1102..f80990c 100644 --- a/prepare_data.py +++ b/prepare_data.py @@ -118,7 +118,9 @@ def analyze_json_error(): def statistics_document( pdf_folder: str, doc_mapping_file_path: str, - sheet_name: str = "all_data", + doc_ar_data_file_path: str, + mapping_sheet_name: str = "Sheet1", + ar_data_sheet_name: str = "doc_ar_data_in_db", output_folder: str = "/data/emea_ar/basic_information/English/", output_file: str = "doc_mapping_statistics_data.xlsx" ): @@ -139,7 +141,7 @@ def statistics_document( describe_stat_df_list = [] # statistics document mapping information - doc_mapping_data = pd.read_excel(doc_mapping_file_path, sheet_name=sheet_name) + doc_mapping_data = pd.read_excel(doc_mapping_file_path, sheet_name=mapping_sheet_name) # statistics doc_mapping_data for counting FundId count based on DocumentId logger.info( @@ -260,7 +262,7 @@ def statistics_document( docid = os.path.basename(pdf_file).split(".")[0] doc = fitz.open(pdf_file) page_num = doc.page_count - doc_page_num_list.append({"docid": docid, "page_num": page_num}) + doc_page_num_list.append({"DocumentId": docid, "page_num": page_num}) doc.close() doc_page_num_df = pd.DataFrame(doc_page_num_list) # order by page_num in descending order @@ -274,8 +276,8 @@ def statistics_document( describe_stat_df = pd.concat(describe_stat_df_list) describe_stat_df.reset_index(drop=True, inplace=True) - - doc_dp_result = get_document_with_all_4_data_points(None, None, doc_mapping_data) + doc_ar_data = pd.read_excel(doc_ar_data_file_path, sheet_name=ar_data_sheet_name) + doc_dp_result = get_document_with_all_4_data_points(None, None, doc_ar_data) doc_dp_data_list = [] for doc_id in doc_id_list: doc_id = int(doc_id) @@ -292,15 +294,26 @@ def statistics_document( doc_dp_data_df = pd.DataFrame(doc_dp_data_list) doc_dp_data_df = doc_dp_data_df.sort_values(by="DocumentId", ascending=True) doc_dp_data_df.reset_index(drop=True, inplace=True) - + + # set all of DocumentId in DataFrame objects to be string type + doc_page_num_df["DocumentId"] = doc_page_num_df["DocumentId"].astype(str) + doc_fund_count["DocumentId"] = doc_fund_count["DocumentId"].astype(str) + doc_share_class_count["DocumentId"] = doc_share_class_count["DocumentId"].astype(str) + doc_dp_data_df["DocumentId"] = doc_dp_data_df["DocumentId"].astype(str) + + # merge statistics data for doc_page_num_df, doc_dp_data_df, doc_fund_count, doc_share_class_count based on DocumentId + doc_page_num_df = doc_page_num_df.merge(doc_fund_count, on="DocumentId", how="left") + doc_page_num_df = doc_page_num_df.merge(doc_share_class_count, on="DocumentId", how="left") + doc_page_num_df = doc_page_num_df.merge(doc_dp_data_df, on="DocumentId", how="left") + # save statistics data to excel with pd.ExcelWriter(stat_file) as writer: - doc_page_num_df.to_excel(writer, sheet_name="doc_page_num", index=False) - doc_dp_data_df.to_excel(writer, sheet_name="doc_dp_data", index=False) - doc_fund_count.to_excel(writer, sheet_name="doc_fund_count", index=False) - doc_share_class_count.to_excel( - writer, sheet_name="doc_share_class_count", index=False - ) + doc_page_num_df.to_excel(writer, sheet_name="doc_level_stats", index=False) + # doc_dp_data_df.to_excel(writer, sheet_name="doc_dp_data", index=False) + # doc_fund_count.to_excel(writer, sheet_name="doc_fund_count", index=False) + # doc_share_class_count.to_excel( + # writer, sheet_name="doc_share_class_count", index=False + # ) provider_fund_count.to_excel( writer, sheet_name="provider_fund_count", index=False ) @@ -315,6 +328,41 @@ def statistics_document( ) +def get_document_with_all_4_data_points(folder: str, file_name: str, data: pd.DataFrame): + if data is None: + file_path = os.path.join(folder, file_name) + if os.path.exists(file_path): + data = pd.read_excel(file_path, sheet_name="doc_ar_data_in_db") + else: + logger.error(f"Invalid file path: {file_path}") + return + # get document id list which noTor is 0 + noTor_0_doc_id_list = data[data["noTor"] == 0]["DocumentId"].unique().tolist() + + # get document id list which share_noTer is 0 + share_noTer_0_doc_id_list = data[data["share_noTer"] == 0]["DocumentId"].unique().tolist() + + # get document id list which share_noOgc is 0 + share_noOgc_0_doc_id_list = data[data["share_noOgc"] == 0]["DocumentId"].unique().tolist() + + # get document id list which share_noPerfFee is 0 + share_noPerfFee_0_doc_id_list = data[data["share_noPerfFee"] == 0]["DocumentId"].unique().tolist() + + logger.info(f"noTor_0_doc_id_list: {len(noTor_0_doc_id_list)}") + logger.info(f"share_noTer_0_doc_id_list: {len(share_noTer_0_doc_id_list)}") + logger.info(f"share_noOgc_0_doc_id_list: {len(share_noOgc_0_doc_id_list)}") + logger.info(f"share_noPerfFee_0_doc_id_list: {len(share_noPerfFee_0_doc_id_list)}") + + all_4_data_points_doc_id_list = list(set(noTor_0_doc_id_list) & set(share_noTer_0_doc_id_list) & set(share_noOgc_0_doc_id_list) & set(share_noPerfFee_0_doc_id_list)) + + logger.info(f"all_4_data_points_doc_id_list: {len(all_4_data_points_doc_id_list)}") + result = {"tor": noTor_0_doc_id_list, + "ter": share_noTer_0_doc_id_list, + "ogc": share_noOgc_0_doc_id_list, + "perf_fee": share_noPerfFee_0_doc_id_list} + return result + + def statistics_provider_mapping(provider_mapping_data_file: str, output_folder: str): if ( provider_mapping_data_file is None @@ -1312,40 +1360,6 @@ def calc_typical_doc_metrics_v1(): -def get_document_with_all_4_data_points(folder: str, file_name: str, data: pd.DataFrame): - if data is None: - file_path = os.path.join(folder, file_name) - if os.path.exists(file_path): - data = pd.read_excel(file_path, sheet_name="doc_ar_data_in_db") - else: - logger.error(f"Invalid file path: {file_path}") - return - # get document id list which noTor is 0 - noTor_0_doc_id_list = data[data["noTor"] == 0]["DocumentId"].unique().tolist() - - # get document id list which share_noTer is 0 - share_noTer_0_doc_id_list = data[data["share_noTer"] == 0]["DocumentId"].unique().tolist() - - # get document id list which share_noOgc is 0 - share_noOgc_0_doc_id_list = data[data["share_noOgc"] == 0]["DocumentId"].unique().tolist() - - # get document id list which share_noPerfFee is 0 - share_noPerfFee_0_doc_id_list = data[data["share_noPerfFee"] == 0]["DocumentId"].unique().tolist() - - logger.info(f"noTor_0_doc_id_list: {len(noTor_0_doc_id_list)}") - logger.info(f"share_noTer_0_doc_id_list: {len(share_noTer_0_doc_id_list)}") - logger.info(f"share_noOgc_0_doc_id_list: {len(share_noOgc_0_doc_id_list)}") - logger.info(f"share_noPerfFee_0_doc_id_list: {len(share_noPerfFee_0_doc_id_list)}") - - all_4_data_points_doc_id_list = list(set(noTor_0_doc_id_list) & set(share_noTer_0_doc_id_list) & set(share_noOgc_0_doc_id_list) & set(share_noPerfFee_0_doc_id_list)) - - logger.info(f"all_4_data_points_doc_id_list: {len(all_4_data_points_doc_id_list)}") - result = {"tor": noTor_0_doc_id_list, - "ter": share_noTer_0_doc_id_list, - "ogc": share_noOgc_0_doc_id_list, - "perf_fee": share_noPerfFee_0_doc_id_list} - return result - if __name__ == "__main__": folder = r"/data/emea_ar/basic_information/English/sample_doc/emea_11_06_case/" file_name = "doc_ar_data_for_emea_11_06.xlsx" @@ -1356,7 +1370,7 @@ if __name__ == "__main__": doc_provider_file_path = ( r"/data/emea_ar/basic_information/English/latest_provider_ar_document.xlsx" ) - doc_mapping_file_path = r"/data/emea_ar/basic_information/English/latest_provider_ar_document_mapping.xlsx" + doc_ar_data_file_path = r"/data/emea_ar/basic_information/English/latest_provider_ar_document_mapping.xlsx" provider_mapping_data_file = ( r"/data/emea_ar/basic_information/English/provider_mapping_data.xlsx" ) @@ -1392,13 +1406,16 @@ if __name__ == "__main__": # pdf_folder) - doc_mapping_file_path = r"/data/emea_ar/basic_information/English/sample_doc/emea_sample_documents/doc_ar_data_for_emea_sample_documents.xlsx" - output_data_folder = r"/data/emea_ar/basic_information/English/sample_doc/emea_sample_documents/" + doc_ar_data_file_path = r"/data/emea_ar/basic_information/sample_documents/sample_doc/sample_documents_12_11/doc_ar_data_12_11.xlsx" + doc_mapping_data_file_path = r"/data/emea_ar/basic_information/sample_documents/sample_doc/sample_documents_12_11/doc_mapping_12_11.xlsx" + output_data_folder = r"/data/emea_ar/basic_information/sample_documents/sample_doc/sample_documents_12_11/" statistics_document(pdf_folder=pdf_folder, - doc_mapping_file_path=doc_mapping_file_path, - sheet_name="doc_ar_data_in_db", + doc_mapping_file_path=doc_mapping_data_file_path, + doc_ar_data_file_path=doc_ar_data_file_path, + mapping_sheet_name="Sheet1", + ar_data_sheet_name="doc_ar_data_in_db", output_folder=output_data_folder, - output_file="doc_ar_data_sample_documents_statistics.xlsx") + output_file="doc_ar_data_sample_documents_12_11_statistics.xlsx") # get_document_extracted_share_diff_by_db() # statistics_provider_mapping( # provider_mapping_data_file=provider_mapping_data_file, diff --git a/sample_documents/sample_documents_12_11.txt b/sample_documents/sample_documents_12_11.txt new file mode 100644 index 0000000..f68ec55 --- /dev/null +++ b/sample_documents/sample_documents_12_11.txt @@ -0,0 +1,71 @@ +553242365 +553242368 +553242344 +553242337 +553242341 +553242327 +553242328 +553242318 +553242315 +553242316 +553242317 +553242310 +553242311 +553242309 +553242308 +553241057 +553241054 +553241040 +553241030 +553240841 +553240837 +553240623 +553240621 +553240619 +553240614 +553240613 +553240611 +553240612 +553240607 +553240604 +553224104 +553165689 +553081760 +553081761 +553081718 +553078775 +553078771 +553078766 +553078762 +553077970 +553066906 +553066859 +553043543 +553035628 +553035441 +553035433 +553035423 +553035418 +553034857 +553407290 +553406874 +553317485 +553315608 +553250296 +553250271 +553250232 +553242443 +553242414 +553242416 +553242409 +553242411 +553242406 +553242408 +553242396 +553242392 +553242384 +553242385 +553242386 +553242374 +553242362 +553242363 \ No newline at end of file