fix issue for parsing data via Vision Function.

2024-12-11 16:49:04 -06:00 · 2024-12-11 16:49:04 -06:00 · 309bb714f6
parent d673a99e21
commit 309bb714f6
7 changed files with 220 additions and 71 deletions
--- a/configuration/datapoint_keyword.json
+++ b/configuration/datapoint_keyword.json
@ -398,6 +398,12 @@
      "Performancegebühren",
      "Performancevergütung",
      "Anlageerfolgsprämie",
      "TER in % (inkl.",
      "TER % (inkl.",
      "TER in % (exkl.",
      "TER % (exkl.",
      "TER% (einschließlich",
      "TER% (ohne",
      "An die Wertentwicklung des Fonds gebundene Gebühren",
      "Performancegebühr",
      "Performance-gebühr",
--- a/configuration/datapoint_reported_name.json
+++ b/configuration/datapoint_reported_name.json
@ -37,6 +37,8 @@
        "german": [
            "Mit anteiliger Performance Fee in %",
            "TER inkl. Performance-Fee in % **)",
            "TER% (einschließlich Anlageerfolgsprämie)",
            "TER % (inkl. Anlageerfolgsprämie)",
            "Gesamtgebühren",
            "Kostenpauschale",
            "Gesamtkostenquote",
--- a/core/data_extraction.py
+++ b/core/data_extraction.py
@ -171,7 +171,7 @@ class DataExtraction:
        previous_page_datapoints = []
        previous_page_fund_name = None
        for page_num, page_text in self.page_text_dict.items():
-            # if page_num > 640 or page_num < 610:
+            # if page_num != 344:
            #     continue
            if page_num in handled_page_num_list:
                continue
@ -395,7 +395,7 @@ class DataExtraction:
        previous_page_last_fund: str = None) -> dict:
        # If can't find numberic value, e.g. 1.25 or 3,88
        # apply Vision ChatGPT to extract data
-        special_code_regex = r"\x00|\x01|\x02|\x03|\x04|\x05|\x06|\x07|\x08|\x09|\x0a|\x0b|\x0c|\x0d|\x0e|\x0f|\x10|\x11|\x12|\x13|\x14|\x15|\x16|\x17|\x18|\x19|\x1a|\x1b|\x1c|\x1d|\x1e|\x1f"
+        special_code_regex = r"\x10|\x11|\x12|\x13|\x14|\x15|\x16|\x17|\x18|\x19|\x1a|\x1b|\x1c|\x1d|\x1e|\x1f"
        special_code_all = [code for code in re.findall(special_code_regex, page_text)
                            if code != "\n"]
        page_text_line_count = len(page_text.split("\n"))
@ -470,9 +470,12 @@ class DataExtraction:
                    data = json_repair.loads(response)
            except:
                data = {"data": []}
-        data = self.validate_data(extract_data_info=data, 
+        try:
-                                  page_text=page_text, 
+            data = self.validate_data(extract_data_info=data, 
-                                  previous_page_last_fund=previous_page_last_fund)
+                                    page_text=page_text, 
                                    previous_page_last_fund=previous_page_last_fund)
        except:
            pass
        data_dict = {"doc_id": self.doc_id}
        data_dict["page_index"] = page_num
@ -574,7 +577,10 @@ class DataExtraction:
                data = json_repair.loads(response)
            except:
                data = {"data": []}
-        data = self.validate_data(data, None, previous_page_last_fund)
+        try:
            data = self.validate_data(data, None, previous_page_last_fund)
        except:
            pass
        data_dict = {"doc_id": self.doc_id}
        data_dict["page_index"] = page_num
@ -620,8 +626,8 @@ class DataExtraction:
        if len(data_list) == 0:
            return extract_data_info
        remove_list = []
-        performance_fee_regex = r"Amount\s+of\s+the\s+performance\s+fees|Performance\s+Fees\s+amounts|Performance\s+fees\s+amounts|Commissioni\s+di\s+performance|Performance\s+Fee\s+"
+        performance_fee_regex = r"Amount\s+of\s+the\s+performance\s+fees|Performance\s+Fees\s+amounts|Performance\s+fees\s+amounts|Commissioni\s+di\s+performance|Performance\s+Fee\s+|Performance\s+fees\s+charged"
-        nav_regex = r"based\s+on\s+(the\s+)?NAV|on\s+the\s+Share\s+Class\s+NAV|NAV\s+of\s+performance\s+fee|of\s+the\s+average\s+Net\s+Asset\s+Value|Attivi\s+in\s+gestione|Performance\s+Fee\s+of\s+NAV\s+in"
+        nav_regex = r"based\s+on\s+(the\s+)?NAV|on\s+the\s+Share\s+Class\s+NAV|NAV\s+of\s+performance\s+fee|of\s+the\s+average\s+Net\s+Asset\s+Value|Attivi\s+in\s+gestione|Performance\s+Fee\s+of\s+NAV\s+in|share\s+class\s+dealing\s+NAV"
        if page_text is not None and len(page_text) > 0:
            performance_fee_search = re.search(performance_fee_regex, page_text)
            nav_search = re.search(nav_regex, page_text)
@ -667,12 +673,18 @@ class DataExtraction:
            for key in keys:
                if self.datapoint_level_config.get(key, "") == "share_level":
                    if data.get("share name", "") == "":
-                        is_share_name = self.check_fund_name_as_share(fund_name)
+                        include_key_words = False
-                        if not is_share_name:
+                        if key == "ter" and page_text is not None and len(page_text) > 0:
-                            remove_list.append(data)
+                            ter_regex = r"TER\s+in\s+\%|TER\s*\%"
-                            break
+                            ter_search = re.search(ter_regex, page_text)
-                        else:
+                            if ter_search is not None:
-                            data["share name"] = fund_name
+                                include_key_words = True
                        if not include_key_words:
                            is_share_name = self.check_fund_name_as_share(fund_name)
                            if not is_share_name:
                                remove_list.append(data)
                                break
                        data["share name"] = fund_name
                if data.get(key, "") == "":
                    data.pop(key)
        for remove_data in remove_list:
@ -929,7 +941,7 @@ class DataExtraction:
                example_count = 1
                none_value_example_count = 0
                for mul_reported_name in mul_reported_name_list:
-                    if datapoint in ["ter", "performance_fee"] and example_count == 3:
+                    if datapoint in ["ter", "performance_fee"] and example_count >= 3:
                        break
                    value = value_examples[example_count % len(value_examples)]
                    answer = {"fund name": fund_example, 
--- a/instructions/data_extraction_prompts_config.json
+++ b/instructions/data_extraction_prompts_config.json
@ -216,6 +216,13 @@
 					"The output should be:",
 					"{\"data\": [{\"fund name\": \"D/S Strategie ausgewogen\", \"ter\": 1.15, \"performance_fee\": 0}]}",
 					"The performance fees value is TER inkl. Performance-Fee in % **) - TER exkl. Performance-Fee in % **) = 1,15 - 1,15 = 0",
 					"Example 3:",
 					"-----Example Start-----",
 					"TER % \n(inkl. \nAnlageerfolgsprämie)\nTER %\n(exkl. \nAnlageerfolgsprämie)\nPIANO 400 Fund\n0,58 %\n0,58 %\n",
 					"-----Example End-----",
 					"The output should be:",
 					"{\"data\": [{\"fund name\": \"PIANO 400 Fund\", \"ter\": 0.58, \"performance_fee\": 0}]}",
 					"The performance fees value is TER % (inkl. Anlageerfolgsprämie) - TER % (exkl. Anlageerfolgsprämie) = 0,58 - 0,58 = 0",
 					"Case 2:",
 					"If some table is with three value columns: \"TER including performance fees\", \"TER excluding performance fees\", \"Performance fees\", ",
 					"The Performance fees value in column: Performance fees, chould be \"-\", because of \"TER including performance fees\" - \"TER excluding performance fees\" = 0, ", 
@ -230,7 +237,8 @@
 					"As at September 30, 2022, the annualised total expense ratios of \\nthe sub-fund Pictet - Corto Europe Long Short are as follows: \\nCLASS \\nANNUALISED TER INCLUDING  \\nPERFORMANCE FEES \\nANNUALISED TER EXCLUDING  \\nPERFORMANCE FEES \\nSYNTHETIC TER \\nP EUR \\n1.66% \\n1.66% \\n1.98%",
 					"-----Example End-----",
 					"The output should be:",
-					"{\"data\": [{\"fund name\": \"Pictet Corto Europe Long Short\", \"share name\": \"P EUR\", \"ter\": 1.98, \"performance_fee\": 0}]}"
+					"{\"data\": [{\"fund name\": \"Pictet Corto Europe Long Short\", \"share name\": \"P EUR\", \"ter\": 1.98, \"performance_fee\": 0}]}",
 					"Attention: Please always output performance fee value including 0 after calculation as (TER including performance fees - TER excluding performance fees), although the value is 0, but it's with actual value."
 				]
 			},
 			{
@ -258,6 +266,19 @@
 				]
 			}
 		],
 		"tor": [
 			{
 				"title": "TOR with TER and multiple years:",
 				"contents": [
 					"TOR and TER are in same table and with multiple years, please extract the TER and TOR value from the latest year column.",
 					"---Example 1 Start---",
 					"APPENDIX 1 – TOTAL EXPENSE RATIOS AND PORTFOLIO TURNOVER RATIOS\nTotal Expense Ratios are based on the trading 12 months preceding the dates listed below. \nTER \nPTR* \nFor the period/year ended \n2024\n2023\n2024\n2023\nYacktman \nClass A US$ \n1.70%\n1.71%\nTotal Sub-Fund \n(5.94)%\n(5.57)%\nDriehaus Emerging \nClass A US$ \n1.76%\n1.89%\nTotal Sub-Fund \n101.51%\n89.41%",
 					"---Example 1 End---",
 					"The output should be:",
 					"{\"data\": [{\"fund name\": \"Yacktman\", \"share name\": \"Class A US$\", \"ter\": 1.70}, {\"fund name\": \"Yacktman\", \"tor\": -5.94}, {\"fund name\": \"Driehaus Emerging\", \"share name\": \"Class A US$\", \"ter\": 1.76}, {\"fund name\": \"Driehaus Emerging\", \"tor\": 101.51}]}"
 				]
 			}
 		],
 		"extreme_complex": [
 			{
 				"title": "Complex Data Table Structure",
--- a/main.py
+++ b/main.py
@ -887,11 +887,11 @@ def batch_run_documents():
    calculate_metrics = False
    extract_way = "text"
-    special_doc_id_list = ["435128656"]
+    special_doc_id_list = []
    if len(special_doc_id_list) == 0:
        force_save_total_data = True
        # file_base_name_candidates = ["sample_document_complex", "emea_case_from_word_complex"]
-        file_base_name_candidates = ["sample_document_complex"]
+        file_base_name_candidates = ["sample_documents_12_11"]
        for document_list_file in document_list_files:
            file_base_name = os.path.basename(document_list_file).replace(".txt", "")
            if (file_base_name_candidates is not None and 
@ -933,7 +933,27 @@ def batch_run_documents():
        )
 def batch_initial_document():
    sample_document_list_folder = r'./sample_documents/'
    document_list_file = os.path.join(sample_document_list_folder, "sample_documents_12_11.txt")
    with open(document_list_file, "r", encoding="utf-8") as f:
        doc_id_list = f.readlines()
    doc_id_list = [doc_id.strip() for doc_id in doc_id_list]
    pdf_folder = r"/data/emea_ar/pdf/"
    page_filter_ground_truth_file = (
        r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx"
    )
    output_extract_data_child_folder = r"/data/emea_ar/output/extract_data/docs/"
    output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
    for doc_id in tqdm(doc_id_list):
        logger.info(f"Start to initial document: {doc_id}")
        emea_ar_parsing = EMEA_AR_Parsing(doc_id=doc_id,
                                          pdf_folder=pdf_folder,
                                          output_extract_data_folder=output_extract_data_child_folder,
                                          output_mapping_data_folder=output_mapping_child_folder)
 if __name__ == "__main__":
    # batch_initial_document()
    batch_run_documents()
    # new_data_file = r"/data/emea_ar/output/mapping_data/total/mapping_data_info_15_documents_by_text_20241121154243.xlsx"
--- a/prepare_data.py
+++ b/prepare_data.py
@ -118,7 +118,9 @@ def analyze_json_error():
 def statistics_document(
    pdf_folder: str, 
    doc_mapping_file_path: str,
-    sheet_name: str = "all_data",
+    doc_ar_data_file_path: str,
    mapping_sheet_name: str = "Sheet1",
    ar_data_sheet_name: str = "doc_ar_data_in_db",
    output_folder: str = "/data/emea_ar/basic_information/English/", 
    output_file: str = "doc_mapping_statistics_data.xlsx"
 ):
@ -139,7 +141,7 @@ def statistics_document(
    describe_stat_df_list = []
    # statistics document mapping information
-    doc_mapping_data = pd.read_excel(doc_mapping_file_path, sheet_name=sheet_name)
+    doc_mapping_data = pd.read_excel(doc_mapping_file_path, sheet_name=mapping_sheet_name)
    # statistics doc_mapping_data for counting FundId count based on DocumentId
    logger.info(
@ -260,7 +262,7 @@ def statistics_document(
        docid = os.path.basename(pdf_file).split(".")[0]
        doc = fitz.open(pdf_file)
        page_num = doc.page_count
-        doc_page_num_list.append({"docid": docid, "page_num": page_num})
+        doc_page_num_list.append({"DocumentId": docid, "page_num": page_num})
        doc.close()
    doc_page_num_df = pd.DataFrame(doc_page_num_list)
    # order by page_num in descending order
@ -274,8 +276,8 @@ def statistics_document(
    describe_stat_df = pd.concat(describe_stat_df_list)
    describe_stat_df.reset_index(drop=True, inplace=True)
-    
+    doc_ar_data = pd.read_excel(doc_ar_data_file_path, sheet_name=ar_data_sheet_name)
-    doc_dp_result = get_document_with_all_4_data_points(None, None, doc_mapping_data)
+    doc_dp_result = get_document_with_all_4_data_points(None, None, doc_ar_data)
    doc_dp_data_list = []
    for doc_id in doc_id_list:
        doc_id = int(doc_id)
@ -293,14 +295,25 @@ def statistics_document(
    doc_dp_data_df = doc_dp_data_df.sort_values(by="DocumentId", ascending=True)
    doc_dp_data_df.reset_index(drop=True, inplace=True)
    # set all of DocumentId in DataFrame objects to be string type
    doc_page_num_df["DocumentId"] = doc_page_num_df["DocumentId"].astype(str)
    doc_fund_count["DocumentId"] = doc_fund_count["DocumentId"].astype(str)
    doc_share_class_count["DocumentId"] = doc_share_class_count["DocumentId"].astype(str)
    doc_dp_data_df["DocumentId"] = doc_dp_data_df["DocumentId"].astype(str)
    # merge statistics data for doc_page_num_df, doc_dp_data_df, doc_fund_count, doc_share_class_count based on DocumentId
    doc_page_num_df = doc_page_num_df.merge(doc_fund_count, on="DocumentId", how="left")
    doc_page_num_df = doc_page_num_df.merge(doc_share_class_count, on="DocumentId", how="left")
    doc_page_num_df = doc_page_num_df.merge(doc_dp_data_df, on="DocumentId", how="left")
    # save statistics data to excel
    with pd.ExcelWriter(stat_file) as writer:
-        doc_page_num_df.to_excel(writer, sheet_name="doc_page_num", index=False)
+        doc_page_num_df.to_excel(writer, sheet_name="doc_level_stats", index=False)
-        doc_dp_data_df.to_excel(writer, sheet_name="doc_dp_data", index=False)
+        # doc_dp_data_df.to_excel(writer, sheet_name="doc_dp_data", index=False)
-        doc_fund_count.to_excel(writer, sheet_name="doc_fund_count", index=False)
+        # doc_fund_count.to_excel(writer, sheet_name="doc_fund_count", index=False)
-        doc_share_class_count.to_excel(
+        # doc_share_class_count.to_excel(
-            writer, sheet_name="doc_share_class_count", index=False
+        #     writer, sheet_name="doc_share_class_count", index=False
-        )
+        # )
        provider_fund_count.to_excel(
            writer, sheet_name="provider_fund_count", index=False
        )
@ -315,6 +328,41 @@ def statistics_document(
        )
 def get_document_with_all_4_data_points(folder: str, file_name: str, data: pd.DataFrame):
    if data is None:
        file_path = os.path.join(folder, file_name)
        if os.path.exists(file_path):
            data = pd.read_excel(file_path, sheet_name="doc_ar_data_in_db")
        else:
            logger.error(f"Invalid file path: {file_path}")
            return
    # get document id list which noTor is 0
    noTor_0_doc_id_list = data[data["noTor"] == 0]["DocumentId"].unique().tolist()
    # get document id list which share_noTer is 0
    share_noTer_0_doc_id_list = data[data["share_noTer"] == 0]["DocumentId"].unique().tolist()
    # get document id list which share_noOgc is 0
    share_noOgc_0_doc_id_list = data[data["share_noOgc"] == 0]["DocumentId"].unique().tolist()
    # get document id list which share_noPerfFee is 0
    share_noPerfFee_0_doc_id_list = data[data["share_noPerfFee"] == 0]["DocumentId"].unique().tolist()
    logger.info(f"noTor_0_doc_id_list: {len(noTor_0_doc_id_list)}")
    logger.info(f"share_noTer_0_doc_id_list: {len(share_noTer_0_doc_id_list)}")
    logger.info(f"share_noOgc_0_doc_id_list: {len(share_noOgc_0_doc_id_list)}")
    logger.info(f"share_noPerfFee_0_doc_id_list: {len(share_noPerfFee_0_doc_id_list)}")
    all_4_data_points_doc_id_list = list(set(noTor_0_doc_id_list) & set(share_noTer_0_doc_id_list) & set(share_noOgc_0_doc_id_list) & set(share_noPerfFee_0_doc_id_list))
    logger.info(f"all_4_data_points_doc_id_list: {len(all_4_data_points_doc_id_list)}")
    result = {"tor": noTor_0_doc_id_list,
              "ter": share_noTer_0_doc_id_list,
              "ogc": share_noOgc_0_doc_id_list,
              "perf_fee": share_noPerfFee_0_doc_id_list}
    return result
 def statistics_provider_mapping(provider_mapping_data_file: str, output_folder: str):
    if (
        provider_mapping_data_file is None
@ -1312,40 +1360,6 @@ def calc_typical_doc_metrics_v1():
 def get_document_with_all_4_data_points(folder: str, file_name: str, data: pd.DataFrame):
    if data is None:
        file_path = os.path.join(folder, file_name)
        if os.path.exists(file_path):
            data = pd.read_excel(file_path, sheet_name="doc_ar_data_in_db")
        else:
            logger.error(f"Invalid file path: {file_path}")
            return
    # get document id list which noTor is 0
    noTor_0_doc_id_list = data[data["noTor"] == 0]["DocumentId"].unique().tolist()
    # get document id list which share_noTer is 0
    share_noTer_0_doc_id_list = data[data["share_noTer"] == 0]["DocumentId"].unique().tolist()
    # get document id list which share_noOgc is 0
    share_noOgc_0_doc_id_list = data[data["share_noOgc"] == 0]["DocumentId"].unique().tolist()
    # get document id list which share_noPerfFee is 0
    share_noPerfFee_0_doc_id_list = data[data["share_noPerfFee"] == 0]["DocumentId"].unique().tolist()
    logger.info(f"noTor_0_doc_id_list: {len(noTor_0_doc_id_list)}")
    logger.info(f"share_noTer_0_doc_id_list: {len(share_noTer_0_doc_id_list)}")
    logger.info(f"share_noOgc_0_doc_id_list: {len(share_noOgc_0_doc_id_list)}")
    logger.info(f"share_noPerfFee_0_doc_id_list: {len(share_noPerfFee_0_doc_id_list)}")
    all_4_data_points_doc_id_list = list(set(noTor_0_doc_id_list) & set(share_noTer_0_doc_id_list) & set(share_noOgc_0_doc_id_list) & set(share_noPerfFee_0_doc_id_list))
    logger.info(f"all_4_data_points_doc_id_list: {len(all_4_data_points_doc_id_list)}")
    result = {"tor": noTor_0_doc_id_list,
              "ter": share_noTer_0_doc_id_list,
              "ogc": share_noOgc_0_doc_id_list,
              "perf_fee": share_noPerfFee_0_doc_id_list}
    return result
 if __name__ == "__main__":
    folder = r"/data/emea_ar/basic_information/English/sample_doc/emea_11_06_case/"
    file_name = "doc_ar_data_for_emea_11_06.xlsx"
@ -1356,7 +1370,7 @@ if __name__ == "__main__":
    doc_provider_file_path = (
        r"/data/emea_ar/basic_information/English/latest_provider_ar_document.xlsx"
    )
-    doc_mapping_file_path = r"/data/emea_ar/basic_information/English/latest_provider_ar_document_mapping.xlsx"
+    doc_ar_data_file_path = r"/data/emea_ar/basic_information/English/latest_provider_ar_document_mapping.xlsx"
    provider_mapping_data_file = (
        r"/data/emea_ar/basic_information/English/provider_mapping_data.xlsx"
    )
@ -1392,13 +1406,16 @@ if __name__ == "__main__":
    #              pdf_folder)
-    doc_mapping_file_path = r"/data/emea_ar/basic_information/English/sample_doc/emea_sample_documents/doc_ar_data_for_emea_sample_documents.xlsx"
+    doc_ar_data_file_path = r"/data/emea_ar/basic_information/sample_documents/sample_doc/sample_documents_12_11/doc_ar_data_12_11.xlsx"
-    output_data_folder = r"/data/emea_ar/basic_information/English/sample_doc/emea_sample_documents/"
+    doc_mapping_data_file_path = r"/data/emea_ar/basic_information/sample_documents/sample_doc/sample_documents_12_11/doc_mapping_12_11.xlsx"
    output_data_folder = r"/data/emea_ar/basic_information/sample_documents/sample_doc/sample_documents_12_11/"
    statistics_document(pdf_folder=pdf_folder,
-                        doc_mapping_file_path=doc_mapping_file_path, 
+                        doc_mapping_file_path=doc_mapping_data_file_path,
-                        sheet_name="doc_ar_data_in_db",
+                        doc_ar_data_file_path=doc_ar_data_file_path, 
                        mapping_sheet_name="Sheet1",
                        ar_data_sheet_name="doc_ar_data_in_db",
                        output_folder=output_data_folder,
-                        output_file="doc_ar_data_sample_documents_statistics.xlsx")
+                        output_file="doc_ar_data_sample_documents_12_11_statistics.xlsx")
    # get_document_extracted_share_diff_by_db()
    # statistics_provider_mapping(
    #     provider_mapping_data_file=provider_mapping_data_file,
--- a/sample_documents/sample_documents_12_11.txt
+++ b/sample_documents/sample_documents_12_11.txt
@ -0,0 +1,71 @@
 553242365
 553242368
 553242344
 553242337
 553242341
 553242327
 553242328
 553242318
 553242315
 553242316
 553242317
 553242310
 553242311
 553242309
 553242308
 553241057
 553241054
 553241040
 553241030
 553240841
 553240837
 553240623
 553240621
 553240619
 553240614
 553240613
 553240611
 553240612
 553240607
 553240604
 553224104
 553165689
 553081760
 553081761
 553081718
 553078775
 553078771
 553078766
 553078762
 553077970
 553066906
 553066859
 553043543
 553035628
 553035441
 553035433
 553035423
 553035418
 553034857
 553407290
 553406874
 553317485
 553315608
 553250296
 553250271
 553250232
 553242443
 553242414
 553242416
 553242409
 553242411
 553242406
 553242408
 553242396
 553242392
 553242384
 553242385
 553242386
 553242374
 553242362
 553242363