From 309bb714f675d4f8c4cda9df7d5e1b6cb87a6a9a Mon Sep 17 00:00:00 2001
From: Blade He <Blade.He@morningstar.com>
Date: Wed, 11 Dec 2024 16:49:04 -0600
Subject: [PATCH] fix issue for parsing data via Vision Function.

---
 configuration/datapoint_keyword.json          |   6 +
 configuration/datapoint_reported_name.json    |   2 +
 core/data_extraction.py                       |  42 +++---
 .../data_extraction_prompts_config.json       |  23 +++-
 main.py                                       |  26 +++-
 prepare_data.py                               | 121 ++++++++++--------
 sample_documents/sample_documents_12_11.txt   |  71 ++++++++++
 7 files changed, 220 insertions(+), 71 deletions(-)
 create mode 100644 sample_documents/sample_documents_12_11.txt

diff --git a/configuration/datapoint_keyword.json b/configuration/datapoint_keyword.json
index c71919e..b765dc9 100644
--- a/configuration/datapoint_keyword.json
+++ b/configuration/datapoint_keyword.json
@@ -398,6 +398,12 @@
       "Performancegebühren",
       "Performancevergütung",
       "Anlageerfolgsprämie",
+      "TER in % (inkl.",
+      "TER % (inkl.",
+      "TER in % (exkl.",
+      "TER % (exkl.",
+      "TER% (einschließlich",
+      "TER% (ohne",
       "An die Wertentwicklung des Fonds gebundene Gebühren",
       "Performancegebühr",
       "Performance-gebühr",
diff --git a/configuration/datapoint_reported_name.json b/configuration/datapoint_reported_name.json
index e1b0342..107b15b 100644
--- a/configuration/datapoint_reported_name.json
+++ b/configuration/datapoint_reported_name.json
@@ -37,6 +37,8 @@
         "german": [
             "Mit anteiliger Performance Fee in %",
             "TER inkl. Performance-Fee in % **)",
+            "TER% (einschließlich Anlageerfolgsprämie)",
+            "TER % (inkl. Anlageerfolgsprämie)",
             "Gesamtgebühren",
             "Kostenpauschale",
             "Gesamtkostenquote",
diff --git a/core/data_extraction.py b/core/data_extraction.py
index a8a9cd0..c9f652f 100644
--- a/core/data_extraction.py
+++ b/core/data_extraction.py
@@ -171,7 +171,7 @@ class DataExtraction:
         previous_page_datapoints = []
         previous_page_fund_name = None
         for page_num, page_text in self.page_text_dict.items():
-            # if page_num > 640 or page_num < 610:
+            # if page_num != 344:
             #     continue
             if page_num in handled_page_num_list:
                 continue
@@ -395,7 +395,7 @@ class DataExtraction:
         previous_page_last_fund: str = None) -> dict:
         # If can't find numberic value, e.g. 1.25 or 3,88
         # apply Vision ChatGPT to extract data
-        special_code_regex = r"\x00|\x01|\x02|\x03|\x04|\x05|\x06|\x07|\x08|\x09|\x0a|\x0b|\x0c|\x0d|\x0e|\x0f|\x10|\x11|\x12|\x13|\x14|\x15|\x16|\x17|\x18|\x19|\x1a|\x1b|\x1c|\x1d|\x1e|\x1f"
+        special_code_regex = r"\x10|\x11|\x12|\x13|\x14|\x15|\x16|\x17|\x18|\x19|\x1a|\x1b|\x1c|\x1d|\x1e|\x1f"
         special_code_all = [code for code in re.findall(special_code_regex, page_text)
                             if code != "\n"]
         page_text_line_count = len(page_text.split("\n"))
@@ -470,9 +470,12 @@ class DataExtraction:
                     data = json_repair.loads(response)
             except:
                 data = {"data": []}
-        data = self.validate_data(extract_data_info=data, 
-                                  page_text=page_text, 
-                                  previous_page_last_fund=previous_page_last_fund)
+        try:
+            data = self.validate_data(extract_data_info=data, 
+                                    page_text=page_text, 
+                                    previous_page_last_fund=previous_page_last_fund)
+        except:
+            pass
         
         data_dict = {"doc_id": self.doc_id}
         data_dict["page_index"] = page_num
@@ -574,7 +577,10 @@ class DataExtraction:
                 data = json_repair.loads(response)
             except:
                 data = {"data": []}
-        data = self.validate_data(data, None, previous_page_last_fund)
+        try:
+            data = self.validate_data(data, None, previous_page_last_fund)
+        except:
+            pass
         
         data_dict = {"doc_id": self.doc_id}
         data_dict["page_index"] = page_num
@@ -620,8 +626,8 @@ class DataExtraction:
         if len(data_list) == 0:
             return extract_data_info
         remove_list = []
-        performance_fee_regex = r"Amount\s+of\s+the\s+performance\s+fees|Performance\s+Fees\s+amounts|Performance\s+fees\s+amounts|Commissioni\s+di\s+performance|Performance\s+Fee\s+"
-        nav_regex = r"based\s+on\s+(the\s+)?NAV|on\s+the\s+Share\s+Class\s+NAV|NAV\s+of\s+performance\s+fee|of\s+the\s+average\s+Net\s+Asset\s+Value|Attivi\s+in\s+gestione|Performance\s+Fee\s+of\s+NAV\s+in"
+        performance_fee_regex = r"Amount\s+of\s+the\s+performance\s+fees|Performance\s+Fees\s+amounts|Performance\s+fees\s+amounts|Commissioni\s+di\s+performance|Performance\s+Fee\s+|Performance\s+fees\s+charged"
+        nav_regex = r"based\s+on\s+(the\s+)?NAV|on\s+the\s+Share\s+Class\s+NAV|NAV\s+of\s+performance\s+fee|of\s+the\s+average\s+Net\s+Asset\s+Value|Attivi\s+in\s+gestione|Performance\s+Fee\s+of\s+NAV\s+in|share\s+class\s+dealing\s+NAV"
         if page_text is not None and len(page_text) > 0:
             performance_fee_search = re.search(performance_fee_regex, page_text)
             nav_search = re.search(nav_regex, page_text)
@@ -667,12 +673,18 @@ class DataExtraction:
             for key in keys:
                 if self.datapoint_level_config.get(key, "") == "share_level":
                     if data.get("share name", "") == "":
-                        is_share_name = self.check_fund_name_as_share(fund_name)
-                        if not is_share_name:
-                            remove_list.append(data)
-                            break
-                        else:
-                            data["share name"] = fund_name
+                        include_key_words = False
+                        if key == "ter" and page_text is not None and len(page_text) > 0:
+                            ter_regex = r"TER\s+in\s+\%|TER\s*\%"
+                            ter_search = re.search(ter_regex, page_text)
+                            if ter_search is not None:
+                                include_key_words = True
+                        if not include_key_words:
+                            is_share_name = self.check_fund_name_as_share(fund_name)
+                            if not is_share_name:
+                                remove_list.append(data)
+                                break
+                        data["share name"] = fund_name
                 if data.get(key, "") == "":
                     data.pop(key)
         for remove_data in remove_list:
@@ -929,7 +941,7 @@ class DataExtraction:
                 example_count = 1
                 none_value_example_count = 0
                 for mul_reported_name in mul_reported_name_list:
-                    if datapoint in ["ter", "performance_fee"] and example_count == 3:
+                    if datapoint in ["ter", "performance_fee"] and example_count >= 3:
                         break
                     value = value_examples[example_count % len(value_examples)]
                     answer = {"fund name": fund_example, 
diff --git a/instructions/data_extraction_prompts_config.json b/instructions/data_extraction_prompts_config.json
index f56caab..b7d37fb 100644
--- a/instructions/data_extraction_prompts_config.json
+++ b/instructions/data_extraction_prompts_config.json
@@ -216,6 +216,13 @@
 					"The output should be:",
 					"{\"data\": [{\"fund name\": \"D/S Strategie ausgewogen\", \"ter\": 1.15, \"performance_fee\": 0}]}",
 					"The performance fees value is TER inkl. Performance-Fee in % **) - TER exkl. Performance-Fee in % **) = 1,15 - 1,15 = 0",
+					"Example 3:",
+					"-----Example Start-----",
+					"TER % \n(inkl. \nAnlageerfolgsprämie)\nTER %\n(exkl. \nAnlageerfolgsprämie)\nPIANO 400 Fund\n0,58 %\n0,58 %\n",
+					"-----Example End-----",
+					"The output should be:",
+					"{\"data\": [{\"fund name\": \"PIANO 400 Fund\", \"ter\": 0.58, \"performance_fee\": 0}]}",
+					"The performance fees value is TER % (inkl. Anlageerfolgsprämie) - TER % (exkl. Anlageerfolgsprämie) = 0,58 - 0,58 = 0",
 					"Case 2:",
 					"If some table is with three value columns: \"TER including performance fees\", \"TER excluding performance fees\", \"Performance fees\", ",
 					"The Performance fees value in column: Performance fees, chould be \"-\", because of \"TER including performance fees\" - \"TER excluding performance fees\" = 0, ", 
@@ -230,7 +237,8 @@
 					"As at September 30, 2022, the annualised total expense ratios of \\nthe sub-fund Pictet - Corto Europe Long Short are as follows: \\nCLASS \\nANNUALISED TER INCLUDING  \\nPERFORMANCE FEES \\nANNUALISED TER EXCLUDING  \\nPERFORMANCE FEES \\nSYNTHETIC TER \\nP EUR \\n1.66% \\n1.66% \\n1.98%",
 					"-----Example End-----",
 					"The output should be:",
-					"{\"data\": [{\"fund name\": \"Pictet Corto Europe Long Short\", \"share name\": \"P EUR\", \"ter\": 1.98, \"performance_fee\": 0}]}"
+					"{\"data\": [{\"fund name\": \"Pictet Corto Europe Long Short\", \"share name\": \"P EUR\", \"ter\": 1.98, \"performance_fee\": 0}]}",
+					"Attention: Please always output performance fee value including 0 after calculation as (TER including performance fees - TER excluding performance fees), although the value is 0, but it's with actual value."
 				]
 			},
 			{
@@ -258,6 +266,19 @@
 				]
 			}
 		],
+		"tor": [
+			{
+				"title": "TOR with TER and multiple years:",
+				"contents": [
+					"TOR and TER are in same table and with multiple years, please extract the TER and TOR value from the latest year column.",
+					"---Example 1 Start---",
+					"APPENDIX 1 – TOTAL EXPENSE RATIOS AND PORTFOLIO TURNOVER RATIOS\nTotal Expense Ratios are based on the trading 12 months preceding the dates listed below. \nTER \nPTR* \nFor the period/year ended \n2024\n2023\n2024\n2023\nYacktman \nClass A US$ \n1.70%\n1.71%\nTotal Sub-Fund \n(5.94)%\n(5.57)%\nDriehaus Emerging \nClass A US$ \n1.76%\n1.89%\nTotal Sub-Fund \n101.51%\n89.41%",
+					"---Example 1 End---",
+					"The output should be:",
+					"{\"data\": [{\"fund name\": \"Yacktman\", \"share name\": \"Class A US$\", \"ter\": 1.70}, {\"fund name\": \"Yacktman\", \"tor\": -5.94}, {\"fund name\": \"Driehaus Emerging\", \"share name\": \"Class A US$\", \"ter\": 1.76}, {\"fund name\": \"Driehaus Emerging\", \"tor\": 101.51}]}"
+				]
+			}
+		],
 		"extreme_complex": [
 			{
 				"title": "Complex Data Table Structure",
diff --git a/main.py b/main.py
index f3862cb..7eea57b 100644
--- a/main.py
+++ b/main.py
@@ -887,11 +887,11 @@ def batch_run_documents():
     calculate_metrics = False
 
     extract_way = "text"
-    special_doc_id_list = ["435128656"]
+    special_doc_id_list = []
     if len(special_doc_id_list) == 0:
         force_save_total_data = True
         # file_base_name_candidates = ["sample_document_complex", "emea_case_from_word_complex"]
-        file_base_name_candidates = ["sample_document_complex"]
+        file_base_name_candidates = ["sample_documents_12_11"]
         for document_list_file in document_list_files:
             file_base_name = os.path.basename(document_list_file).replace(".txt", "")
             if (file_base_name_candidates is not None and 
@@ -931,9 +931,29 @@ def batch_run_documents():
             force_save_total_data=force_save_total_data,
             calculate_metrics=calculate_metrics,
         )
-    
+
+
+def batch_initial_document():
+    sample_document_list_folder = r'./sample_documents/'
+    document_list_file = os.path.join(sample_document_list_folder, "sample_documents_12_11.txt")
+    with open(document_list_file, "r", encoding="utf-8") as f:
+        doc_id_list = f.readlines()
+    doc_id_list = [doc_id.strip() for doc_id in doc_id_list]
+    pdf_folder = r"/data/emea_ar/pdf/"
+    page_filter_ground_truth_file = (
+        r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx"
+    )
+    output_extract_data_child_folder = r"/data/emea_ar/output/extract_data/docs/"
+    output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
+    for doc_id in tqdm(doc_id_list):
+        logger.info(f"Start to initial document: {doc_id}")
+        emea_ar_parsing = EMEA_AR_Parsing(doc_id=doc_id,
+                                          pdf_folder=pdf_folder,
+                                          output_extract_data_folder=output_extract_data_child_folder,
+                                          output_mapping_data_folder=output_mapping_child_folder)
 
 if __name__ == "__main__":
+    # batch_initial_document()
     batch_run_documents()
     
     # new_data_file = r"/data/emea_ar/output/mapping_data/total/mapping_data_info_15_documents_by_text_20241121154243.xlsx"
diff --git a/prepare_data.py b/prepare_data.py
index 43c1102..f80990c 100644
--- a/prepare_data.py
+++ b/prepare_data.py
@@ -118,7 +118,9 @@ def analyze_json_error():
 def statistics_document(
     pdf_folder: str, 
     doc_mapping_file_path: str,
-    sheet_name: str = "all_data",
+    doc_ar_data_file_path: str,
+    mapping_sheet_name: str = "Sheet1",
+    ar_data_sheet_name: str = "doc_ar_data_in_db",
     output_folder: str = "/data/emea_ar/basic_information/English/", 
     output_file: str = "doc_mapping_statistics_data.xlsx"
 ):
@@ -139,7 +141,7 @@ def statistics_document(
 
     describe_stat_df_list = []
     # statistics document mapping information
-    doc_mapping_data = pd.read_excel(doc_mapping_file_path, sheet_name=sheet_name)
+    doc_mapping_data = pd.read_excel(doc_mapping_file_path, sheet_name=mapping_sheet_name)
 
     # statistics doc_mapping_data for counting FundId count based on DocumentId
     logger.info(
@@ -260,7 +262,7 @@ def statistics_document(
         docid = os.path.basename(pdf_file).split(".")[0]
         doc = fitz.open(pdf_file)
         page_num = doc.page_count
-        doc_page_num_list.append({"docid": docid, "page_num": page_num})
+        doc_page_num_list.append({"DocumentId": docid, "page_num": page_num})
         doc.close()
     doc_page_num_df = pd.DataFrame(doc_page_num_list)
     # order by page_num in descending order
@@ -274,8 +276,8 @@ def statistics_document(
     describe_stat_df = pd.concat(describe_stat_df_list)
     describe_stat_df.reset_index(drop=True, inplace=True)
     
-    
-    doc_dp_result = get_document_with_all_4_data_points(None, None, doc_mapping_data)
+    doc_ar_data = pd.read_excel(doc_ar_data_file_path, sheet_name=ar_data_sheet_name)
+    doc_dp_result = get_document_with_all_4_data_points(None, None, doc_ar_data)
     doc_dp_data_list = []
     for doc_id in doc_id_list:
         doc_id = int(doc_id)
@@ -292,15 +294,26 @@ def statistics_document(
     doc_dp_data_df = pd.DataFrame(doc_dp_data_list)
     doc_dp_data_df = doc_dp_data_df.sort_values(by="DocumentId", ascending=True)
     doc_dp_data_df.reset_index(drop=True, inplace=True)
-
+    
+    # set all of DocumentId in DataFrame objects to be string type
+    doc_page_num_df["DocumentId"] = doc_page_num_df["DocumentId"].astype(str)
+    doc_fund_count["DocumentId"] = doc_fund_count["DocumentId"].astype(str)
+    doc_share_class_count["DocumentId"] = doc_share_class_count["DocumentId"].astype(str)
+    doc_dp_data_df["DocumentId"] = doc_dp_data_df["DocumentId"].astype(str)
+    
+    # merge statistics data for doc_page_num_df, doc_dp_data_df, doc_fund_count, doc_share_class_count based on DocumentId
+    doc_page_num_df = doc_page_num_df.merge(doc_fund_count, on="DocumentId", how="left")
+    doc_page_num_df = doc_page_num_df.merge(doc_share_class_count, on="DocumentId", how="left")
+    doc_page_num_df = doc_page_num_df.merge(doc_dp_data_df, on="DocumentId", how="left")
+    
     # save statistics data to excel
     with pd.ExcelWriter(stat_file) as writer:
-        doc_page_num_df.to_excel(writer, sheet_name="doc_page_num", index=False)
-        doc_dp_data_df.to_excel(writer, sheet_name="doc_dp_data", index=False)
-        doc_fund_count.to_excel(writer, sheet_name="doc_fund_count", index=False)
-        doc_share_class_count.to_excel(
-            writer, sheet_name="doc_share_class_count", index=False
-        )
+        doc_page_num_df.to_excel(writer, sheet_name="doc_level_stats", index=False)
+        # doc_dp_data_df.to_excel(writer, sheet_name="doc_dp_data", index=False)
+        # doc_fund_count.to_excel(writer, sheet_name="doc_fund_count", index=False)
+        # doc_share_class_count.to_excel(
+        #     writer, sheet_name="doc_share_class_count", index=False
+        # )
         provider_fund_count.to_excel(
             writer, sheet_name="provider_fund_count", index=False
         )
@@ -315,6 +328,41 @@ def statistics_document(
         )
 
 
+def get_document_with_all_4_data_points(folder: str, file_name: str, data: pd.DataFrame):
+    if data is None:
+        file_path = os.path.join(folder, file_name)
+        if os.path.exists(file_path):
+            data = pd.read_excel(file_path, sheet_name="doc_ar_data_in_db")
+        else:
+            logger.error(f"Invalid file path: {file_path}")
+            return
+    # get document id list which noTor is 0
+    noTor_0_doc_id_list = data[data["noTor"] == 0]["DocumentId"].unique().tolist()
+    
+    # get document id list which share_noTer is 0
+    share_noTer_0_doc_id_list = data[data["share_noTer"] == 0]["DocumentId"].unique().tolist()
+    
+    # get document id list which share_noOgc is 0
+    share_noOgc_0_doc_id_list = data[data["share_noOgc"] == 0]["DocumentId"].unique().tolist()
+    
+    # get document id list which share_noPerfFee is 0
+    share_noPerfFee_0_doc_id_list = data[data["share_noPerfFee"] == 0]["DocumentId"].unique().tolist()
+    
+    logger.info(f"noTor_0_doc_id_list: {len(noTor_0_doc_id_list)}")
+    logger.info(f"share_noTer_0_doc_id_list: {len(share_noTer_0_doc_id_list)}")
+    logger.info(f"share_noOgc_0_doc_id_list: {len(share_noOgc_0_doc_id_list)}")
+    logger.info(f"share_noPerfFee_0_doc_id_list: {len(share_noPerfFee_0_doc_id_list)}")
+    
+    all_4_data_points_doc_id_list = list(set(noTor_0_doc_id_list) & set(share_noTer_0_doc_id_list) & set(share_noOgc_0_doc_id_list) & set(share_noPerfFee_0_doc_id_list))
+    
+    logger.info(f"all_4_data_points_doc_id_list: {len(all_4_data_points_doc_id_list)}")
+    result = {"tor": noTor_0_doc_id_list,
+              "ter": share_noTer_0_doc_id_list,
+              "ogc": share_noOgc_0_doc_id_list,
+              "perf_fee": share_noPerfFee_0_doc_id_list}
+    return result
+
+
 def statistics_provider_mapping(provider_mapping_data_file: str, output_folder: str):
     if (
         provider_mapping_data_file is None
@@ -1312,40 +1360,6 @@ def calc_typical_doc_metrics_v1():
         
 
 
-def get_document_with_all_4_data_points(folder: str, file_name: str, data: pd.DataFrame):
-    if data is None:
-        file_path = os.path.join(folder, file_name)
-        if os.path.exists(file_path):
-            data = pd.read_excel(file_path, sheet_name="doc_ar_data_in_db")
-        else:
-            logger.error(f"Invalid file path: {file_path}")
-            return
-    # get document id list which noTor is 0
-    noTor_0_doc_id_list = data[data["noTor"] == 0]["DocumentId"].unique().tolist()
-    
-    # get document id list which share_noTer is 0
-    share_noTer_0_doc_id_list = data[data["share_noTer"] == 0]["DocumentId"].unique().tolist()
-    
-    # get document id list which share_noOgc is 0
-    share_noOgc_0_doc_id_list = data[data["share_noOgc"] == 0]["DocumentId"].unique().tolist()
-    
-    # get document id list which share_noPerfFee is 0
-    share_noPerfFee_0_doc_id_list = data[data["share_noPerfFee"] == 0]["DocumentId"].unique().tolist()
-    
-    logger.info(f"noTor_0_doc_id_list: {len(noTor_0_doc_id_list)}")
-    logger.info(f"share_noTer_0_doc_id_list: {len(share_noTer_0_doc_id_list)}")
-    logger.info(f"share_noOgc_0_doc_id_list: {len(share_noOgc_0_doc_id_list)}")
-    logger.info(f"share_noPerfFee_0_doc_id_list: {len(share_noPerfFee_0_doc_id_list)}")
-    
-    all_4_data_points_doc_id_list = list(set(noTor_0_doc_id_list) & set(share_noTer_0_doc_id_list) & set(share_noOgc_0_doc_id_list) & set(share_noPerfFee_0_doc_id_list))
-    
-    logger.info(f"all_4_data_points_doc_id_list: {len(all_4_data_points_doc_id_list)}")
-    result = {"tor": noTor_0_doc_id_list,
-              "ter": share_noTer_0_doc_id_list,
-              "ogc": share_noOgc_0_doc_id_list,
-              "perf_fee": share_noPerfFee_0_doc_id_list}
-    return result
-
 if __name__ == "__main__":
     folder = r"/data/emea_ar/basic_information/English/sample_doc/emea_11_06_case/"
     file_name = "doc_ar_data_for_emea_11_06.xlsx"
@@ -1356,7 +1370,7 @@ if __name__ == "__main__":
     doc_provider_file_path = (
         r"/data/emea_ar/basic_information/English/latest_provider_ar_document.xlsx"
     )
-    doc_mapping_file_path = r"/data/emea_ar/basic_information/English/latest_provider_ar_document_mapping.xlsx"
+    doc_ar_data_file_path = r"/data/emea_ar/basic_information/English/latest_provider_ar_document_mapping.xlsx"
     provider_mapping_data_file = (
         r"/data/emea_ar/basic_information/English/provider_mapping_data.xlsx"
     )
@@ -1392,13 +1406,16 @@ if __name__ == "__main__":
     #              pdf_folder)
     
     
-    doc_mapping_file_path = r"/data/emea_ar/basic_information/English/sample_doc/emea_sample_documents/doc_ar_data_for_emea_sample_documents.xlsx"
-    output_data_folder = r"/data/emea_ar/basic_information/English/sample_doc/emea_sample_documents/"
+    doc_ar_data_file_path = r"/data/emea_ar/basic_information/sample_documents/sample_doc/sample_documents_12_11/doc_ar_data_12_11.xlsx"
+    doc_mapping_data_file_path = r"/data/emea_ar/basic_information/sample_documents/sample_doc/sample_documents_12_11/doc_mapping_12_11.xlsx"
+    output_data_folder = r"/data/emea_ar/basic_information/sample_documents/sample_doc/sample_documents_12_11/"
     statistics_document(pdf_folder=pdf_folder,
-                        doc_mapping_file_path=doc_mapping_file_path, 
-                        sheet_name="doc_ar_data_in_db",
+                        doc_mapping_file_path=doc_mapping_data_file_path,
+                        doc_ar_data_file_path=doc_ar_data_file_path, 
+                        mapping_sheet_name="Sheet1",
+                        ar_data_sheet_name="doc_ar_data_in_db",
                         output_folder=output_data_folder,
-                        output_file="doc_ar_data_sample_documents_statistics.xlsx")
+                        output_file="doc_ar_data_sample_documents_12_11_statistics.xlsx")
     # get_document_extracted_share_diff_by_db()
     # statistics_provider_mapping(
     #     provider_mapping_data_file=provider_mapping_data_file,
diff --git a/sample_documents/sample_documents_12_11.txt b/sample_documents/sample_documents_12_11.txt
new file mode 100644
index 0000000..f68ec55
--- /dev/null
+++ b/sample_documents/sample_documents_12_11.txt
@@ -0,0 +1,71 @@
+553242365
+553242368
+553242344
+553242337
+553242341
+553242327
+553242328
+553242318
+553242315
+553242316
+553242317
+553242310
+553242311
+553242309
+553242308
+553241057
+553241054
+553241040
+553241030
+553240841
+553240837
+553240623
+553240621
+553240619
+553240614
+553240613
+553240611
+553240612
+553240607
+553240604
+553224104
+553165689
+553081760
+553081761
+553081718
+553078775
+553078771
+553078766
+553078762
+553077970
+553066906
+553066859
+553043543
+553035628
+553035441
+553035433
+553035423
+553035418
+553034857
+553407290
+553406874
+553317485
+553315608
+553250296
+553250271
+553250232
+553242443
+553242414
+553242416
+553242409
+553242411
+553242406
+553242408
+553242396
+553242392
+553242384
+553242385
+553242386
+553242374
+553242362
+553242363
\ No newline at end of file