fix issue for parsing data via Vision Function.

This commit is contained in:
Blade He 2024-12-11 16:49:04 -06:00
parent d673a99e21
commit 309bb714f6
7 changed files with 220 additions and 71 deletions

View File

@ -398,6 +398,12 @@
"Performancegebühren", "Performancegebühren",
"Performancevergütung", "Performancevergütung",
"Anlageerfolgsprämie", "Anlageerfolgsprämie",
"TER in % (inkl.",
"TER % (inkl.",
"TER in % (exkl.",
"TER % (exkl.",
"TER% (einschließlich",
"TER% (ohne",
"An die Wertentwicklung des Fonds gebundene Gebühren", "An die Wertentwicklung des Fonds gebundene Gebühren",
"Performancegebühr", "Performancegebühr",
"Performance-gebühr", "Performance-gebühr",

View File

@ -37,6 +37,8 @@
"german": [ "german": [
"Mit anteiliger Performance Fee in %", "Mit anteiliger Performance Fee in %",
"TER inkl. Performance-Fee in % **)", "TER inkl. Performance-Fee in % **)",
"TER% (einschließlich Anlageerfolgsprämie)",
"TER % (inkl. Anlageerfolgsprämie)",
"Gesamtgebühren", "Gesamtgebühren",
"Kostenpauschale", "Kostenpauschale",
"Gesamtkostenquote", "Gesamtkostenquote",

View File

@ -171,7 +171,7 @@ class DataExtraction:
previous_page_datapoints = [] previous_page_datapoints = []
previous_page_fund_name = None previous_page_fund_name = None
for page_num, page_text in self.page_text_dict.items(): for page_num, page_text in self.page_text_dict.items():
# if page_num > 640 or page_num < 610: # if page_num != 344:
# continue # continue
if page_num in handled_page_num_list: if page_num in handled_page_num_list:
continue continue
@ -395,7 +395,7 @@ class DataExtraction:
previous_page_last_fund: str = None) -> dict: previous_page_last_fund: str = None) -> dict:
# If can't find numberic value, e.g. 1.25 or 3,88 # If can't find numberic value, e.g. 1.25 or 3,88
# apply Vision ChatGPT to extract data # apply Vision ChatGPT to extract data
special_code_regex = r"\x00|\x01|\x02|\x03|\x04|\x05|\x06|\x07|\x08|\x09|\x0a|\x0b|\x0c|\x0d|\x0e|\x0f|\x10|\x11|\x12|\x13|\x14|\x15|\x16|\x17|\x18|\x19|\x1a|\x1b|\x1c|\x1d|\x1e|\x1f" special_code_regex = r"\x10|\x11|\x12|\x13|\x14|\x15|\x16|\x17|\x18|\x19|\x1a|\x1b|\x1c|\x1d|\x1e|\x1f"
special_code_all = [code for code in re.findall(special_code_regex, page_text) special_code_all = [code for code in re.findall(special_code_regex, page_text)
if code != "\n"] if code != "\n"]
page_text_line_count = len(page_text.split("\n")) page_text_line_count = len(page_text.split("\n"))
@ -470,9 +470,12 @@ class DataExtraction:
data = json_repair.loads(response) data = json_repair.loads(response)
except: except:
data = {"data": []} data = {"data": []}
data = self.validate_data(extract_data_info=data, try:
page_text=page_text, data = self.validate_data(extract_data_info=data,
previous_page_last_fund=previous_page_last_fund) page_text=page_text,
previous_page_last_fund=previous_page_last_fund)
except:
pass
data_dict = {"doc_id": self.doc_id} data_dict = {"doc_id": self.doc_id}
data_dict["page_index"] = page_num data_dict["page_index"] = page_num
@ -574,7 +577,10 @@ class DataExtraction:
data = json_repair.loads(response) data = json_repair.loads(response)
except: except:
data = {"data": []} data = {"data": []}
data = self.validate_data(data, None, previous_page_last_fund) try:
data = self.validate_data(data, None, previous_page_last_fund)
except:
pass
data_dict = {"doc_id": self.doc_id} data_dict = {"doc_id": self.doc_id}
data_dict["page_index"] = page_num data_dict["page_index"] = page_num
@ -620,8 +626,8 @@ class DataExtraction:
if len(data_list) == 0: if len(data_list) == 0:
return extract_data_info return extract_data_info
remove_list = [] remove_list = []
performance_fee_regex = r"Amount\s+of\s+the\s+performance\s+fees|Performance\s+Fees\s+amounts|Performance\s+fees\s+amounts|Commissioni\s+di\s+performance|Performance\s+Fee\s+" performance_fee_regex = r"Amount\s+of\s+the\s+performance\s+fees|Performance\s+Fees\s+amounts|Performance\s+fees\s+amounts|Commissioni\s+di\s+performance|Performance\s+Fee\s+|Performance\s+fees\s+charged"
nav_regex = r"based\s+on\s+(the\s+)?NAV|on\s+the\s+Share\s+Class\s+NAV|NAV\s+of\s+performance\s+fee|of\s+the\s+average\s+Net\s+Asset\s+Value|Attivi\s+in\s+gestione|Performance\s+Fee\s+of\s+NAV\s+in" nav_regex = r"based\s+on\s+(the\s+)?NAV|on\s+the\s+Share\s+Class\s+NAV|NAV\s+of\s+performance\s+fee|of\s+the\s+average\s+Net\s+Asset\s+Value|Attivi\s+in\s+gestione|Performance\s+Fee\s+of\s+NAV\s+in|share\s+class\s+dealing\s+NAV"
if page_text is not None and len(page_text) > 0: if page_text is not None and len(page_text) > 0:
performance_fee_search = re.search(performance_fee_regex, page_text) performance_fee_search = re.search(performance_fee_regex, page_text)
nav_search = re.search(nav_regex, page_text) nav_search = re.search(nav_regex, page_text)
@ -667,12 +673,18 @@ class DataExtraction:
for key in keys: for key in keys:
if self.datapoint_level_config.get(key, "") == "share_level": if self.datapoint_level_config.get(key, "") == "share_level":
if data.get("share name", "") == "": if data.get("share name", "") == "":
is_share_name = self.check_fund_name_as_share(fund_name) include_key_words = False
if not is_share_name: if key == "ter" and page_text is not None and len(page_text) > 0:
remove_list.append(data) ter_regex = r"TER\s+in\s+\%|TER\s*\%"
break ter_search = re.search(ter_regex, page_text)
else: if ter_search is not None:
data["share name"] = fund_name include_key_words = True
if not include_key_words:
is_share_name = self.check_fund_name_as_share(fund_name)
if not is_share_name:
remove_list.append(data)
break
data["share name"] = fund_name
if data.get(key, "") == "": if data.get(key, "") == "":
data.pop(key) data.pop(key)
for remove_data in remove_list: for remove_data in remove_list:
@ -929,7 +941,7 @@ class DataExtraction:
example_count = 1 example_count = 1
none_value_example_count = 0 none_value_example_count = 0
for mul_reported_name in mul_reported_name_list: for mul_reported_name in mul_reported_name_list:
if datapoint in ["ter", "performance_fee"] and example_count == 3: if datapoint in ["ter", "performance_fee"] and example_count >= 3:
break break
value = value_examples[example_count % len(value_examples)] value = value_examples[example_count % len(value_examples)]
answer = {"fund name": fund_example, answer = {"fund name": fund_example,

View File

@ -216,6 +216,13 @@
"The output should be:", "The output should be:",
"{\"data\": [{\"fund name\": \"D/S Strategie ausgewogen\", \"ter\": 1.15, \"performance_fee\": 0}]}", "{\"data\": [{\"fund name\": \"D/S Strategie ausgewogen\", \"ter\": 1.15, \"performance_fee\": 0}]}",
"The performance fees value is TER inkl. Performance-Fee in % **) - TER exkl. Performance-Fee in % **) = 1,15 - 1,15 = 0", "The performance fees value is TER inkl. Performance-Fee in % **) - TER exkl. Performance-Fee in % **) = 1,15 - 1,15 = 0",
"Example 3:",
"-----Example Start-----",
"TER % \n(inkl. \nAnlageerfolgsprämie)\nTER %\n(exkl. \nAnlageerfolgsprämie)\nPIANO 400 Fund\n0,58 %\n0,58 %\n",
"-----Example End-----",
"The output should be:",
"{\"data\": [{\"fund name\": \"PIANO 400 Fund\", \"ter\": 0.58, \"performance_fee\": 0}]}",
"The performance fees value is TER % (inkl. Anlageerfolgsprämie) - TER % (exkl. Anlageerfolgsprämie) = 0,58 - 0,58 = 0",
"Case 2:", "Case 2:",
"If some table is with three value columns: \"TER including performance fees\", \"TER excluding performance fees\", \"Performance fees\", ", "If some table is with three value columns: \"TER including performance fees\", \"TER excluding performance fees\", \"Performance fees\", ",
"The Performance fees value in column: Performance fees, chould be \"-\", because of \"TER including performance fees\" - \"TER excluding performance fees\" = 0, ", "The Performance fees value in column: Performance fees, chould be \"-\", because of \"TER including performance fees\" - \"TER excluding performance fees\" = 0, ",
@ -230,7 +237,8 @@
"As at September 30, 2022, the annualised total expense ratios of \\nthe sub-fund Pictet - Corto Europe Long Short are as follows: \\nCLASS \\nANNUALISED TER INCLUDING \\nPERFORMANCE FEES \\nANNUALISED TER EXCLUDING \\nPERFORMANCE FEES \\nSYNTHETIC TER \\nP EUR \\n1.66% \\n1.66% \\n1.98%", "As at September 30, 2022, the annualised total expense ratios of \\nthe sub-fund Pictet - Corto Europe Long Short are as follows: \\nCLASS \\nANNUALISED TER INCLUDING \\nPERFORMANCE FEES \\nANNUALISED TER EXCLUDING \\nPERFORMANCE FEES \\nSYNTHETIC TER \\nP EUR \\n1.66% \\n1.66% \\n1.98%",
"-----Example End-----", "-----Example End-----",
"The output should be:", "The output should be:",
"{\"data\": [{\"fund name\": \"Pictet Corto Europe Long Short\", \"share name\": \"P EUR\", \"ter\": 1.98, \"performance_fee\": 0}]}" "{\"data\": [{\"fund name\": \"Pictet Corto Europe Long Short\", \"share name\": \"P EUR\", \"ter\": 1.98, \"performance_fee\": 0}]}",
"Attention: Please always output performance fee value including 0 after calculation as (TER including performance fees - TER excluding performance fees), although the value is 0, but it's with actual value."
] ]
}, },
{ {
@ -258,6 +266,19 @@
] ]
} }
], ],
"tor": [
{
"title": "TOR with TER and multiple years:",
"contents": [
"TOR and TER are in same table and with multiple years, please extract the TER and TOR value from the latest year column.",
"---Example 1 Start---",
"APPENDIX 1 TOTAL EXPENSE RATIOS AND PORTFOLIO TURNOVER RATIOS\nTotal Expense Ratios are based on the trading 12 months preceding the dates listed below. \nTER \nPTR* \nFor the period/year ended \n2024\n2023\n2024\n2023\nYacktman \nClass A US$ \n1.70%\n1.71%\nTotal Sub-Fund \n(5.94)%\n(5.57)%\nDriehaus Emerging \nClass A US$ \n1.76%\n1.89%\nTotal Sub-Fund \n101.51%\n89.41%",
"---Example 1 End---",
"The output should be:",
"{\"data\": [{\"fund name\": \"Yacktman\", \"share name\": \"Class A US$\", \"ter\": 1.70}, {\"fund name\": \"Yacktman\", \"tor\": -5.94}, {\"fund name\": \"Driehaus Emerging\", \"share name\": \"Class A US$\", \"ter\": 1.76}, {\"fund name\": \"Driehaus Emerging\", \"tor\": 101.51}]}"
]
}
],
"extreme_complex": [ "extreme_complex": [
{ {
"title": "Complex Data Table Structure", "title": "Complex Data Table Structure",

24
main.py
View File

@ -887,11 +887,11 @@ def batch_run_documents():
calculate_metrics = False calculate_metrics = False
extract_way = "text" extract_way = "text"
special_doc_id_list = ["435128656"] special_doc_id_list = []
if len(special_doc_id_list) == 0: if len(special_doc_id_list) == 0:
force_save_total_data = True force_save_total_data = True
# file_base_name_candidates = ["sample_document_complex", "emea_case_from_word_complex"] # file_base_name_candidates = ["sample_document_complex", "emea_case_from_word_complex"]
file_base_name_candidates = ["sample_document_complex"] file_base_name_candidates = ["sample_documents_12_11"]
for document_list_file in document_list_files: for document_list_file in document_list_files:
file_base_name = os.path.basename(document_list_file).replace(".txt", "") file_base_name = os.path.basename(document_list_file).replace(".txt", "")
if (file_base_name_candidates is not None and if (file_base_name_candidates is not None and
@ -933,7 +933,27 @@ def batch_run_documents():
) )
def batch_initial_document():
sample_document_list_folder = r'./sample_documents/'
document_list_file = os.path.join(sample_document_list_folder, "sample_documents_12_11.txt")
with open(document_list_file, "r", encoding="utf-8") as f:
doc_id_list = f.readlines()
doc_id_list = [doc_id.strip() for doc_id in doc_id_list]
pdf_folder = r"/data/emea_ar/pdf/"
page_filter_ground_truth_file = (
r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx"
)
output_extract_data_child_folder = r"/data/emea_ar/output/extract_data/docs/"
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
for doc_id in tqdm(doc_id_list):
logger.info(f"Start to initial document: {doc_id}")
emea_ar_parsing = EMEA_AR_Parsing(doc_id=doc_id,
pdf_folder=pdf_folder,
output_extract_data_folder=output_extract_data_child_folder,
output_mapping_data_folder=output_mapping_child_folder)
if __name__ == "__main__": if __name__ == "__main__":
# batch_initial_document()
batch_run_documents() batch_run_documents()
# new_data_file = r"/data/emea_ar/output/mapping_data/total/mapping_data_info_15_documents_by_text_20241121154243.xlsx" # new_data_file = r"/data/emea_ar/output/mapping_data/total/mapping_data_info_15_documents_by_text_20241121154243.xlsx"

View File

@ -118,7 +118,9 @@ def analyze_json_error():
def statistics_document( def statistics_document(
pdf_folder: str, pdf_folder: str,
doc_mapping_file_path: str, doc_mapping_file_path: str,
sheet_name: str = "all_data", doc_ar_data_file_path: str,
mapping_sheet_name: str = "Sheet1",
ar_data_sheet_name: str = "doc_ar_data_in_db",
output_folder: str = "/data/emea_ar/basic_information/English/", output_folder: str = "/data/emea_ar/basic_information/English/",
output_file: str = "doc_mapping_statistics_data.xlsx" output_file: str = "doc_mapping_statistics_data.xlsx"
): ):
@ -139,7 +141,7 @@ def statistics_document(
describe_stat_df_list = [] describe_stat_df_list = []
# statistics document mapping information # statistics document mapping information
doc_mapping_data = pd.read_excel(doc_mapping_file_path, sheet_name=sheet_name) doc_mapping_data = pd.read_excel(doc_mapping_file_path, sheet_name=mapping_sheet_name)
# statistics doc_mapping_data for counting FundId count based on DocumentId # statistics doc_mapping_data for counting FundId count based on DocumentId
logger.info( logger.info(
@ -260,7 +262,7 @@ def statistics_document(
docid = os.path.basename(pdf_file).split(".")[0] docid = os.path.basename(pdf_file).split(".")[0]
doc = fitz.open(pdf_file) doc = fitz.open(pdf_file)
page_num = doc.page_count page_num = doc.page_count
doc_page_num_list.append({"docid": docid, "page_num": page_num}) doc_page_num_list.append({"DocumentId": docid, "page_num": page_num})
doc.close() doc.close()
doc_page_num_df = pd.DataFrame(doc_page_num_list) doc_page_num_df = pd.DataFrame(doc_page_num_list)
# order by page_num in descending order # order by page_num in descending order
@ -274,8 +276,8 @@ def statistics_document(
describe_stat_df = pd.concat(describe_stat_df_list) describe_stat_df = pd.concat(describe_stat_df_list)
describe_stat_df.reset_index(drop=True, inplace=True) describe_stat_df.reset_index(drop=True, inplace=True)
doc_ar_data = pd.read_excel(doc_ar_data_file_path, sheet_name=ar_data_sheet_name)
doc_dp_result = get_document_with_all_4_data_points(None, None, doc_mapping_data) doc_dp_result = get_document_with_all_4_data_points(None, None, doc_ar_data)
doc_dp_data_list = [] doc_dp_data_list = []
for doc_id in doc_id_list: for doc_id in doc_id_list:
doc_id = int(doc_id) doc_id = int(doc_id)
@ -293,14 +295,25 @@ def statistics_document(
doc_dp_data_df = doc_dp_data_df.sort_values(by="DocumentId", ascending=True) doc_dp_data_df = doc_dp_data_df.sort_values(by="DocumentId", ascending=True)
doc_dp_data_df.reset_index(drop=True, inplace=True) doc_dp_data_df.reset_index(drop=True, inplace=True)
# set all of DocumentId in DataFrame objects to be string type
doc_page_num_df["DocumentId"] = doc_page_num_df["DocumentId"].astype(str)
doc_fund_count["DocumentId"] = doc_fund_count["DocumentId"].astype(str)
doc_share_class_count["DocumentId"] = doc_share_class_count["DocumentId"].astype(str)
doc_dp_data_df["DocumentId"] = doc_dp_data_df["DocumentId"].astype(str)
# merge statistics data for doc_page_num_df, doc_dp_data_df, doc_fund_count, doc_share_class_count based on DocumentId
doc_page_num_df = doc_page_num_df.merge(doc_fund_count, on="DocumentId", how="left")
doc_page_num_df = doc_page_num_df.merge(doc_share_class_count, on="DocumentId", how="left")
doc_page_num_df = doc_page_num_df.merge(doc_dp_data_df, on="DocumentId", how="left")
# save statistics data to excel # save statistics data to excel
with pd.ExcelWriter(stat_file) as writer: with pd.ExcelWriter(stat_file) as writer:
doc_page_num_df.to_excel(writer, sheet_name="doc_page_num", index=False) doc_page_num_df.to_excel(writer, sheet_name="doc_level_stats", index=False)
doc_dp_data_df.to_excel(writer, sheet_name="doc_dp_data", index=False) # doc_dp_data_df.to_excel(writer, sheet_name="doc_dp_data", index=False)
doc_fund_count.to_excel(writer, sheet_name="doc_fund_count", index=False) # doc_fund_count.to_excel(writer, sheet_name="doc_fund_count", index=False)
doc_share_class_count.to_excel( # doc_share_class_count.to_excel(
writer, sheet_name="doc_share_class_count", index=False # writer, sheet_name="doc_share_class_count", index=False
) # )
provider_fund_count.to_excel( provider_fund_count.to_excel(
writer, sheet_name="provider_fund_count", index=False writer, sheet_name="provider_fund_count", index=False
) )
@ -315,6 +328,41 @@ def statistics_document(
) )
def get_document_with_all_4_data_points(folder: str, file_name: str, data: pd.DataFrame):
if data is None:
file_path = os.path.join(folder, file_name)
if os.path.exists(file_path):
data = pd.read_excel(file_path, sheet_name="doc_ar_data_in_db")
else:
logger.error(f"Invalid file path: {file_path}")
return
# get document id list which noTor is 0
noTor_0_doc_id_list = data[data["noTor"] == 0]["DocumentId"].unique().tolist()
# get document id list which share_noTer is 0
share_noTer_0_doc_id_list = data[data["share_noTer"] == 0]["DocumentId"].unique().tolist()
# get document id list which share_noOgc is 0
share_noOgc_0_doc_id_list = data[data["share_noOgc"] == 0]["DocumentId"].unique().tolist()
# get document id list which share_noPerfFee is 0
share_noPerfFee_0_doc_id_list = data[data["share_noPerfFee"] == 0]["DocumentId"].unique().tolist()
logger.info(f"noTor_0_doc_id_list: {len(noTor_0_doc_id_list)}")
logger.info(f"share_noTer_0_doc_id_list: {len(share_noTer_0_doc_id_list)}")
logger.info(f"share_noOgc_0_doc_id_list: {len(share_noOgc_0_doc_id_list)}")
logger.info(f"share_noPerfFee_0_doc_id_list: {len(share_noPerfFee_0_doc_id_list)}")
all_4_data_points_doc_id_list = list(set(noTor_0_doc_id_list) & set(share_noTer_0_doc_id_list) & set(share_noOgc_0_doc_id_list) & set(share_noPerfFee_0_doc_id_list))
logger.info(f"all_4_data_points_doc_id_list: {len(all_4_data_points_doc_id_list)}")
result = {"tor": noTor_0_doc_id_list,
"ter": share_noTer_0_doc_id_list,
"ogc": share_noOgc_0_doc_id_list,
"perf_fee": share_noPerfFee_0_doc_id_list}
return result
def statistics_provider_mapping(provider_mapping_data_file: str, output_folder: str): def statistics_provider_mapping(provider_mapping_data_file: str, output_folder: str):
if ( if (
provider_mapping_data_file is None provider_mapping_data_file is None
@ -1312,40 +1360,6 @@ def calc_typical_doc_metrics_v1():
def get_document_with_all_4_data_points(folder: str, file_name: str, data: pd.DataFrame):
if data is None:
file_path = os.path.join(folder, file_name)
if os.path.exists(file_path):
data = pd.read_excel(file_path, sheet_name="doc_ar_data_in_db")
else:
logger.error(f"Invalid file path: {file_path}")
return
# get document id list which noTor is 0
noTor_0_doc_id_list = data[data["noTor"] == 0]["DocumentId"].unique().tolist()
# get document id list which share_noTer is 0
share_noTer_0_doc_id_list = data[data["share_noTer"] == 0]["DocumentId"].unique().tolist()
# get document id list which share_noOgc is 0
share_noOgc_0_doc_id_list = data[data["share_noOgc"] == 0]["DocumentId"].unique().tolist()
# get document id list which share_noPerfFee is 0
share_noPerfFee_0_doc_id_list = data[data["share_noPerfFee"] == 0]["DocumentId"].unique().tolist()
logger.info(f"noTor_0_doc_id_list: {len(noTor_0_doc_id_list)}")
logger.info(f"share_noTer_0_doc_id_list: {len(share_noTer_0_doc_id_list)}")
logger.info(f"share_noOgc_0_doc_id_list: {len(share_noOgc_0_doc_id_list)}")
logger.info(f"share_noPerfFee_0_doc_id_list: {len(share_noPerfFee_0_doc_id_list)}")
all_4_data_points_doc_id_list = list(set(noTor_0_doc_id_list) & set(share_noTer_0_doc_id_list) & set(share_noOgc_0_doc_id_list) & set(share_noPerfFee_0_doc_id_list))
logger.info(f"all_4_data_points_doc_id_list: {len(all_4_data_points_doc_id_list)}")
result = {"tor": noTor_0_doc_id_list,
"ter": share_noTer_0_doc_id_list,
"ogc": share_noOgc_0_doc_id_list,
"perf_fee": share_noPerfFee_0_doc_id_list}
return result
if __name__ == "__main__": if __name__ == "__main__":
folder = r"/data/emea_ar/basic_information/English/sample_doc/emea_11_06_case/" folder = r"/data/emea_ar/basic_information/English/sample_doc/emea_11_06_case/"
file_name = "doc_ar_data_for_emea_11_06.xlsx" file_name = "doc_ar_data_for_emea_11_06.xlsx"
@ -1356,7 +1370,7 @@ if __name__ == "__main__":
doc_provider_file_path = ( doc_provider_file_path = (
r"/data/emea_ar/basic_information/English/latest_provider_ar_document.xlsx" r"/data/emea_ar/basic_information/English/latest_provider_ar_document.xlsx"
) )
doc_mapping_file_path = r"/data/emea_ar/basic_information/English/latest_provider_ar_document_mapping.xlsx" doc_ar_data_file_path = r"/data/emea_ar/basic_information/English/latest_provider_ar_document_mapping.xlsx"
provider_mapping_data_file = ( provider_mapping_data_file = (
r"/data/emea_ar/basic_information/English/provider_mapping_data.xlsx" r"/data/emea_ar/basic_information/English/provider_mapping_data.xlsx"
) )
@ -1392,13 +1406,16 @@ if __name__ == "__main__":
# pdf_folder) # pdf_folder)
doc_mapping_file_path = r"/data/emea_ar/basic_information/English/sample_doc/emea_sample_documents/doc_ar_data_for_emea_sample_documents.xlsx" doc_ar_data_file_path = r"/data/emea_ar/basic_information/sample_documents/sample_doc/sample_documents_12_11/doc_ar_data_12_11.xlsx"
output_data_folder = r"/data/emea_ar/basic_information/English/sample_doc/emea_sample_documents/" doc_mapping_data_file_path = r"/data/emea_ar/basic_information/sample_documents/sample_doc/sample_documents_12_11/doc_mapping_12_11.xlsx"
output_data_folder = r"/data/emea_ar/basic_information/sample_documents/sample_doc/sample_documents_12_11/"
statistics_document(pdf_folder=pdf_folder, statistics_document(pdf_folder=pdf_folder,
doc_mapping_file_path=doc_mapping_file_path, doc_mapping_file_path=doc_mapping_data_file_path,
sheet_name="doc_ar_data_in_db", doc_ar_data_file_path=doc_ar_data_file_path,
mapping_sheet_name="Sheet1",
ar_data_sheet_name="doc_ar_data_in_db",
output_folder=output_data_folder, output_folder=output_data_folder,
output_file="doc_ar_data_sample_documents_statistics.xlsx") output_file="doc_ar_data_sample_documents_12_11_statistics.xlsx")
# get_document_extracted_share_diff_by_db() # get_document_extracted_share_diff_by_db()
# statistics_provider_mapping( # statistics_provider_mapping(
# provider_mapping_data_file=provider_mapping_data_file, # provider_mapping_data_file=provider_mapping_data_file,

View File

@ -0,0 +1,71 @@
553242365
553242368
553242344
553242337
553242341
553242327
553242328
553242318
553242315
553242316
553242317
553242310
553242311
553242309
553242308
553241057
553241054
553241040
553241030
553240841
553240837
553240623
553240621
553240619
553240614
553240613
553240611
553240612
553240607
553240604
553224104
553165689
553081760
553081761
553081718
553078775
553078771
553078766
553078762
553077970
553066906
553066859
553043543
553035628
553035441
553035433
553035423
553035418
553034857
553407290
553406874
553317485
553315608
553250296
553250271
553250232
553242443
553242414
553242416
553242409
553242411
553242406
553242408
553242396
553242392
553242384
553242385
553242386
553242374
553242362
553242363