fix issue for parsing data via Vision Function.
This commit is contained in:
parent
d673a99e21
commit
309bb714f6
|
|
@ -398,6 +398,12 @@
|
||||||
"Performancegebühren",
|
"Performancegebühren",
|
||||||
"Performancevergütung",
|
"Performancevergütung",
|
||||||
"Anlageerfolgsprämie",
|
"Anlageerfolgsprämie",
|
||||||
|
"TER in % (inkl.",
|
||||||
|
"TER % (inkl.",
|
||||||
|
"TER in % (exkl.",
|
||||||
|
"TER % (exkl.",
|
||||||
|
"TER% (einschließlich",
|
||||||
|
"TER% (ohne",
|
||||||
"An die Wertentwicklung des Fonds gebundene Gebühren",
|
"An die Wertentwicklung des Fonds gebundene Gebühren",
|
||||||
"Performancegebühr",
|
"Performancegebühr",
|
||||||
"Performance-gebühr",
|
"Performance-gebühr",
|
||||||
|
|
|
||||||
|
|
@ -37,6 +37,8 @@
|
||||||
"german": [
|
"german": [
|
||||||
"Mit anteiliger Performance Fee in %",
|
"Mit anteiliger Performance Fee in %",
|
||||||
"TER inkl. Performance-Fee in % **)",
|
"TER inkl. Performance-Fee in % **)",
|
||||||
|
"TER% (einschließlich Anlageerfolgsprämie)",
|
||||||
|
"TER % (inkl. Anlageerfolgsprämie)",
|
||||||
"Gesamtgebühren",
|
"Gesamtgebühren",
|
||||||
"Kostenpauschale",
|
"Kostenpauschale",
|
||||||
"Gesamtkostenquote",
|
"Gesamtkostenquote",
|
||||||
|
|
|
||||||
|
|
@ -171,7 +171,7 @@ class DataExtraction:
|
||||||
previous_page_datapoints = []
|
previous_page_datapoints = []
|
||||||
previous_page_fund_name = None
|
previous_page_fund_name = None
|
||||||
for page_num, page_text in self.page_text_dict.items():
|
for page_num, page_text in self.page_text_dict.items():
|
||||||
# if page_num > 640 or page_num < 610:
|
# if page_num != 344:
|
||||||
# continue
|
# continue
|
||||||
if page_num in handled_page_num_list:
|
if page_num in handled_page_num_list:
|
||||||
continue
|
continue
|
||||||
|
|
@ -395,7 +395,7 @@ class DataExtraction:
|
||||||
previous_page_last_fund: str = None) -> dict:
|
previous_page_last_fund: str = None) -> dict:
|
||||||
# If can't find numberic value, e.g. 1.25 or 3,88
|
# If can't find numberic value, e.g. 1.25 or 3,88
|
||||||
# apply Vision ChatGPT to extract data
|
# apply Vision ChatGPT to extract data
|
||||||
special_code_regex = r"\x00|\x01|\x02|\x03|\x04|\x05|\x06|\x07|\x08|\x09|\x0a|\x0b|\x0c|\x0d|\x0e|\x0f|\x10|\x11|\x12|\x13|\x14|\x15|\x16|\x17|\x18|\x19|\x1a|\x1b|\x1c|\x1d|\x1e|\x1f"
|
special_code_regex = r"\x10|\x11|\x12|\x13|\x14|\x15|\x16|\x17|\x18|\x19|\x1a|\x1b|\x1c|\x1d|\x1e|\x1f"
|
||||||
special_code_all = [code for code in re.findall(special_code_regex, page_text)
|
special_code_all = [code for code in re.findall(special_code_regex, page_text)
|
||||||
if code != "\n"]
|
if code != "\n"]
|
||||||
page_text_line_count = len(page_text.split("\n"))
|
page_text_line_count = len(page_text.split("\n"))
|
||||||
|
|
@ -470,9 +470,12 @@ class DataExtraction:
|
||||||
data = json_repair.loads(response)
|
data = json_repair.loads(response)
|
||||||
except:
|
except:
|
||||||
data = {"data": []}
|
data = {"data": []}
|
||||||
|
try:
|
||||||
data = self.validate_data(extract_data_info=data,
|
data = self.validate_data(extract_data_info=data,
|
||||||
page_text=page_text,
|
page_text=page_text,
|
||||||
previous_page_last_fund=previous_page_last_fund)
|
previous_page_last_fund=previous_page_last_fund)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
data_dict = {"doc_id": self.doc_id}
|
data_dict = {"doc_id": self.doc_id}
|
||||||
data_dict["page_index"] = page_num
|
data_dict["page_index"] = page_num
|
||||||
|
|
@ -574,7 +577,10 @@ class DataExtraction:
|
||||||
data = json_repair.loads(response)
|
data = json_repair.loads(response)
|
||||||
except:
|
except:
|
||||||
data = {"data": []}
|
data = {"data": []}
|
||||||
|
try:
|
||||||
data = self.validate_data(data, None, previous_page_last_fund)
|
data = self.validate_data(data, None, previous_page_last_fund)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
data_dict = {"doc_id": self.doc_id}
|
data_dict = {"doc_id": self.doc_id}
|
||||||
data_dict["page_index"] = page_num
|
data_dict["page_index"] = page_num
|
||||||
|
|
@ -620,8 +626,8 @@ class DataExtraction:
|
||||||
if len(data_list) == 0:
|
if len(data_list) == 0:
|
||||||
return extract_data_info
|
return extract_data_info
|
||||||
remove_list = []
|
remove_list = []
|
||||||
performance_fee_regex = r"Amount\s+of\s+the\s+performance\s+fees|Performance\s+Fees\s+amounts|Performance\s+fees\s+amounts|Commissioni\s+di\s+performance|Performance\s+Fee\s+"
|
performance_fee_regex = r"Amount\s+of\s+the\s+performance\s+fees|Performance\s+Fees\s+amounts|Performance\s+fees\s+amounts|Commissioni\s+di\s+performance|Performance\s+Fee\s+|Performance\s+fees\s+charged"
|
||||||
nav_regex = r"based\s+on\s+(the\s+)?NAV|on\s+the\s+Share\s+Class\s+NAV|NAV\s+of\s+performance\s+fee|of\s+the\s+average\s+Net\s+Asset\s+Value|Attivi\s+in\s+gestione|Performance\s+Fee\s+of\s+NAV\s+in"
|
nav_regex = r"based\s+on\s+(the\s+)?NAV|on\s+the\s+Share\s+Class\s+NAV|NAV\s+of\s+performance\s+fee|of\s+the\s+average\s+Net\s+Asset\s+Value|Attivi\s+in\s+gestione|Performance\s+Fee\s+of\s+NAV\s+in|share\s+class\s+dealing\s+NAV"
|
||||||
if page_text is not None and len(page_text) > 0:
|
if page_text is not None and len(page_text) > 0:
|
||||||
performance_fee_search = re.search(performance_fee_regex, page_text)
|
performance_fee_search = re.search(performance_fee_regex, page_text)
|
||||||
nav_search = re.search(nav_regex, page_text)
|
nav_search = re.search(nav_regex, page_text)
|
||||||
|
|
@ -667,11 +673,17 @@ class DataExtraction:
|
||||||
for key in keys:
|
for key in keys:
|
||||||
if self.datapoint_level_config.get(key, "") == "share_level":
|
if self.datapoint_level_config.get(key, "") == "share_level":
|
||||||
if data.get("share name", "") == "":
|
if data.get("share name", "") == "":
|
||||||
|
include_key_words = False
|
||||||
|
if key == "ter" and page_text is not None and len(page_text) > 0:
|
||||||
|
ter_regex = r"TER\s+in\s+\%|TER\s*\%"
|
||||||
|
ter_search = re.search(ter_regex, page_text)
|
||||||
|
if ter_search is not None:
|
||||||
|
include_key_words = True
|
||||||
|
if not include_key_words:
|
||||||
is_share_name = self.check_fund_name_as_share(fund_name)
|
is_share_name = self.check_fund_name_as_share(fund_name)
|
||||||
if not is_share_name:
|
if not is_share_name:
|
||||||
remove_list.append(data)
|
remove_list.append(data)
|
||||||
break
|
break
|
||||||
else:
|
|
||||||
data["share name"] = fund_name
|
data["share name"] = fund_name
|
||||||
if data.get(key, "") == "":
|
if data.get(key, "") == "":
|
||||||
data.pop(key)
|
data.pop(key)
|
||||||
|
|
@ -929,7 +941,7 @@ class DataExtraction:
|
||||||
example_count = 1
|
example_count = 1
|
||||||
none_value_example_count = 0
|
none_value_example_count = 0
|
||||||
for mul_reported_name in mul_reported_name_list:
|
for mul_reported_name in mul_reported_name_list:
|
||||||
if datapoint in ["ter", "performance_fee"] and example_count == 3:
|
if datapoint in ["ter", "performance_fee"] and example_count >= 3:
|
||||||
break
|
break
|
||||||
value = value_examples[example_count % len(value_examples)]
|
value = value_examples[example_count % len(value_examples)]
|
||||||
answer = {"fund name": fund_example,
|
answer = {"fund name": fund_example,
|
||||||
|
|
|
||||||
|
|
@ -216,6 +216,13 @@
|
||||||
"The output should be:",
|
"The output should be:",
|
||||||
"{\"data\": [{\"fund name\": \"D/S Strategie ausgewogen\", \"ter\": 1.15, \"performance_fee\": 0}]}",
|
"{\"data\": [{\"fund name\": \"D/S Strategie ausgewogen\", \"ter\": 1.15, \"performance_fee\": 0}]}",
|
||||||
"The performance fees value is TER inkl. Performance-Fee in % **) - TER exkl. Performance-Fee in % **) = 1,15 - 1,15 = 0",
|
"The performance fees value is TER inkl. Performance-Fee in % **) - TER exkl. Performance-Fee in % **) = 1,15 - 1,15 = 0",
|
||||||
|
"Example 3:",
|
||||||
|
"-----Example Start-----",
|
||||||
|
"TER % \n(inkl. \nAnlageerfolgsprämie)\nTER %\n(exkl. \nAnlageerfolgsprämie)\nPIANO 400 Fund\n0,58 %\n0,58 %\n",
|
||||||
|
"-----Example End-----",
|
||||||
|
"The output should be:",
|
||||||
|
"{\"data\": [{\"fund name\": \"PIANO 400 Fund\", \"ter\": 0.58, \"performance_fee\": 0}]}",
|
||||||
|
"The performance fees value is TER % (inkl. Anlageerfolgsprämie) - TER % (exkl. Anlageerfolgsprämie) = 0,58 - 0,58 = 0",
|
||||||
"Case 2:",
|
"Case 2:",
|
||||||
"If some table is with three value columns: \"TER including performance fees\", \"TER excluding performance fees\", \"Performance fees\", ",
|
"If some table is with three value columns: \"TER including performance fees\", \"TER excluding performance fees\", \"Performance fees\", ",
|
||||||
"The Performance fees value in column: Performance fees, chould be \"-\", because of \"TER including performance fees\" - \"TER excluding performance fees\" = 0, ",
|
"The Performance fees value in column: Performance fees, chould be \"-\", because of \"TER including performance fees\" - \"TER excluding performance fees\" = 0, ",
|
||||||
|
|
@ -230,7 +237,8 @@
|
||||||
"As at September 30, 2022, the annualised total expense ratios of \\nthe sub-fund Pictet - Corto Europe Long Short are as follows: \\nCLASS \\nANNUALISED TER INCLUDING \\nPERFORMANCE FEES \\nANNUALISED TER EXCLUDING \\nPERFORMANCE FEES \\nSYNTHETIC TER \\nP EUR \\n1.66% \\n1.66% \\n1.98%",
|
"As at September 30, 2022, the annualised total expense ratios of \\nthe sub-fund Pictet - Corto Europe Long Short are as follows: \\nCLASS \\nANNUALISED TER INCLUDING \\nPERFORMANCE FEES \\nANNUALISED TER EXCLUDING \\nPERFORMANCE FEES \\nSYNTHETIC TER \\nP EUR \\n1.66% \\n1.66% \\n1.98%",
|
||||||
"-----Example End-----",
|
"-----Example End-----",
|
||||||
"The output should be:",
|
"The output should be:",
|
||||||
"{\"data\": [{\"fund name\": \"Pictet Corto Europe Long Short\", \"share name\": \"P EUR\", \"ter\": 1.98, \"performance_fee\": 0}]}"
|
"{\"data\": [{\"fund name\": \"Pictet Corto Europe Long Short\", \"share name\": \"P EUR\", \"ter\": 1.98, \"performance_fee\": 0}]}",
|
||||||
|
"Attention: Please always output performance fee value including 0 after calculation as (TER including performance fees - TER excluding performance fees), although the value is 0, but it's with actual value."
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
@ -258,6 +266,19 @@
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
"tor": [
|
||||||
|
{
|
||||||
|
"title": "TOR with TER and multiple years:",
|
||||||
|
"contents": [
|
||||||
|
"TOR and TER are in same table and with multiple years, please extract the TER and TOR value from the latest year column.",
|
||||||
|
"---Example 1 Start---",
|
||||||
|
"APPENDIX 1 – TOTAL EXPENSE RATIOS AND PORTFOLIO TURNOVER RATIOS\nTotal Expense Ratios are based on the trading 12 months preceding the dates listed below. \nTER \nPTR* \nFor the period/year ended \n2024\n2023\n2024\n2023\nYacktman \nClass A US$ \n1.70%\n1.71%\nTotal Sub-Fund \n(5.94)%\n(5.57)%\nDriehaus Emerging \nClass A US$ \n1.76%\n1.89%\nTotal Sub-Fund \n101.51%\n89.41%",
|
||||||
|
"---Example 1 End---",
|
||||||
|
"The output should be:",
|
||||||
|
"{\"data\": [{\"fund name\": \"Yacktman\", \"share name\": \"Class A US$\", \"ter\": 1.70}, {\"fund name\": \"Yacktman\", \"tor\": -5.94}, {\"fund name\": \"Driehaus Emerging\", \"share name\": \"Class A US$\", \"ter\": 1.76}, {\"fund name\": \"Driehaus Emerging\", \"tor\": 101.51}]}"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"extreme_complex": [
|
"extreme_complex": [
|
||||||
{
|
{
|
||||||
"title": "Complex Data Table Structure",
|
"title": "Complex Data Table Structure",
|
||||||
|
|
|
||||||
24
main.py
24
main.py
|
|
@ -887,11 +887,11 @@ def batch_run_documents():
|
||||||
calculate_metrics = False
|
calculate_metrics = False
|
||||||
|
|
||||||
extract_way = "text"
|
extract_way = "text"
|
||||||
special_doc_id_list = ["435128656"]
|
special_doc_id_list = []
|
||||||
if len(special_doc_id_list) == 0:
|
if len(special_doc_id_list) == 0:
|
||||||
force_save_total_data = True
|
force_save_total_data = True
|
||||||
# file_base_name_candidates = ["sample_document_complex", "emea_case_from_word_complex"]
|
# file_base_name_candidates = ["sample_document_complex", "emea_case_from_word_complex"]
|
||||||
file_base_name_candidates = ["sample_document_complex"]
|
file_base_name_candidates = ["sample_documents_12_11"]
|
||||||
for document_list_file in document_list_files:
|
for document_list_file in document_list_files:
|
||||||
file_base_name = os.path.basename(document_list_file).replace(".txt", "")
|
file_base_name = os.path.basename(document_list_file).replace(".txt", "")
|
||||||
if (file_base_name_candidates is not None and
|
if (file_base_name_candidates is not None and
|
||||||
|
|
@ -933,7 +933,27 @@ def batch_run_documents():
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def batch_initial_document():
|
||||||
|
sample_document_list_folder = r'./sample_documents/'
|
||||||
|
document_list_file = os.path.join(sample_document_list_folder, "sample_documents_12_11.txt")
|
||||||
|
with open(document_list_file, "r", encoding="utf-8") as f:
|
||||||
|
doc_id_list = f.readlines()
|
||||||
|
doc_id_list = [doc_id.strip() for doc_id in doc_id_list]
|
||||||
|
pdf_folder = r"/data/emea_ar/pdf/"
|
||||||
|
page_filter_ground_truth_file = (
|
||||||
|
r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx"
|
||||||
|
)
|
||||||
|
output_extract_data_child_folder = r"/data/emea_ar/output/extract_data/docs/"
|
||||||
|
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
|
||||||
|
for doc_id in tqdm(doc_id_list):
|
||||||
|
logger.info(f"Start to initial document: {doc_id}")
|
||||||
|
emea_ar_parsing = EMEA_AR_Parsing(doc_id=doc_id,
|
||||||
|
pdf_folder=pdf_folder,
|
||||||
|
output_extract_data_folder=output_extract_data_child_folder,
|
||||||
|
output_mapping_data_folder=output_mapping_child_folder)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
# batch_initial_document()
|
||||||
batch_run_documents()
|
batch_run_documents()
|
||||||
|
|
||||||
# new_data_file = r"/data/emea_ar/output/mapping_data/total/mapping_data_info_15_documents_by_text_20241121154243.xlsx"
|
# new_data_file = r"/data/emea_ar/output/mapping_data/total/mapping_data_info_15_documents_by_text_20241121154243.xlsx"
|
||||||
|
|
|
||||||
119
prepare_data.py
119
prepare_data.py
|
|
@ -118,7 +118,9 @@ def analyze_json_error():
|
||||||
def statistics_document(
|
def statistics_document(
|
||||||
pdf_folder: str,
|
pdf_folder: str,
|
||||||
doc_mapping_file_path: str,
|
doc_mapping_file_path: str,
|
||||||
sheet_name: str = "all_data",
|
doc_ar_data_file_path: str,
|
||||||
|
mapping_sheet_name: str = "Sheet1",
|
||||||
|
ar_data_sheet_name: str = "doc_ar_data_in_db",
|
||||||
output_folder: str = "/data/emea_ar/basic_information/English/",
|
output_folder: str = "/data/emea_ar/basic_information/English/",
|
||||||
output_file: str = "doc_mapping_statistics_data.xlsx"
|
output_file: str = "doc_mapping_statistics_data.xlsx"
|
||||||
):
|
):
|
||||||
|
|
@ -139,7 +141,7 @@ def statistics_document(
|
||||||
|
|
||||||
describe_stat_df_list = []
|
describe_stat_df_list = []
|
||||||
# statistics document mapping information
|
# statistics document mapping information
|
||||||
doc_mapping_data = pd.read_excel(doc_mapping_file_path, sheet_name=sheet_name)
|
doc_mapping_data = pd.read_excel(doc_mapping_file_path, sheet_name=mapping_sheet_name)
|
||||||
|
|
||||||
# statistics doc_mapping_data for counting FundId count based on DocumentId
|
# statistics doc_mapping_data for counting FundId count based on DocumentId
|
||||||
logger.info(
|
logger.info(
|
||||||
|
|
@ -260,7 +262,7 @@ def statistics_document(
|
||||||
docid = os.path.basename(pdf_file).split(".")[0]
|
docid = os.path.basename(pdf_file).split(".")[0]
|
||||||
doc = fitz.open(pdf_file)
|
doc = fitz.open(pdf_file)
|
||||||
page_num = doc.page_count
|
page_num = doc.page_count
|
||||||
doc_page_num_list.append({"docid": docid, "page_num": page_num})
|
doc_page_num_list.append({"DocumentId": docid, "page_num": page_num})
|
||||||
doc.close()
|
doc.close()
|
||||||
doc_page_num_df = pd.DataFrame(doc_page_num_list)
|
doc_page_num_df = pd.DataFrame(doc_page_num_list)
|
||||||
# order by page_num in descending order
|
# order by page_num in descending order
|
||||||
|
|
@ -274,8 +276,8 @@ def statistics_document(
|
||||||
describe_stat_df = pd.concat(describe_stat_df_list)
|
describe_stat_df = pd.concat(describe_stat_df_list)
|
||||||
describe_stat_df.reset_index(drop=True, inplace=True)
|
describe_stat_df.reset_index(drop=True, inplace=True)
|
||||||
|
|
||||||
|
doc_ar_data = pd.read_excel(doc_ar_data_file_path, sheet_name=ar_data_sheet_name)
|
||||||
doc_dp_result = get_document_with_all_4_data_points(None, None, doc_mapping_data)
|
doc_dp_result = get_document_with_all_4_data_points(None, None, doc_ar_data)
|
||||||
doc_dp_data_list = []
|
doc_dp_data_list = []
|
||||||
for doc_id in doc_id_list:
|
for doc_id in doc_id_list:
|
||||||
doc_id = int(doc_id)
|
doc_id = int(doc_id)
|
||||||
|
|
@ -293,14 +295,25 @@ def statistics_document(
|
||||||
doc_dp_data_df = doc_dp_data_df.sort_values(by="DocumentId", ascending=True)
|
doc_dp_data_df = doc_dp_data_df.sort_values(by="DocumentId", ascending=True)
|
||||||
doc_dp_data_df.reset_index(drop=True, inplace=True)
|
doc_dp_data_df.reset_index(drop=True, inplace=True)
|
||||||
|
|
||||||
|
# set all of DocumentId in DataFrame objects to be string type
|
||||||
|
doc_page_num_df["DocumentId"] = doc_page_num_df["DocumentId"].astype(str)
|
||||||
|
doc_fund_count["DocumentId"] = doc_fund_count["DocumentId"].astype(str)
|
||||||
|
doc_share_class_count["DocumentId"] = doc_share_class_count["DocumentId"].astype(str)
|
||||||
|
doc_dp_data_df["DocumentId"] = doc_dp_data_df["DocumentId"].astype(str)
|
||||||
|
|
||||||
|
# merge statistics data for doc_page_num_df, doc_dp_data_df, doc_fund_count, doc_share_class_count based on DocumentId
|
||||||
|
doc_page_num_df = doc_page_num_df.merge(doc_fund_count, on="DocumentId", how="left")
|
||||||
|
doc_page_num_df = doc_page_num_df.merge(doc_share_class_count, on="DocumentId", how="left")
|
||||||
|
doc_page_num_df = doc_page_num_df.merge(doc_dp_data_df, on="DocumentId", how="left")
|
||||||
|
|
||||||
# save statistics data to excel
|
# save statistics data to excel
|
||||||
with pd.ExcelWriter(stat_file) as writer:
|
with pd.ExcelWriter(stat_file) as writer:
|
||||||
doc_page_num_df.to_excel(writer, sheet_name="doc_page_num", index=False)
|
doc_page_num_df.to_excel(writer, sheet_name="doc_level_stats", index=False)
|
||||||
doc_dp_data_df.to_excel(writer, sheet_name="doc_dp_data", index=False)
|
# doc_dp_data_df.to_excel(writer, sheet_name="doc_dp_data", index=False)
|
||||||
doc_fund_count.to_excel(writer, sheet_name="doc_fund_count", index=False)
|
# doc_fund_count.to_excel(writer, sheet_name="doc_fund_count", index=False)
|
||||||
doc_share_class_count.to_excel(
|
# doc_share_class_count.to_excel(
|
||||||
writer, sheet_name="doc_share_class_count", index=False
|
# writer, sheet_name="doc_share_class_count", index=False
|
||||||
)
|
# )
|
||||||
provider_fund_count.to_excel(
|
provider_fund_count.to_excel(
|
||||||
writer, sheet_name="provider_fund_count", index=False
|
writer, sheet_name="provider_fund_count", index=False
|
||||||
)
|
)
|
||||||
|
|
@ -315,6 +328,41 @@ def statistics_document(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def get_document_with_all_4_data_points(folder: str, file_name: str, data: pd.DataFrame):
|
||||||
|
if data is None:
|
||||||
|
file_path = os.path.join(folder, file_name)
|
||||||
|
if os.path.exists(file_path):
|
||||||
|
data = pd.read_excel(file_path, sheet_name="doc_ar_data_in_db")
|
||||||
|
else:
|
||||||
|
logger.error(f"Invalid file path: {file_path}")
|
||||||
|
return
|
||||||
|
# get document id list which noTor is 0
|
||||||
|
noTor_0_doc_id_list = data[data["noTor"] == 0]["DocumentId"].unique().tolist()
|
||||||
|
|
||||||
|
# get document id list which share_noTer is 0
|
||||||
|
share_noTer_0_doc_id_list = data[data["share_noTer"] == 0]["DocumentId"].unique().tolist()
|
||||||
|
|
||||||
|
# get document id list which share_noOgc is 0
|
||||||
|
share_noOgc_0_doc_id_list = data[data["share_noOgc"] == 0]["DocumentId"].unique().tolist()
|
||||||
|
|
||||||
|
# get document id list which share_noPerfFee is 0
|
||||||
|
share_noPerfFee_0_doc_id_list = data[data["share_noPerfFee"] == 0]["DocumentId"].unique().tolist()
|
||||||
|
|
||||||
|
logger.info(f"noTor_0_doc_id_list: {len(noTor_0_doc_id_list)}")
|
||||||
|
logger.info(f"share_noTer_0_doc_id_list: {len(share_noTer_0_doc_id_list)}")
|
||||||
|
logger.info(f"share_noOgc_0_doc_id_list: {len(share_noOgc_0_doc_id_list)}")
|
||||||
|
logger.info(f"share_noPerfFee_0_doc_id_list: {len(share_noPerfFee_0_doc_id_list)}")
|
||||||
|
|
||||||
|
all_4_data_points_doc_id_list = list(set(noTor_0_doc_id_list) & set(share_noTer_0_doc_id_list) & set(share_noOgc_0_doc_id_list) & set(share_noPerfFee_0_doc_id_list))
|
||||||
|
|
||||||
|
logger.info(f"all_4_data_points_doc_id_list: {len(all_4_data_points_doc_id_list)}")
|
||||||
|
result = {"tor": noTor_0_doc_id_list,
|
||||||
|
"ter": share_noTer_0_doc_id_list,
|
||||||
|
"ogc": share_noOgc_0_doc_id_list,
|
||||||
|
"perf_fee": share_noPerfFee_0_doc_id_list}
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
def statistics_provider_mapping(provider_mapping_data_file: str, output_folder: str):
|
def statistics_provider_mapping(provider_mapping_data_file: str, output_folder: str):
|
||||||
if (
|
if (
|
||||||
provider_mapping_data_file is None
|
provider_mapping_data_file is None
|
||||||
|
|
@ -1312,40 +1360,6 @@ def calc_typical_doc_metrics_v1():
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def get_document_with_all_4_data_points(folder: str, file_name: str, data: pd.DataFrame):
|
|
||||||
if data is None:
|
|
||||||
file_path = os.path.join(folder, file_name)
|
|
||||||
if os.path.exists(file_path):
|
|
||||||
data = pd.read_excel(file_path, sheet_name="doc_ar_data_in_db")
|
|
||||||
else:
|
|
||||||
logger.error(f"Invalid file path: {file_path}")
|
|
||||||
return
|
|
||||||
# get document id list which noTor is 0
|
|
||||||
noTor_0_doc_id_list = data[data["noTor"] == 0]["DocumentId"].unique().tolist()
|
|
||||||
|
|
||||||
# get document id list which share_noTer is 0
|
|
||||||
share_noTer_0_doc_id_list = data[data["share_noTer"] == 0]["DocumentId"].unique().tolist()
|
|
||||||
|
|
||||||
# get document id list which share_noOgc is 0
|
|
||||||
share_noOgc_0_doc_id_list = data[data["share_noOgc"] == 0]["DocumentId"].unique().tolist()
|
|
||||||
|
|
||||||
# get document id list which share_noPerfFee is 0
|
|
||||||
share_noPerfFee_0_doc_id_list = data[data["share_noPerfFee"] == 0]["DocumentId"].unique().tolist()
|
|
||||||
|
|
||||||
logger.info(f"noTor_0_doc_id_list: {len(noTor_0_doc_id_list)}")
|
|
||||||
logger.info(f"share_noTer_0_doc_id_list: {len(share_noTer_0_doc_id_list)}")
|
|
||||||
logger.info(f"share_noOgc_0_doc_id_list: {len(share_noOgc_0_doc_id_list)}")
|
|
||||||
logger.info(f"share_noPerfFee_0_doc_id_list: {len(share_noPerfFee_0_doc_id_list)}")
|
|
||||||
|
|
||||||
all_4_data_points_doc_id_list = list(set(noTor_0_doc_id_list) & set(share_noTer_0_doc_id_list) & set(share_noOgc_0_doc_id_list) & set(share_noPerfFee_0_doc_id_list))
|
|
||||||
|
|
||||||
logger.info(f"all_4_data_points_doc_id_list: {len(all_4_data_points_doc_id_list)}")
|
|
||||||
result = {"tor": noTor_0_doc_id_list,
|
|
||||||
"ter": share_noTer_0_doc_id_list,
|
|
||||||
"ogc": share_noOgc_0_doc_id_list,
|
|
||||||
"perf_fee": share_noPerfFee_0_doc_id_list}
|
|
||||||
return result
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
folder = r"/data/emea_ar/basic_information/English/sample_doc/emea_11_06_case/"
|
folder = r"/data/emea_ar/basic_information/English/sample_doc/emea_11_06_case/"
|
||||||
file_name = "doc_ar_data_for_emea_11_06.xlsx"
|
file_name = "doc_ar_data_for_emea_11_06.xlsx"
|
||||||
|
|
@ -1356,7 +1370,7 @@ if __name__ == "__main__":
|
||||||
doc_provider_file_path = (
|
doc_provider_file_path = (
|
||||||
r"/data/emea_ar/basic_information/English/latest_provider_ar_document.xlsx"
|
r"/data/emea_ar/basic_information/English/latest_provider_ar_document.xlsx"
|
||||||
)
|
)
|
||||||
doc_mapping_file_path = r"/data/emea_ar/basic_information/English/latest_provider_ar_document_mapping.xlsx"
|
doc_ar_data_file_path = r"/data/emea_ar/basic_information/English/latest_provider_ar_document_mapping.xlsx"
|
||||||
provider_mapping_data_file = (
|
provider_mapping_data_file = (
|
||||||
r"/data/emea_ar/basic_information/English/provider_mapping_data.xlsx"
|
r"/data/emea_ar/basic_information/English/provider_mapping_data.xlsx"
|
||||||
)
|
)
|
||||||
|
|
@ -1392,13 +1406,16 @@ if __name__ == "__main__":
|
||||||
# pdf_folder)
|
# pdf_folder)
|
||||||
|
|
||||||
|
|
||||||
doc_mapping_file_path = r"/data/emea_ar/basic_information/English/sample_doc/emea_sample_documents/doc_ar_data_for_emea_sample_documents.xlsx"
|
doc_ar_data_file_path = r"/data/emea_ar/basic_information/sample_documents/sample_doc/sample_documents_12_11/doc_ar_data_12_11.xlsx"
|
||||||
output_data_folder = r"/data/emea_ar/basic_information/English/sample_doc/emea_sample_documents/"
|
doc_mapping_data_file_path = r"/data/emea_ar/basic_information/sample_documents/sample_doc/sample_documents_12_11/doc_mapping_12_11.xlsx"
|
||||||
|
output_data_folder = r"/data/emea_ar/basic_information/sample_documents/sample_doc/sample_documents_12_11/"
|
||||||
statistics_document(pdf_folder=pdf_folder,
|
statistics_document(pdf_folder=pdf_folder,
|
||||||
doc_mapping_file_path=doc_mapping_file_path,
|
doc_mapping_file_path=doc_mapping_data_file_path,
|
||||||
sheet_name="doc_ar_data_in_db",
|
doc_ar_data_file_path=doc_ar_data_file_path,
|
||||||
|
mapping_sheet_name="Sheet1",
|
||||||
|
ar_data_sheet_name="doc_ar_data_in_db",
|
||||||
output_folder=output_data_folder,
|
output_folder=output_data_folder,
|
||||||
output_file="doc_ar_data_sample_documents_statistics.xlsx")
|
output_file="doc_ar_data_sample_documents_12_11_statistics.xlsx")
|
||||||
# get_document_extracted_share_diff_by_db()
|
# get_document_extracted_share_diff_by_db()
|
||||||
# statistics_provider_mapping(
|
# statistics_provider_mapping(
|
||||||
# provider_mapping_data_file=provider_mapping_data_file,
|
# provider_mapping_data_file=provider_mapping_data_file,
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,71 @@
|
||||||
|
553242365
|
||||||
|
553242368
|
||||||
|
553242344
|
||||||
|
553242337
|
||||||
|
553242341
|
||||||
|
553242327
|
||||||
|
553242328
|
||||||
|
553242318
|
||||||
|
553242315
|
||||||
|
553242316
|
||||||
|
553242317
|
||||||
|
553242310
|
||||||
|
553242311
|
||||||
|
553242309
|
||||||
|
553242308
|
||||||
|
553241057
|
||||||
|
553241054
|
||||||
|
553241040
|
||||||
|
553241030
|
||||||
|
553240841
|
||||||
|
553240837
|
||||||
|
553240623
|
||||||
|
553240621
|
||||||
|
553240619
|
||||||
|
553240614
|
||||||
|
553240613
|
||||||
|
553240611
|
||||||
|
553240612
|
||||||
|
553240607
|
||||||
|
553240604
|
||||||
|
553224104
|
||||||
|
553165689
|
||||||
|
553081760
|
||||||
|
553081761
|
||||||
|
553081718
|
||||||
|
553078775
|
||||||
|
553078771
|
||||||
|
553078766
|
||||||
|
553078762
|
||||||
|
553077970
|
||||||
|
553066906
|
||||||
|
553066859
|
||||||
|
553043543
|
||||||
|
553035628
|
||||||
|
553035441
|
||||||
|
553035433
|
||||||
|
553035423
|
||||||
|
553035418
|
||||||
|
553034857
|
||||||
|
553407290
|
||||||
|
553406874
|
||||||
|
553317485
|
||||||
|
553315608
|
||||||
|
553250296
|
||||||
|
553250271
|
||||||
|
553250232
|
||||||
|
553242443
|
||||||
|
553242414
|
||||||
|
553242416
|
||||||
|
553242409
|
||||||
|
553242411
|
||||||
|
553242406
|
||||||
|
553242408
|
||||||
|
553242396
|
||||||
|
553242392
|
||||||
|
553242384
|
||||||
|
553242385
|
||||||
|
553242386
|
||||||
|
553242374
|
||||||
|
553242362
|
||||||
|
553242363
|
||||||
Loading…
Reference in New Issue