From 0349033eaf866cc03b041c9838341ae50716b3cf Mon Sep 17 00:00:00 2001 From: Blade He Date: Wed, 6 Nov 2024 16:39:42 -0600 Subject: [PATCH] update for more statistics methods --- main.py | 71 +++++++++++++++++++++++++++-- prepare_data.py | 119 +++++++++++++++++++++++++++++++++++------------- 2 files changed, 156 insertions(+), 34 deletions(-) diff --git a/main.py b/main.py index aba2060..1ca1d8f 100644 --- a/main.py +++ b/main.py @@ -922,13 +922,78 @@ if __name__ == "__main__": "539087870", "536343790" ] + + # document samples 2024-11-06 + # check_db_mapping_doc_id_list = ["546483469", + # "546375582", + # "546375575", + # "546375576", + # "546375577", + # "546375568", + # "546371033", + # "546632761", + # "546632544", + # "546632464", + # "546724583", + # "546724552", + # "546694677", + # "546660422", + # "546638908", + # "546632845", + # "546105299", + # "546085481", + # "546078693", + # "546078650", + # "546289930", + # "546289910", + # "542967371", + # "542798238", + # "546048730", + # "546048143", + # "546047619", + # "546047528", + # "546046730", + # "546919329"] + + # document samples: 30 documents, all with 4 data points + check_db_mapping_doc_id_list = ["479742284", + "501380497", + "501380553", + "501380775", + "501380801", + "501600428", + "501600429", + "501600541", + "501600549", + "503659548", + "506326520", + "507720522", + "507928179", + "508981020", + "509133771", + "509743502", + "514636951", + "514636952", + "514636953", + "514636954", + "514636955", + "514636957", + "514636958", + "514636959", + "514636985", + "514636988", + "514636990", + "514636993", + "514636994", + "539794746" + ] special_doc_id_list = check_db_mapping_doc_id_list - special_doc_id_list = ["532422720"] + # special_doc_id_list = [] output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/" output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/" re_run_extract_data = False - re_run_mapping_data = True - force_save_total_data = False + re_run_mapping_data = False + force_save_total_data = True calculate_metrics = False extract_ways = ["text"] diff --git a/prepare_data.py b/prepare_data.py index c728819..3306c09 100644 --- a/prepare_data.py +++ b/prepare_data.py @@ -249,34 +249,54 @@ def statistics_document( doc_id_list = [str(docid) for docid in doc_mapping_data["DocumentId"].unique().tolist()] # statistics document page number - pdf_files = glob(os.path.join(pdf_folder, "*.pdf")) - logger.info(f"Total {len(pdf_files)} pdf files found in {pdf_folder}") - logger.info("statistics document page number") - doc_page_num_list = [] - for pdf_file in tqdm(pdf_files): - pdf_base_name = os.path.basename(pdf_file).replace(".pdf", "") - if pdf_base_name not in doc_id_list: - continue - docid = os.path.basename(pdf_file).split(".")[0] - doc = fitz.open(pdf_file) - page_num = doc.page_count - doc_page_num_list.append({"docid": docid, "page_num": page_num}) - doc.close() - doc_page_num_df = pd.DataFrame(doc_page_num_list) - # order by page_num in descending order - doc_page_num_df = doc_page_num_df.sort_values(by="page_num", ascending=False) - # statistics page_num by describe and transform to DataFrame - doc_page_num_stat_df = get_describe_stat( - doc_page_num_df, "page_num", "doc_page_num" - ) - describe_stat_df_list.append(doc_page_num_stat_df) + # pdf_files = glob(os.path.join(pdf_folder, "*.pdf")) + # logger.info(f"Total {len(pdf_files)} pdf files found in {pdf_folder}") + # logger.info("statistics document page number") + # doc_page_num_list = [] + # for pdf_file in tqdm(pdf_files): + # pdf_base_name = os.path.basename(pdf_file).replace(".pdf", "") + # if pdf_base_name not in doc_id_list: + # continue + # docid = os.path.basename(pdf_file).split(".")[0] + # doc = fitz.open(pdf_file) + # page_num = doc.page_count + # doc_page_num_list.append({"docid": docid, "page_num": page_num}) + # doc.close() + # doc_page_num_df = pd.DataFrame(doc_page_num_list) + # # order by page_num in descending order + # doc_page_num_df = doc_page_num_df.sort_values(by="page_num", ascending=False) + # # statistics page_num by describe and transform to DataFrame + # doc_page_num_stat_df = get_describe_stat( + # doc_page_num_df, "page_num", "doc_page_num" + # ) + # describe_stat_df_list.append(doc_page_num_stat_df) describe_stat_df = pd.concat(describe_stat_df_list) describe_stat_df.reset_index(drop=True, inplace=True) + + + doc_dp_result = get_document_with_all_4_data_points(None, None, doc_mapping_data) + doc_dp_data_list = [] + for doc_id in doc_id_list: + doc_id = int(doc_id) + doc_dp_data = {"DocumentId": doc_id, "tor": 0, "ter": 0, "ogc": 0, "perf_fee": 0} + if doc_id in doc_dp_result["tor"]: + doc_dp_data["tor"] = 1 + if doc_id in doc_dp_result["ter"]: + doc_dp_data["ter"] = 1 + if doc_id in doc_dp_result["ogc"]: + doc_dp_data["ogc"] = 1 + if doc_id in doc_dp_result["perf_fee"]: + doc_dp_data["perf_fee"] = 1 + doc_dp_data_list.append(doc_dp_data) + doc_dp_data_df = pd.DataFrame(doc_dp_data_list) + doc_dp_data_df = doc_dp_data_df.sort_values(by="DocumentId", ascending=True) + doc_dp_data_df.reset_index(drop=True, inplace=True) # save statistics data to excel with pd.ExcelWriter(stat_file) as writer: - doc_page_num_df.to_excel(writer, sheet_name="doc_page_num", index=False) + # doc_page_num_df.to_excel(writer, sheet_name="doc_page_num", index=False) + doc_dp_data_df.to_excel(writer, sheet_name="doc_dp_data", index=False) doc_fund_count.to_excel(writer, sheet_name="doc_fund_count", index=False) doc_share_class_count.to_excel( writer, sheet_name="doc_share_class_count", index=False @@ -1289,12 +1309,49 @@ def calc_typical_doc_metrics_v1(): final_data_df.to_excel( writer, sheet_name="metrics", index=False ) - + +def get_document_with_all_4_data_points(folder: str, file_name: str, data: pd.DataFrame): + if data is None: + file_path = os.path.join(folder, file_name) + if os.path.exists(file_path): + data = pd.read_excel(file_path, sheet_name="doc_ar_data_in_db") + else: + logger.error(f"Invalid file path: {file_path}") + return + # get document id list which noTor is 0 + noTor_0_doc_id_list = data[data["noTor"] == 0]["DocumentId"].unique().tolist() + + # get document id list which share_noTer is 0 + share_noTer_0_doc_id_list = data[data["share_noTer"] == 0]["DocumentId"].unique().tolist() + + # get document id list which share_noOgc is 0 + share_noOgc_0_doc_id_list = data[data["share_noOgc"] == 0]["DocumentId"].unique().tolist() + + # get document id list which share_noPerfFee is 0 + share_noPerfFee_0_doc_id_list = data[data["share_noPerfFee"] == 0]["DocumentId"].unique().tolist() + + logger.info(f"noTor_0_doc_id_list: {len(noTor_0_doc_id_list)}") + logger.info(f"share_noTer_0_doc_id_list: {len(share_noTer_0_doc_id_list)}") + logger.info(f"share_noOgc_0_doc_id_list: {len(share_noOgc_0_doc_id_list)}") + logger.info(f"share_noPerfFee_0_doc_id_list: {len(share_noPerfFee_0_doc_id_list)}") + + all_4_data_points_doc_id_list = list(set(noTor_0_doc_id_list) & set(share_noTer_0_doc_id_list) & set(share_noOgc_0_doc_id_list) & set(share_noPerfFee_0_doc_id_list)) + + logger.info(f"all_4_data_points_doc_id_list: {len(all_4_data_points_doc_id_list)}") + result = {"tor": noTor_0_doc_id_list, + "ter": share_noTer_0_doc_id_list, + "ogc": share_noOgc_0_doc_id_list, + "perf_fee": share_noPerfFee_0_doc_id_list} + return result + if __name__ == "__main__": + folder = r"/data/emea_ar/basic_information/English/sample_doc/emea_11_06_case/" + file_name = "doc_ar_data_for_emea_11_06.xlsx" + # get_document_with_all_4_data_points(folder, file_name, None) # calc_typical_doc_metrics_v1() - calc_typical_doc_metrics_v2() + # calc_typical_doc_metrics_v2() doc_provider_file_path = ( r"/data/emea_ar/basic_information/English/latest_provider_ar_document.xlsx" @@ -1335,13 +1392,13 @@ if __name__ == "__main__": # pdf_folder) - doc_mapping_file_path = r"/data/emea_ar/basic_information/English/sample_doc/emea_final_typical_case/doc_ar_data_for_final_list_emea_documents.xlsx" - output_data_folder = r"/data/emea_ar/basic_information/English/sample_doc/emea_final_typical_case/" - # statistics_document(pdf_folder=pdf_folder, - # doc_mapping_file_path=doc_mapping_file_path, - # sheet_name="doc_ar_data_in_db", - # output_folder=output_data_folder, - # output_file="doc_ar_data_statistics.xlsx") + doc_mapping_file_path = r"/data/emea_ar/basic_information/English/sample_doc/emea_doc_with_all_4_dp/doc_ar_data_with_all_4_dp.xlsx" + output_data_folder = r"/data/emea_ar/basic_information/English/sample_doc/emea_doc_with_all_4_dp/" + statistics_document(pdf_folder=pdf_folder, + doc_mapping_file_path=doc_mapping_file_path, + sheet_name="doc_ar_data_in_db", + output_folder=output_data_folder, + output_file="doc_ar_data_with_all_4_dp_statistics.xlsx") # get_document_extracted_share_diff_by_db() # statistics_provider_mapping( # provider_mapping_data_file=provider_mapping_data_file,