diff --git a/.gitignore b/.gitignore index d1f36cc..38bc785 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,4 @@ /test_calc_metrics.py /test_metrics /data +/sample_documents/japan_prospectus.txt diff --git a/drilldown_practice.py b/drilldown_practice.py index a536a32..48742a6 100644 --- a/drilldown_practice.py +++ b/drilldown_practice.py @@ -10,11 +10,7 @@ from utils.logger import logger from utils.pdf_util import PDFUtil -def drilldown_documents(): - # doc_id: str, - pdf_folder = r"/data/emea_ar/pdf/" - drilldown_folder = r"/data/emea_ar/output/drilldown/" - extract_data_folder = r'/data/emea_ar/output/extract_data/docs/by_text/json/' +def drilldown_documents(pdf_folder: str, extract_data_folder: str, drilldown_folder: str): extract_files = glob(extract_data_folder + '*.json') for index, json_file in enumerate(tqdm(extract_files)): @@ -156,5 +152,8 @@ def calculate_metrics(): if __name__ == "__main__": + pdf_folder = r"/data/emea_ar/pdf/" + drilldown_folder = r"/data/emea_ar/output/drilldown/" + extract_data_folder = r'/data/emea_ar/output/extract_data/docs/by_text/json/' drilldown_documents() # calculate_metrics() \ No newline at end of file diff --git a/main.py b/main.py index 7eea57b..f7867e1 100644 --- a/main.py +++ b/main.py @@ -935,11 +935,11 @@ def batch_run_documents(): def batch_initial_document(): sample_document_list_folder = r'./sample_documents/' - document_list_file = os.path.join(sample_document_list_folder, "sample_documents_12_11.txt") + document_list_file = os.path.join(sample_document_list_folder, "japan_prospectus.txt") with open(document_list_file, "r", encoding="utf-8") as f: doc_id_list = f.readlines() doc_id_list = [doc_id.strip() for doc_id in doc_id_list] - pdf_folder = r"/data/emea_ar/pdf/" + pdf_folder = r"/data/illume/japan_prospectus/pdf/" page_filter_ground_truth_file = ( r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx" ) @@ -952,9 +952,88 @@ def batch_initial_document(): output_extract_data_folder=output_extract_data_child_folder, output_mapping_data_folder=output_mapping_child_folder) + +def merge_output_data(data_file_path: str, + document_mapping_file: str, + output_data_file_path: str): + data_df = pd.read_excel(data_file_path, sheet_name="total_mapping_data") + document_mapping_df = pd.read_excel(document_mapping_file, sheet_name="doc_date") + # set doc_id to be string type + data_df["doc_id"] = data_df["doc_id"].astype(str) + document_mapping_df["DocumentId"] = document_mapping_df["DocumentId"].astype(str) + """ + doc_id page_index raw_name datapoint value raw_check comment investment_type investment_id investment_name similarity + 553242368 344 Deutsche MSCI World Index Fund tor 61 33 FS0000AY1Y Xtrackers MSCI World Index Fund 0.75 + 553242368 344 db x-trackers EUR Liquid Corporate 12.5 UCITS ETF - Klasse 1C ter 0.35 1 F000018PY1 Xtrackers EUR Corporate Green Bond UCITS ETF 1C 0.462 + """ + doc_id_list = data_df["doc_id"].unique().tolist() + data_point_dict = { + "tor": "TurnoverRatio", + "ter": "NetExpenseRatio", + "ogc": "OngoingCharge", + "performance_fee": "PerformanceFee" + } + total_data_list = [] + for doc_id in tqdm(doc_id_list): + doc_data_list = [] + doc_data_df = data_df[data_df["doc_id"] == doc_id] + doc_date = str(document_mapping_df[document_mapping_df["DocumentId"] == doc_id]["EffectiveDate"].values[0])[0:10] + exist_raw_name_list = [] + for index, row in doc_data_df.iterrows(): + doc_id = str(row["doc_id"]) + page_index = int(row["page_index"]) + raw_name = str(row["raw_name"]) + datapoint = str(row["datapoint"]) + value = row["value"] + investment_type = row["investment_type"] + investment_id = row["investment_id"] + investment_name = row["investment_name"] + + exist = False + for exist_raw_name_info in exist_raw_name_list: + exist_raw_name = exist_raw_name_info["raw_name"] + exist_investment_type = exist_raw_name_info["investment_type"] + if exist_raw_name == raw_name and exist_investment_type == investment_type: + exist = True + break + if not exist: + data = { + "DocumentId": doc_id, + "investment_type": investment_type, + "investment_id": investment_id, + "investment_name": investment_name, + "EffectiveDate": doc_date, + "page_index": [], + "RawName": raw_name, + "NetExpenseRatio": "", + "OngoingCharge": "", + "TurnoverRatio": "", + "PerformanceFee": "" + } + exist_raw_name_list.append({"raw_name": raw_name, "investment_type": investment_type}) + doc_data_list.append(data) + # find data from total_data_list by raw_name + for data in doc_data_list: + if data["RawName"] == raw_name and data["investment_type"] == investment_type: + update_key = data_point_dict[datapoint] + data[update_key] = value + if page_index not in data["page_index"]: + data["page_index"].append(page_index) + break + total_data_list.extend(doc_data_list) + total_data_df = pd.DataFrame(total_data_list) + total_data_df.fillna("", inplace=True) + with pd.ExcelWriter(output_data_file_path) as writer: + total_data_df.to_excel(writer, index=False, sheet_name="total_data") + + if __name__ == "__main__": + data_file_path = r"/data/emea_ar/ground_truth/data_extraction/verify/20241211/sample_documents_12_11_mapping_data_info_44_documents_by_text_20241211185546.xlsx" + document_mapping_file_path = r"/data/emea_ar/basic_information/sample_documents/sample_doc/sample_documents_12_11/doc_mapping_12_11.xlsx" + output_data_file_path = r"/data/emea_ar/ground_truth/data_extraction/verify/20241211/sample_documents_12_11_merged_data_info.xlsx" + merge_output_data(data_file_path, document_mapping_file_path, output_data_file_path) # batch_initial_document() - batch_run_documents() + # batch_run_documents() # new_data_file = r"/data/emea_ar/output/mapping_data/total/mapping_data_info_15_documents_by_text_20241121154243.xlsx" # original_data_file = r"/data/emea_ar/ground_truth/data_extraction/verify/mapping_data_info_30_documents_all_4_datapoints_20241106_verify_mapping.xlsx" diff --git a/prepare_data.py b/prepare_data.py index f80990c..1f6618a 100644 --- a/prepare_data.py +++ b/prepare_data.py @@ -276,35 +276,39 @@ def statistics_document( describe_stat_df = pd.concat(describe_stat_df_list) describe_stat_df.reset_index(drop=True, inplace=True) - doc_ar_data = pd.read_excel(doc_ar_data_file_path, sheet_name=ar_data_sheet_name) - doc_dp_result = get_document_with_all_4_data_points(None, None, doc_ar_data) - doc_dp_data_list = [] - for doc_id in doc_id_list: - doc_id = int(doc_id) - doc_dp_data = {"DocumentId": doc_id, "tor": 0, "ter": 0, "ogc": 0, "perf_fee": 0} - if doc_id in doc_dp_result["tor"]: - doc_dp_data["tor"] = 1 - if doc_id in doc_dp_result["ter"]: - doc_dp_data["ter"] = 1 - if doc_id in doc_dp_result["ogc"]: - doc_dp_data["ogc"] = 1 - if doc_id in doc_dp_result["perf_fee"]: - doc_dp_data["perf_fee"] = 1 - doc_dp_data_list.append(doc_dp_data) - doc_dp_data_df = pd.DataFrame(doc_dp_data_list) - doc_dp_data_df = doc_dp_data_df.sort_values(by="DocumentId", ascending=True) - doc_dp_data_df.reset_index(drop=True, inplace=True) + doc_dp_data_df = None + if doc_ar_data_file_path is not None and os.path.exists(doc_ar_data_file_path): + doc_ar_data = pd.read_excel(doc_ar_data_file_path, sheet_name=ar_data_sheet_name) + doc_dp_result = get_document_with_all_4_data_points(None, None, doc_ar_data) + doc_dp_data_list = [] + for doc_id in doc_id_list: + doc_id = int(doc_id) + doc_dp_data = {"DocumentId": doc_id, "tor": 0, "ter": 0, "ogc": 0, "perf_fee": 0} + if doc_id in doc_dp_result["tor"]: + doc_dp_data["tor"] = 1 + if doc_id in doc_dp_result["ter"]: + doc_dp_data["ter"] = 1 + if doc_id in doc_dp_result["ogc"]: + doc_dp_data["ogc"] = 1 + if doc_id in doc_dp_result["perf_fee"]: + doc_dp_data["perf_fee"] = 1 + doc_dp_data_list.append(doc_dp_data) + doc_dp_data_df = pd.DataFrame(doc_dp_data_list) + doc_dp_data_df = doc_dp_data_df.sort_values(by="DocumentId", ascending=True) + doc_dp_data_df.reset_index(drop=True, inplace=True) # set all of DocumentId in DataFrame objects to be string type doc_page_num_df["DocumentId"] = doc_page_num_df["DocumentId"].astype(str) doc_fund_count["DocumentId"] = doc_fund_count["DocumentId"].astype(str) doc_share_class_count["DocumentId"] = doc_share_class_count["DocumentId"].astype(str) - doc_dp_data_df["DocumentId"] = doc_dp_data_df["DocumentId"].astype(str) + if doc_dp_data_df is not None: + doc_dp_data_df["DocumentId"] = doc_dp_data_df["DocumentId"].astype(str) # merge statistics data for doc_page_num_df, doc_dp_data_df, doc_fund_count, doc_share_class_count based on DocumentId doc_page_num_df = doc_page_num_df.merge(doc_fund_count, on="DocumentId", how="left") doc_page_num_df = doc_page_num_df.merge(doc_share_class_count, on="DocumentId", how="left") - doc_page_num_df = doc_page_num_df.merge(doc_dp_data_df, on="DocumentId", how="left") + if doc_dp_data_df is not None: + doc_page_num_df = doc_page_num_df.merge(doc_dp_data_df, on="DocumentId", how="left") # save statistics data to excel with pd.ExcelWriter(stat_file) as writer: @@ -1395,7 +1399,10 @@ if __name__ == "__main__": # sheet_name="Sheet1", # doc_id_column="Document Id", # pdf_path=pdf_folder) - # output_pdf_page_text(pdf_folder, output_folder) + + pdf_folder = r"/data/illume/japan_prospectus/pdf/" + output_folder = r"/data/illume/japan_prospectus/pdf_txt/" + output_pdf_page_text(pdf_folder, output_folder) # extract_pdf_table(pdf_folder, output_folder) # analyze_json_error() @@ -1409,13 +1416,20 @@ if __name__ == "__main__": doc_ar_data_file_path = r"/data/emea_ar/basic_information/sample_documents/sample_doc/sample_documents_12_11/doc_ar_data_12_11.xlsx" doc_mapping_data_file_path = r"/data/emea_ar/basic_information/sample_documents/sample_doc/sample_documents_12_11/doc_mapping_12_11.xlsx" output_data_folder = r"/data/emea_ar/basic_information/sample_documents/sample_doc/sample_documents_12_11/" - statistics_document(pdf_folder=pdf_folder, - doc_mapping_file_path=doc_mapping_data_file_path, - doc_ar_data_file_path=doc_ar_data_file_path, - mapping_sheet_name="Sheet1", - ar_data_sheet_name="doc_ar_data_in_db", - output_folder=output_data_folder, - output_file="doc_ar_data_sample_documents_12_11_statistics.xlsx") + output_file="doc_ar_data_sample_documents_12_11_statistics.xlsx" + + pdf_folder = r"/data/illume/japan_prospectus/pdf/" + doc_ar_data_file_path = None + doc_mapping_data_file_path = r"/data/illume/japan_prospectus/materials/document_mapping.xlsx" + output_data_folder = r"/data/illume/japan_prospectus/materials/" + output_file = "japan_prospectus_statistics.xlsx" + # statistics_document(pdf_folder=pdf_folder, + # doc_mapping_file_path=doc_mapping_data_file_path, + # doc_ar_data_file_path=doc_ar_data_file_path, + # mapping_sheet_name="Sheet1", + # ar_data_sheet_name="doc_ar_data_in_db", + # output_folder=output_data_folder, + # output_file=output_file) # get_document_extracted_share_diff_by_db() # statistics_provider_mapping( # provider_mapping_data_file=provider_mapping_data_file,