From d41fae3dba0b777a2fd94e7929defced0b89fbf2 Mon Sep 17 00:00:00 2001 From: Blade He Date: Fri, 17 Jan 2025 16:26:31 -0600 Subject: [PATCH] prepare for 100 multi-funds document samples --- app_emea_ar.py | 2 + .../aus_prospectus/document_dp_pages.json | 9 -- core/page_filter.py | 57 ++++++- main.py | 143 +++++++++--------- prepare_data.py | 124 +++++++++++---- ...pectus_100_documents_multi_fund_sample.txt | 100 ++++++++++++ 6 files changed, 323 insertions(+), 112 deletions(-) create mode 100644 sample_documents/aus_prospectus_100_documents_multi_fund_sample.txt diff --git a/app_emea_ar.py b/app_emea_ar.py index 5a04592..105be43 100644 --- a/app_emea_ar.py +++ b/app_emea_ar.py @@ -47,11 +47,13 @@ def emea_ar_data_extract(): extract_way = "text" os.makedirs(pdf_folder, exist_ok=True) + os.makedirs(output_pdf_text_folder, exist_ok=True) os.makedirs(output_extract_data_folder, exist_ok=True) os.makedirs(output_mapping_data_folder, exist_ok=True) os.makedirs(drilldown_folder, exist_ok=True) clean_folder(pdf_folder) + clean_folder(output_pdf_text_folder) clean_folder(output_extract_data_folder) clean_folder(output_mapping_data_folder) clean_folder(drilldown_folder) diff --git a/configuration/aus_prospectus/document_dp_pages.json b/configuration/aus_prospectus/document_dp_pages.json index 940dc20..2c63c08 100644 --- a/configuration/aus_prospectus/document_dp_pages.json +++ b/configuration/aus_prospectus/document_dp_pages.json @@ -1,11 +1,2 @@ { - "539790009": [39, 40, 45, 46, 47], - "542300403": [12], - "542306317": [4, 15, 16, 17, 18], - "547567013": [12, 13, 14, 15, 16, 17, 33, 34, 35, 39, 40, 42, 43, 44, 45], - "552505237": [16, 17, 18, 19, 25, 26, 27], - "552505278": [12, 13, 14, 15, 16, 17, 18, 19, 20, 27, 28, 29], - "554431052": [34, 35, 36, 41, 42, 43], - "555377021": [21, 22, 23, 24, 25, 26], - "555654388": [35, 36] } diff --git a/core/page_filter.py b/core/page_filter.py index 534720c..e64bfe6 100644 --- a/core/page_filter.py +++ b/core/page_filter.py @@ -12,10 +12,16 @@ from utils.pdf_util import get_pdf_pages_by_html class FilterPages: def __init__( - self, doc_id: str, pdf_file: str, document_mapping_info_df: pd.DataFrame, doc_source: str = "emea_ar" + self, + doc_id: str, + pdf_file: str, + document_mapping_info_df: pd.DataFrame, + doc_source: str = "emea_ar", + output_pdf_text_folder: str = r"/data/emea_ar/output/pdf_text/", ) -> None: self.doc_id = doc_id self.pdf_file = pdf_file + self.output_pdf_text_folder = output_pdf_text_folder self.configuration_folder = f"./configuration/{doc_source}/" misc_config_file = os.path.join(self.configuration_folder, "misc_config.json") if os.path.exists(misc_config_file): @@ -45,15 +51,64 @@ class FilterPages: def get_pdf_page_text_dict(self) -> dict: page_text_dict = {} + # if exist page text file, load it + self.page_text_file = "" + pdf2html_page_text_file = "" + pymupdf_page_text_file = "" + if self.output_pdf_text_folder is not None and len(self.output_pdf_text_folder) > 0: + os.makedirs(self.output_pdf_text_folder, exist_ok=True) + pdf2html_output_pdf_text_folder = os.path.join( + self.output_pdf_text_folder, "pdf2html/" + ) + os.makedirs(pdf2html_output_pdf_text_folder, exist_ok=True) + pymupdf_output_pdf_text_folder = os.path.join( + self.output_pdf_text_folder, "pymupdf/" + ) + os.makedirs(pymupdf_output_pdf_text_folder, exist_ok=True) + pdf2html_page_text_file = os.path.join( + pdf2html_output_pdf_text_folder, f"{self.doc_id}_page_text.json" + ) + pymupdf_page_text_file = os.path.join( + pymupdf_output_pdf_text_folder, f"{self.doc_id}_page_text.json" + ) + if os.path.exists(pdf2html_page_text_file): + self.apply_pdf2html = True + self.page_text_file = pdf2html_page_text_file + with open(pdf2html_page_text_file, "r", encoding="utf-8") as f: + page_text_dict = json.load(f) + elif os.path.exists(pymupdf_page_text_file): + self.apply_pdf2html = False + self.page_text_file = pymupdf_page_text_file + with open(pymupdf_page_text_file, "r", encoding="utf-8") as f: + page_text_dict = json.load(f) + else: + pass + if len(page_text_dict.keys()) > 0: + logger.info(f"Load page text from file: {self.page_text_file}") + # transfer the keys of page_text_dict to be int + page_text_dict = {int(k): v for k, v in page_text_dict.items()} + return page_text_dict + if self.apply_pdf2html: page_text_dict = get_pdf_pages_by_html(self.pdf_file, pdf_info_type="pdf_path") if len(page_text_dict.keys()) == 0: pdf_util = PDFUtil(self.pdf_file) success, text, page_text_dict = pdf_util.extract_text() self.apply_pdf2html = False + self.page_text_file = pymupdf_page_text_file + else: + self.page_text_file = pdf2html_page_text_file else: pdf_util = PDFUtil(self.pdf_file) success, text, page_text_dict = pdf_util.extract_text() + self.page_text_file = pymupdf_page_text_file + + if len(self.page_text_file) > 0: + try: + with open(self.page_text_file, "w", encoding="utf-8") as f: + json.dump(page_text_dict, f, ensure_ascii=False, indent=4) + except Exception as e: + logger.error(f"Error: {e}") return page_text_dict def get_configuration_from_file(self) -> dict: diff --git a/main.py b/main.py index 738c211..c30af06 100644 --- a/main.py +++ b/main.py @@ -71,27 +71,13 @@ class EMEA_AR_Parsing: os.makedirs(self.output_mapping_data_folder, exist_ok=True) self.filter_pages = FilterPages( - self.doc_id, self.pdf_file, self.document_mapping_info_df, self.doc_source + self.doc_id, + self.pdf_file, + self.document_mapping_info_df, + self.doc_source, + output_pdf_text_folder ) self.page_text_dict = self.filter_pages.page_text_dict - try: - os.makedirs(output_pdf_text_folder, exist_ok=True) - if self.filter_pages.apply_pdf2html: - output_pdf_text_folder = os.path.join( - output_pdf_text_folder, "pdf2html/" - ) - else: - output_pdf_text_folder = os.path.join( - output_pdf_text_folder, "pymupdf/" - ) - os.makedirs(output_pdf_text_folder, exist_ok=True) - self.page_text_file = os.path.join( - output_pdf_text_folder, f"{self.doc_id}_page_text.json" - ) - with open(self.page_text_file, "w", encoding="utf-8") as f: - json.dump(self.page_text_dict, f, ensure_ascii=False, indent=4) - except Exception as e: - logger.error(f"Error: {e}") self.datapoint_page_info, self.result_details = self.get_datapoint_page_info() self.datapoints = self.get_datapoints_from_datapoint_page_info() @@ -1079,6 +1065,7 @@ def batch_initial_document( document_list_file: str = "sample_document_complex.txt", doc_source: str = "emea_ar", pdf_folder: str = r"/data/emea_ar/pdf/", + output_pdf_text_folder: str = r"/data/emea_ar/output/pdf_text/", output_extract_data_child_folder: str = r"/data/emea_ar/output/extract_data/docs/", output_mapping_child_folder: str = r"/data/emea_ar/output/mapping_data/docs/", ): @@ -1094,6 +1081,7 @@ def batch_initial_document( doc_id=doc_id, doc_source=doc_source, pdf_folder=pdf_folder, + output_pdf_text_folder=output_pdf_text_folder, output_extract_data_folder=output_extract_data_child_folder, output_mapping_data_folder=output_mapping_child_folder, ) @@ -1319,74 +1307,81 @@ def merge_output_data_aus_prospectus( if __name__ == "__main__": - data_file_path = r"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_11_documents_by_text_20250116220811.xlsx" - document_mapping_file_path = r"/data/aus_prospectus/basic_information/11_documents/document_mapping.xlsx" - merged_total_data_folder = r'/data/aus_prospectus/output/mapping_data/total/merged/' - os.makedirs(merged_total_data_folder, exist_ok=True) - data_file_base_name = os.path.basename(data_file_path) - output_data_file_path = os.path.join(merged_total_data_folder, "merged_" + data_file_base_name) - merge_output_data_aus_prospectus(data_file_path, document_mapping_file_path, output_data_file_path) + # data_file_path = r"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_11_documents_by_text_20250116220811.xlsx" + # document_mapping_file_path = r"/data/aus_prospectus/basic_information/11_documents/document_mapping.xlsx" + # merged_total_data_folder = r'/data/aus_prospectus/output/mapping_data/total/merged/' + # os.makedirs(merged_total_data_folder, exist_ok=True) + # data_file_base_name = os.path.basename(data_file_path) + # output_data_file_path = os.path.join(merged_total_data_folder, "merged_" + data_file_base_name) + # merge_output_data_aus_prospectus(data_file_path, document_mapping_file_path, output_data_file_path) + # doc_source = "aus_prospectus" # sample_document_list_folder: str = r'./sample_documents/' - # document_list_file: str = "aus_prospectus.txt" + # document_list_file: str = "aus_prospectus_100_documents_multi_fund_sample.txt" # pdf_folder: str = r"/data/aus_prospectus/pdf/" + # output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/" # output_extract_data_child_folder: str = r"/data/aus_prospectus/output/extract_data/docs/" # output_mapping_child_folder: str = r"/data/aus_prospectus/output/mapping_data/docs/" # batch_initial_document(sample_document_list_folder=sample_document_list_folder, # document_list_file=document_list_file, + # doc_source=doc_source, # pdf_folder=pdf_folder, + # output_pdf_text_folder=output_pdf_text_folder, # output_extract_data_child_folder=output_extract_data_child_folder, # output_mapping_child_folder=output_mapping_child_folder) # special_doc_id_list = ["553242411"] - # doc_source = "aus_prospectus" - # if doc_source == "aus_prospectus": - # special_doc_id_list: list = [ - # "539790009", - # "542300403", - # "542301117", - # "542306317", - # "547567013", - # "552505237", - # "552505278", - # "554431052", - # "554851189", - # "555377021", - # "555654388", - # ] - # # special_doc_id_list: list = ["554851189"] - # pdf_folder: str = r"/data/aus_prospectus/pdf/" - # output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/" - # output_extract_data_child_folder: str = ( - # r"/data/aus_prospectus/output/extract_data/docs/" - # ) - # output_extract_data_total_folder: str = ( - # r"/data/aus_prospectus/output/extract_data/total/" - # ) - # output_mapping_child_folder: str = ( - # r"/data/aus_prospectus/output/mapping_data/docs/" - # ) - # output_mapping_total_folder: str = ( - # r"/data/aus_prospectus/output/mapping_data/total/" - # ) - # drilldown_folder = r"/data/aus_prospectus/output/drilldown/" - # batch_run_documents( - # doc_source=doc_source, - # special_doc_id_list=special_doc_id_list, - # pdf_folder=pdf_folder, - # output_pdf_text_folder=output_pdf_text_folder, - # output_extract_data_child_folder=output_extract_data_child_folder, - # output_extract_data_total_folder=output_extract_data_total_folder, - # output_mapping_child_folder=output_mapping_child_folder, - # output_mapping_total_folder=output_mapping_total_folder, - # drilldown_folder=drilldown_folder, - # ) - # elif doc_source == "emea_ar": - # special_doc_id_list = ["553242408"] - # batch_run_documents( - # doc_source=doc_source, special_doc_id_list=special_doc_id_list - # ) + doc_source = "aus_prospectus" + if doc_source == "aus_prospectus": + document_sample_file = r"./sample_documents/aus_prospectus_100_documents_multi_fund_sample.txt" + with open(document_sample_file, "r", encoding="utf-8") as f: + special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()] + # special_doc_id_list: list = [ + # "539790009", + # "542300403", + # "542301117", + # "542306317", + # "547567013", + # "552505237", + # "552505278", + # "554431052", + # "554851189", + # "555377021", + # "555654388", + # ] + # special_doc_id_list: list = ["554851189"] + pdf_folder: str = r"/data/aus_prospectus/pdf/" + output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/" + output_extract_data_child_folder: str = ( + r"/data/aus_prospectus/output/extract_data/docs/" + ) + output_extract_data_total_folder: str = ( + r"/data/aus_prospectus/output/extract_data/total/" + ) + output_mapping_child_folder: str = ( + r"/data/aus_prospectus/output/mapping_data/docs/" + ) + output_mapping_total_folder: str = ( + r"/data/aus_prospectus/output/mapping_data/total/" + ) + drilldown_folder = r"/data/aus_prospectus/output/drilldown/" + batch_run_documents( + doc_source=doc_source, + special_doc_id_list=special_doc_id_list, + pdf_folder=pdf_folder, + output_pdf_text_folder=output_pdf_text_folder, + output_extract_data_child_folder=output_extract_data_child_folder, + output_extract_data_total_folder=output_extract_data_total_folder, + output_mapping_child_folder=output_mapping_child_folder, + output_mapping_total_folder=output_mapping_total_folder, + drilldown_folder=drilldown_folder, + ) + elif doc_source == "emea_ar": + special_doc_id_list = ["553242408"] + batch_run_documents( + doc_source=doc_source, special_doc_id_list=special_doc_id_list + ) # new_data_file = r"/data/emea_ar/output/mapping_data/total/mapping_data_info_15_documents_by_text_20241121154243.xlsx" # original_data_file = r"/data/emea_ar/ground_truth/data_extraction/verify/mapping_data_info_30_documents_all_4_datapoints_20241106_verify_mapping.xlsx" diff --git a/prepare_data.py b/prepare_data.py index 969e64b..b5bef69 100644 --- a/prepare_data.py +++ b/prepare_data.py @@ -12,7 +12,6 @@ import json_repair from utils.logger import logger from utils.pdf_download import download_pdf_from_documents_warehouse from utils.pdf_util import PDFUtil -from pdf_table_extraction import PDFTableExtraction def get_unique_docids_from_doc_provider_data(doc_provider_file_path: str): @@ -78,23 +77,6 @@ def output_pdf_page_text(pdf_folder: str, output_folder: str): logger.info(f"Successfully extracted text from {pdf_file}") -def extract_pdf_table(pdf_folder: str, output_folder: str): - if pdf_folder is None or len(pdf_folder) == 0 or not os.path.exists(pdf_folder): - logger.error(f"Invalid pdf_folder: {pdf_folder}") - return - if output_folder is None or len(output_folder) == 0: - logger.error(f"Invalid output_folder: {output_folder}") - return - os.makedirs(output_folder, exist_ok=True) - - pdf_files = glob(os.path.join(pdf_folder, "*.pdf")) - logger.info(f"Total {len(pdf_files)} pdf files found in {pdf_folder}") - for pdf_file in pdf_files: - logger.info(f"Start processing {pdf_file}") - pdf_table_extraction = PDFTableExtraction(pdf_file, output_folder) - pdf_table_extraction.extract_tables() - - def analyze_json_error(): text_file = r"/data/emea_ar/output/pdf_table_prompts/445877368_4.txt" with open(text_file, "r", encoding="utf-8") as file: @@ -1385,15 +1367,100 @@ def merge_aus_document_prospectus_data(): aus_document_prospectus_data.to_excel( writer, sheet_name="aus_document_prospectus", index=False ) + +def pdf_exist(): + data_folder = r"/data/aus_prospectus/basic_information/from_2024_documents/" + data_file = os.path.join(data_folder, "aus_100_document_prospectus_multi_fund.xlsx") + percentile_result_df = pd.read_excel(data_file, sheet_name="percentile_result") + document_id_list = percentile_result_df["DocumentId"].unique().tolist() + + pdf_doc_path = r"/data/aus_prospectus/pdf/" + for doc_id in document_id_list: + pdf_file_path = os.path.join(pdf_doc_path, f"{doc_id}.pdf") + if not os.path.exists(pdf_file_path): + logger.error(f"pdf file not exist: {pdf_file_path}") + else: + logger.info(f"pdf file exist: {pdf_file_path}") + -def get_pdf_2_html(): - pass +def prepare_multi_fund_aus_prospectus_document(): + data_folder = r"/data/aus_prospectus/basic_information/from_2024_documents/" + document_mapping_file = os.path.join(data_folder, "document_mapping.xlsx") + document_data_file = os.path.join(data_folder, "aus_document_prospectus.xlsx") + + document_mapping_df = pd.read_excel(document_mapping_file, sheet_name="document_mapping") + document_fund_count_df = pd.read_excel(document_mapping_file, sheet_name="document_fund_count") + + document_data_df = pd.read_excel(document_data_file, sheet_name="aus_document_prospectus") + document_data_df.fillna("", inplace=True) + # get data from document_data_df which SecurityName is not empty string + document_data_df = document_data_df[document_data_df["SecurityName"] != ""] + document_id_list = document_data_df["DocumentId"].unique().tolist() + + # get document which fund count > 1 + document_fund_count_df = document_fund_count_df[document_fund_count_df["DocumentId"].isin(document_id_list)] + document_fund_count_df = document_fund_count_df[document_fund_count_df["DistinctFundCount"] > 1] + document_fund_count_df = document_fund_count_df.sort_values(by="DistinctFundCount", ascending=False) + # Calculate percentile + percentiles = [0, 0.3, 0.6, 1] + quantile_values = document_fund_count_df['DistinctFundCount'].quantile(percentiles) + + # Group by percentile + bins = [quantile_values[0], quantile_values[0.3], quantile_values[0.6], quantile_values[1]] + document_fund_count_df['Percentile_Group'] = pd.cut(document_fund_count_df['DistinctFundCount'], bins=bins, labels=["0-30", "30-60", "60-100"], include_lowest=True) + + # Get relevant samples based on percentile group + percentile_result = pd.DataFrame() + for group, count in zip(["0-30", "30-60", "60-100"], [30, 30, 40]): + group_df = document_fund_count_df[document_fund_count_df['Percentile_Group'] == group] + sampled_df = group_df.sample(n=min(len(group_df), count), random_state=42) + percentile_result = pd.concat([percentile_result, sampled_df], ignore_index=True) + percentile_result.reset_index(drop=True, inplace=True) + document_id_list = percentile_result["DocumentId"].unique().tolist() + final_document_mapping_df = document_mapping_df[document_mapping_df["DocumentId"].isin(document_id_list)] + # order by DocumentId, FundLegalName, FundClassLegalName + final_document_mapping_df = final_document_mapping_df.sort_values(by=["DocumentId", "FundLegalName", "FundClassLegalName"], ascending=True) + final_document_mapping_df.reset_index(drop=True, inplace=True) + + # get CompanyId, CompanyName from final_document_mapping_df + final_document_provider_df = final_document_mapping_df[["CompanyId", "CompanyName"]].drop_duplicates() + # order by CompanyName + final_document_provider_df = final_document_provider_df.sort_values(by="CompanyName", ascending=True) + final_document_provider_df.reset_index(drop=True, inplace=True) + + final_document_data_df = document_data_df[document_data_df["DocumentId"].isin(document_id_list)] + # order by DocumentId, FundLegalName, FundClassLegalName + final_document_data_df = final_document_data_df.sort_values(by=["DocumentId", "FundLegalName", "FundClassLegalName"], ascending=True) + final_document_data_df.reset_index(drop=True, inplace=True) + + output_file = os.path.join(data_folder, "aus_100_document_prospectus_multi_fund.xlsx") + with pd.ExcelWriter(output_file) as writer: + final_document_mapping_df.to_excel( + writer, sheet_name="document_mapping", index=False + ) + final_document_provider_df.to_excel( + writer, sheet_name="document_provider", index=False + ) + final_document_data_df.to_excel( + writer, sheet_name="aus_document_data", index=False + ) + percentile_result.to_excel( + writer, sheet_name="percentile_result", index=False + ) + output_sample_document_file = os.path.join(r"./sample_documents/", + "aus_prospectus_100_documents_multi_fund_sample.txt") + # output document id to txt file + with open(output_sample_document_file, "w") as f: + for doc_id in document_id_list: + f.write(f"{doc_id}\n") if __name__ == "__main__": - merge_aus_document_prospectus_data() + # pdf_exist() + prepare_multi_fund_aus_prospectus_document() + # merge_aus_document_prospectus_data() folder = r"/data/emea_ar/basic_information/English/sample_doc/emea_11_06_case/" file_name = "doc_ar_data_for_emea_11_06.xlsx" # get_document_with_all_4_data_points(folder, file_name, None) @@ -1447,18 +1514,19 @@ if __name__ == "__main__": output_data_folder = r"/data/emea_ar/basic_information/sample_documents/sample_doc/sample_documents_12_11/" output_file="doc_ar_data_sample_documents_12_11_statistics.xlsx" - pdf_folder = r"/data/illume/japan_prospectus/pdf/" - doc_ar_data_file_path = None - doc_mapping_data_file_path = r"/data/illume/japan_prospectus/materials/document_mapping.xlsx" - output_data_folder = r"/data/illume/japan_prospectus/materials/" - output_file = "japan_prospectus_statistics.xlsx" + # pdf_folder = r"/data/aus_prospectus/pdf/" + # doc_ar_data_file_path = None + # doc_mapping_data_file_path = r"/data/aus_prospectus/basic_information/from_2024_documents/aus_100_document_prospectus_multi_fund.xlsx" + # output_data_folder = r"/data/aus_prospectus/basic_information/from_2024_documents/" + # output_file = "aus_100_document_prospectus_multi_fund_statistics.xlsx" # statistics_document(pdf_folder=pdf_folder, # doc_mapping_file_path=doc_mapping_data_file_path, # doc_ar_data_file_path=doc_ar_data_file_path, - # mapping_sheet_name="Sheet1", - # ar_data_sheet_name="doc_ar_data_in_db", + # mapping_sheet_name="document_mapping", + # ar_data_sheet_name="aus_document_data", # output_folder=output_data_folder, # output_file=output_file) + # get_document_extracted_share_diff_by_db() # statistics_provider_mapping( # provider_mapping_data_file=provider_mapping_data_file, diff --git a/sample_documents/aus_prospectus_100_documents_multi_fund_sample.txt b/sample_documents/aus_prospectus_100_documents_multi_fund_sample.txt new file mode 100644 index 0000000..56b7c04 --- /dev/null +++ b/sample_documents/aus_prospectus_100_documents_multi_fund_sample.txt @@ -0,0 +1,100 @@ +539559156 +544218181 +541357691 +539601979 +518496603 +498429337 +515319827 +500162212 +557034899 +520954335 +555654388 +521324579 +539791416 +520663234 +534711945 +551941891 +509391262 +521618542 +557362556 +556248266 +497151360 +521625399 +539559165 +539601349 +557362553 +524631914 +530101994 +533499706 +557526104 +521618713 +536014247 +534287530 +553449194 +526644754 +550769189 +526200526 +539604078 +535821734 +511351053 +515041837 +550026958 +554124220 +539794750 +539580710 +526200513 +557065365 +551941902 +535827969 +535825845 +557526108 +544886026 +539794737 +540028470 +552995727 +517738771 +521630976 +539601982 +529881356 +546269437 +521822717 +515499515 +521617820 +539791633 +524248225 +492008222 +522608195 +515978340 +560485097 +546289968 +558726012 +547567013 +492202154 +520698753 +540453691 +539791362 +535825766 +534287518 +552727485 +521617806 +539266817 +557362542 +522879836 +535821752 +528208771 +518492363 +552505270 +502017146 +545500792 +539604126 +503209562 +521822710 +550533961 +513111748 +545750078 +539581666 +521609521 +542630657 +491033056 +560260579 +521618962