From 353bc2859910170a45fd66ba5b29ff98fb7c7a50 Mon Sep 17 00:00:00 2001 From: Blade He Date: Tue, 11 Feb 2025 11:49:53 -0600 Subject: [PATCH] update a little --- main.py | 11 ++-- prepare_data.py | 46 ++++++++++------ .../aus_prospectus_52_documents_sample.txt | 52 +++++++++++++++++++ 3 files changed, 89 insertions(+), 20 deletions(-) create mode 100644 sample_documents/aus_prospectus_52_documents_sample.txt diff --git a/main.py b/main.py index 4074d4c..89dba42 100644 --- a/main.py +++ b/main.py @@ -1377,7 +1377,7 @@ if __name__ == "__main__": # doc_source = "aus_prospectus" # sample_document_list_folder: str = r'./sample_documents/' - # document_list_file: str = "aus_prospectus_100_documents_multi_fund_sample.txt" + # document_list_file: str = "aus_prospectus_52_documents_sample.txt" # pdf_folder: str = r"/data/aus_prospectus/pdf/" # output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/" # output_extract_data_child_folder: str = r"/data/aus_prospectus/output/extract_data/docs/" @@ -1397,13 +1397,16 @@ if __name__ == "__main__": # document_sample_file = ( # r"./sample_documents/aus_prospectus_100_documents_multi_fund_sample.txt" # ) + # document_sample_file = ( + # r"./sample_documents/aus_prospectus_17_documents_sample.txt" + # ) document_sample_file = ( - r"./sample_documents/aus_prospectus_17_documents_sample.txt" + r"./sample_documents/aus_prospectus_52_documents_sample.txt" ) with open(document_sample_file, "r", encoding="utf-8") as f: special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()] # document_mapping_file = r"/data/aus_prospectus/basic_information/from_2024_documents/aus_100_document_prospectus_multi_fund.xlsx" - document_mapping_file = r"/data/aus_prospectus/basic_information/17_documents/aus_prospectus_17_documents_mapping.xlsx" + document_mapping_file = r"/data/aus_prospectus/basic_information/biz_rule/phase1_document_mapping.xlsx" # special_doc_id_list: list = [ # "539790009", # "542300403", @@ -1417,7 +1420,7 @@ if __name__ == "__main__": # "555377021", # "555654388", # ] - special_doc_id_list: list = ["377377369"] + # special_doc_id_list: list = ["377377369"] pdf_folder: str = r"/data/aus_prospectus/pdf/" output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/" output_extract_data_child_folder: str = ( diff --git a/prepare_data.py b/prepare_data.py index b5bef69..4ded59f 100644 --- a/prepare_data.py +++ b/prepare_data.py @@ -1345,14 +1345,19 @@ def calc_typical_doc_metrics_v1(): ) -def merge_aus_document_prospectus_data(): +def merge_aus_document_prospectus_data(aus_data_folder: str = r"/data/aus_prospectus/basic_information/from_2024_documents/", + aus_document_mapping_file: str = r"document_mapping.xlsx", + aus_prospectus_data_file: str = r"aus_prospectus_data.xlsx", + document_mapping_sheet: str = "document_mapping", + output_file: str = r"aus_document_prospectus.xlsx", + output_sheet: str = "aus_document_prospectus"): """ Merge AUS document and prospectus data. """ - aus_document_file = r"/data/aus_prospectus/basic_information/from_2024_documents/document_mapping.xlsx" - aus_prospectus_file = r"/data/aus_prospectus/basic_information/from_2024_documents/aus_prospectus_data.xlsx" - aus_document_data = pd.read_excel(aus_document_file, sheet_name="document_mapping") - aus_prospectus_data = pd.read_excel(aus_prospectus_file) + aus_document_mapping_file = os.path.join(aus_data_folder, aus_document_mapping_file) + aus_prospectus_data_file = os.path.join(aus_data_folder, aus_prospectus_data_file) + aus_document_data = pd.read_excel(aus_document_mapping_file, sheet_name=document_mapping_sheet) + aus_prospectus_data = pd.read_excel(aus_prospectus_data_file) aus_document_data["DocumentId"] = aus_document_data["DocumentId"].astype(str) @@ -1362,10 +1367,10 @@ def merge_aus_document_prospectus_data(): on=["FundClassId", "EffectiveDate"], how="left", ) - aus_document_prospectus_file = r"/data/aus_prospectus/basic_information/from_2024_documents/aus_document_prospectus.xlsx" + aus_document_prospectus_file = os.path.join(aus_data_folder, output_file) with pd.ExcelWriter(aus_document_prospectus_file) as writer: aus_document_prospectus_data.to_excel( - writer, sheet_name="aus_document_prospectus", index=False + writer, sheet_name=output_sheet, index=False ) @@ -1384,15 +1389,19 @@ def pdf_exist(): logger.info(f"pdf file exist: {pdf_file_path}") -def prepare_multi_fund_aus_prospectus_document(): - data_folder = r"/data/aus_prospectus/basic_information/from_2024_documents/" - document_mapping_file = os.path.join(data_folder, "document_mapping.xlsx") - document_data_file = os.path.join(data_folder, "aus_document_prospectus.xlsx") +def prepare_multi_fund_aus_prospectus_document(data_folder: str = r"/data/aus_prospectus/basic_information/from_2024_documents/", + document_mapping_file: str = "document_mapping.xlsx", + document_mapping_sheet: str = "document_mapping", + document_fund_count_sheet: str = "document_fund_count", + document_data_file: str = "aus_document_prospectus.xlsx", + document_data_sheet: str = "aus_document_prospectus"): + document_mapping_file = os.path.join(data_folder, document_mapping_file) + document_data_file = os.path.join(data_folder, document_data_file) - document_mapping_df = pd.read_excel(document_mapping_file, sheet_name="document_mapping") - document_fund_count_df = pd.read_excel(document_mapping_file, sheet_name="document_fund_count") + document_mapping_df = pd.read_excel(document_mapping_file, sheet_name=document_mapping_sheet) + document_fund_count_df = pd.read_excel(document_mapping_file, sheet_name=document_fund_count_sheet) - document_data_df = pd.read_excel(document_data_file, sheet_name="aus_document_prospectus") + document_data_df = pd.read_excel(document_data_file, sheet_name=document_data_sheet) document_data_df.fillna("", inplace=True) # get data from document_data_df which SecurityName is not empty string document_data_df = document_data_df[document_data_df["SecurityName"] != ""] @@ -1459,8 +1468,13 @@ def prepare_multi_fund_aus_prospectus_document(): if __name__ == "__main__": # pdf_exist() - prepare_multi_fund_aus_prospectus_document() - # merge_aus_document_prospectus_data() + # prepare_multi_fund_aus_prospectus_document() + merge_aus_document_prospectus_data(aus_data_folder=r"/data/aus_prospectus/basic_information/biz_rule/", + aus_document_mapping_file="phase1_document_mapping.xlsx", + aus_prospectus_data_file="phase1_aus_prospectus_data.xlsx", + document_mapping_sheet="document_mapping", + output_file="phase1_aus_document_prospectus.xlsx", + output_sheet="aus_document_prospectus") folder = r"/data/emea_ar/basic_information/English/sample_doc/emea_11_06_case/" file_name = "doc_ar_data_for_emea_11_06.xlsx" # get_document_with_all_4_data_points(folder, file_name, None) diff --git a/sample_documents/aus_prospectus_52_documents_sample.txt b/sample_documents/aus_prospectus_52_documents_sample.txt new file mode 100644 index 0000000..64cdaab --- /dev/null +++ b/sample_documents/aus_prospectus_52_documents_sample.txt @@ -0,0 +1,52 @@ +420255482 +417904228 +448577874 +466381788 +502017146 +480854115 +481913497 +479996918 +475093006 +492202154 +495604527 +471624689 +397667293 +447335411 +490252419 +498429268 +369105359 +481728671 +466227438 +489870941 +481909371 +495604806 +557526143 +369219625 +377425488 +420281919 +420333972 +425940066 +439596540 +406568432 +411560137 +412398851 +412698096 +413157970 +319457827 +337806248 +341026731 +344001344 +362917516 +471895618 +450135866 +454032956 +471026558 +471052717 +471608905 +476325788 +478569026 +480716611 +480718722 +485628750 +486915646 +486927510 \ No newline at end of file