update a little

This commit is contained in:
Blade He 2025-02-11 11:49:53 -06:00
parent 01e2a0e38d
commit 353bc28599
3 changed files with 89 additions and 20 deletions

11
main.py
View File

@ -1377,7 +1377,7 @@ if __name__ == "__main__":
# doc_source = "aus_prospectus"
# sample_document_list_folder: str = r'./sample_documents/'
# document_list_file: str = "aus_prospectus_100_documents_multi_fund_sample.txt"
# document_list_file: str = "aus_prospectus_52_documents_sample.txt"
# pdf_folder: str = r"/data/aus_prospectus/pdf/"
# output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
# output_extract_data_child_folder: str = r"/data/aus_prospectus/output/extract_data/docs/"
@ -1397,13 +1397,16 @@ if __name__ == "__main__":
# document_sample_file = (
# r"./sample_documents/aus_prospectus_100_documents_multi_fund_sample.txt"
# )
# document_sample_file = (
# r"./sample_documents/aus_prospectus_17_documents_sample.txt"
# )
document_sample_file = (
r"./sample_documents/aus_prospectus_17_documents_sample.txt"
r"./sample_documents/aus_prospectus_52_documents_sample.txt"
)
with open(document_sample_file, "r", encoding="utf-8") as f:
special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()]
# document_mapping_file = r"/data/aus_prospectus/basic_information/from_2024_documents/aus_100_document_prospectus_multi_fund.xlsx"
document_mapping_file = r"/data/aus_prospectus/basic_information/17_documents/aus_prospectus_17_documents_mapping.xlsx"
document_mapping_file = r"/data/aus_prospectus/basic_information/biz_rule/phase1_document_mapping.xlsx"
# special_doc_id_list: list = [
# "539790009",
# "542300403",
@ -1417,7 +1420,7 @@ if __name__ == "__main__":
# "555377021",
# "555654388",
# ]
special_doc_id_list: list = ["377377369"]
# special_doc_id_list: list = ["377377369"]
pdf_folder: str = r"/data/aus_prospectus/pdf/"
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
output_extract_data_child_folder: str = (

View File

@ -1345,14 +1345,19 @@ def calc_typical_doc_metrics_v1():
)
def merge_aus_document_prospectus_data():
def merge_aus_document_prospectus_data(aus_data_folder: str = r"/data/aus_prospectus/basic_information/from_2024_documents/",
aus_document_mapping_file: str = r"document_mapping.xlsx",
aus_prospectus_data_file: str = r"aus_prospectus_data.xlsx",
document_mapping_sheet: str = "document_mapping",
output_file: str = r"aus_document_prospectus.xlsx",
output_sheet: str = "aus_document_prospectus"):
"""
Merge AUS document and prospectus data.
"""
aus_document_file = r"/data/aus_prospectus/basic_information/from_2024_documents/document_mapping.xlsx"
aus_prospectus_file = r"/data/aus_prospectus/basic_information/from_2024_documents/aus_prospectus_data.xlsx"
aus_document_data = pd.read_excel(aus_document_file, sheet_name="document_mapping")
aus_prospectus_data = pd.read_excel(aus_prospectus_file)
aus_document_mapping_file = os.path.join(aus_data_folder, aus_document_mapping_file)
aus_prospectus_data_file = os.path.join(aus_data_folder, aus_prospectus_data_file)
aus_document_data = pd.read_excel(aus_document_mapping_file, sheet_name=document_mapping_sheet)
aus_prospectus_data = pd.read_excel(aus_prospectus_data_file)
aus_document_data["DocumentId"] = aus_document_data["DocumentId"].astype(str)
@ -1362,10 +1367,10 @@ def merge_aus_document_prospectus_data():
on=["FundClassId", "EffectiveDate"],
how="left",
)
aus_document_prospectus_file = r"/data/aus_prospectus/basic_information/from_2024_documents/aus_document_prospectus.xlsx"
aus_document_prospectus_file = os.path.join(aus_data_folder, output_file)
with pd.ExcelWriter(aus_document_prospectus_file) as writer:
aus_document_prospectus_data.to_excel(
writer, sheet_name="aus_document_prospectus", index=False
writer, sheet_name=output_sheet, index=False
)
@ -1384,15 +1389,19 @@ def pdf_exist():
logger.info(f"pdf file exist: {pdf_file_path}")
def prepare_multi_fund_aus_prospectus_document():
data_folder = r"/data/aus_prospectus/basic_information/from_2024_documents/"
document_mapping_file = os.path.join(data_folder, "document_mapping.xlsx")
document_data_file = os.path.join(data_folder, "aus_document_prospectus.xlsx")
def prepare_multi_fund_aus_prospectus_document(data_folder: str = r"/data/aus_prospectus/basic_information/from_2024_documents/",
document_mapping_file: str = "document_mapping.xlsx",
document_mapping_sheet: str = "document_mapping",
document_fund_count_sheet: str = "document_fund_count",
document_data_file: str = "aus_document_prospectus.xlsx",
document_data_sheet: str = "aus_document_prospectus"):
document_mapping_file = os.path.join(data_folder, document_mapping_file)
document_data_file = os.path.join(data_folder, document_data_file)
document_mapping_df = pd.read_excel(document_mapping_file, sheet_name="document_mapping")
document_fund_count_df = pd.read_excel(document_mapping_file, sheet_name="document_fund_count")
document_mapping_df = pd.read_excel(document_mapping_file, sheet_name=document_mapping_sheet)
document_fund_count_df = pd.read_excel(document_mapping_file, sheet_name=document_fund_count_sheet)
document_data_df = pd.read_excel(document_data_file, sheet_name="aus_document_prospectus")
document_data_df = pd.read_excel(document_data_file, sheet_name=document_data_sheet)
document_data_df.fillna("", inplace=True)
# get data from document_data_df which SecurityName is not empty string
document_data_df = document_data_df[document_data_df["SecurityName"] != ""]
@ -1459,8 +1468,13 @@ def prepare_multi_fund_aus_prospectus_document():
if __name__ == "__main__":
# pdf_exist()
prepare_multi_fund_aus_prospectus_document()
# merge_aus_document_prospectus_data()
# prepare_multi_fund_aus_prospectus_document()
merge_aus_document_prospectus_data(aus_data_folder=r"/data/aus_prospectus/basic_information/biz_rule/",
aus_document_mapping_file="phase1_document_mapping.xlsx",
aus_prospectus_data_file="phase1_aus_prospectus_data.xlsx",
document_mapping_sheet="document_mapping",
output_file="phase1_aus_document_prospectus.xlsx",
output_sheet="aus_document_prospectus")
folder = r"/data/emea_ar/basic_information/English/sample_doc/emea_11_06_case/"
file_name = "doc_ar_data_for_emea_11_06.xlsx"
# get_document_with_all_4_data_points(folder, file_name, None)

View File

@ -0,0 +1,52 @@
420255482
417904228
448577874
466381788
502017146
480854115
481913497
479996918
475093006
492202154
495604527
471624689
397667293
447335411
490252419
498429268
369105359
481728671
466227438
489870941
481909371
495604806
557526143
369219625
377425488
420281919
420333972
425940066
439596540
406568432
411560137
412398851
412698096
413157970
319457827
337806248
341026731
344001344
362917516
471895618
450135866
454032956
471026558
471052717
471608905
476325788
478569026
480716611
480718722
485628750
486915646
486927510