update a little
This commit is contained in:
parent
01e2a0e38d
commit
353bc28599
11
main.py
11
main.py
|
|
@ -1377,7 +1377,7 @@ if __name__ == "__main__":
|
|||
|
||||
# doc_source = "aus_prospectus"
|
||||
# sample_document_list_folder: str = r'./sample_documents/'
|
||||
# document_list_file: str = "aus_prospectus_100_documents_multi_fund_sample.txt"
|
||||
# document_list_file: str = "aus_prospectus_52_documents_sample.txt"
|
||||
# pdf_folder: str = r"/data/aus_prospectus/pdf/"
|
||||
# output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
|
||||
# output_extract_data_child_folder: str = r"/data/aus_prospectus/output/extract_data/docs/"
|
||||
|
|
@ -1397,13 +1397,16 @@ if __name__ == "__main__":
|
|||
# document_sample_file = (
|
||||
# r"./sample_documents/aus_prospectus_100_documents_multi_fund_sample.txt"
|
||||
# )
|
||||
# document_sample_file = (
|
||||
# r"./sample_documents/aus_prospectus_17_documents_sample.txt"
|
||||
# )
|
||||
document_sample_file = (
|
||||
r"./sample_documents/aus_prospectus_17_documents_sample.txt"
|
||||
r"./sample_documents/aus_prospectus_52_documents_sample.txt"
|
||||
)
|
||||
with open(document_sample_file, "r", encoding="utf-8") as f:
|
||||
special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()]
|
||||
# document_mapping_file = r"/data/aus_prospectus/basic_information/from_2024_documents/aus_100_document_prospectus_multi_fund.xlsx"
|
||||
document_mapping_file = r"/data/aus_prospectus/basic_information/17_documents/aus_prospectus_17_documents_mapping.xlsx"
|
||||
document_mapping_file = r"/data/aus_prospectus/basic_information/biz_rule/phase1_document_mapping.xlsx"
|
||||
# special_doc_id_list: list = [
|
||||
# "539790009",
|
||||
# "542300403",
|
||||
|
|
@ -1417,7 +1420,7 @@ if __name__ == "__main__":
|
|||
# "555377021",
|
||||
# "555654388",
|
||||
# ]
|
||||
special_doc_id_list: list = ["377377369"]
|
||||
# special_doc_id_list: list = ["377377369"]
|
||||
pdf_folder: str = r"/data/aus_prospectus/pdf/"
|
||||
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
|
||||
output_extract_data_child_folder: str = (
|
||||
|
|
|
|||
|
|
@ -1345,14 +1345,19 @@ def calc_typical_doc_metrics_v1():
|
|||
)
|
||||
|
||||
|
||||
def merge_aus_document_prospectus_data():
|
||||
def merge_aus_document_prospectus_data(aus_data_folder: str = r"/data/aus_prospectus/basic_information/from_2024_documents/",
|
||||
aus_document_mapping_file: str = r"document_mapping.xlsx",
|
||||
aus_prospectus_data_file: str = r"aus_prospectus_data.xlsx",
|
||||
document_mapping_sheet: str = "document_mapping",
|
||||
output_file: str = r"aus_document_prospectus.xlsx",
|
||||
output_sheet: str = "aus_document_prospectus"):
|
||||
"""
|
||||
Merge AUS document and prospectus data.
|
||||
"""
|
||||
aus_document_file = r"/data/aus_prospectus/basic_information/from_2024_documents/document_mapping.xlsx"
|
||||
aus_prospectus_file = r"/data/aus_prospectus/basic_information/from_2024_documents/aus_prospectus_data.xlsx"
|
||||
aus_document_data = pd.read_excel(aus_document_file, sheet_name="document_mapping")
|
||||
aus_prospectus_data = pd.read_excel(aus_prospectus_file)
|
||||
aus_document_mapping_file = os.path.join(aus_data_folder, aus_document_mapping_file)
|
||||
aus_prospectus_data_file = os.path.join(aus_data_folder, aus_prospectus_data_file)
|
||||
aus_document_data = pd.read_excel(aus_document_mapping_file, sheet_name=document_mapping_sheet)
|
||||
aus_prospectus_data = pd.read_excel(aus_prospectus_data_file)
|
||||
|
||||
aus_document_data["DocumentId"] = aus_document_data["DocumentId"].astype(str)
|
||||
|
||||
|
|
@ -1362,10 +1367,10 @@ def merge_aus_document_prospectus_data():
|
|||
on=["FundClassId", "EffectiveDate"],
|
||||
how="left",
|
||||
)
|
||||
aus_document_prospectus_file = r"/data/aus_prospectus/basic_information/from_2024_documents/aus_document_prospectus.xlsx"
|
||||
aus_document_prospectus_file = os.path.join(aus_data_folder, output_file)
|
||||
with pd.ExcelWriter(aus_document_prospectus_file) as writer:
|
||||
aus_document_prospectus_data.to_excel(
|
||||
writer, sheet_name="aus_document_prospectus", index=False
|
||||
writer, sheet_name=output_sheet, index=False
|
||||
)
|
||||
|
||||
|
||||
|
|
@ -1384,15 +1389,19 @@ def pdf_exist():
|
|||
logger.info(f"pdf file exist: {pdf_file_path}")
|
||||
|
||||
|
||||
def prepare_multi_fund_aus_prospectus_document():
|
||||
data_folder = r"/data/aus_prospectus/basic_information/from_2024_documents/"
|
||||
document_mapping_file = os.path.join(data_folder, "document_mapping.xlsx")
|
||||
document_data_file = os.path.join(data_folder, "aus_document_prospectus.xlsx")
|
||||
def prepare_multi_fund_aus_prospectus_document(data_folder: str = r"/data/aus_prospectus/basic_information/from_2024_documents/",
|
||||
document_mapping_file: str = "document_mapping.xlsx",
|
||||
document_mapping_sheet: str = "document_mapping",
|
||||
document_fund_count_sheet: str = "document_fund_count",
|
||||
document_data_file: str = "aus_document_prospectus.xlsx",
|
||||
document_data_sheet: str = "aus_document_prospectus"):
|
||||
document_mapping_file = os.path.join(data_folder, document_mapping_file)
|
||||
document_data_file = os.path.join(data_folder, document_data_file)
|
||||
|
||||
document_mapping_df = pd.read_excel(document_mapping_file, sheet_name="document_mapping")
|
||||
document_fund_count_df = pd.read_excel(document_mapping_file, sheet_name="document_fund_count")
|
||||
document_mapping_df = pd.read_excel(document_mapping_file, sheet_name=document_mapping_sheet)
|
||||
document_fund_count_df = pd.read_excel(document_mapping_file, sheet_name=document_fund_count_sheet)
|
||||
|
||||
document_data_df = pd.read_excel(document_data_file, sheet_name="aus_document_prospectus")
|
||||
document_data_df = pd.read_excel(document_data_file, sheet_name=document_data_sheet)
|
||||
document_data_df.fillna("", inplace=True)
|
||||
# get data from document_data_df which SecurityName is not empty string
|
||||
document_data_df = document_data_df[document_data_df["SecurityName"] != ""]
|
||||
|
|
@ -1459,8 +1468,13 @@ def prepare_multi_fund_aus_prospectus_document():
|
|||
|
||||
if __name__ == "__main__":
|
||||
# pdf_exist()
|
||||
prepare_multi_fund_aus_prospectus_document()
|
||||
# merge_aus_document_prospectus_data()
|
||||
# prepare_multi_fund_aus_prospectus_document()
|
||||
merge_aus_document_prospectus_data(aus_data_folder=r"/data/aus_prospectus/basic_information/biz_rule/",
|
||||
aus_document_mapping_file="phase1_document_mapping.xlsx",
|
||||
aus_prospectus_data_file="phase1_aus_prospectus_data.xlsx",
|
||||
document_mapping_sheet="document_mapping",
|
||||
output_file="phase1_aus_document_prospectus.xlsx",
|
||||
output_sheet="aus_document_prospectus")
|
||||
folder = r"/data/emea_ar/basic_information/English/sample_doc/emea_11_06_case/"
|
||||
file_name = "doc_ar_data_for_emea_11_06.xlsx"
|
||||
# get_document_with_all_4_data_points(folder, file_name, None)
|
||||
|
|
|
|||
|
|
@ -0,0 +1,52 @@
|
|||
420255482
|
||||
417904228
|
||||
448577874
|
||||
466381788
|
||||
502017146
|
||||
480854115
|
||||
481913497
|
||||
479996918
|
||||
475093006
|
||||
492202154
|
||||
495604527
|
||||
471624689
|
||||
397667293
|
||||
447335411
|
||||
490252419
|
||||
498429268
|
||||
369105359
|
||||
481728671
|
||||
466227438
|
||||
489870941
|
||||
481909371
|
||||
495604806
|
||||
557526143
|
||||
369219625
|
||||
377425488
|
||||
420281919
|
||||
420333972
|
||||
425940066
|
||||
439596540
|
||||
406568432
|
||||
411560137
|
||||
412398851
|
||||
412698096
|
||||
413157970
|
||||
319457827
|
||||
337806248
|
||||
341026731
|
||||
344001344
|
||||
362917516
|
||||
471895618
|
||||
450135866
|
||||
454032956
|
||||
471026558
|
||||
471052717
|
||||
471608905
|
||||
476325788
|
||||
478569026
|
||||
480716611
|
||||
480718722
|
||||
485628750
|
||||
486915646
|
||||
486927510
|
||||
Loading…
Reference in New Issue