update a little
This commit is contained in:
parent
01e2a0e38d
commit
353bc28599
11
main.py
11
main.py
|
|
@ -1377,7 +1377,7 @@ if __name__ == "__main__":
|
||||||
|
|
||||||
# doc_source = "aus_prospectus"
|
# doc_source = "aus_prospectus"
|
||||||
# sample_document_list_folder: str = r'./sample_documents/'
|
# sample_document_list_folder: str = r'./sample_documents/'
|
||||||
# document_list_file: str = "aus_prospectus_100_documents_multi_fund_sample.txt"
|
# document_list_file: str = "aus_prospectus_52_documents_sample.txt"
|
||||||
# pdf_folder: str = r"/data/aus_prospectus/pdf/"
|
# pdf_folder: str = r"/data/aus_prospectus/pdf/"
|
||||||
# output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
|
# output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
|
||||||
# output_extract_data_child_folder: str = r"/data/aus_prospectus/output/extract_data/docs/"
|
# output_extract_data_child_folder: str = r"/data/aus_prospectus/output/extract_data/docs/"
|
||||||
|
|
@ -1397,13 +1397,16 @@ if __name__ == "__main__":
|
||||||
# document_sample_file = (
|
# document_sample_file = (
|
||||||
# r"./sample_documents/aus_prospectus_100_documents_multi_fund_sample.txt"
|
# r"./sample_documents/aus_prospectus_100_documents_multi_fund_sample.txt"
|
||||||
# )
|
# )
|
||||||
|
# document_sample_file = (
|
||||||
|
# r"./sample_documents/aus_prospectus_17_documents_sample.txt"
|
||||||
|
# )
|
||||||
document_sample_file = (
|
document_sample_file = (
|
||||||
r"./sample_documents/aus_prospectus_17_documents_sample.txt"
|
r"./sample_documents/aus_prospectus_52_documents_sample.txt"
|
||||||
)
|
)
|
||||||
with open(document_sample_file, "r", encoding="utf-8") as f:
|
with open(document_sample_file, "r", encoding="utf-8") as f:
|
||||||
special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()]
|
special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()]
|
||||||
# document_mapping_file = r"/data/aus_prospectus/basic_information/from_2024_documents/aus_100_document_prospectus_multi_fund.xlsx"
|
# document_mapping_file = r"/data/aus_prospectus/basic_information/from_2024_documents/aus_100_document_prospectus_multi_fund.xlsx"
|
||||||
document_mapping_file = r"/data/aus_prospectus/basic_information/17_documents/aus_prospectus_17_documents_mapping.xlsx"
|
document_mapping_file = r"/data/aus_prospectus/basic_information/biz_rule/phase1_document_mapping.xlsx"
|
||||||
# special_doc_id_list: list = [
|
# special_doc_id_list: list = [
|
||||||
# "539790009",
|
# "539790009",
|
||||||
# "542300403",
|
# "542300403",
|
||||||
|
|
@ -1417,7 +1420,7 @@ if __name__ == "__main__":
|
||||||
# "555377021",
|
# "555377021",
|
||||||
# "555654388",
|
# "555654388",
|
||||||
# ]
|
# ]
|
||||||
special_doc_id_list: list = ["377377369"]
|
# special_doc_id_list: list = ["377377369"]
|
||||||
pdf_folder: str = r"/data/aus_prospectus/pdf/"
|
pdf_folder: str = r"/data/aus_prospectus/pdf/"
|
||||||
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
|
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
|
||||||
output_extract_data_child_folder: str = (
|
output_extract_data_child_folder: str = (
|
||||||
|
|
|
||||||
|
|
@ -1345,14 +1345,19 @@ def calc_typical_doc_metrics_v1():
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def merge_aus_document_prospectus_data():
|
def merge_aus_document_prospectus_data(aus_data_folder: str = r"/data/aus_prospectus/basic_information/from_2024_documents/",
|
||||||
|
aus_document_mapping_file: str = r"document_mapping.xlsx",
|
||||||
|
aus_prospectus_data_file: str = r"aus_prospectus_data.xlsx",
|
||||||
|
document_mapping_sheet: str = "document_mapping",
|
||||||
|
output_file: str = r"aus_document_prospectus.xlsx",
|
||||||
|
output_sheet: str = "aus_document_prospectus"):
|
||||||
"""
|
"""
|
||||||
Merge AUS document and prospectus data.
|
Merge AUS document and prospectus data.
|
||||||
"""
|
"""
|
||||||
aus_document_file = r"/data/aus_prospectus/basic_information/from_2024_documents/document_mapping.xlsx"
|
aus_document_mapping_file = os.path.join(aus_data_folder, aus_document_mapping_file)
|
||||||
aus_prospectus_file = r"/data/aus_prospectus/basic_information/from_2024_documents/aus_prospectus_data.xlsx"
|
aus_prospectus_data_file = os.path.join(aus_data_folder, aus_prospectus_data_file)
|
||||||
aus_document_data = pd.read_excel(aus_document_file, sheet_name="document_mapping")
|
aus_document_data = pd.read_excel(aus_document_mapping_file, sheet_name=document_mapping_sheet)
|
||||||
aus_prospectus_data = pd.read_excel(aus_prospectus_file)
|
aus_prospectus_data = pd.read_excel(aus_prospectus_data_file)
|
||||||
|
|
||||||
aus_document_data["DocumentId"] = aus_document_data["DocumentId"].astype(str)
|
aus_document_data["DocumentId"] = aus_document_data["DocumentId"].astype(str)
|
||||||
|
|
||||||
|
|
@ -1362,10 +1367,10 @@ def merge_aus_document_prospectus_data():
|
||||||
on=["FundClassId", "EffectiveDate"],
|
on=["FundClassId", "EffectiveDate"],
|
||||||
how="left",
|
how="left",
|
||||||
)
|
)
|
||||||
aus_document_prospectus_file = r"/data/aus_prospectus/basic_information/from_2024_documents/aus_document_prospectus.xlsx"
|
aus_document_prospectus_file = os.path.join(aus_data_folder, output_file)
|
||||||
with pd.ExcelWriter(aus_document_prospectus_file) as writer:
|
with pd.ExcelWriter(aus_document_prospectus_file) as writer:
|
||||||
aus_document_prospectus_data.to_excel(
|
aus_document_prospectus_data.to_excel(
|
||||||
writer, sheet_name="aus_document_prospectus", index=False
|
writer, sheet_name=output_sheet, index=False
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -1384,15 +1389,19 @@ def pdf_exist():
|
||||||
logger.info(f"pdf file exist: {pdf_file_path}")
|
logger.info(f"pdf file exist: {pdf_file_path}")
|
||||||
|
|
||||||
|
|
||||||
def prepare_multi_fund_aus_prospectus_document():
|
def prepare_multi_fund_aus_prospectus_document(data_folder: str = r"/data/aus_prospectus/basic_information/from_2024_documents/",
|
||||||
data_folder = r"/data/aus_prospectus/basic_information/from_2024_documents/"
|
document_mapping_file: str = "document_mapping.xlsx",
|
||||||
document_mapping_file = os.path.join(data_folder, "document_mapping.xlsx")
|
document_mapping_sheet: str = "document_mapping",
|
||||||
document_data_file = os.path.join(data_folder, "aus_document_prospectus.xlsx")
|
document_fund_count_sheet: str = "document_fund_count",
|
||||||
|
document_data_file: str = "aus_document_prospectus.xlsx",
|
||||||
|
document_data_sheet: str = "aus_document_prospectus"):
|
||||||
|
document_mapping_file = os.path.join(data_folder, document_mapping_file)
|
||||||
|
document_data_file = os.path.join(data_folder, document_data_file)
|
||||||
|
|
||||||
document_mapping_df = pd.read_excel(document_mapping_file, sheet_name="document_mapping")
|
document_mapping_df = pd.read_excel(document_mapping_file, sheet_name=document_mapping_sheet)
|
||||||
document_fund_count_df = pd.read_excel(document_mapping_file, sheet_name="document_fund_count")
|
document_fund_count_df = pd.read_excel(document_mapping_file, sheet_name=document_fund_count_sheet)
|
||||||
|
|
||||||
document_data_df = pd.read_excel(document_data_file, sheet_name="aus_document_prospectus")
|
document_data_df = pd.read_excel(document_data_file, sheet_name=document_data_sheet)
|
||||||
document_data_df.fillna("", inplace=True)
|
document_data_df.fillna("", inplace=True)
|
||||||
# get data from document_data_df which SecurityName is not empty string
|
# get data from document_data_df which SecurityName is not empty string
|
||||||
document_data_df = document_data_df[document_data_df["SecurityName"] != ""]
|
document_data_df = document_data_df[document_data_df["SecurityName"] != ""]
|
||||||
|
|
@ -1459,8 +1468,13 @@ def prepare_multi_fund_aus_prospectus_document():
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
# pdf_exist()
|
# pdf_exist()
|
||||||
prepare_multi_fund_aus_prospectus_document()
|
# prepare_multi_fund_aus_prospectus_document()
|
||||||
# merge_aus_document_prospectus_data()
|
merge_aus_document_prospectus_data(aus_data_folder=r"/data/aus_prospectus/basic_information/biz_rule/",
|
||||||
|
aus_document_mapping_file="phase1_document_mapping.xlsx",
|
||||||
|
aus_prospectus_data_file="phase1_aus_prospectus_data.xlsx",
|
||||||
|
document_mapping_sheet="document_mapping",
|
||||||
|
output_file="phase1_aus_document_prospectus.xlsx",
|
||||||
|
output_sheet="aus_document_prospectus")
|
||||||
folder = r"/data/emea_ar/basic_information/English/sample_doc/emea_11_06_case/"
|
folder = r"/data/emea_ar/basic_information/English/sample_doc/emea_11_06_case/"
|
||||||
file_name = "doc_ar_data_for_emea_11_06.xlsx"
|
file_name = "doc_ar_data_for_emea_11_06.xlsx"
|
||||||
# get_document_with_all_4_data_points(folder, file_name, None)
|
# get_document_with_all_4_data_points(folder, file_name, None)
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,52 @@
|
||||||
|
420255482
|
||||||
|
417904228
|
||||||
|
448577874
|
||||||
|
466381788
|
||||||
|
502017146
|
||||||
|
480854115
|
||||||
|
481913497
|
||||||
|
479996918
|
||||||
|
475093006
|
||||||
|
492202154
|
||||||
|
495604527
|
||||||
|
471624689
|
||||||
|
397667293
|
||||||
|
447335411
|
||||||
|
490252419
|
||||||
|
498429268
|
||||||
|
369105359
|
||||||
|
481728671
|
||||||
|
466227438
|
||||||
|
489870941
|
||||||
|
481909371
|
||||||
|
495604806
|
||||||
|
557526143
|
||||||
|
369219625
|
||||||
|
377425488
|
||||||
|
420281919
|
||||||
|
420333972
|
||||||
|
425940066
|
||||||
|
439596540
|
||||||
|
406568432
|
||||||
|
411560137
|
||||||
|
412398851
|
||||||
|
412698096
|
||||||
|
413157970
|
||||||
|
319457827
|
||||||
|
337806248
|
||||||
|
341026731
|
||||||
|
344001344
|
||||||
|
362917516
|
||||||
|
471895618
|
||||||
|
450135866
|
||||||
|
454032956
|
||||||
|
471026558
|
||||||
|
471052717
|
||||||
|
471608905
|
||||||
|
476325788
|
||||||
|
478569026
|
||||||
|
480716611
|
||||||
|
480718722
|
||||||
|
485628750
|
||||||
|
486915646
|
||||||
|
486927510
|
||||||
Loading…
Reference in New Issue