support output merged data format
This commit is contained in:
parent
2eace81f51
commit
fb4a6402f0
132
main.py
132
main.py
|
|
@ -1190,22 +1190,16 @@ def merge_output_data_aus_prospectus(
|
||||||
):
|
):
|
||||||
# TODO: merge output data for aus prospectus, plan to realize it on 2025-01-16
|
# TODO: merge output data for aus prospectus, plan to realize it on 2025-01-16
|
||||||
data_df = pd.read_excel(data_file_path, sheet_name="total_mapping_data")
|
data_df = pd.read_excel(data_file_path, sheet_name="total_mapping_data")
|
||||||
document_mapping_df = pd.read_excel(document_mapping_file, sheet_name="doc_date")
|
document_mapping_df = pd.read_excel(document_mapping_file, sheet_name="Sheet1")
|
||||||
# set doc_id to be string type
|
# set doc_id to be string type
|
||||||
data_df["doc_id"] = data_df["doc_id"].astype(str)
|
data_df["doc_id"] = data_df["doc_id"].astype(str)
|
||||||
document_mapping_df["DocumentId"] = document_mapping_df["DocumentId"].astype(str)
|
document_mapping_df["DocumentId"] = document_mapping_df["DocumentId"].astype(str)
|
||||||
"""
|
|
||||||
doc_id page_index raw_name datapoint value raw_check comment investment_type investment_id investment_name similarity
|
|
||||||
553242368 344 Deutsche MSCI World Index Fund tor 61 33 FS0000AY1Y Xtrackers MSCI World Index Fund 0.75
|
|
||||||
553242368 344 db x-trackers EUR Liquid Corporate 12.5 UCITS ETF - Klasse 1C ter 0.35 1 F000018PY1 Xtrackers EUR Corporate Green Bond UCITS ETF 1C 0.462
|
|
||||||
"""
|
|
||||||
doc_id_list = data_df["doc_id"].unique().tolist()
|
doc_id_list = data_df["doc_id"].unique().tolist()
|
||||||
data_point_dict = {
|
datapoint_keyword_config_file = r"./configuration/aus_prospectus/datapoint_name.json"
|
||||||
"tor": "TurnoverRatio",
|
with open(datapoint_keyword_config_file, "r", encoding="utf-8") as f:
|
||||||
"ter": "NetExpenseRatio",
|
datapoint_keyword_config = json.load(f)
|
||||||
"ogc": "OngoingCharge",
|
datapoint_name_list = list(datapoint_keyword_config.keys())
|
||||||
"performance_fee": "PerformanceFee",
|
|
||||||
}
|
|
||||||
total_data_list = []
|
total_data_list = []
|
||||||
for doc_id in tqdm(doc_id_list):
|
for doc_id in tqdm(doc_id_list):
|
||||||
doc_data_list = []
|
doc_data_list = []
|
||||||
|
|
@ -1245,11 +1239,9 @@ def merge_output_data_aus_prospectus(
|
||||||
"EffectiveDate": doc_date,
|
"EffectiveDate": doc_date,
|
||||||
"page_index": [],
|
"page_index": [],
|
||||||
"RawName": raw_name,
|
"RawName": raw_name,
|
||||||
"NetExpenseRatio": "",
|
|
||||||
"OngoingCharge": "",
|
|
||||||
"TurnoverRatio": "",
|
|
||||||
"PerformanceFee": "",
|
|
||||||
}
|
}
|
||||||
|
for datapoint_name in datapoint_name_list:
|
||||||
|
data[datapoint_name] = ""
|
||||||
exist_raw_name_list.append(
|
exist_raw_name_list.append(
|
||||||
{"raw_name": raw_name, "investment_type": investment_type}
|
{"raw_name": raw_name, "investment_type": investment_type}
|
||||||
)
|
)
|
||||||
|
|
@ -1260,7 +1252,7 @@ def merge_output_data_aus_prospectus(
|
||||||
data["RawName"] == raw_name
|
data["RawName"] == raw_name
|
||||||
and data["investment_type"] == investment_type
|
and data["investment_type"] == investment_type
|
||||||
):
|
):
|
||||||
update_key = data_point_dict[datapoint]
|
update_key = datapoint
|
||||||
data[update_key] = value
|
data[update_key] = value
|
||||||
if page_index not in data["page_index"]:
|
if page_index not in data["page_index"]:
|
||||||
data["page_index"].append(page_index)
|
data["page_index"].append(page_index)
|
||||||
|
|
@ -1273,10 +1265,13 @@ def merge_output_data_aus_prospectus(
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
# data_file_path = r"/data/emea_ar/ground_truth/data_extraction/verify/20241211/sample_documents_12_11_mapping_data_info_44_documents_by_text_20241211185546.xlsx"
|
data_file_path = r"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_11_documents_by_text_20250115185745.xlsx"
|
||||||
# document_mapping_file_path = r"/data/emea_ar/basic_information/sample_documents/sample_doc/sample_documents_12_11/doc_mapping_12_11.xlsx"
|
document_mapping_file_path = r"/data/aus_prospectus/basic_information/11_documents/document_mapping.xlsx"
|
||||||
# output_data_file_path = r"/data/emea_ar/ground_truth/data_extraction/verify/20241211/sample_documents_12_11_merged_data_info.xlsx"
|
merged_total_data_folder = r'/data/aus_prospectus/output/mapping_data/total/merged/'
|
||||||
# merge_output_data(data_file_path, document_mapping_file_path, output_data_file_path)
|
os.makedirs(merged_total_data_folder, exist_ok=True)
|
||||||
|
data_file_base_name = os.path.basename(data_file_path)
|
||||||
|
output_data_file_path = os.path.join(merged_total_data_folder, "merged_" + data_file_base_name)
|
||||||
|
merge_output_data_aus_prospectus(data_file_path, document_mapping_file_path, output_data_file_path)
|
||||||
|
|
||||||
# sample_document_list_folder: str = r'./sample_documents/'
|
# sample_document_list_folder: str = r'./sample_documents/'
|
||||||
# document_list_file: str = "aus_prospectus.txt"
|
# document_list_file: str = "aus_prospectus.txt"
|
||||||
|
|
@ -1290,53 +1285,54 @@ if __name__ == "__main__":
|
||||||
# output_mapping_child_folder=output_mapping_child_folder)
|
# output_mapping_child_folder=output_mapping_child_folder)
|
||||||
|
|
||||||
# special_doc_id_list = ["553242411"]
|
# special_doc_id_list = ["553242411"]
|
||||||
doc_source = "emea_ar"
|
|
||||||
if doc_source == "aus_prospectus":
|
# doc_source = "emea_ar"
|
||||||
special_doc_id_list: list = [
|
# if doc_source == "aus_prospectus":
|
||||||
"539790009",
|
# special_doc_id_list: list = [
|
||||||
"542300403",
|
# "539790009",
|
||||||
"542301117",
|
# "542300403",
|
||||||
"542306317",
|
# "542301117",
|
||||||
"547567013",
|
# "542306317",
|
||||||
"552505237",
|
# "547567013",
|
||||||
"552505278",
|
# "552505237",
|
||||||
"554431052",
|
# "552505278",
|
||||||
"554851189",
|
# "554431052",
|
||||||
"555377021",
|
# "554851189",
|
||||||
"555654388",
|
# "555377021",
|
||||||
]
|
# "555654388",
|
||||||
special_doc_id_list: list = ["554851189"]
|
# ]
|
||||||
pdf_folder: str = r"/data/aus_prospectus/pdf/"
|
# special_doc_id_list: list = ["554851189"]
|
||||||
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
|
# pdf_folder: str = r"/data/aus_prospectus/pdf/"
|
||||||
output_extract_data_child_folder: str = (
|
# output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
|
||||||
r"/data/aus_prospectus/output/extract_data/docs/"
|
# output_extract_data_child_folder: str = (
|
||||||
)
|
# r"/data/aus_prospectus/output/extract_data/docs/"
|
||||||
output_extract_data_total_folder: str = (
|
# )
|
||||||
r"/data/aus_prospectus/output/extract_data/total/"
|
# output_extract_data_total_folder: str = (
|
||||||
)
|
# r"/data/aus_prospectus/output/extract_data/total/"
|
||||||
output_mapping_child_folder: str = (
|
# )
|
||||||
r"/data/aus_prospectus/output/mapping_data/docs/"
|
# output_mapping_child_folder: str = (
|
||||||
)
|
# r"/data/aus_prospectus/output/mapping_data/docs/"
|
||||||
output_mapping_total_folder: str = (
|
# )
|
||||||
r"/data/aus_prospectus/output/mapping_data/total/"
|
# output_mapping_total_folder: str = (
|
||||||
)
|
# r"/data/aus_prospectus/output/mapping_data/total/"
|
||||||
drilldown_folder = r"/data/aus_prospectus/output/drilldown/"
|
# )
|
||||||
batch_run_documents(
|
# drilldown_folder = r"/data/aus_prospectus/output/drilldown/"
|
||||||
doc_source=doc_source,
|
# batch_run_documents(
|
||||||
special_doc_id_list=special_doc_id_list,
|
# doc_source=doc_source,
|
||||||
pdf_folder=pdf_folder,
|
# special_doc_id_list=special_doc_id_list,
|
||||||
output_pdf_text_folder=output_pdf_text_folder,
|
# pdf_folder=pdf_folder,
|
||||||
output_extract_data_child_folder=output_extract_data_child_folder,
|
# output_pdf_text_folder=output_pdf_text_folder,
|
||||||
output_extract_data_total_folder=output_extract_data_total_folder,
|
# output_extract_data_child_folder=output_extract_data_child_folder,
|
||||||
output_mapping_child_folder=output_mapping_child_folder,
|
# output_extract_data_total_folder=output_extract_data_total_folder,
|
||||||
output_mapping_total_folder=output_mapping_total_folder,
|
# output_mapping_child_folder=output_mapping_child_folder,
|
||||||
drilldown_folder=drilldown_folder,
|
# output_mapping_total_folder=output_mapping_total_folder,
|
||||||
)
|
# drilldown_folder=drilldown_folder,
|
||||||
elif doc_source == "emea_ar":
|
# )
|
||||||
special_doc_id_list = ["553242408"]
|
# elif doc_source == "emea_ar":
|
||||||
batch_run_documents(
|
# special_doc_id_list = ["553242408"]
|
||||||
doc_source=doc_source, special_doc_id_list=special_doc_id_list
|
# batch_run_documents(
|
||||||
)
|
# doc_source=doc_source, special_doc_id_list=special_doc_id_list
|
||||||
|
# )
|
||||||
|
|
||||||
# new_data_file = r"/data/emea_ar/output/mapping_data/total/mapping_data_info_15_documents_by_text_20241121154243.xlsx"
|
# new_data_file = r"/data/emea_ar/output/mapping_data/total/mapping_data_info_15_documents_by_text_20241121154243.xlsx"
|
||||||
# original_data_file = r"/data/emea_ar/ground_truth/data_extraction/verify/mapping_data_info_30_documents_all_4_datapoints_20241106_verify_mapping.xlsx"
|
# original_data_file = r"/data/emea_ar/ground_truth/data_extraction/verify/mapping_data_info_30_documents_all_4_datapoints_20241106_verify_mapping.xlsx"
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue