a little change
This commit is contained in:
parent
a89aa9c4de
commit
ace0ac2674
102
main.py
102
main.py
|
|
@ -914,7 +914,7 @@ def batch_run_documents(special_doc_id_list: list = None,
|
||||||
)
|
)
|
||||||
re_run_extract_data = True
|
re_run_extract_data = True
|
||||||
re_run_mapping_data = True
|
re_run_mapping_data = True
|
||||||
force_save_total_data = False
|
force_save_total_data = True
|
||||||
calculate_metrics = False
|
calculate_metrics = False
|
||||||
|
|
||||||
extract_way = "text"
|
extract_way = "text"
|
||||||
|
|
@ -1060,6 +1060,81 @@ def merge_output_data(data_file_path: str,
|
||||||
total_data_df.to_excel(writer, index=False, sheet_name="total_data")
|
total_data_df.to_excel(writer, index=False, sheet_name="total_data")
|
||||||
|
|
||||||
|
|
||||||
|
def merge_output_data_aus_prospectus(data_file_path: str,
|
||||||
|
document_mapping_file: str,
|
||||||
|
output_data_file_path: str):
|
||||||
|
# TODO: merge output data for aus prospectus, plan to realize it on 2025-01-16
|
||||||
|
data_df = pd.read_excel(data_file_path, sheet_name="total_mapping_data")
|
||||||
|
document_mapping_df = pd.read_excel(document_mapping_file, sheet_name="doc_date")
|
||||||
|
# set doc_id to be string type
|
||||||
|
data_df["doc_id"] = data_df["doc_id"].astype(str)
|
||||||
|
document_mapping_df["DocumentId"] = document_mapping_df["DocumentId"].astype(str)
|
||||||
|
"""
|
||||||
|
doc_id page_index raw_name datapoint value raw_check comment investment_type investment_id investment_name similarity
|
||||||
|
553242368 344 Deutsche MSCI World Index Fund tor 61 33 FS0000AY1Y Xtrackers MSCI World Index Fund 0.75
|
||||||
|
553242368 344 db x-trackers EUR Liquid Corporate 12.5 UCITS ETF - Klasse 1C ter 0.35 1 F000018PY1 Xtrackers EUR Corporate Green Bond UCITS ETF 1C 0.462
|
||||||
|
"""
|
||||||
|
doc_id_list = data_df["doc_id"].unique().tolist()
|
||||||
|
data_point_dict = {
|
||||||
|
"tor": "TurnoverRatio",
|
||||||
|
"ter": "NetExpenseRatio",
|
||||||
|
"ogc": "OngoingCharge",
|
||||||
|
"performance_fee": "PerformanceFee"
|
||||||
|
}
|
||||||
|
total_data_list = []
|
||||||
|
for doc_id in tqdm(doc_id_list):
|
||||||
|
doc_data_list = []
|
||||||
|
doc_data_df = data_df[data_df["doc_id"] == doc_id]
|
||||||
|
doc_date = str(document_mapping_df[document_mapping_df["DocumentId"] == doc_id]["EffectiveDate"].values[0])[0:10]
|
||||||
|
exist_raw_name_list = []
|
||||||
|
for index, row in doc_data_df.iterrows():
|
||||||
|
doc_id = str(row["doc_id"])
|
||||||
|
page_index = int(row["page_index"])
|
||||||
|
raw_name = str(row["raw_name"])
|
||||||
|
datapoint = str(row["datapoint"])
|
||||||
|
value = row["value"]
|
||||||
|
investment_type = row["investment_type"]
|
||||||
|
investment_id = row["investment_id"]
|
||||||
|
investment_name = row["investment_name"]
|
||||||
|
|
||||||
|
exist = False
|
||||||
|
for exist_raw_name_info in exist_raw_name_list:
|
||||||
|
exist_raw_name = exist_raw_name_info["raw_name"]
|
||||||
|
exist_investment_type = exist_raw_name_info["investment_type"]
|
||||||
|
if exist_raw_name == raw_name and exist_investment_type == investment_type:
|
||||||
|
exist = True
|
||||||
|
break
|
||||||
|
if not exist:
|
||||||
|
data = {
|
||||||
|
"DocumentId": doc_id,
|
||||||
|
"investment_type": investment_type,
|
||||||
|
"investment_id": investment_id,
|
||||||
|
"investment_name": investment_name,
|
||||||
|
"EffectiveDate": doc_date,
|
||||||
|
"page_index": [],
|
||||||
|
"RawName": raw_name,
|
||||||
|
"NetExpenseRatio": "",
|
||||||
|
"OngoingCharge": "",
|
||||||
|
"TurnoverRatio": "",
|
||||||
|
"PerformanceFee": ""
|
||||||
|
}
|
||||||
|
exist_raw_name_list.append({"raw_name": raw_name, "investment_type": investment_type})
|
||||||
|
doc_data_list.append(data)
|
||||||
|
# find data from total_data_list by raw_name
|
||||||
|
for data in doc_data_list:
|
||||||
|
if data["RawName"] == raw_name and data["investment_type"] == investment_type:
|
||||||
|
update_key = data_point_dict[datapoint]
|
||||||
|
data[update_key] = value
|
||||||
|
if page_index not in data["page_index"]:
|
||||||
|
data["page_index"].append(page_index)
|
||||||
|
break
|
||||||
|
total_data_list.extend(doc_data_list)
|
||||||
|
total_data_df = pd.DataFrame(total_data_list)
|
||||||
|
total_data_df.fillna("", inplace=True)
|
||||||
|
with pd.ExcelWriter(output_data_file_path) as writer:
|
||||||
|
total_data_df.to_excel(writer, index=False, sheet_name="total_data")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
# data_file_path = r"/data/emea_ar/ground_truth/data_extraction/verify/20241211/sample_documents_12_11_mapping_data_info_44_documents_by_text_20241211185546.xlsx"
|
# data_file_path = r"/data/emea_ar/ground_truth/data_extraction/verify/20241211/sample_documents_12_11_mapping_data_info_44_documents_by_text_20241211185546.xlsx"
|
||||||
# document_mapping_file_path = r"/data/emea_ar/basic_information/sample_documents/sample_doc/sample_documents_12_11/doc_mapping_12_11.xlsx"
|
# document_mapping_file_path = r"/data/emea_ar/basic_information/sample_documents/sample_doc/sample_documents_12_11/doc_mapping_12_11.xlsx"
|
||||||
|
|
@ -1079,19 +1154,18 @@ if __name__ == "__main__":
|
||||||
|
|
||||||
|
|
||||||
# special_doc_id_list = ["553242411"]
|
# special_doc_id_list = ["553242411"]
|
||||||
# special_doc_id_list: list = ["539790009",
|
special_doc_id_list: list = ["539790009",
|
||||||
# "542300403",
|
"542300403",
|
||||||
# "542301117",
|
"542301117",
|
||||||
# "542306317",
|
"542306317",
|
||||||
# "547567013",
|
"547567013",
|
||||||
# "552505237",
|
"552505237",
|
||||||
# "552505278",
|
"552505278",
|
||||||
# "554431052",
|
"554431052",
|
||||||
# "554851189",
|
"554851189",
|
||||||
# "555377021",
|
"555377021",
|
||||||
# "555654388"]
|
"555654388"]
|
||||||
special_doc_id_list: list = ["539790009", "542301117"]
|
# special_doc_id_list: list = ["539790009", "542301117"]
|
||||||
special_doc_id_list: list = ["539790009"]
|
|
||||||
pdf_folder:str = r"/data/aus_prospectus/pdf/"
|
pdf_folder:str = r"/data/aus_prospectus/pdf/"
|
||||||
output_pdf_text_folder:str = r"/data/aus_prospectus/output/pdf_text/"
|
output_pdf_text_folder:str = r"/data/aus_prospectus/output/pdf_text/"
|
||||||
output_extract_data_child_folder:str = r"/data/aus_prospectus/output/extract_data/docs/"
|
output_extract_data_child_folder:str = r"/data/aus_prospectus/output/extract_data/docs/"
|
||||||
|
|
|
||||||
|
|
@ -1367,9 +1367,9 @@ def merge_aus_document_prospectus_data():
|
||||||
"""
|
"""
|
||||||
Merge AUS document and prospectus data.
|
Merge AUS document and prospectus data.
|
||||||
"""
|
"""
|
||||||
aus_document_file = r"/data/aus_prospectus/basic_information/document_mapping.xlsx"
|
aus_document_file = r"/data/aus_prospectus/basic_information/from_2024_documents/document_mapping.xlsx"
|
||||||
aus_prospectus_file = r"/data/aus_prospectus/basic_information/aus_prospectus_data.xlsx"
|
aus_prospectus_file = r"/data/aus_prospectus/basic_information/from_2024_documents/aus_prospectus_data.xlsx"
|
||||||
aus_document_data = pd.read_excel(aus_document_file)
|
aus_document_data = pd.read_excel(aus_document_file, sheet_name="document_mapping")
|
||||||
aus_prospectus_data = pd.read_excel(aus_prospectus_file)
|
aus_prospectus_data = pd.read_excel(aus_prospectus_file)
|
||||||
|
|
||||||
aus_document_data["DocumentId"] = aus_document_data["DocumentId"].astype(str)
|
aus_document_data["DocumentId"] = aus_document_data["DocumentId"].astype(str)
|
||||||
|
|
@ -1380,7 +1380,7 @@ def merge_aus_document_prospectus_data():
|
||||||
on=["FundClassId", "EffectiveDate"],
|
on=["FundClassId", "EffectiveDate"],
|
||||||
how="left",
|
how="left",
|
||||||
)
|
)
|
||||||
aus_document_prospectus_file = r"/data/aus_prospectus/aus_document_prospectus.xlsx"
|
aus_document_prospectus_file = r"/data/aus_prospectus/basic_information/from_2024_documents/aus_document_prospectus.xlsx"
|
||||||
with pd.ExcelWriter(aus_document_prospectus_file) as writer:
|
with pd.ExcelWriter(aus_document_prospectus_file) as writer:
|
||||||
aus_document_prospectus_data.to_excel(
|
aus_document_prospectus_data.to_excel(
|
||||||
writer, sheet_name="aus_document_prospectus", index=False
|
writer, sheet_name="aus_document_prospectus", index=False
|
||||||
|
|
@ -1393,7 +1393,7 @@ def get_pdf_2_html():
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
# merge_aus_document_prospectus_data()
|
merge_aus_document_prospectus_data()
|
||||||
folder = r"/data/emea_ar/basic_information/English/sample_doc/emea_11_06_case/"
|
folder = r"/data/emea_ar/basic_information/English/sample_doc/emea_11_06_case/"
|
||||||
file_name = "doc_ar_data_for_emea_11_06.xlsx"
|
file_name = "doc_ar_data_for_emea_11_06.xlsx"
|
||||||
# get_document_with_all_4_data_points(folder, file_name, None)
|
# get_document_with_all_4_data_points(folder, file_name, None)
|
||||||
|
|
@ -1431,7 +1431,7 @@ if __name__ == "__main__":
|
||||||
|
|
||||||
pdf_folder = r"/data/aus_prospectus/pdf/"
|
pdf_folder = r"/data/aus_prospectus/pdf/"
|
||||||
output_folder = r"/data/aus_prospectus/pdf_txt/"
|
output_folder = r"/data/aus_prospectus/pdf_txt/"
|
||||||
output_pdf_page_text(pdf_folder, output_folder)
|
# output_pdf_page_text(pdf_folder, output_folder)
|
||||||
|
|
||||||
# extract_pdf_table(pdf_folder, output_folder)
|
# extract_pdf_table(pdf_folder, output_folder)
|
||||||
# analyze_json_error()
|
# analyze_json_error()
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue