update for output data as template

This commit is contained in:
Blade He 2025-01-17 11:41:58 -06:00
parent f10ff8ee33
commit b93a8d55e8
1 changed files with 67 additions and 13 deletions

80
main.py
View File

@ -1017,7 +1017,7 @@ def batch_run_documents(
)
re_run_extract_data = True
re_run_mapping_data = True
force_save_total_data = False
force_save_total_data = True
calculate_metrics = False
extract_way = "text"
@ -1203,22 +1203,31 @@ def merge_output_data_aus_prospectus(
total_data_list = []
for doc_id in tqdm(doc_id_list):
doc_data_list = []
doc_data_df = data_df[data_df["doc_id"] == doc_id]
doc_date = str(
document_mapping_df[document_mapping_df["DocumentId"] == doc_id][
"EffectiveDate"
].values[0]
)[0:10]
share_doc_data_df = data_df[(data_df["doc_id"] == doc_id) & (data_df["investment_type"] == 1)]
exist_raw_name_list = []
for index, row in doc_data_df.iterrows():
for index, row in share_doc_data_df.iterrows():
doc_id = str(row["doc_id"])
page_index = int(row["page_index"])
raw_fund_name = str(row["raw_fund_name"])
raw_share_name = str(row["raw_share_name"])
raw_name = str(row["raw_name"])
datapoint = str(row["datapoint"])
value = row["value"]
investment_type = row["investment_type"]
investment_id = row["investment_id"]
investment_name = row["investment_name"]
share_class_id = row["investment_id"]
share_class_legal_name = row["investment_name"]
fund_id = ""
fund_legal_name = ""
if share_class_id != "":
record_row = document_mapping_df[document_mapping_df["FundClassId"] == share_class_id]
if len(record_row) > 0:
fund_id = record_row["FundId"].values[0]
fund_legal_name = record_row["FundLegalName"].values[0]
exist = False
for exist_raw_name_info in exist_raw_name_list:
@ -1233,9 +1242,13 @@ def merge_output_data_aus_prospectus(
if not exist:
data = {
"DocumentId": doc_id,
"investment_type": investment_type,
"investment_id": investment_id,
"investment_name": investment_name,
"raw_fund_name": raw_fund_name,
"raw_share_name": raw_share_name,
"raw_name": raw_name,
"fund_id": fund_id,
"fund_name": fund_legal_name,
"sec_id": share_class_id,
"sec_name": share_class_legal_name,
"EffectiveDate": doc_date,
"page_index": [],
"RawName": raw_name,
@ -1249,14 +1262,55 @@ def merge_output_data_aus_prospectus(
# find data from total_data_list by raw_name
for data in doc_data_list:
if (
data["RawName"] == raw_name
and data["investment_type"] == investment_type
data["raw_name"] == raw_name
):
update_key = datapoint
data[update_key] = value
if page_index not in data["page_index"]:
data["page_index"].append(page_index)
break
fund_doc_data_df = data_df[(data_df["doc_id"] == doc_id) & (data_df["investment_type"] == 33)]
for index, row in fund_doc_data_df.iterrows():
doc_id = str(row["doc_id"])
page_index = int(row["page_index"])
raw_fund_name = str(row["raw_fund_name"])
raw_share_name = ""
raw_name = str(row["raw_name"])
datapoint = str(row["datapoint"])
value = row["value"]
fund_id = row["investment_id"]
fund_legal_name = row["investment_name"]
exist = False
if fund_id != "":
for data in doc_data_list:
if (fund_id != "" and data["fund_id"] == fund_id) or \
(data["raw_fund_name"] == raw_fund_name):
update_key = datapoint
data[update_key] = value
if page_index not in data["page_index"]:
data["page_index"].append(page_index)
exist = True
if not exist:
data = {
"DocumentId": doc_id,
"raw_fund_name": raw_fund_name,
"raw_share_name": "",
"raw_name": raw_name,
"fund_id": fund_id,
"fund_name": fund_legal_name,
"sec_id": "",
"sec_name": "",
"EffectiveDate": doc_date,
"page_index": [page_index],
"RawName": raw_name,
}
for datapoint_name in datapoint_name_list:
data[datapoint_name] = ""
data[datapoint] = value
doc_data_list.append(data)
total_data_list.extend(doc_data_list)
total_data_df = pd.DataFrame(total_data_list)
total_data_df.fillna("", inplace=True)
@ -1265,7 +1319,7 @@ def merge_output_data_aus_prospectus(
if __name__ == "__main__":
data_file_path = r"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_11_documents_by_text_20250115185745.xlsx"
data_file_path = r"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_11_documents_by_text_20250116220811.xlsx"
document_mapping_file_path = r"/data/aus_prospectus/basic_information/11_documents/document_mapping.xlsx"
merged_total_data_folder = r'/data/aus_prospectus/output/mapping_data/total/merged/'
os.makedirs(merged_total_data_folder, exist_ok=True)
@ -1286,7 +1340,7 @@ if __name__ == "__main__":
# special_doc_id_list = ["553242411"]
# doc_source = "emea_ar"
# doc_source = "aus_prospectus"
# if doc_source == "aus_prospectus":
# special_doc_id_list: list = [
# "539790009",
@ -1301,7 +1355,7 @@ if __name__ == "__main__":
# "555377021",
# "555654388",
# ]
# special_doc_id_list: list = ["554851189"]
# # special_doc_id_list: list = ["554851189"]
# pdf_folder: str = r"/data/aus_prospectus/pdf/"
# output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
# output_extract_data_child_folder: str = (