update for output data as template
This commit is contained in:
parent
f10ff8ee33
commit
b93a8d55e8
80
main.py
80
main.py
|
|
@ -1017,7 +1017,7 @@ def batch_run_documents(
|
|||
)
|
||||
re_run_extract_data = True
|
||||
re_run_mapping_data = True
|
||||
force_save_total_data = False
|
||||
force_save_total_data = True
|
||||
calculate_metrics = False
|
||||
|
||||
extract_way = "text"
|
||||
|
|
@ -1203,22 +1203,31 @@ def merge_output_data_aus_prospectus(
|
|||
total_data_list = []
|
||||
for doc_id in tqdm(doc_id_list):
|
||||
doc_data_list = []
|
||||
doc_data_df = data_df[data_df["doc_id"] == doc_id]
|
||||
doc_date = str(
|
||||
document_mapping_df[document_mapping_df["DocumentId"] == doc_id][
|
||||
"EffectiveDate"
|
||||
].values[0]
|
||||
)[0:10]
|
||||
share_doc_data_df = data_df[(data_df["doc_id"] == doc_id) & (data_df["investment_type"] == 1)]
|
||||
exist_raw_name_list = []
|
||||
for index, row in doc_data_df.iterrows():
|
||||
for index, row in share_doc_data_df.iterrows():
|
||||
doc_id = str(row["doc_id"])
|
||||
page_index = int(row["page_index"])
|
||||
raw_fund_name = str(row["raw_fund_name"])
|
||||
raw_share_name = str(row["raw_share_name"])
|
||||
raw_name = str(row["raw_name"])
|
||||
datapoint = str(row["datapoint"])
|
||||
value = row["value"]
|
||||
investment_type = row["investment_type"]
|
||||
investment_id = row["investment_id"]
|
||||
investment_name = row["investment_name"]
|
||||
share_class_id = row["investment_id"]
|
||||
share_class_legal_name = row["investment_name"]
|
||||
fund_id = ""
|
||||
fund_legal_name = ""
|
||||
if share_class_id != "":
|
||||
record_row = document_mapping_df[document_mapping_df["FundClassId"] == share_class_id]
|
||||
if len(record_row) > 0:
|
||||
fund_id = record_row["FundId"].values[0]
|
||||
fund_legal_name = record_row["FundLegalName"].values[0]
|
||||
|
||||
exist = False
|
||||
for exist_raw_name_info in exist_raw_name_list:
|
||||
|
|
@ -1233,9 +1242,13 @@ def merge_output_data_aus_prospectus(
|
|||
if not exist:
|
||||
data = {
|
||||
"DocumentId": doc_id,
|
||||
"investment_type": investment_type,
|
||||
"investment_id": investment_id,
|
||||
"investment_name": investment_name,
|
||||
"raw_fund_name": raw_fund_name,
|
||||
"raw_share_name": raw_share_name,
|
||||
"raw_name": raw_name,
|
||||
"fund_id": fund_id,
|
||||
"fund_name": fund_legal_name,
|
||||
"sec_id": share_class_id,
|
||||
"sec_name": share_class_legal_name,
|
||||
"EffectiveDate": doc_date,
|
||||
"page_index": [],
|
||||
"RawName": raw_name,
|
||||
|
|
@ -1249,14 +1262,55 @@ def merge_output_data_aus_prospectus(
|
|||
# find data from total_data_list by raw_name
|
||||
for data in doc_data_list:
|
||||
if (
|
||||
data["RawName"] == raw_name
|
||||
and data["investment_type"] == investment_type
|
||||
data["raw_name"] == raw_name
|
||||
):
|
||||
update_key = datapoint
|
||||
data[update_key] = value
|
||||
if page_index not in data["page_index"]:
|
||||
data["page_index"].append(page_index)
|
||||
break
|
||||
|
||||
fund_doc_data_df = data_df[(data_df["doc_id"] == doc_id) & (data_df["investment_type"] == 33)]
|
||||
for index, row in fund_doc_data_df.iterrows():
|
||||
doc_id = str(row["doc_id"])
|
||||
page_index = int(row["page_index"])
|
||||
raw_fund_name = str(row["raw_fund_name"])
|
||||
raw_share_name = ""
|
||||
raw_name = str(row["raw_name"])
|
||||
datapoint = str(row["datapoint"])
|
||||
value = row["value"]
|
||||
fund_id = row["investment_id"]
|
||||
fund_legal_name = row["investment_name"]
|
||||
|
||||
exist = False
|
||||
if fund_id != "":
|
||||
for data in doc_data_list:
|
||||
if (fund_id != "" and data["fund_id"] == fund_id) or \
|
||||
(data["raw_fund_name"] == raw_fund_name):
|
||||
update_key = datapoint
|
||||
data[update_key] = value
|
||||
if page_index not in data["page_index"]:
|
||||
data["page_index"].append(page_index)
|
||||
exist = True
|
||||
|
||||
if not exist:
|
||||
data = {
|
||||
"DocumentId": doc_id,
|
||||
"raw_fund_name": raw_fund_name,
|
||||
"raw_share_name": "",
|
||||
"raw_name": raw_name,
|
||||
"fund_id": fund_id,
|
||||
"fund_name": fund_legal_name,
|
||||
"sec_id": "",
|
||||
"sec_name": "",
|
||||
"EffectiveDate": doc_date,
|
||||
"page_index": [page_index],
|
||||
"RawName": raw_name,
|
||||
}
|
||||
for datapoint_name in datapoint_name_list:
|
||||
data[datapoint_name] = ""
|
||||
data[datapoint] = value
|
||||
doc_data_list.append(data)
|
||||
total_data_list.extend(doc_data_list)
|
||||
total_data_df = pd.DataFrame(total_data_list)
|
||||
total_data_df.fillna("", inplace=True)
|
||||
|
|
@ -1265,7 +1319,7 @@ def merge_output_data_aus_prospectus(
|
|||
|
||||
|
||||
if __name__ == "__main__":
|
||||
data_file_path = r"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_11_documents_by_text_20250115185745.xlsx"
|
||||
data_file_path = r"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_11_documents_by_text_20250116220811.xlsx"
|
||||
document_mapping_file_path = r"/data/aus_prospectus/basic_information/11_documents/document_mapping.xlsx"
|
||||
merged_total_data_folder = r'/data/aus_prospectus/output/mapping_data/total/merged/'
|
||||
os.makedirs(merged_total_data_folder, exist_ok=True)
|
||||
|
|
@ -1286,7 +1340,7 @@ if __name__ == "__main__":
|
|||
|
||||
# special_doc_id_list = ["553242411"]
|
||||
|
||||
# doc_source = "emea_ar"
|
||||
# doc_source = "aus_prospectus"
|
||||
# if doc_source == "aus_prospectus":
|
||||
# special_doc_id_list: list = [
|
||||
# "539790009",
|
||||
|
|
@ -1301,7 +1355,7 @@ if __name__ == "__main__":
|
|||
# "555377021",
|
||||
# "555654388",
|
||||
# ]
|
||||
# special_doc_id_list: list = ["554851189"]
|
||||
# # special_doc_id_list: list = ["554851189"]
|
||||
# pdf_folder: str = r"/data/aus_prospectus/pdf/"
|
||||
# output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
|
||||
# output_extract_data_child_folder: str = (
|
||||
|
|
|
|||
Loading…
Reference in New Issue