update for output data as template
This commit is contained in:
parent
f10ff8ee33
commit
b93a8d55e8
80
main.py
80
main.py
|
|
@ -1017,7 +1017,7 @@ def batch_run_documents(
|
||||||
)
|
)
|
||||||
re_run_extract_data = True
|
re_run_extract_data = True
|
||||||
re_run_mapping_data = True
|
re_run_mapping_data = True
|
||||||
force_save_total_data = False
|
force_save_total_data = True
|
||||||
calculate_metrics = False
|
calculate_metrics = False
|
||||||
|
|
||||||
extract_way = "text"
|
extract_way = "text"
|
||||||
|
|
@ -1203,22 +1203,31 @@ def merge_output_data_aus_prospectus(
|
||||||
total_data_list = []
|
total_data_list = []
|
||||||
for doc_id in tqdm(doc_id_list):
|
for doc_id in tqdm(doc_id_list):
|
||||||
doc_data_list = []
|
doc_data_list = []
|
||||||
doc_data_df = data_df[data_df["doc_id"] == doc_id]
|
|
||||||
doc_date = str(
|
doc_date = str(
|
||||||
document_mapping_df[document_mapping_df["DocumentId"] == doc_id][
|
document_mapping_df[document_mapping_df["DocumentId"] == doc_id][
|
||||||
"EffectiveDate"
|
"EffectiveDate"
|
||||||
].values[0]
|
].values[0]
|
||||||
)[0:10]
|
)[0:10]
|
||||||
|
share_doc_data_df = data_df[(data_df["doc_id"] == doc_id) & (data_df["investment_type"] == 1)]
|
||||||
exist_raw_name_list = []
|
exist_raw_name_list = []
|
||||||
for index, row in doc_data_df.iterrows():
|
for index, row in share_doc_data_df.iterrows():
|
||||||
doc_id = str(row["doc_id"])
|
doc_id = str(row["doc_id"])
|
||||||
page_index = int(row["page_index"])
|
page_index = int(row["page_index"])
|
||||||
|
raw_fund_name = str(row["raw_fund_name"])
|
||||||
|
raw_share_name = str(row["raw_share_name"])
|
||||||
raw_name = str(row["raw_name"])
|
raw_name = str(row["raw_name"])
|
||||||
datapoint = str(row["datapoint"])
|
datapoint = str(row["datapoint"])
|
||||||
value = row["value"]
|
value = row["value"]
|
||||||
investment_type = row["investment_type"]
|
investment_type = row["investment_type"]
|
||||||
investment_id = row["investment_id"]
|
share_class_id = row["investment_id"]
|
||||||
investment_name = row["investment_name"]
|
share_class_legal_name = row["investment_name"]
|
||||||
|
fund_id = ""
|
||||||
|
fund_legal_name = ""
|
||||||
|
if share_class_id != "":
|
||||||
|
record_row = document_mapping_df[document_mapping_df["FundClassId"] == share_class_id]
|
||||||
|
if len(record_row) > 0:
|
||||||
|
fund_id = record_row["FundId"].values[0]
|
||||||
|
fund_legal_name = record_row["FundLegalName"].values[0]
|
||||||
|
|
||||||
exist = False
|
exist = False
|
||||||
for exist_raw_name_info in exist_raw_name_list:
|
for exist_raw_name_info in exist_raw_name_list:
|
||||||
|
|
@ -1233,9 +1242,13 @@ def merge_output_data_aus_prospectus(
|
||||||
if not exist:
|
if not exist:
|
||||||
data = {
|
data = {
|
||||||
"DocumentId": doc_id,
|
"DocumentId": doc_id,
|
||||||
"investment_type": investment_type,
|
"raw_fund_name": raw_fund_name,
|
||||||
"investment_id": investment_id,
|
"raw_share_name": raw_share_name,
|
||||||
"investment_name": investment_name,
|
"raw_name": raw_name,
|
||||||
|
"fund_id": fund_id,
|
||||||
|
"fund_name": fund_legal_name,
|
||||||
|
"sec_id": share_class_id,
|
||||||
|
"sec_name": share_class_legal_name,
|
||||||
"EffectiveDate": doc_date,
|
"EffectiveDate": doc_date,
|
||||||
"page_index": [],
|
"page_index": [],
|
||||||
"RawName": raw_name,
|
"RawName": raw_name,
|
||||||
|
|
@ -1249,14 +1262,55 @@ def merge_output_data_aus_prospectus(
|
||||||
# find data from total_data_list by raw_name
|
# find data from total_data_list by raw_name
|
||||||
for data in doc_data_list:
|
for data in doc_data_list:
|
||||||
if (
|
if (
|
||||||
data["RawName"] == raw_name
|
data["raw_name"] == raw_name
|
||||||
and data["investment_type"] == investment_type
|
|
||||||
):
|
):
|
||||||
update_key = datapoint
|
update_key = datapoint
|
||||||
data[update_key] = value
|
data[update_key] = value
|
||||||
if page_index not in data["page_index"]:
|
if page_index not in data["page_index"]:
|
||||||
data["page_index"].append(page_index)
|
data["page_index"].append(page_index)
|
||||||
break
|
break
|
||||||
|
|
||||||
|
fund_doc_data_df = data_df[(data_df["doc_id"] == doc_id) & (data_df["investment_type"] == 33)]
|
||||||
|
for index, row in fund_doc_data_df.iterrows():
|
||||||
|
doc_id = str(row["doc_id"])
|
||||||
|
page_index = int(row["page_index"])
|
||||||
|
raw_fund_name = str(row["raw_fund_name"])
|
||||||
|
raw_share_name = ""
|
||||||
|
raw_name = str(row["raw_name"])
|
||||||
|
datapoint = str(row["datapoint"])
|
||||||
|
value = row["value"]
|
||||||
|
fund_id = row["investment_id"]
|
||||||
|
fund_legal_name = row["investment_name"]
|
||||||
|
|
||||||
|
exist = False
|
||||||
|
if fund_id != "":
|
||||||
|
for data in doc_data_list:
|
||||||
|
if (fund_id != "" and data["fund_id"] == fund_id) or \
|
||||||
|
(data["raw_fund_name"] == raw_fund_name):
|
||||||
|
update_key = datapoint
|
||||||
|
data[update_key] = value
|
||||||
|
if page_index not in data["page_index"]:
|
||||||
|
data["page_index"].append(page_index)
|
||||||
|
exist = True
|
||||||
|
|
||||||
|
if not exist:
|
||||||
|
data = {
|
||||||
|
"DocumentId": doc_id,
|
||||||
|
"raw_fund_name": raw_fund_name,
|
||||||
|
"raw_share_name": "",
|
||||||
|
"raw_name": raw_name,
|
||||||
|
"fund_id": fund_id,
|
||||||
|
"fund_name": fund_legal_name,
|
||||||
|
"sec_id": "",
|
||||||
|
"sec_name": "",
|
||||||
|
"EffectiveDate": doc_date,
|
||||||
|
"page_index": [page_index],
|
||||||
|
"RawName": raw_name,
|
||||||
|
}
|
||||||
|
for datapoint_name in datapoint_name_list:
|
||||||
|
data[datapoint_name] = ""
|
||||||
|
data[datapoint] = value
|
||||||
|
doc_data_list.append(data)
|
||||||
total_data_list.extend(doc_data_list)
|
total_data_list.extend(doc_data_list)
|
||||||
total_data_df = pd.DataFrame(total_data_list)
|
total_data_df = pd.DataFrame(total_data_list)
|
||||||
total_data_df.fillna("", inplace=True)
|
total_data_df.fillna("", inplace=True)
|
||||||
|
|
@ -1265,7 +1319,7 @@ def merge_output_data_aus_prospectus(
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
data_file_path = r"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_11_documents_by_text_20250115185745.xlsx"
|
data_file_path = r"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_11_documents_by_text_20250116220811.xlsx"
|
||||||
document_mapping_file_path = r"/data/aus_prospectus/basic_information/11_documents/document_mapping.xlsx"
|
document_mapping_file_path = r"/data/aus_prospectus/basic_information/11_documents/document_mapping.xlsx"
|
||||||
merged_total_data_folder = r'/data/aus_prospectus/output/mapping_data/total/merged/'
|
merged_total_data_folder = r'/data/aus_prospectus/output/mapping_data/total/merged/'
|
||||||
os.makedirs(merged_total_data_folder, exist_ok=True)
|
os.makedirs(merged_total_data_folder, exist_ok=True)
|
||||||
|
|
@ -1286,7 +1340,7 @@ if __name__ == "__main__":
|
||||||
|
|
||||||
# special_doc_id_list = ["553242411"]
|
# special_doc_id_list = ["553242411"]
|
||||||
|
|
||||||
# doc_source = "emea_ar"
|
# doc_source = "aus_prospectus"
|
||||||
# if doc_source == "aus_prospectus":
|
# if doc_source == "aus_prospectus":
|
||||||
# special_doc_id_list: list = [
|
# special_doc_id_list: list = [
|
||||||
# "539790009",
|
# "539790009",
|
||||||
|
|
@ -1301,7 +1355,7 @@ if __name__ == "__main__":
|
||||||
# "555377021",
|
# "555377021",
|
||||||
# "555654388",
|
# "555654388",
|
||||||
# ]
|
# ]
|
||||||
# special_doc_id_list: list = ["554851189"]
|
# # special_doc_id_list: list = ["554851189"]
|
||||||
# pdf_folder: str = r"/data/aus_prospectus/pdf/"
|
# pdf_folder: str = r"/data/aus_prospectus/pdf/"
|
||||||
# output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
|
# output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
|
||||||
# output_extract_data_child_folder: str = (
|
# output_extract_data_child_folder: str = (
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue