a little change
This commit is contained in:
parent
d9b0bed39a
commit
41f8c307ff
131
main.py
131
main.py
|
|
@ -31,11 +31,14 @@ class EMEA_AR_Parsing:
|
||||||
output_mapping_data_folder: str = r"/data/emea_ar/output/mapping_data/docs/",
|
output_mapping_data_folder: str = r"/data/emea_ar/output/mapping_data/docs/",
|
||||||
extract_way: str = "text",
|
extract_way: str = "text",
|
||||||
drilldown_folder: str = r"/data/emea_ar/output/drilldown/",
|
drilldown_folder: str = r"/data/emea_ar/output/drilldown/",
|
||||||
|
compare_with_provider: bool = True
|
||||||
) -> None:
|
) -> None:
|
||||||
self.doc_id = doc_id
|
self.doc_id = doc_id
|
||||||
self.doc_source = doc_source
|
self.doc_source = doc_source
|
||||||
self.pdf_folder = pdf_folder
|
self.pdf_folder = pdf_folder
|
||||||
os.makedirs(self.pdf_folder, exist_ok=True)
|
os.makedirs(self.pdf_folder, exist_ok=True)
|
||||||
|
self.compare_with_provider = compare_with_provider
|
||||||
|
|
||||||
self.pdf_file = self.download_pdf()
|
self.pdf_file = self.download_pdf()
|
||||||
self.document_mapping_info_df = query_document_fund_mapping(doc_id, rerun=False)
|
self.document_mapping_info_df = query_document_fund_mapping(doc_id, rerun=False)
|
||||||
|
|
||||||
|
|
@ -76,7 +79,7 @@ class EMEA_AR_Parsing:
|
||||||
self.pdf_file,
|
self.pdf_file,
|
||||||
self.document_mapping_info_df,
|
self.document_mapping_info_df,
|
||||||
self.doc_source,
|
self.doc_source,
|
||||||
output_pdf_text_folder
|
output_pdf_text_folder,
|
||||||
)
|
)
|
||||||
self.page_text_dict = self.filter_pages.page_text_dict
|
self.page_text_dict = self.filter_pages.page_text_dict
|
||||||
|
|
||||||
|
|
@ -87,7 +90,9 @@ class EMEA_AR_Parsing:
|
||||||
drilldown_folder = r"/data/emea_ar/output/drilldown/"
|
drilldown_folder = r"/data/emea_ar/output/drilldown/"
|
||||||
os.makedirs(drilldown_folder, exist_ok=True)
|
os.makedirs(drilldown_folder, exist_ok=True)
|
||||||
self.drilldown_folder = drilldown_folder
|
self.drilldown_folder = drilldown_folder
|
||||||
misc_config_file = os.path.join(f"./configuration/{doc_source}/", "misc_config.json")
|
misc_config_file = os.path.join(
|
||||||
|
f"./configuration/{doc_source}/", "misc_config.json"
|
||||||
|
)
|
||||||
if os.path.exists(misc_config_file):
|
if os.path.exists(misc_config_file):
|
||||||
with open(misc_config_file, "r", encoding="utf-8") as f:
|
with open(misc_config_file, "r", encoding="utf-8") as f:
|
||||||
misc_config = json.load(f)
|
misc_config = json.load(f)
|
||||||
|
|
@ -278,7 +283,8 @@ class EMEA_AR_Parsing:
|
||||||
data_from_gpt,
|
data_from_gpt,
|
||||||
self.document_mapping_info_df,
|
self.document_mapping_info_df,
|
||||||
self.output_mapping_data_folder,
|
self.output_mapping_data_folder,
|
||||||
self.doc_source
|
self.doc_source,
|
||||||
|
compare_with_provider=self.compare_with_provider
|
||||||
)
|
)
|
||||||
return data_mapping.mapping_raw_data_entrance()
|
return data_mapping.mapping_raw_data_entrance()
|
||||||
|
|
||||||
|
|
@ -334,6 +340,7 @@ def mapping_data(
|
||||||
output_mapping_data_folder=output_mapping_folder,
|
output_mapping_data_folder=output_mapping_folder,
|
||||||
extract_way=extract_way,
|
extract_way=extract_way,
|
||||||
drilldown_folder=drilldown_folder,
|
drilldown_folder=drilldown_folder,
|
||||||
|
compare_with_provider=False
|
||||||
)
|
)
|
||||||
doc_data_from_gpt, annotation_list = emea_ar_parsing.extract_data(
|
doc_data_from_gpt, annotation_list = emea_ar_parsing.extract_data(
|
||||||
re_run=re_run_extract_data
|
re_run=re_run_extract_data
|
||||||
|
|
@ -502,18 +509,28 @@ def batch_start_job(
|
||||||
writer, index=False, sheet_name="extract_data"
|
writer, index=False, sheet_name="extract_data"
|
||||||
)
|
)
|
||||||
|
|
||||||
if document_mapping_file is not None and len(document_mapping_file) > 0 and os.path.exists(document_mapping_file):
|
if (
|
||||||
|
document_mapping_file is not None
|
||||||
|
and len(document_mapping_file) > 0
|
||||||
|
and os.path.exists(document_mapping_file)
|
||||||
|
):
|
||||||
try:
|
try:
|
||||||
merged_total_data_folder = os.path.join(output_mapping_total_folder, "merged/")
|
merged_total_data_folder = os.path.join(
|
||||||
|
output_mapping_total_folder, "merged/"
|
||||||
|
)
|
||||||
os.makedirs(merged_total_data_folder, exist_ok=True)
|
os.makedirs(merged_total_data_folder, exist_ok=True)
|
||||||
data_file_base_name = os.path.basename(output_file)
|
data_file_base_name = os.path.basename(output_file)
|
||||||
output_merged_data_file_path = os.path.join(merged_total_data_folder, "merged_" + data_file_base_name)
|
output_merged_data_file_path = os.path.join(
|
||||||
merge_output_data_aus_prospectus(output_file, document_mapping_file, output_merged_data_file_path)
|
merged_total_data_folder, "merged_" + data_file_base_name
|
||||||
|
)
|
||||||
|
merge_output_data_aus_prospectus(
|
||||||
|
output_file, document_mapping_file, output_merged_data_file_path
|
||||||
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error: {e}")
|
logger.error(f"Error: {e}")
|
||||||
|
|
||||||
if calculate_metrics:
|
if calculate_metrics:
|
||||||
prediction_sheet_name = "total_mapping_data"
|
prediction_sheet_name = "data_in_doc_mapping"
|
||||||
ground_truth_file = r"/data/emea_ar/ground_truth/data_extraction/mapping_data_info_73_documents.xlsx"
|
ground_truth_file = r"/data/emea_ar/ground_truth/data_extraction/mapping_data_info_73_documents.xlsx"
|
||||||
ground_truth_sheet_name = "mapping_data"
|
ground_truth_sheet_name = "mapping_data"
|
||||||
metrics_output_folder = r"/data/emea_ar/output/metrics/"
|
metrics_output_folder = r"/data/emea_ar/output/metrics/"
|
||||||
|
|
@ -770,11 +787,11 @@ def test_auto_generate_instructions():
|
||||||
|
|
||||||
|
|
||||||
def test_data_extraction_metrics():
|
def test_data_extraction_metrics():
|
||||||
data_type = "data_extraction"
|
data_type = "document_mapping_in_db"
|
||||||
# prediction_file = r"/data/emea_ar/output/mapping_data/total/mapping_data_info_88_documents_by_image_20240920033929.xlsx"
|
# prediction_file = r"/data/emea_ar/output/mapping_data/total/mapping_data_info_88_documents_by_image_20240920033929.xlsx"
|
||||||
prediction_file = r"/data/emea_ar/output/mapping_data/total/mapping_data_info_88_documents_by_text_20240922152517.xlsx"
|
prediction_file = r"/data/emea_ar/output/mapping_data/total/mapping_data_info_51_documents_by_text_20250127104008.xlsx"
|
||||||
# prediction_file = r"/data/emea_ar/output/mapping_data/docs/by_text/excel/481475385.xlsx"
|
# prediction_file = r"/data/emea_ar/output/mapping_data/docs/by_text/excel/481475385.xlsx"
|
||||||
prediction_sheet_name = "mapping_data"
|
prediction_sheet_name = "data_in_doc_mapping"
|
||||||
ground_truth_file = r"/data/emea_ar/ground_truth/data_extraction/mapping_data_info_73_documents.xlsx"
|
ground_truth_file = r"/data/emea_ar/ground_truth/data_extraction/mapping_data_info_73_documents.xlsx"
|
||||||
ground_truth_sheet_name = "mapping_data"
|
ground_truth_sheet_name = "mapping_data"
|
||||||
metrics_output_folder = r"/data/emea_ar/output/metrics/"
|
metrics_output_folder = r"/data/emea_ar/output/metrics/"
|
||||||
|
|
@ -1015,7 +1032,7 @@ def batch_run_documents(
|
||||||
page_filter_ground_truth_file = (
|
page_filter_ground_truth_file = (
|
||||||
r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx"
|
r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx"
|
||||||
)
|
)
|
||||||
re_run_extract_data = True
|
re_run_extract_data = False
|
||||||
re_run_mapping_data = True
|
re_run_mapping_data = True
|
||||||
force_save_total_data = False
|
force_save_total_data = False
|
||||||
calculate_metrics = False
|
calculate_metrics = False
|
||||||
|
|
@ -1194,13 +1211,17 @@ def merge_output_data_aus_prospectus(
|
||||||
):
|
):
|
||||||
# TODO: merge output data for aus prospectus, plan to realize it on 2025-01-16
|
# TODO: merge output data for aus prospectus, plan to realize it on 2025-01-16
|
||||||
data_df = pd.read_excel(data_file_path, sheet_name="total_mapping_data")
|
data_df = pd.read_excel(data_file_path, sheet_name="total_mapping_data")
|
||||||
document_mapping_df = pd.read_excel(document_mapping_file, sheet_name="document_mapping")
|
document_mapping_df = pd.read_excel(
|
||||||
|
document_mapping_file, sheet_name="document_mapping"
|
||||||
|
)
|
||||||
# set doc_id to be string type
|
# set doc_id to be string type
|
||||||
data_df["doc_id"] = data_df["doc_id"].astype(str)
|
data_df["doc_id"] = data_df["doc_id"].astype(str)
|
||||||
document_mapping_df["DocumentId"] = document_mapping_df["DocumentId"].astype(str)
|
document_mapping_df["DocumentId"] = document_mapping_df["DocumentId"].astype(str)
|
||||||
|
|
||||||
doc_id_list = data_df["doc_id"].unique().tolist()
|
doc_id_list = data_df["doc_id"].unique().tolist()
|
||||||
datapoint_keyword_config_file = r"./configuration/aus_prospectus/datapoint_name.json"
|
datapoint_keyword_config_file = (
|
||||||
|
r"./configuration/aus_prospectus/datapoint_name.json"
|
||||||
|
)
|
||||||
with open(datapoint_keyword_config_file, "r", encoding="utf-8") as f:
|
with open(datapoint_keyword_config_file, "r", encoding="utf-8") as f:
|
||||||
datapoint_keyword_config = json.load(f)
|
datapoint_keyword_config = json.load(f)
|
||||||
datapoint_name_list = list(datapoint_keyword_config.keys())
|
datapoint_name_list = list(datapoint_keyword_config.keys())
|
||||||
|
|
@ -1212,7 +1233,9 @@ def merge_output_data_aus_prospectus(
|
||||||
"EffectiveDate"
|
"EffectiveDate"
|
||||||
].values[0]
|
].values[0]
|
||||||
)[0:10]
|
)[0:10]
|
||||||
share_doc_data_df = data_df[(data_df["doc_id"] == doc_id) & (data_df["investment_type"] == 1)]
|
share_doc_data_df = data_df[
|
||||||
|
(data_df["doc_id"] == doc_id) & (data_df["investment_type"] == 1)
|
||||||
|
]
|
||||||
exist_raw_name_list = []
|
exist_raw_name_list = []
|
||||||
for index, row in share_doc_data_df.iterrows():
|
for index, row in share_doc_data_df.iterrows():
|
||||||
doc_id = str(row["doc_id"])
|
doc_id = str(row["doc_id"])
|
||||||
|
|
@ -1228,7 +1251,9 @@ def merge_output_data_aus_prospectus(
|
||||||
fund_id = ""
|
fund_id = ""
|
||||||
fund_legal_name = ""
|
fund_legal_name = ""
|
||||||
if share_class_id != "":
|
if share_class_id != "":
|
||||||
record_row = document_mapping_df[document_mapping_df["FundClassId"] == share_class_id]
|
record_row = document_mapping_df[
|
||||||
|
document_mapping_df["FundClassId"] == share_class_id
|
||||||
|
]
|
||||||
if len(record_row) > 0:
|
if len(record_row) > 0:
|
||||||
fund_id = record_row["FundId"].values[0]
|
fund_id = record_row["FundId"].values[0]
|
||||||
fund_legal_name = record_row["FundLegalName"].values[0]
|
fund_legal_name = record_row["FundLegalName"].values[0]
|
||||||
|
|
@ -1265,16 +1290,16 @@ def merge_output_data_aus_prospectus(
|
||||||
doc_data_list.append(data)
|
doc_data_list.append(data)
|
||||||
# find data from total_data_list by raw_name
|
# find data from total_data_list by raw_name
|
||||||
for data in doc_data_list:
|
for data in doc_data_list:
|
||||||
if (
|
if data["raw_name"] == raw_name:
|
||||||
data["raw_name"] == raw_name
|
|
||||||
):
|
|
||||||
update_key = datapoint
|
update_key = datapoint
|
||||||
data[update_key] = value
|
data[update_key] = value
|
||||||
if page_index not in data["page_index"]:
|
if page_index not in data["page_index"]:
|
||||||
data["page_index"].append(page_index)
|
data["page_index"].append(page_index)
|
||||||
break
|
break
|
||||||
|
|
||||||
fund_doc_data_df = data_df[(data_df["doc_id"] == doc_id) & (data_df["investment_type"] == 33)]
|
fund_doc_data_df = data_df[
|
||||||
|
(data_df["doc_id"] == doc_id) & (data_df["investment_type"] == 33)
|
||||||
|
]
|
||||||
for index, row in fund_doc_data_df.iterrows():
|
for index, row in fund_doc_data_df.iterrows():
|
||||||
doc_id = str(row["doc_id"])
|
doc_id = str(row["doc_id"])
|
||||||
page_index = int(row["page_index"])
|
page_index = int(row["page_index"])
|
||||||
|
|
@ -1289,8 +1314,9 @@ def merge_output_data_aus_prospectus(
|
||||||
exist = False
|
exist = False
|
||||||
if fund_id != "":
|
if fund_id != "":
|
||||||
for data in doc_data_list:
|
for data in doc_data_list:
|
||||||
if (fund_id != "" and data["fund_id"] == fund_id) or \
|
if (fund_id != "" and data["fund_id"] == fund_id) or (
|
||||||
(data["raw_fund_name"] == raw_fund_name):
|
data["raw_fund_name"] == raw_fund_name
|
||||||
|
):
|
||||||
update_key = datapoint
|
update_key = datapoint
|
||||||
data[update_key] = value
|
data[update_key] = value
|
||||||
if page_index not in data["page_index"]:
|
if page_index not in data["page_index"]:
|
||||||
|
|
@ -1323,6 +1349,7 @@ def merge_output_data_aus_prospectus(
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
# test_data_extraction_metrics()
|
||||||
# data_file_path = r"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_11_documents_by_text_20250116220811.xlsx"
|
# data_file_path = r"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_11_documents_by_text_20250116220811.xlsx"
|
||||||
# document_mapping_file_path = r"/data/aus_prospectus/basic_information/11_documents/document_mapping.xlsx"
|
# document_mapping_file_path = r"/data/aus_prospectus/basic_information/11_documents/document_mapping.xlsx"
|
||||||
# merged_total_data_folder = r'/data/aus_prospectus/output/mapping_data/total/merged/'
|
# merged_total_data_folder = r'/data/aus_prospectus/output/mapping_data/total/merged/'
|
||||||
|
|
@ -1350,7 +1377,9 @@ if __name__ == "__main__":
|
||||||
|
|
||||||
doc_source = "emea_ar"
|
doc_source = "emea_ar"
|
||||||
if doc_source == "aus_prospectus":
|
if doc_source == "aus_prospectus":
|
||||||
document_sample_file = r"./sample_documents/aus_prospectus_100_documents_multi_fund_sample.txt"
|
document_sample_file = (
|
||||||
|
r"./sample_documents/aus_prospectus_100_documents_multi_fund_sample.txt"
|
||||||
|
)
|
||||||
with open(document_sample_file, "r", encoding="utf-8") as f:
|
with open(document_sample_file, "r", encoding="utf-8") as f:
|
||||||
special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()]
|
special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()]
|
||||||
document_mapping_file = r"/data/aus_prospectus/basic_information/from_2024_documents/aus_100_document_prospectus_multi_fund.xlsx"
|
document_mapping_file = r"/data/aus_prospectus/basic_information/from_2024_documents/aus_100_document_prospectus_multi_fund.xlsx"
|
||||||
|
|
@ -1397,7 +1426,61 @@ if __name__ == "__main__":
|
||||||
drilldown_folder=drilldown_folder,
|
drilldown_folder=drilldown_folder,
|
||||||
)
|
)
|
||||||
elif doc_source == "emea_ar":
|
elif doc_source == "emea_ar":
|
||||||
special_doc_id_list = ["553242408"]
|
special_doc_id_list = [
|
||||||
|
"292989214",
|
||||||
|
"316237292",
|
||||||
|
"321733631",
|
||||||
|
"323390570",
|
||||||
|
"327956364",
|
||||||
|
"333207452",
|
||||||
|
"334718372",
|
||||||
|
"344636875",
|
||||||
|
"362246081",
|
||||||
|
"366179419",
|
||||||
|
"380945052",
|
||||||
|
"382366116",
|
||||||
|
"387202452",
|
||||||
|
"389171486",
|
||||||
|
"391456740",
|
||||||
|
"391736837",
|
||||||
|
"394778487",
|
||||||
|
"401684600",
|
||||||
|
"402113224",
|
||||||
|
"402181770",
|
||||||
|
"402397014",
|
||||||
|
"405803396",
|
||||||
|
"445102363",
|
||||||
|
"445256897",
|
||||||
|
"448265376",
|
||||||
|
"449555622",
|
||||||
|
"449623976",
|
||||||
|
"458291624",
|
||||||
|
"458359181",
|
||||||
|
"463081566",
|
||||||
|
"469138353",
|
||||||
|
"471641628",
|
||||||
|
"476492237",
|
||||||
|
"478585901",
|
||||||
|
"478586066",
|
||||||
|
"479042264",
|
||||||
|
"479793787",
|
||||||
|
"481475385",
|
||||||
|
"483617247",
|
||||||
|
"486378555",
|
||||||
|
"486383912",
|
||||||
|
"492121213",
|
||||||
|
"497497599",
|
||||||
|
"502693599",
|
||||||
|
"502821436",
|
||||||
|
"503194284",
|
||||||
|
"506559375",
|
||||||
|
"507967525",
|
||||||
|
"508854243",
|
||||||
|
"509845549",
|
||||||
|
"520879048",
|
||||||
|
"529925114",
|
||||||
|
]
|
||||||
|
special_doc_id_list = ["471641628"]
|
||||||
batch_run_documents(
|
batch_run_documents(
|
||||||
doc_source=doc_source, special_doc_id_list=special_doc_id_list
|
doc_source=doc_source, special_doc_id_list=special_doc_id_list
|
||||||
)
|
)
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue