realize merge_output_data function, whether to output as this format, depends on confirmation with data/ developer teams
This commit is contained in:
parent
309bb714f6
commit
65e752e25a
|
|
@ -6,3 +6,4 @@
|
||||||
/test_calc_metrics.py
|
/test_calc_metrics.py
|
||||||
/test_metrics
|
/test_metrics
|
||||||
/data
|
/data
|
||||||
|
/sample_documents/japan_prospectus.txt
|
||||||
|
|
|
||||||
|
|
@ -10,11 +10,7 @@ from utils.logger import logger
|
||||||
from utils.pdf_util import PDFUtil
|
from utils.pdf_util import PDFUtil
|
||||||
|
|
||||||
|
|
||||||
def drilldown_documents():
|
def drilldown_documents(pdf_folder: str, extract_data_folder: str, drilldown_folder: str):
|
||||||
# doc_id: str,
|
|
||||||
pdf_folder = r"/data/emea_ar/pdf/"
|
|
||||||
drilldown_folder = r"/data/emea_ar/output/drilldown/"
|
|
||||||
extract_data_folder = r'/data/emea_ar/output/extract_data/docs/by_text/json/'
|
|
||||||
extract_files = glob(extract_data_folder + '*.json')
|
extract_files = glob(extract_data_folder + '*.json')
|
||||||
|
|
||||||
for index, json_file in enumerate(tqdm(extract_files)):
|
for index, json_file in enumerate(tqdm(extract_files)):
|
||||||
|
|
@ -156,5 +152,8 @@ def calculate_metrics():
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
pdf_folder = r"/data/emea_ar/pdf/"
|
||||||
|
drilldown_folder = r"/data/emea_ar/output/drilldown/"
|
||||||
|
extract_data_folder = r'/data/emea_ar/output/extract_data/docs/by_text/json/'
|
||||||
drilldown_documents()
|
drilldown_documents()
|
||||||
# calculate_metrics()
|
# calculate_metrics()
|
||||||
85
main.py
85
main.py
|
|
@ -935,11 +935,11 @@ def batch_run_documents():
|
||||||
|
|
||||||
def batch_initial_document():
|
def batch_initial_document():
|
||||||
sample_document_list_folder = r'./sample_documents/'
|
sample_document_list_folder = r'./sample_documents/'
|
||||||
document_list_file = os.path.join(sample_document_list_folder, "sample_documents_12_11.txt")
|
document_list_file = os.path.join(sample_document_list_folder, "japan_prospectus.txt")
|
||||||
with open(document_list_file, "r", encoding="utf-8") as f:
|
with open(document_list_file, "r", encoding="utf-8") as f:
|
||||||
doc_id_list = f.readlines()
|
doc_id_list = f.readlines()
|
||||||
doc_id_list = [doc_id.strip() for doc_id in doc_id_list]
|
doc_id_list = [doc_id.strip() for doc_id in doc_id_list]
|
||||||
pdf_folder = r"/data/emea_ar/pdf/"
|
pdf_folder = r"/data/illume/japan_prospectus/pdf/"
|
||||||
page_filter_ground_truth_file = (
|
page_filter_ground_truth_file = (
|
||||||
r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx"
|
r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx"
|
||||||
)
|
)
|
||||||
|
|
@ -952,9 +952,88 @@ def batch_initial_document():
|
||||||
output_extract_data_folder=output_extract_data_child_folder,
|
output_extract_data_folder=output_extract_data_child_folder,
|
||||||
output_mapping_data_folder=output_mapping_child_folder)
|
output_mapping_data_folder=output_mapping_child_folder)
|
||||||
|
|
||||||
|
|
||||||
|
def merge_output_data(data_file_path: str,
|
||||||
|
document_mapping_file: str,
|
||||||
|
output_data_file_path: str):
|
||||||
|
data_df = pd.read_excel(data_file_path, sheet_name="total_mapping_data")
|
||||||
|
document_mapping_df = pd.read_excel(document_mapping_file, sheet_name="doc_date")
|
||||||
|
# set doc_id to be string type
|
||||||
|
data_df["doc_id"] = data_df["doc_id"].astype(str)
|
||||||
|
document_mapping_df["DocumentId"] = document_mapping_df["DocumentId"].astype(str)
|
||||||
|
"""
|
||||||
|
doc_id page_index raw_name datapoint value raw_check comment investment_type investment_id investment_name similarity
|
||||||
|
553242368 344 Deutsche MSCI World Index Fund tor 61 33 FS0000AY1Y Xtrackers MSCI World Index Fund 0.75
|
||||||
|
553242368 344 db x-trackers EUR Liquid Corporate 12.5 UCITS ETF - Klasse 1C ter 0.35 1 F000018PY1 Xtrackers EUR Corporate Green Bond UCITS ETF 1C 0.462
|
||||||
|
"""
|
||||||
|
doc_id_list = data_df["doc_id"].unique().tolist()
|
||||||
|
data_point_dict = {
|
||||||
|
"tor": "TurnoverRatio",
|
||||||
|
"ter": "NetExpenseRatio",
|
||||||
|
"ogc": "OngoingCharge",
|
||||||
|
"performance_fee": "PerformanceFee"
|
||||||
|
}
|
||||||
|
total_data_list = []
|
||||||
|
for doc_id in tqdm(doc_id_list):
|
||||||
|
doc_data_list = []
|
||||||
|
doc_data_df = data_df[data_df["doc_id"] == doc_id]
|
||||||
|
doc_date = str(document_mapping_df[document_mapping_df["DocumentId"] == doc_id]["EffectiveDate"].values[0])[0:10]
|
||||||
|
exist_raw_name_list = []
|
||||||
|
for index, row in doc_data_df.iterrows():
|
||||||
|
doc_id = str(row["doc_id"])
|
||||||
|
page_index = int(row["page_index"])
|
||||||
|
raw_name = str(row["raw_name"])
|
||||||
|
datapoint = str(row["datapoint"])
|
||||||
|
value = row["value"]
|
||||||
|
investment_type = row["investment_type"]
|
||||||
|
investment_id = row["investment_id"]
|
||||||
|
investment_name = row["investment_name"]
|
||||||
|
|
||||||
|
exist = False
|
||||||
|
for exist_raw_name_info in exist_raw_name_list:
|
||||||
|
exist_raw_name = exist_raw_name_info["raw_name"]
|
||||||
|
exist_investment_type = exist_raw_name_info["investment_type"]
|
||||||
|
if exist_raw_name == raw_name and exist_investment_type == investment_type:
|
||||||
|
exist = True
|
||||||
|
break
|
||||||
|
if not exist:
|
||||||
|
data = {
|
||||||
|
"DocumentId": doc_id,
|
||||||
|
"investment_type": investment_type,
|
||||||
|
"investment_id": investment_id,
|
||||||
|
"investment_name": investment_name,
|
||||||
|
"EffectiveDate": doc_date,
|
||||||
|
"page_index": [],
|
||||||
|
"RawName": raw_name,
|
||||||
|
"NetExpenseRatio": "",
|
||||||
|
"OngoingCharge": "",
|
||||||
|
"TurnoverRatio": "",
|
||||||
|
"PerformanceFee": ""
|
||||||
|
}
|
||||||
|
exist_raw_name_list.append({"raw_name": raw_name, "investment_type": investment_type})
|
||||||
|
doc_data_list.append(data)
|
||||||
|
# find data from total_data_list by raw_name
|
||||||
|
for data in doc_data_list:
|
||||||
|
if data["RawName"] == raw_name and data["investment_type"] == investment_type:
|
||||||
|
update_key = data_point_dict[datapoint]
|
||||||
|
data[update_key] = value
|
||||||
|
if page_index not in data["page_index"]:
|
||||||
|
data["page_index"].append(page_index)
|
||||||
|
break
|
||||||
|
total_data_list.extend(doc_data_list)
|
||||||
|
total_data_df = pd.DataFrame(total_data_list)
|
||||||
|
total_data_df.fillna("", inplace=True)
|
||||||
|
with pd.ExcelWriter(output_data_file_path) as writer:
|
||||||
|
total_data_df.to_excel(writer, index=False, sheet_name="total_data")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
data_file_path = r"/data/emea_ar/ground_truth/data_extraction/verify/20241211/sample_documents_12_11_mapping_data_info_44_documents_by_text_20241211185546.xlsx"
|
||||||
|
document_mapping_file_path = r"/data/emea_ar/basic_information/sample_documents/sample_doc/sample_documents_12_11/doc_mapping_12_11.xlsx"
|
||||||
|
output_data_file_path = r"/data/emea_ar/ground_truth/data_extraction/verify/20241211/sample_documents_12_11_merged_data_info.xlsx"
|
||||||
|
merge_output_data(data_file_path, document_mapping_file_path, output_data_file_path)
|
||||||
# batch_initial_document()
|
# batch_initial_document()
|
||||||
batch_run_documents()
|
# batch_run_documents()
|
||||||
|
|
||||||
# new_data_file = r"/data/emea_ar/output/mapping_data/total/mapping_data_info_15_documents_by_text_20241121154243.xlsx"
|
# new_data_file = r"/data/emea_ar/output/mapping_data/total/mapping_data_info_15_documents_by_text_20241121154243.xlsx"
|
||||||
# original_data_file = r"/data/emea_ar/ground_truth/data_extraction/verify/mapping_data_info_30_documents_all_4_datapoints_20241106_verify_mapping.xlsx"
|
# original_data_file = r"/data/emea_ar/ground_truth/data_extraction/verify/mapping_data_info_30_documents_all_4_datapoints_20241106_verify_mapping.xlsx"
|
||||||
|
|
|
||||||
|
|
@ -276,35 +276,39 @@ def statistics_document(
|
||||||
describe_stat_df = pd.concat(describe_stat_df_list)
|
describe_stat_df = pd.concat(describe_stat_df_list)
|
||||||
describe_stat_df.reset_index(drop=True, inplace=True)
|
describe_stat_df.reset_index(drop=True, inplace=True)
|
||||||
|
|
||||||
doc_ar_data = pd.read_excel(doc_ar_data_file_path, sheet_name=ar_data_sheet_name)
|
doc_dp_data_df = None
|
||||||
doc_dp_result = get_document_with_all_4_data_points(None, None, doc_ar_data)
|
if doc_ar_data_file_path is not None and os.path.exists(doc_ar_data_file_path):
|
||||||
doc_dp_data_list = []
|
doc_ar_data = pd.read_excel(doc_ar_data_file_path, sheet_name=ar_data_sheet_name)
|
||||||
for doc_id in doc_id_list:
|
doc_dp_result = get_document_with_all_4_data_points(None, None, doc_ar_data)
|
||||||
doc_id = int(doc_id)
|
doc_dp_data_list = []
|
||||||
doc_dp_data = {"DocumentId": doc_id, "tor": 0, "ter": 0, "ogc": 0, "perf_fee": 0}
|
for doc_id in doc_id_list:
|
||||||
if doc_id in doc_dp_result["tor"]:
|
doc_id = int(doc_id)
|
||||||
doc_dp_data["tor"] = 1
|
doc_dp_data = {"DocumentId": doc_id, "tor": 0, "ter": 0, "ogc": 0, "perf_fee": 0}
|
||||||
if doc_id in doc_dp_result["ter"]:
|
if doc_id in doc_dp_result["tor"]:
|
||||||
doc_dp_data["ter"] = 1
|
doc_dp_data["tor"] = 1
|
||||||
if doc_id in doc_dp_result["ogc"]:
|
if doc_id in doc_dp_result["ter"]:
|
||||||
doc_dp_data["ogc"] = 1
|
doc_dp_data["ter"] = 1
|
||||||
if doc_id in doc_dp_result["perf_fee"]:
|
if doc_id in doc_dp_result["ogc"]:
|
||||||
doc_dp_data["perf_fee"] = 1
|
doc_dp_data["ogc"] = 1
|
||||||
doc_dp_data_list.append(doc_dp_data)
|
if doc_id in doc_dp_result["perf_fee"]:
|
||||||
doc_dp_data_df = pd.DataFrame(doc_dp_data_list)
|
doc_dp_data["perf_fee"] = 1
|
||||||
doc_dp_data_df = doc_dp_data_df.sort_values(by="DocumentId", ascending=True)
|
doc_dp_data_list.append(doc_dp_data)
|
||||||
doc_dp_data_df.reset_index(drop=True, inplace=True)
|
doc_dp_data_df = pd.DataFrame(doc_dp_data_list)
|
||||||
|
doc_dp_data_df = doc_dp_data_df.sort_values(by="DocumentId", ascending=True)
|
||||||
|
doc_dp_data_df.reset_index(drop=True, inplace=True)
|
||||||
|
|
||||||
# set all of DocumentId in DataFrame objects to be string type
|
# set all of DocumentId in DataFrame objects to be string type
|
||||||
doc_page_num_df["DocumentId"] = doc_page_num_df["DocumentId"].astype(str)
|
doc_page_num_df["DocumentId"] = doc_page_num_df["DocumentId"].astype(str)
|
||||||
doc_fund_count["DocumentId"] = doc_fund_count["DocumentId"].astype(str)
|
doc_fund_count["DocumentId"] = doc_fund_count["DocumentId"].astype(str)
|
||||||
doc_share_class_count["DocumentId"] = doc_share_class_count["DocumentId"].astype(str)
|
doc_share_class_count["DocumentId"] = doc_share_class_count["DocumentId"].astype(str)
|
||||||
doc_dp_data_df["DocumentId"] = doc_dp_data_df["DocumentId"].astype(str)
|
if doc_dp_data_df is not None:
|
||||||
|
doc_dp_data_df["DocumentId"] = doc_dp_data_df["DocumentId"].astype(str)
|
||||||
|
|
||||||
# merge statistics data for doc_page_num_df, doc_dp_data_df, doc_fund_count, doc_share_class_count based on DocumentId
|
# merge statistics data for doc_page_num_df, doc_dp_data_df, doc_fund_count, doc_share_class_count based on DocumentId
|
||||||
doc_page_num_df = doc_page_num_df.merge(doc_fund_count, on="DocumentId", how="left")
|
doc_page_num_df = doc_page_num_df.merge(doc_fund_count, on="DocumentId", how="left")
|
||||||
doc_page_num_df = doc_page_num_df.merge(doc_share_class_count, on="DocumentId", how="left")
|
doc_page_num_df = doc_page_num_df.merge(doc_share_class_count, on="DocumentId", how="left")
|
||||||
doc_page_num_df = doc_page_num_df.merge(doc_dp_data_df, on="DocumentId", how="left")
|
if doc_dp_data_df is not None:
|
||||||
|
doc_page_num_df = doc_page_num_df.merge(doc_dp_data_df, on="DocumentId", how="left")
|
||||||
|
|
||||||
# save statistics data to excel
|
# save statistics data to excel
|
||||||
with pd.ExcelWriter(stat_file) as writer:
|
with pd.ExcelWriter(stat_file) as writer:
|
||||||
|
|
@ -1395,7 +1399,10 @@ if __name__ == "__main__":
|
||||||
# sheet_name="Sheet1",
|
# sheet_name="Sheet1",
|
||||||
# doc_id_column="Document Id",
|
# doc_id_column="Document Id",
|
||||||
# pdf_path=pdf_folder)
|
# pdf_path=pdf_folder)
|
||||||
# output_pdf_page_text(pdf_folder, output_folder)
|
|
||||||
|
pdf_folder = r"/data/illume/japan_prospectus/pdf/"
|
||||||
|
output_folder = r"/data/illume/japan_prospectus/pdf_txt/"
|
||||||
|
output_pdf_page_text(pdf_folder, output_folder)
|
||||||
|
|
||||||
# extract_pdf_table(pdf_folder, output_folder)
|
# extract_pdf_table(pdf_folder, output_folder)
|
||||||
# analyze_json_error()
|
# analyze_json_error()
|
||||||
|
|
@ -1409,13 +1416,20 @@ if __name__ == "__main__":
|
||||||
doc_ar_data_file_path = r"/data/emea_ar/basic_information/sample_documents/sample_doc/sample_documents_12_11/doc_ar_data_12_11.xlsx"
|
doc_ar_data_file_path = r"/data/emea_ar/basic_information/sample_documents/sample_doc/sample_documents_12_11/doc_ar_data_12_11.xlsx"
|
||||||
doc_mapping_data_file_path = r"/data/emea_ar/basic_information/sample_documents/sample_doc/sample_documents_12_11/doc_mapping_12_11.xlsx"
|
doc_mapping_data_file_path = r"/data/emea_ar/basic_information/sample_documents/sample_doc/sample_documents_12_11/doc_mapping_12_11.xlsx"
|
||||||
output_data_folder = r"/data/emea_ar/basic_information/sample_documents/sample_doc/sample_documents_12_11/"
|
output_data_folder = r"/data/emea_ar/basic_information/sample_documents/sample_doc/sample_documents_12_11/"
|
||||||
statistics_document(pdf_folder=pdf_folder,
|
output_file="doc_ar_data_sample_documents_12_11_statistics.xlsx"
|
||||||
doc_mapping_file_path=doc_mapping_data_file_path,
|
|
||||||
doc_ar_data_file_path=doc_ar_data_file_path,
|
pdf_folder = r"/data/illume/japan_prospectus/pdf/"
|
||||||
mapping_sheet_name="Sheet1",
|
doc_ar_data_file_path = None
|
||||||
ar_data_sheet_name="doc_ar_data_in_db",
|
doc_mapping_data_file_path = r"/data/illume/japan_prospectus/materials/document_mapping.xlsx"
|
||||||
output_folder=output_data_folder,
|
output_data_folder = r"/data/illume/japan_prospectus/materials/"
|
||||||
output_file="doc_ar_data_sample_documents_12_11_statistics.xlsx")
|
output_file = "japan_prospectus_statistics.xlsx"
|
||||||
|
# statistics_document(pdf_folder=pdf_folder,
|
||||||
|
# doc_mapping_file_path=doc_mapping_data_file_path,
|
||||||
|
# doc_ar_data_file_path=doc_ar_data_file_path,
|
||||||
|
# mapping_sheet_name="Sheet1",
|
||||||
|
# ar_data_sheet_name="doc_ar_data_in_db",
|
||||||
|
# output_folder=output_data_folder,
|
||||||
|
# output_file=output_file)
|
||||||
# get_document_extracted_share_diff_by_db()
|
# get_document_extracted_share_diff_by_db()
|
||||||
# statistics_provider_mapping(
|
# statistics_provider_mapping(
|
||||||
# provider_mapping_data_file=provider_mapping_data_file,
|
# provider_mapping_data_file=provider_mapping_data_file,
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue