update for more statistics methods
This commit is contained in:
parent
81a424b00d
commit
0349033eaf
71
main.py
71
main.py
|
|
@ -922,13 +922,78 @@ if __name__ == "__main__":
|
|||
"539087870",
|
||||
"536343790"
|
||||
]
|
||||
|
||||
# document samples 2024-11-06
|
||||
# check_db_mapping_doc_id_list = ["546483469",
|
||||
# "546375582",
|
||||
# "546375575",
|
||||
# "546375576",
|
||||
# "546375577",
|
||||
# "546375568",
|
||||
# "546371033",
|
||||
# "546632761",
|
||||
# "546632544",
|
||||
# "546632464",
|
||||
# "546724583",
|
||||
# "546724552",
|
||||
# "546694677",
|
||||
# "546660422",
|
||||
# "546638908",
|
||||
# "546632845",
|
||||
# "546105299",
|
||||
# "546085481",
|
||||
# "546078693",
|
||||
# "546078650",
|
||||
# "546289930",
|
||||
# "546289910",
|
||||
# "542967371",
|
||||
# "542798238",
|
||||
# "546048730",
|
||||
# "546048143",
|
||||
# "546047619",
|
||||
# "546047528",
|
||||
# "546046730",
|
||||
# "546919329"]
|
||||
|
||||
# document samples: 30 documents, all with 4 data points
|
||||
check_db_mapping_doc_id_list = ["479742284",
|
||||
"501380497",
|
||||
"501380553",
|
||||
"501380775",
|
||||
"501380801",
|
||||
"501600428",
|
||||
"501600429",
|
||||
"501600541",
|
||||
"501600549",
|
||||
"503659548",
|
||||
"506326520",
|
||||
"507720522",
|
||||
"507928179",
|
||||
"508981020",
|
||||
"509133771",
|
||||
"509743502",
|
||||
"514636951",
|
||||
"514636952",
|
||||
"514636953",
|
||||
"514636954",
|
||||
"514636955",
|
||||
"514636957",
|
||||
"514636958",
|
||||
"514636959",
|
||||
"514636985",
|
||||
"514636988",
|
||||
"514636990",
|
||||
"514636993",
|
||||
"514636994",
|
||||
"539794746"
|
||||
]
|
||||
special_doc_id_list = check_db_mapping_doc_id_list
|
||||
special_doc_id_list = ["532422720"]
|
||||
# special_doc_id_list = []
|
||||
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
|
||||
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
|
||||
re_run_extract_data = False
|
||||
re_run_mapping_data = True
|
||||
force_save_total_data = False
|
||||
re_run_mapping_data = False
|
||||
force_save_total_data = True
|
||||
calculate_metrics = False
|
||||
|
||||
extract_ways = ["text"]
|
||||
|
|
|
|||
119
prepare_data.py
119
prepare_data.py
|
|
@ -249,34 +249,54 @@ def statistics_document(
|
|||
|
||||
doc_id_list = [str(docid) for docid in doc_mapping_data["DocumentId"].unique().tolist()]
|
||||
# statistics document page number
|
||||
pdf_files = glob(os.path.join(pdf_folder, "*.pdf"))
|
||||
logger.info(f"Total {len(pdf_files)} pdf files found in {pdf_folder}")
|
||||
logger.info("statistics document page number")
|
||||
doc_page_num_list = []
|
||||
for pdf_file in tqdm(pdf_files):
|
||||
pdf_base_name = os.path.basename(pdf_file).replace(".pdf", "")
|
||||
if pdf_base_name not in doc_id_list:
|
||||
continue
|
||||
docid = os.path.basename(pdf_file).split(".")[0]
|
||||
doc = fitz.open(pdf_file)
|
||||
page_num = doc.page_count
|
||||
doc_page_num_list.append({"docid": docid, "page_num": page_num})
|
||||
doc.close()
|
||||
doc_page_num_df = pd.DataFrame(doc_page_num_list)
|
||||
# order by page_num in descending order
|
||||
doc_page_num_df = doc_page_num_df.sort_values(by="page_num", ascending=False)
|
||||
# statistics page_num by describe and transform to DataFrame
|
||||
doc_page_num_stat_df = get_describe_stat(
|
||||
doc_page_num_df, "page_num", "doc_page_num"
|
||||
)
|
||||
describe_stat_df_list.append(doc_page_num_stat_df)
|
||||
# pdf_files = glob(os.path.join(pdf_folder, "*.pdf"))
|
||||
# logger.info(f"Total {len(pdf_files)} pdf files found in {pdf_folder}")
|
||||
# logger.info("statistics document page number")
|
||||
# doc_page_num_list = []
|
||||
# for pdf_file in tqdm(pdf_files):
|
||||
# pdf_base_name = os.path.basename(pdf_file).replace(".pdf", "")
|
||||
# if pdf_base_name not in doc_id_list:
|
||||
# continue
|
||||
# docid = os.path.basename(pdf_file).split(".")[0]
|
||||
# doc = fitz.open(pdf_file)
|
||||
# page_num = doc.page_count
|
||||
# doc_page_num_list.append({"docid": docid, "page_num": page_num})
|
||||
# doc.close()
|
||||
# doc_page_num_df = pd.DataFrame(doc_page_num_list)
|
||||
# # order by page_num in descending order
|
||||
# doc_page_num_df = doc_page_num_df.sort_values(by="page_num", ascending=False)
|
||||
# # statistics page_num by describe and transform to DataFrame
|
||||
# doc_page_num_stat_df = get_describe_stat(
|
||||
# doc_page_num_df, "page_num", "doc_page_num"
|
||||
# )
|
||||
# describe_stat_df_list.append(doc_page_num_stat_df)
|
||||
|
||||
describe_stat_df = pd.concat(describe_stat_df_list)
|
||||
describe_stat_df.reset_index(drop=True, inplace=True)
|
||||
|
||||
|
||||
doc_dp_result = get_document_with_all_4_data_points(None, None, doc_mapping_data)
|
||||
doc_dp_data_list = []
|
||||
for doc_id in doc_id_list:
|
||||
doc_id = int(doc_id)
|
||||
doc_dp_data = {"DocumentId": doc_id, "tor": 0, "ter": 0, "ogc": 0, "perf_fee": 0}
|
||||
if doc_id in doc_dp_result["tor"]:
|
||||
doc_dp_data["tor"] = 1
|
||||
if doc_id in doc_dp_result["ter"]:
|
||||
doc_dp_data["ter"] = 1
|
||||
if doc_id in doc_dp_result["ogc"]:
|
||||
doc_dp_data["ogc"] = 1
|
||||
if doc_id in doc_dp_result["perf_fee"]:
|
||||
doc_dp_data["perf_fee"] = 1
|
||||
doc_dp_data_list.append(doc_dp_data)
|
||||
doc_dp_data_df = pd.DataFrame(doc_dp_data_list)
|
||||
doc_dp_data_df = doc_dp_data_df.sort_values(by="DocumentId", ascending=True)
|
||||
doc_dp_data_df.reset_index(drop=True, inplace=True)
|
||||
|
||||
# save statistics data to excel
|
||||
with pd.ExcelWriter(stat_file) as writer:
|
||||
doc_page_num_df.to_excel(writer, sheet_name="doc_page_num", index=False)
|
||||
# doc_page_num_df.to_excel(writer, sheet_name="doc_page_num", index=False)
|
||||
doc_dp_data_df.to_excel(writer, sheet_name="doc_dp_data", index=False)
|
||||
doc_fund_count.to_excel(writer, sheet_name="doc_fund_count", index=False)
|
||||
doc_share_class_count.to_excel(
|
||||
writer, sheet_name="doc_share_class_count", index=False
|
||||
|
|
@ -1289,12 +1309,49 @@ def calc_typical_doc_metrics_v1():
|
|||
final_data_df.to_excel(
|
||||
writer, sheet_name="metrics", index=False
|
||||
)
|
||||
|
||||
|
||||
|
||||
|
||||
def get_document_with_all_4_data_points(folder: str, file_name: str, data: pd.DataFrame):
|
||||
if data is None:
|
||||
file_path = os.path.join(folder, file_name)
|
||||
if os.path.exists(file_path):
|
||||
data = pd.read_excel(file_path, sheet_name="doc_ar_data_in_db")
|
||||
else:
|
||||
logger.error(f"Invalid file path: {file_path}")
|
||||
return
|
||||
# get document id list which noTor is 0
|
||||
noTor_0_doc_id_list = data[data["noTor"] == 0]["DocumentId"].unique().tolist()
|
||||
|
||||
# get document id list which share_noTer is 0
|
||||
share_noTer_0_doc_id_list = data[data["share_noTer"] == 0]["DocumentId"].unique().tolist()
|
||||
|
||||
# get document id list which share_noOgc is 0
|
||||
share_noOgc_0_doc_id_list = data[data["share_noOgc"] == 0]["DocumentId"].unique().tolist()
|
||||
|
||||
# get document id list which share_noPerfFee is 0
|
||||
share_noPerfFee_0_doc_id_list = data[data["share_noPerfFee"] == 0]["DocumentId"].unique().tolist()
|
||||
|
||||
logger.info(f"noTor_0_doc_id_list: {len(noTor_0_doc_id_list)}")
|
||||
logger.info(f"share_noTer_0_doc_id_list: {len(share_noTer_0_doc_id_list)}")
|
||||
logger.info(f"share_noOgc_0_doc_id_list: {len(share_noOgc_0_doc_id_list)}")
|
||||
logger.info(f"share_noPerfFee_0_doc_id_list: {len(share_noPerfFee_0_doc_id_list)}")
|
||||
|
||||
all_4_data_points_doc_id_list = list(set(noTor_0_doc_id_list) & set(share_noTer_0_doc_id_list) & set(share_noOgc_0_doc_id_list) & set(share_noPerfFee_0_doc_id_list))
|
||||
|
||||
logger.info(f"all_4_data_points_doc_id_list: {len(all_4_data_points_doc_id_list)}")
|
||||
result = {"tor": noTor_0_doc_id_list,
|
||||
"ter": share_noTer_0_doc_id_list,
|
||||
"ogc": share_noOgc_0_doc_id_list,
|
||||
"perf_fee": share_noPerfFee_0_doc_id_list}
|
||||
return result
|
||||
|
||||
if __name__ == "__main__":
|
||||
folder = r"/data/emea_ar/basic_information/English/sample_doc/emea_11_06_case/"
|
||||
file_name = "doc_ar_data_for_emea_11_06.xlsx"
|
||||
# get_document_with_all_4_data_points(folder, file_name, None)
|
||||
# calc_typical_doc_metrics_v1()
|
||||
calc_typical_doc_metrics_v2()
|
||||
# calc_typical_doc_metrics_v2()
|
||||
|
||||
doc_provider_file_path = (
|
||||
r"/data/emea_ar/basic_information/English/latest_provider_ar_document.xlsx"
|
||||
|
|
@ -1335,13 +1392,13 @@ if __name__ == "__main__":
|
|||
# pdf_folder)
|
||||
|
||||
|
||||
doc_mapping_file_path = r"/data/emea_ar/basic_information/English/sample_doc/emea_final_typical_case/doc_ar_data_for_final_list_emea_documents.xlsx"
|
||||
output_data_folder = r"/data/emea_ar/basic_information/English/sample_doc/emea_final_typical_case/"
|
||||
# statistics_document(pdf_folder=pdf_folder,
|
||||
# doc_mapping_file_path=doc_mapping_file_path,
|
||||
# sheet_name="doc_ar_data_in_db",
|
||||
# output_folder=output_data_folder,
|
||||
# output_file="doc_ar_data_statistics.xlsx")
|
||||
doc_mapping_file_path = r"/data/emea_ar/basic_information/English/sample_doc/emea_doc_with_all_4_dp/doc_ar_data_with_all_4_dp.xlsx"
|
||||
output_data_folder = r"/data/emea_ar/basic_information/English/sample_doc/emea_doc_with_all_4_dp/"
|
||||
statistics_document(pdf_folder=pdf_folder,
|
||||
doc_mapping_file_path=doc_mapping_file_path,
|
||||
sheet_name="doc_ar_data_in_db",
|
||||
output_folder=output_data_folder,
|
||||
output_file="doc_ar_data_with_all_4_dp_statistics.xlsx")
|
||||
# get_document_extracted_share_diff_by_db()
|
||||
# statistics_provider_mapping(
|
||||
# provider_mapping_data_file=provider_mapping_data_file,
|
||||
|
|
|
|||
Loading…
Reference in New Issue