From f9ef4cec960b46b978670e4e61c3685da339919e Mon Sep 17 00:00:00 2001 From: Blade He Date: Fri, 31 Jan 2025 10:59:54 -0600 Subject: [PATCH] update sql_query cache file store location At most cache 5 days, then clean from local disk. --- app_emea_ar.py | 6 ++++++ main.py | 18 +++++++++++++----- utils/sql_query_util.py | 26 ++++++++++++++++---------- 3 files changed, 35 insertions(+), 15 deletions(-) diff --git a/app_emea_ar.py b/app_emea_ar.py index 8aab6d4..3a281f0 100644 --- a/app_emea_ar.py +++ b/app_emea_ar.py @@ -44,6 +44,8 @@ def emea_ar_data_extract(): output_extract_data_folder = r"./data/emea_ar/output/extract_data/docs/" output_mapping_data_folder = r"./data/emea_ar/output/mapping_data/docs/" drilldown_folder = r"./data/emea_ar/output/drilldown/" + db_mapping_document_folder = r"./data/emea_ar/output/db_mapping/document/" + db_mapping_provider_folder = r"./data/emea_ar/output/db_mapping/provider/" extract_way = "text" os.makedirs(pdf_folder, exist_ok=True) @@ -51,12 +53,16 @@ def emea_ar_data_extract(): os.makedirs(output_extract_data_folder, exist_ok=True) os.makedirs(output_mapping_data_folder, exist_ok=True) os.makedirs(drilldown_folder, exist_ok=True) + os.makedirs(db_mapping_document_folder, exist_ok=True) + os.makedirs(db_mapping_provider_folder, exist_ok=True) clean_folder(pdf_folder) clean_folder(output_pdf_text_folder) clean_folder(output_extract_data_folder) clean_folder(output_mapping_data_folder) clean_folder(drilldown_folder) + clean_folder(db_mapping_document_folder) + clean_folder(db_mapping_provider_folder) re_run_extract_data = False re_run_mapping_data = False diff --git a/main.py b/main.py index e8e0543..1d852e6 100644 --- a/main.py +++ b/main.py @@ -254,6 +254,14 @@ class EMEA_AR_Parsing: except Exception as e: logger.error(f"Error: {e}") annotation_list = annotation_list_df.to_dict(orient="records") + try: + drilldown_json_file = os.path.join( + drilldown_data_folder, f"{doc_id}_drilldown.json" + ) + with open(drilldown_json_file, "w", encoding="utf-8") as f: + json.dump(annotation_list, f, ensure_ascii=False, indent=4) + except Exception as e: + logger.error(f"Error: {e}") return annotation_list def mapping_data(self, data_from_gpt: list, re_run: bool = False) -> list: @@ -1032,9 +1040,9 @@ def batch_run_documents( page_filter_ground_truth_file = ( r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx" ) - re_run_extract_data = True - re_run_mapping_data = True - force_save_total_data = False + re_run_extract_data = False + re_run_mapping_data = False + force_save_total_data = True calculate_metrics = False extract_way = "text" @@ -1375,7 +1383,7 @@ if __name__ == "__main__": # special_doc_id_list = ["553242411"] - doc_source = "aus_prospectus" + doc_source = "emea_ar" if doc_source == "aus_prospectus": document_sample_file = ( r"./sample_documents/aus_prospectus_100_documents_multi_fund_sample.txt" @@ -1480,7 +1488,7 @@ if __name__ == "__main__": "520879048", "529925114", ] - special_doc_id_list = ["471641628"] + # special_doc_id_list = ["532438210"] batch_run_documents( doc_source=doc_source, special_doc_id_list=special_doc_id_list ) diff --git a/utils/sql_query_util.py b/utils/sql_query_util.py index 1ed64c1..4e1cb6e 100644 --- a/utils/sql_query_util.py +++ b/utils/sql_query_util.py @@ -8,7 +8,7 @@ import dotenv dotenv.load_dotenv() -def query_document_fund_mapping(doc_id, rerun=True, output_folder=r"/data/emea_ar/output/mapping/document/"): +def query_document_fund_mapping(doc_id, rerun=True, output_folder=r"./data/emea_ar/output/db_mapping/document/"): count = 1 while True: try: @@ -27,10 +27,13 @@ def query_document_fund_mapping(doc_id, rerun=True, output_folder=r"/data/emea_a by=["FundName", "ShareClassName"] ).reset_index(drop=True) if output_folder is not None and len(output_folder) > 0: - os.makedirs(output_folder, exist_ok=True) - output_file = os.path.join(output_folder, f"{doc_id}.xlsx") - with pd.ExcelWriter(output_file) as writer: - document_mapping_info_df.to_excel(writer, index=False) + try: + os.makedirs(output_folder, exist_ok=True) + output_file = os.path.join(output_folder, f"{doc_id}.xlsx") + with pd.ExcelWriter(output_file) as writer: + document_mapping_info_df.to_excel(writer, index=False) + except: + pass return document_mapping_info_df except Exception as e: print(e) @@ -40,7 +43,7 @@ def query_document_fund_mapping(doc_id, rerun=True, output_folder=r"/data/emea_a count += 1 -def query_investment_by_provider(company_id: str, rerun=True, output_folder=r"/data/emea_ar/output/mapping/provider/"): +def query_investment_by_provider(company_id: str, rerun=True, output_folder=r"./data/emea_ar/output/db_mapping/provider/"): count = 1 while True: try: @@ -59,10 +62,13 @@ def query_investment_by_provider(company_id: str, rerun=True, output_folder=r"/d .sort_values(by=['FundName', 'ShareClassName']) \ .reset_index(drop=True) if output_folder is not None and len(output_folder) > 0: - os.makedirs(output_folder, exist_ok=True) - output_file = os.path.join(output_folder, f"{company_id}.xlsx") - with pd.ExcelWriter(output_file) as writer: - investment_by_provider_df.to_excel(writer, index=False) + try: + os.makedirs(output_folder, exist_ok=True) + output_file = os.path.join(output_folder, f"{company_id}.xlsx") + with pd.ExcelWriter(output_file) as writer: + investment_by_provider_df.to_excel(writer, index=False) + except: + pass return investment_by_provider_df except Exception as e: print(e)