update sql_query cache file store location

At most cache 5 days, then clean from local disk.
This commit is contained in:
Blade He 2025-01-31 10:59:54 -06:00
parent 7f37f3532f
commit f9ef4cec96
3 changed files with 35 additions and 15 deletions

View File

@ -44,6 +44,8 @@ def emea_ar_data_extract():
output_extract_data_folder = r"./data/emea_ar/output/extract_data/docs/" output_extract_data_folder = r"./data/emea_ar/output/extract_data/docs/"
output_mapping_data_folder = r"./data/emea_ar/output/mapping_data/docs/" output_mapping_data_folder = r"./data/emea_ar/output/mapping_data/docs/"
drilldown_folder = r"./data/emea_ar/output/drilldown/" drilldown_folder = r"./data/emea_ar/output/drilldown/"
db_mapping_document_folder = r"./data/emea_ar/output/db_mapping/document/"
db_mapping_provider_folder = r"./data/emea_ar/output/db_mapping/provider/"
extract_way = "text" extract_way = "text"
os.makedirs(pdf_folder, exist_ok=True) os.makedirs(pdf_folder, exist_ok=True)
@ -51,12 +53,16 @@ def emea_ar_data_extract():
os.makedirs(output_extract_data_folder, exist_ok=True) os.makedirs(output_extract_data_folder, exist_ok=True)
os.makedirs(output_mapping_data_folder, exist_ok=True) os.makedirs(output_mapping_data_folder, exist_ok=True)
os.makedirs(drilldown_folder, exist_ok=True) os.makedirs(drilldown_folder, exist_ok=True)
os.makedirs(db_mapping_document_folder, exist_ok=True)
os.makedirs(db_mapping_provider_folder, exist_ok=True)
clean_folder(pdf_folder) clean_folder(pdf_folder)
clean_folder(output_pdf_text_folder) clean_folder(output_pdf_text_folder)
clean_folder(output_extract_data_folder) clean_folder(output_extract_data_folder)
clean_folder(output_mapping_data_folder) clean_folder(output_mapping_data_folder)
clean_folder(drilldown_folder) clean_folder(drilldown_folder)
clean_folder(db_mapping_document_folder)
clean_folder(db_mapping_provider_folder)
re_run_extract_data = False re_run_extract_data = False
re_run_mapping_data = False re_run_mapping_data = False

18
main.py
View File

@ -254,6 +254,14 @@ class EMEA_AR_Parsing:
except Exception as e: except Exception as e:
logger.error(f"Error: {e}") logger.error(f"Error: {e}")
annotation_list = annotation_list_df.to_dict(orient="records") annotation_list = annotation_list_df.to_dict(orient="records")
try:
drilldown_json_file = os.path.join(
drilldown_data_folder, f"{doc_id}_drilldown.json"
)
with open(drilldown_json_file, "w", encoding="utf-8") as f:
json.dump(annotation_list, f, ensure_ascii=False, indent=4)
except Exception as e:
logger.error(f"Error: {e}")
return annotation_list return annotation_list
def mapping_data(self, data_from_gpt: list, re_run: bool = False) -> list: def mapping_data(self, data_from_gpt: list, re_run: bool = False) -> list:
@ -1032,9 +1040,9 @@ def batch_run_documents(
page_filter_ground_truth_file = ( page_filter_ground_truth_file = (
r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx" r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx"
) )
re_run_extract_data = True re_run_extract_data = False
re_run_mapping_data = True re_run_mapping_data = False
force_save_total_data = False force_save_total_data = True
calculate_metrics = False calculate_metrics = False
extract_way = "text" extract_way = "text"
@ -1375,7 +1383,7 @@ if __name__ == "__main__":
# special_doc_id_list = ["553242411"] # special_doc_id_list = ["553242411"]
doc_source = "aus_prospectus" doc_source = "emea_ar"
if doc_source == "aus_prospectus": if doc_source == "aus_prospectus":
document_sample_file = ( document_sample_file = (
r"./sample_documents/aus_prospectus_100_documents_multi_fund_sample.txt" r"./sample_documents/aus_prospectus_100_documents_multi_fund_sample.txt"
@ -1480,7 +1488,7 @@ if __name__ == "__main__":
"520879048", "520879048",
"529925114", "529925114",
] ]
special_doc_id_list = ["471641628"] # special_doc_id_list = ["532438210"]
batch_run_documents( batch_run_documents(
doc_source=doc_source, special_doc_id_list=special_doc_id_list doc_source=doc_source, special_doc_id_list=special_doc_id_list
) )

View File

@ -8,7 +8,7 @@ import dotenv
dotenv.load_dotenv() dotenv.load_dotenv()
def query_document_fund_mapping(doc_id, rerun=True, output_folder=r"/data/emea_ar/output/mapping/document/"): def query_document_fund_mapping(doc_id, rerun=True, output_folder=r"./data/emea_ar/output/db_mapping/document/"):
count = 1 count = 1
while True: while True:
try: try:
@ -27,10 +27,13 @@ def query_document_fund_mapping(doc_id, rerun=True, output_folder=r"/data/emea_a
by=["FundName", "ShareClassName"] by=["FundName", "ShareClassName"]
).reset_index(drop=True) ).reset_index(drop=True)
if output_folder is not None and len(output_folder) > 0: if output_folder is not None and len(output_folder) > 0:
os.makedirs(output_folder, exist_ok=True) try:
output_file = os.path.join(output_folder, f"{doc_id}.xlsx") os.makedirs(output_folder, exist_ok=True)
with pd.ExcelWriter(output_file) as writer: output_file = os.path.join(output_folder, f"{doc_id}.xlsx")
document_mapping_info_df.to_excel(writer, index=False) with pd.ExcelWriter(output_file) as writer:
document_mapping_info_df.to_excel(writer, index=False)
except:
pass
return document_mapping_info_df return document_mapping_info_df
except Exception as e: except Exception as e:
print(e) print(e)
@ -40,7 +43,7 @@ def query_document_fund_mapping(doc_id, rerun=True, output_folder=r"/data/emea_a
count += 1 count += 1
def query_investment_by_provider(company_id: str, rerun=True, output_folder=r"/data/emea_ar/output/mapping/provider/"): def query_investment_by_provider(company_id: str, rerun=True, output_folder=r"./data/emea_ar/output/db_mapping/provider/"):
count = 1 count = 1
while True: while True:
try: try:
@ -59,10 +62,13 @@ def query_investment_by_provider(company_id: str, rerun=True, output_folder=r"/d
.sort_values(by=['FundName', 'ShareClassName']) \ .sort_values(by=['FundName', 'ShareClassName']) \
.reset_index(drop=True) .reset_index(drop=True)
if output_folder is not None and len(output_folder) > 0: if output_folder is not None and len(output_folder) > 0:
os.makedirs(output_folder, exist_ok=True) try:
output_file = os.path.join(output_folder, f"{company_id}.xlsx") os.makedirs(output_folder, exist_ok=True)
with pd.ExcelWriter(output_file) as writer: output_file = os.path.join(output_folder, f"{company_id}.xlsx")
investment_by_provider_df.to_excel(writer, index=False) with pd.ExcelWriter(output_file) as writer:
investment_by_provider_df.to_excel(writer, index=False)
except:
pass
return investment_by_provider_df return investment_by_provider_df
except Exception as e: except Exception as e:
print(e) print(e)