update sql_query cache file store location
At most cache 5 days, then clean from local disk.
This commit is contained in:
parent
7f37f3532f
commit
f9ef4cec96
|
|
@ -44,6 +44,8 @@ def emea_ar_data_extract():
|
|||
output_extract_data_folder = r"./data/emea_ar/output/extract_data/docs/"
|
||||
output_mapping_data_folder = r"./data/emea_ar/output/mapping_data/docs/"
|
||||
drilldown_folder = r"./data/emea_ar/output/drilldown/"
|
||||
db_mapping_document_folder = r"./data/emea_ar/output/db_mapping/document/"
|
||||
db_mapping_provider_folder = r"./data/emea_ar/output/db_mapping/provider/"
|
||||
extract_way = "text"
|
||||
|
||||
os.makedirs(pdf_folder, exist_ok=True)
|
||||
|
|
@ -51,12 +53,16 @@ def emea_ar_data_extract():
|
|||
os.makedirs(output_extract_data_folder, exist_ok=True)
|
||||
os.makedirs(output_mapping_data_folder, exist_ok=True)
|
||||
os.makedirs(drilldown_folder, exist_ok=True)
|
||||
os.makedirs(db_mapping_document_folder, exist_ok=True)
|
||||
os.makedirs(db_mapping_provider_folder, exist_ok=True)
|
||||
|
||||
clean_folder(pdf_folder)
|
||||
clean_folder(output_pdf_text_folder)
|
||||
clean_folder(output_extract_data_folder)
|
||||
clean_folder(output_mapping_data_folder)
|
||||
clean_folder(drilldown_folder)
|
||||
clean_folder(db_mapping_document_folder)
|
||||
clean_folder(db_mapping_provider_folder)
|
||||
|
||||
re_run_extract_data = False
|
||||
re_run_mapping_data = False
|
||||
|
|
|
|||
18
main.py
18
main.py
|
|
@ -254,6 +254,14 @@ class EMEA_AR_Parsing:
|
|||
except Exception as e:
|
||||
logger.error(f"Error: {e}")
|
||||
annotation_list = annotation_list_df.to_dict(orient="records")
|
||||
try:
|
||||
drilldown_json_file = os.path.join(
|
||||
drilldown_data_folder, f"{doc_id}_drilldown.json"
|
||||
)
|
||||
with open(drilldown_json_file, "w", encoding="utf-8") as f:
|
||||
json.dump(annotation_list, f, ensure_ascii=False, indent=4)
|
||||
except Exception as e:
|
||||
logger.error(f"Error: {e}")
|
||||
return annotation_list
|
||||
|
||||
def mapping_data(self, data_from_gpt: list, re_run: bool = False) -> list:
|
||||
|
|
@ -1032,9 +1040,9 @@ def batch_run_documents(
|
|||
page_filter_ground_truth_file = (
|
||||
r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx"
|
||||
)
|
||||
re_run_extract_data = True
|
||||
re_run_mapping_data = True
|
||||
force_save_total_data = False
|
||||
re_run_extract_data = False
|
||||
re_run_mapping_data = False
|
||||
force_save_total_data = True
|
||||
calculate_metrics = False
|
||||
|
||||
extract_way = "text"
|
||||
|
|
@ -1375,7 +1383,7 @@ if __name__ == "__main__":
|
|||
|
||||
# special_doc_id_list = ["553242411"]
|
||||
|
||||
doc_source = "aus_prospectus"
|
||||
doc_source = "emea_ar"
|
||||
if doc_source == "aus_prospectus":
|
||||
document_sample_file = (
|
||||
r"./sample_documents/aus_prospectus_100_documents_multi_fund_sample.txt"
|
||||
|
|
@ -1480,7 +1488,7 @@ if __name__ == "__main__":
|
|||
"520879048",
|
||||
"529925114",
|
||||
]
|
||||
special_doc_id_list = ["471641628"]
|
||||
# special_doc_id_list = ["532438210"]
|
||||
batch_run_documents(
|
||||
doc_source=doc_source, special_doc_id_list=special_doc_id_list
|
||||
)
|
||||
|
|
|
|||
|
|
@ -8,7 +8,7 @@ import dotenv
|
|||
dotenv.load_dotenv()
|
||||
|
||||
|
||||
def query_document_fund_mapping(doc_id, rerun=True, output_folder=r"/data/emea_ar/output/mapping/document/"):
|
||||
def query_document_fund_mapping(doc_id, rerun=True, output_folder=r"./data/emea_ar/output/db_mapping/document/"):
|
||||
count = 1
|
||||
while True:
|
||||
try:
|
||||
|
|
@ -27,10 +27,13 @@ def query_document_fund_mapping(doc_id, rerun=True, output_folder=r"/data/emea_a
|
|||
by=["FundName", "ShareClassName"]
|
||||
).reset_index(drop=True)
|
||||
if output_folder is not None and len(output_folder) > 0:
|
||||
os.makedirs(output_folder, exist_ok=True)
|
||||
output_file = os.path.join(output_folder, f"{doc_id}.xlsx")
|
||||
with pd.ExcelWriter(output_file) as writer:
|
||||
document_mapping_info_df.to_excel(writer, index=False)
|
||||
try:
|
||||
os.makedirs(output_folder, exist_ok=True)
|
||||
output_file = os.path.join(output_folder, f"{doc_id}.xlsx")
|
||||
with pd.ExcelWriter(output_file) as writer:
|
||||
document_mapping_info_df.to_excel(writer, index=False)
|
||||
except:
|
||||
pass
|
||||
return document_mapping_info_df
|
||||
except Exception as e:
|
||||
print(e)
|
||||
|
|
@ -40,7 +43,7 @@ def query_document_fund_mapping(doc_id, rerun=True, output_folder=r"/data/emea_a
|
|||
count += 1
|
||||
|
||||
|
||||
def query_investment_by_provider(company_id: str, rerun=True, output_folder=r"/data/emea_ar/output/mapping/provider/"):
|
||||
def query_investment_by_provider(company_id: str, rerun=True, output_folder=r"./data/emea_ar/output/db_mapping/provider/"):
|
||||
count = 1
|
||||
while True:
|
||||
try:
|
||||
|
|
@ -59,10 +62,13 @@ def query_investment_by_provider(company_id: str, rerun=True, output_folder=r"/d
|
|||
.sort_values(by=['FundName', 'ShareClassName']) \
|
||||
.reset_index(drop=True)
|
||||
if output_folder is not None and len(output_folder) > 0:
|
||||
os.makedirs(output_folder, exist_ok=True)
|
||||
output_file = os.path.join(output_folder, f"{company_id}.xlsx")
|
||||
with pd.ExcelWriter(output_file) as writer:
|
||||
investment_by_provider_df.to_excel(writer, index=False)
|
||||
try:
|
||||
os.makedirs(output_folder, exist_ok=True)
|
||||
output_file = os.path.join(output_folder, f"{company_id}.xlsx")
|
||||
with pd.ExcelWriter(output_file) as writer:
|
||||
investment_by_provider_df.to_excel(writer, index=False)
|
||||
except:
|
||||
pass
|
||||
return investment_by_provider_df
|
||||
except Exception as e:
|
||||
print(e)
|
||||
|
|
|
|||
Loading…
Reference in New Issue