fix issue for post actions

This commit is contained in:
Blade He 2025-03-31 22:04:31 -05:00
parent 50e51e0894
commit 4cee95db9a
5 changed files with 75 additions and 96 deletions

View File

@ -114,25 +114,6 @@ def calc_metrics(ground_truth_file: str, prediction_file: str):
print(f"Accuracy TOR: {accuracy_tor}") print(f"Accuracy TOR: {accuracy_tor}")
def transform_pdf_2_image():
"""
Transform pdf to image.
"""
import fitz
folder = r"/Users/bhe/OneDrive - MORNINGSTAR INC/Personal Document/US_Life/pay/"
pdf_file = r"Pay_Date_2025-02-14.pdf"
pdf_path = os.path.join(folder, pdf_file)
pdf_doc = fitz.open(pdf_path)
pdf_file_pure_name = pdf_file.replace(".pdf", "")
for page_num in range(pdf_doc.page_count):
page = pdf_doc.load_page(page_num)
image = page.get_pixmap(dpi=300)
image_path = os.path.join(folder, f"{pdf_file_pure_name}_{page_num}.png")
image.save(image_path)
def invoke_api_demo(doc_id: str = "407881493"): def invoke_api_demo(doc_id: str = "407881493"):
headers = {"connection": "keep-alive", "content-type": "application/json"} headers = {"connection": "keep-alive", "content-type": "application/json"}
data = { data = {
@ -1432,7 +1413,7 @@ def merge_inference_data():
if __name__ == "__main__": if __name__ == "__main__":
merge_inference_data() # merge_inference_data()
# adjust_column_order() # adjust_column_order()
# set_mapping_to_data_side_documents_data() # set_mapping_to_data_side_documents_data()
@ -1481,7 +1462,6 @@ if __name__ == "__main__":
# calculate_metrics_based_audit_file(is_strict=False) # calculate_metrics_based_audit_file(is_strict=False)
# remove_ter_ogc_performance_fee_annotation() # remove_ter_ogc_performance_fee_annotation()
# batch_run_documents() # batch_run_documents()
# transform_pdf_2_image()
# ground_truth_file = "./test_metrics/ground_truth.xlsx" # ground_truth_file = "./test_metrics/ground_truth.xlsx"
# prediction_file = "./test_metrics/prediction.xlsx" # prediction_file = "./test_metrics/prediction.xlsx"
# calc_metrics(ground_truth_file, prediction_file) # calc_metrics(ground_truth_file, prediction_file)

View File

@ -295,31 +295,25 @@ class DataExtraction:
def post_supplement_data(self, data_list: list) -> list: def post_supplement_data(self, data_list: list) -> list:
""" """
data_dict = {"doc_id": self.doc_id} Post supplement data for the extracted data
data_dict["page_index"] = page_num
data_dict["datapoints"] = ", ".join(page_datapoints)
data_dict["page_text"] = page_text
data_dict["instructions"] = instructions
data_dict["raw_answer"] = response
data_dict["extract_data"] = data
data_dict["extract_way"] = original_way
data_dict["prompt_token"] = result.get("prompt_token", 0)
data_dict["completion_token"] = result.get("completion_token", 0)
data_dict["total_token"] = result.get("total_token", 0)
""" """
data_list = self.check_benchmark(data_list) try:
data_list = self.supplement_ttr_pension(data_list) data_list = self.check_benchmark(data_list)
data_list = self.align_fund_share_name(data_list) data_list = self.supplement_ttr_pension(data_list)
data_list = self.supplement_minimum_initial_investment(data_list) data_list = self.align_fund_share_name(data_list)
data_list = self.check_total_annual_dollar_based_charges(data_list) data_list = self.supplement_minimum_initial_investment(data_list)
data_list, datapoint_list_with_production_name = self.post_adjust_for_value_with_production_name(data_list) data_list = self.check_total_annual_dollar_based_charges(data_list)
data_list = self.remove_duplicate_data(data_list) data_list, datapoint_list_with_production_name = self.post_adjust_for_value_with_production_name(data_list)
if "management_fee" not in datapoint_list_with_production_name and "management_fee_and_costs" not in datapoint_list_with_production_name: data_list = self.remove_duplicate_data(data_list)
data_list, adjust = self.post_management_fee_exclude_performance_fee(data_list) if "management_fee" not in datapoint_list_with_production_name and "management_fee_and_costs" not in datapoint_list_with_production_name:
if not adjust: data_list, adjust = self.post_management_fee_exclude_performance_fee(data_list)
data_list = self.post_adjust_management_fee_costs(data_list) if not adjust:
data_list = self.post_adjust_management_fee_costs(data_list)
data_list = self.check_administration_fees(data_list)
data_list = self.check_administration_fees(data_list)
except Exception as e:
logger.error(f"Post supplement data error: {e}")
print_exc()
return data_list return data_list
def check_benchmark(self, data_list: list): def check_benchmark(self, data_list: list):
@ -347,6 +341,8 @@ class DataExtraction:
if "benchmark_name" not in keys: if "benchmark_name" not in keys:
continue continue
benchmark_name = data_item.get("benchmark_name", "") benchmark_name = data_item.get("benchmark_name", "")
if len(benchmark_name) == 0:
continue
if benchmark_name.startswith("A range") or benchmark_name.startswith("The fund") or \ if benchmark_name.startswith("A range") or benchmark_name.startswith("The fund") or \
benchmark_name.startswith("CPI "): benchmark_name.startswith("CPI "):
data_item.pop("benchmark_name") data_item.pop("benchmark_name")

View File

@ -87,6 +87,9 @@
"Retirement account \n\nInvestment option \n(A) Investment fees \nand costs (including \n(B) performance \nfees) (pa)* \n(B) Performance \nfees (pa) \n# \n(C) Transaction \ncosts (pa)*^ \n(A) + (C) Total \ninvestment cost \n(pa) \nBalanced Indexed 0.00% 0.00% 0.00% 0.00%\n", "Retirement account \n\nInvestment option \n(A) Investment fees \nand costs (including \n(B) performance \nfees) (pa)* \n(B) Performance \nfees (pa) \n# \n(C) Transaction \ncosts (pa)*^ \n(A) + (C) Total \ninvestment cost \n(pa) \nBalanced Indexed 0.00% 0.00% 0.00% 0.00%\n",
"---Example End---", "---Example End---",
"For this example, as \"Investment fees and costs (including (B) performance fees)\" and \"Performance fees (pa)\" mentioned as 0.00% so return 0 as datapoint values.", "For this example, as \"Investment fees and costs (including (B) performance fees)\" and \"Performance fees (pa)\" mentioned as 0.00% so return 0 as datapoint values.",
"The fund name prefix is \"Retirement account\", the investment option is \"Balanced - Indexed\", so fund name and share name should be: \"Retirement account Balanced - Indexed\".",
"The output should be:",
"{\"data\": [{\"fund name\": \"Retirement account Balanced - Indexed\", \"share name\": \"Retirement account Balanced - Indexed\", \"management_fee_and_costs\": 0, \"management_fee\": 0, \"performance_fee_costs\": 0}]}",
"7. If for data point value specifically Nil is written in the value then return NULL('') for the same" "7. If for data point value specifically Nil is written in the value then return NULL('') for the same"
], ],
"investment_level": { "investment_level": {

View File

@ -1538,7 +1538,7 @@ if __name__ == "__main__":
with open(document_sample_file, "r", encoding="utf-8") as f: with open(document_sample_file, "r", encoding="utf-8") as f:
special_doc_id_list = [doc_id.strip() for doc_id in f.readlines() special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()
if len(doc_id.strip()) > 0] if len(doc_id.strip()) > 0]
# special_doc_id_list = ["573372424", "455235248", "462780211"] # special_doc_id_list = ["384508026"]
pdf_folder: str = r"/data/aus_prospectus/pdf/" pdf_folder: str = r"/data/aus_prospectus/pdf/"
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/" output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
output_extract_data_child_folder: str = ( output_extract_data_child_folder: str = (

File diff suppressed because one or more lines are too long