fix issue for post actions

This commit is contained in:
Blade He 2025-03-31 22:04:31 -05:00
parent 50e51e0894
commit 4cee95db9a
5 changed files with 75 additions and 96 deletions

View File

@ -114,25 +114,6 @@ def calc_metrics(ground_truth_file: str, prediction_file: str):
print(f"Accuracy TOR: {accuracy_tor}")
def transform_pdf_2_image():
"""
Transform pdf to image.
"""
import fitz
folder = r"/Users/bhe/OneDrive - MORNINGSTAR INC/Personal Document/US_Life/pay/"
pdf_file = r"Pay_Date_2025-02-14.pdf"
pdf_path = os.path.join(folder, pdf_file)
pdf_doc = fitz.open(pdf_path)
pdf_file_pure_name = pdf_file.replace(".pdf", "")
for page_num in range(pdf_doc.page_count):
page = pdf_doc.load_page(page_num)
image = page.get_pixmap(dpi=300)
image_path = os.path.join(folder, f"{pdf_file_pure_name}_{page_num}.png")
image.save(image_path)
def invoke_api_demo(doc_id: str = "407881493"):
headers = {"connection": "keep-alive", "content-type": "application/json"}
data = {
@ -1432,7 +1413,7 @@ def merge_inference_data():
if __name__ == "__main__":
merge_inference_data()
# merge_inference_data()
# adjust_column_order()
# set_mapping_to_data_side_documents_data()
@ -1481,7 +1462,6 @@ if __name__ == "__main__":
# calculate_metrics_based_audit_file(is_strict=False)
# remove_ter_ogc_performance_fee_annotation()
# batch_run_documents()
# transform_pdf_2_image()
# ground_truth_file = "./test_metrics/ground_truth.xlsx"
# prediction_file = "./test_metrics/prediction.xlsx"
# calc_metrics(ground_truth_file, prediction_file)

View File

@ -295,31 +295,25 @@ class DataExtraction:
def post_supplement_data(self, data_list: list) -> list:
"""
data_dict = {"doc_id": self.doc_id}
data_dict["page_index"] = page_num
data_dict["datapoints"] = ", ".join(page_datapoints)
data_dict["page_text"] = page_text
data_dict["instructions"] = instructions
data_dict["raw_answer"] = response
data_dict["extract_data"] = data
data_dict["extract_way"] = original_way
data_dict["prompt_token"] = result.get("prompt_token", 0)
data_dict["completion_token"] = result.get("completion_token", 0)
data_dict["total_token"] = result.get("total_token", 0)
Post supplement data for the extracted data
"""
data_list = self.check_benchmark(data_list)
data_list = self.supplement_ttr_pension(data_list)
data_list = self.align_fund_share_name(data_list)
data_list = self.supplement_minimum_initial_investment(data_list)
data_list = self.check_total_annual_dollar_based_charges(data_list)
data_list, datapoint_list_with_production_name = self.post_adjust_for_value_with_production_name(data_list)
data_list = self.remove_duplicate_data(data_list)
if "management_fee" not in datapoint_list_with_production_name and "management_fee_and_costs" not in datapoint_list_with_production_name:
data_list, adjust = self.post_management_fee_exclude_performance_fee(data_list)
if not adjust:
data_list = self.post_adjust_management_fee_costs(data_list)
try:
data_list = self.check_benchmark(data_list)
data_list = self.supplement_ttr_pension(data_list)
data_list = self.align_fund_share_name(data_list)
data_list = self.supplement_minimum_initial_investment(data_list)
data_list = self.check_total_annual_dollar_based_charges(data_list)
data_list, datapoint_list_with_production_name = self.post_adjust_for_value_with_production_name(data_list)
data_list = self.remove_duplicate_data(data_list)
if "management_fee" not in datapoint_list_with_production_name and "management_fee_and_costs" not in datapoint_list_with_production_name:
data_list, adjust = self.post_management_fee_exclude_performance_fee(data_list)
if not adjust:
data_list = self.post_adjust_management_fee_costs(data_list)
data_list = self.check_administration_fees(data_list)
data_list = self.check_administration_fees(data_list)
except Exception as e:
logger.error(f"Post supplement data error: {e}")
print_exc()
return data_list
def check_benchmark(self, data_list: list):
@ -347,6 +341,8 @@ class DataExtraction:
if "benchmark_name" not in keys:
continue
benchmark_name = data_item.get("benchmark_name", "")
if len(benchmark_name) == 0:
continue
if benchmark_name.startswith("A range") or benchmark_name.startswith("The fund") or \
benchmark_name.startswith("CPI "):
data_item.pop("benchmark_name")

View File

@ -87,6 +87,9 @@
"Retirement account \n\nInvestment option \n(A) Investment fees \nand costs (including \n(B) performance \nfees) (pa)* \n(B) Performance \nfees (pa) \n# \n(C) Transaction \ncosts (pa)*^ \n(A) + (C) Total \ninvestment cost \n(pa) \nBalanced Indexed 0.00% 0.00% 0.00% 0.00%\n",
"---Example End---",
"For this example, as \"Investment fees and costs (including (B) performance fees)\" and \"Performance fees (pa)\" mentioned as 0.00% so return 0 as datapoint values.",
"The fund name prefix is \"Retirement account\", the investment option is \"Balanced - Indexed\", so fund name and share name should be: \"Retirement account Balanced - Indexed\".",
"The output should be:",
"{\"data\": [{\"fund name\": \"Retirement account Balanced - Indexed\", \"share name\": \"Retirement account Balanced - Indexed\", \"management_fee_and_costs\": 0, \"management_fee\": 0, \"performance_fee_costs\": 0}]}",
"7. If for data point value specifically Nil is written in the value then return NULL('') for the same"
],
"investment_level": {

View File

@ -1538,7 +1538,7 @@ if __name__ == "__main__":
with open(document_sample_file, "r", encoding="utf-8") as f:
special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()
if len(doc_id.strip()) > 0]
# special_doc_id_list = ["573372424", "455235248", "462780211"]
# special_doc_id_list = ["384508026"]
pdf_folder: str = r"/data/aus_prospectus/pdf/"
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
output_extract_data_child_folder: str = (

File diff suppressed because one or more lines are too long