fix issue for post actions
This commit is contained in:
parent
50e51e0894
commit
4cee95db9a
|
|
@ -114,25 +114,6 @@ def calc_metrics(ground_truth_file: str, prediction_file: str):
|
|||
print(f"Accuracy TOR: {accuracy_tor}")
|
||||
|
||||
|
||||
def transform_pdf_2_image():
|
||||
"""
|
||||
Transform pdf to image.
|
||||
"""
|
||||
import fitz
|
||||
|
||||
folder = r"/Users/bhe/OneDrive - MORNINGSTAR INC/Personal Document/US_Life/pay/"
|
||||
pdf_file = r"Pay_Date_2025-02-14.pdf"
|
||||
pdf_path = os.path.join(folder, pdf_file)
|
||||
pdf_doc = fitz.open(pdf_path)
|
||||
|
||||
pdf_file_pure_name = pdf_file.replace(".pdf", "")
|
||||
for page_num in range(pdf_doc.page_count):
|
||||
page = pdf_doc.load_page(page_num)
|
||||
image = page.get_pixmap(dpi=300)
|
||||
image_path = os.path.join(folder, f"{pdf_file_pure_name}_{page_num}.png")
|
||||
image.save(image_path)
|
||||
|
||||
|
||||
def invoke_api_demo(doc_id: str = "407881493"):
|
||||
headers = {"connection": "keep-alive", "content-type": "application/json"}
|
||||
data = {
|
||||
|
|
@ -1432,7 +1413,7 @@ def merge_inference_data():
|
|||
|
||||
|
||||
if __name__ == "__main__":
|
||||
merge_inference_data()
|
||||
# merge_inference_data()
|
||||
# adjust_column_order()
|
||||
# set_mapping_to_data_side_documents_data()
|
||||
|
||||
|
|
@ -1481,7 +1462,6 @@ if __name__ == "__main__":
|
|||
# calculate_metrics_based_audit_file(is_strict=False)
|
||||
# remove_ter_ogc_performance_fee_annotation()
|
||||
# batch_run_documents()
|
||||
# transform_pdf_2_image()
|
||||
# ground_truth_file = "./test_metrics/ground_truth.xlsx"
|
||||
# prediction_file = "./test_metrics/prediction.xlsx"
|
||||
# calc_metrics(ground_truth_file, prediction_file)
|
||||
|
|
|
|||
|
|
@ -295,31 +295,25 @@ class DataExtraction:
|
|||
|
||||
def post_supplement_data(self, data_list: list) -> list:
|
||||
"""
|
||||
data_dict = {"doc_id": self.doc_id}
|
||||
data_dict["page_index"] = page_num
|
||||
data_dict["datapoints"] = ", ".join(page_datapoints)
|
||||
data_dict["page_text"] = page_text
|
||||
data_dict["instructions"] = instructions
|
||||
data_dict["raw_answer"] = response
|
||||
data_dict["extract_data"] = data
|
||||
data_dict["extract_way"] = original_way
|
||||
data_dict["prompt_token"] = result.get("prompt_token", 0)
|
||||
data_dict["completion_token"] = result.get("completion_token", 0)
|
||||
data_dict["total_token"] = result.get("total_token", 0)
|
||||
Post supplement data for the extracted data
|
||||
"""
|
||||
data_list = self.check_benchmark(data_list)
|
||||
data_list = self.supplement_ttr_pension(data_list)
|
||||
data_list = self.align_fund_share_name(data_list)
|
||||
data_list = self.supplement_minimum_initial_investment(data_list)
|
||||
data_list = self.check_total_annual_dollar_based_charges(data_list)
|
||||
data_list, datapoint_list_with_production_name = self.post_adjust_for_value_with_production_name(data_list)
|
||||
data_list = self.remove_duplicate_data(data_list)
|
||||
if "management_fee" not in datapoint_list_with_production_name and "management_fee_and_costs" not in datapoint_list_with_production_name:
|
||||
data_list, adjust = self.post_management_fee_exclude_performance_fee(data_list)
|
||||
if not adjust:
|
||||
data_list = self.post_adjust_management_fee_costs(data_list)
|
||||
|
||||
data_list = self.check_administration_fees(data_list)
|
||||
try:
|
||||
data_list = self.check_benchmark(data_list)
|
||||
data_list = self.supplement_ttr_pension(data_list)
|
||||
data_list = self.align_fund_share_name(data_list)
|
||||
data_list = self.supplement_minimum_initial_investment(data_list)
|
||||
data_list = self.check_total_annual_dollar_based_charges(data_list)
|
||||
data_list, datapoint_list_with_production_name = self.post_adjust_for_value_with_production_name(data_list)
|
||||
data_list = self.remove_duplicate_data(data_list)
|
||||
if "management_fee" not in datapoint_list_with_production_name and "management_fee_and_costs" not in datapoint_list_with_production_name:
|
||||
data_list, adjust = self.post_management_fee_exclude_performance_fee(data_list)
|
||||
if not adjust:
|
||||
data_list = self.post_adjust_management_fee_costs(data_list)
|
||||
|
||||
data_list = self.check_administration_fees(data_list)
|
||||
except Exception as e:
|
||||
logger.error(f"Post supplement data error: {e}")
|
||||
print_exc()
|
||||
return data_list
|
||||
|
||||
def check_benchmark(self, data_list: list):
|
||||
|
|
@ -347,6 +341,8 @@ class DataExtraction:
|
|||
if "benchmark_name" not in keys:
|
||||
continue
|
||||
benchmark_name = data_item.get("benchmark_name", "")
|
||||
if len(benchmark_name) == 0:
|
||||
continue
|
||||
if benchmark_name.startswith("A range") or benchmark_name.startswith("The fund") or \
|
||||
benchmark_name.startswith("CPI "):
|
||||
data_item.pop("benchmark_name")
|
||||
|
|
|
|||
|
|
@ -87,6 +87,9 @@
|
|||
"Retirement account \n\nInvestment option \n(A) Investment fees \nand costs (including \n(B) performance \nfees) (pa)* \n(B) Performance \nfees (pa) \n# \n(C) Transaction \ncosts (pa)*^ \n(A) + (C) Total \ninvestment cost \n(pa) \nBalanced – Indexed 0.00% 0.00% 0.00% 0.00%\n",
|
||||
"---Example End---",
|
||||
"For this example, as \"Investment fees and costs (including (B) performance fees)\" and \"Performance fees (pa)\" mentioned as 0.00% so return 0 as datapoint values.",
|
||||
"The fund name prefix is \"Retirement account\", the investment option is \"Balanced - Indexed\", so fund name and share name should be: \"Retirement account Balanced - Indexed\".",
|
||||
"The output should be:",
|
||||
"{\"data\": [{\"fund name\": \"Retirement account Balanced - Indexed\", \"share name\": \"Retirement account Balanced - Indexed\", \"management_fee_and_costs\": 0, \"management_fee\": 0, \"performance_fee_costs\": 0}]}",
|
||||
"7. If for data point value specifically Nil is written in the value then return NULL('') for the same"
|
||||
],
|
||||
"investment_level": {
|
||||
|
|
|
|||
2
main.py
2
main.py
|
|
@ -1538,7 +1538,7 @@ if __name__ == "__main__":
|
|||
with open(document_sample_file, "r", encoding="utf-8") as f:
|
||||
special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()
|
||||
if len(doc_id.strip()) > 0]
|
||||
# special_doc_id_list = ["573372424", "455235248", "462780211"]
|
||||
# special_doc_id_list = ["384508026"]
|
||||
pdf_folder: str = r"/data/aus_prospectus/pdf/"
|
||||
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
|
||||
output_extract_data_child_folder: str = (
|
||||
|
|
|
|||
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue