fix issue for post actions
This commit is contained in:
parent
50e51e0894
commit
4cee95db9a
|
|
@ -114,25 +114,6 @@ def calc_metrics(ground_truth_file: str, prediction_file: str):
|
||||||
print(f"Accuracy TOR: {accuracy_tor}")
|
print(f"Accuracy TOR: {accuracy_tor}")
|
||||||
|
|
||||||
|
|
||||||
def transform_pdf_2_image():
|
|
||||||
"""
|
|
||||||
Transform pdf to image.
|
|
||||||
"""
|
|
||||||
import fitz
|
|
||||||
|
|
||||||
folder = r"/Users/bhe/OneDrive - MORNINGSTAR INC/Personal Document/US_Life/pay/"
|
|
||||||
pdf_file = r"Pay_Date_2025-02-14.pdf"
|
|
||||||
pdf_path = os.path.join(folder, pdf_file)
|
|
||||||
pdf_doc = fitz.open(pdf_path)
|
|
||||||
|
|
||||||
pdf_file_pure_name = pdf_file.replace(".pdf", "")
|
|
||||||
for page_num in range(pdf_doc.page_count):
|
|
||||||
page = pdf_doc.load_page(page_num)
|
|
||||||
image = page.get_pixmap(dpi=300)
|
|
||||||
image_path = os.path.join(folder, f"{pdf_file_pure_name}_{page_num}.png")
|
|
||||||
image.save(image_path)
|
|
||||||
|
|
||||||
|
|
||||||
def invoke_api_demo(doc_id: str = "407881493"):
|
def invoke_api_demo(doc_id: str = "407881493"):
|
||||||
headers = {"connection": "keep-alive", "content-type": "application/json"}
|
headers = {"connection": "keep-alive", "content-type": "application/json"}
|
||||||
data = {
|
data = {
|
||||||
|
|
@ -1432,7 +1413,7 @@ def merge_inference_data():
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
merge_inference_data()
|
# merge_inference_data()
|
||||||
# adjust_column_order()
|
# adjust_column_order()
|
||||||
# set_mapping_to_data_side_documents_data()
|
# set_mapping_to_data_side_documents_data()
|
||||||
|
|
||||||
|
|
@ -1481,7 +1462,6 @@ if __name__ == "__main__":
|
||||||
# calculate_metrics_based_audit_file(is_strict=False)
|
# calculate_metrics_based_audit_file(is_strict=False)
|
||||||
# remove_ter_ogc_performance_fee_annotation()
|
# remove_ter_ogc_performance_fee_annotation()
|
||||||
# batch_run_documents()
|
# batch_run_documents()
|
||||||
# transform_pdf_2_image()
|
|
||||||
# ground_truth_file = "./test_metrics/ground_truth.xlsx"
|
# ground_truth_file = "./test_metrics/ground_truth.xlsx"
|
||||||
# prediction_file = "./test_metrics/prediction.xlsx"
|
# prediction_file = "./test_metrics/prediction.xlsx"
|
||||||
# calc_metrics(ground_truth_file, prediction_file)
|
# calc_metrics(ground_truth_file, prediction_file)
|
||||||
|
|
|
||||||
|
|
@ -295,18 +295,9 @@ class DataExtraction:
|
||||||
|
|
||||||
def post_supplement_data(self, data_list: list) -> list:
|
def post_supplement_data(self, data_list: list) -> list:
|
||||||
"""
|
"""
|
||||||
data_dict = {"doc_id": self.doc_id}
|
Post supplement data for the extracted data
|
||||||
data_dict["page_index"] = page_num
|
|
||||||
data_dict["datapoints"] = ", ".join(page_datapoints)
|
|
||||||
data_dict["page_text"] = page_text
|
|
||||||
data_dict["instructions"] = instructions
|
|
||||||
data_dict["raw_answer"] = response
|
|
||||||
data_dict["extract_data"] = data
|
|
||||||
data_dict["extract_way"] = original_way
|
|
||||||
data_dict["prompt_token"] = result.get("prompt_token", 0)
|
|
||||||
data_dict["completion_token"] = result.get("completion_token", 0)
|
|
||||||
data_dict["total_token"] = result.get("total_token", 0)
|
|
||||||
"""
|
"""
|
||||||
|
try:
|
||||||
data_list = self.check_benchmark(data_list)
|
data_list = self.check_benchmark(data_list)
|
||||||
data_list = self.supplement_ttr_pension(data_list)
|
data_list = self.supplement_ttr_pension(data_list)
|
||||||
data_list = self.align_fund_share_name(data_list)
|
data_list = self.align_fund_share_name(data_list)
|
||||||
|
|
@ -320,6 +311,9 @@ class DataExtraction:
|
||||||
data_list = self.post_adjust_management_fee_costs(data_list)
|
data_list = self.post_adjust_management_fee_costs(data_list)
|
||||||
|
|
||||||
data_list = self.check_administration_fees(data_list)
|
data_list = self.check_administration_fees(data_list)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Post supplement data error: {e}")
|
||||||
|
print_exc()
|
||||||
return data_list
|
return data_list
|
||||||
|
|
||||||
def check_benchmark(self, data_list: list):
|
def check_benchmark(self, data_list: list):
|
||||||
|
|
@ -347,6 +341,8 @@ class DataExtraction:
|
||||||
if "benchmark_name" not in keys:
|
if "benchmark_name" not in keys:
|
||||||
continue
|
continue
|
||||||
benchmark_name = data_item.get("benchmark_name", "")
|
benchmark_name = data_item.get("benchmark_name", "")
|
||||||
|
if len(benchmark_name) == 0:
|
||||||
|
continue
|
||||||
if benchmark_name.startswith("A range") or benchmark_name.startswith("The fund") or \
|
if benchmark_name.startswith("A range") or benchmark_name.startswith("The fund") or \
|
||||||
benchmark_name.startswith("CPI "):
|
benchmark_name.startswith("CPI "):
|
||||||
data_item.pop("benchmark_name")
|
data_item.pop("benchmark_name")
|
||||||
|
|
|
||||||
|
|
@ -87,6 +87,9 @@
|
||||||
"Retirement account \n\nInvestment option \n(A) Investment fees \nand costs (including \n(B) performance \nfees) (pa)* \n(B) Performance \nfees (pa) \n# \n(C) Transaction \ncosts (pa)*^ \n(A) + (C) Total \ninvestment cost \n(pa) \nBalanced – Indexed 0.00% 0.00% 0.00% 0.00%\n",
|
"Retirement account \n\nInvestment option \n(A) Investment fees \nand costs (including \n(B) performance \nfees) (pa)* \n(B) Performance \nfees (pa) \n# \n(C) Transaction \ncosts (pa)*^ \n(A) + (C) Total \ninvestment cost \n(pa) \nBalanced – Indexed 0.00% 0.00% 0.00% 0.00%\n",
|
||||||
"---Example End---",
|
"---Example End---",
|
||||||
"For this example, as \"Investment fees and costs (including (B) performance fees)\" and \"Performance fees (pa)\" mentioned as 0.00% so return 0 as datapoint values.",
|
"For this example, as \"Investment fees and costs (including (B) performance fees)\" and \"Performance fees (pa)\" mentioned as 0.00% so return 0 as datapoint values.",
|
||||||
|
"The fund name prefix is \"Retirement account\", the investment option is \"Balanced - Indexed\", so fund name and share name should be: \"Retirement account Balanced - Indexed\".",
|
||||||
|
"The output should be:",
|
||||||
|
"{\"data\": [{\"fund name\": \"Retirement account Balanced - Indexed\", \"share name\": \"Retirement account Balanced - Indexed\", \"management_fee_and_costs\": 0, \"management_fee\": 0, \"performance_fee_costs\": 0}]}",
|
||||||
"7. If for data point value specifically Nil is written in the value then return NULL('') for the same"
|
"7. If for data point value specifically Nil is written in the value then return NULL('') for the same"
|
||||||
],
|
],
|
||||||
"investment_level": {
|
"investment_level": {
|
||||||
|
|
|
||||||
2
main.py
2
main.py
|
|
@ -1538,7 +1538,7 @@ if __name__ == "__main__":
|
||||||
with open(document_sample_file, "r", encoding="utf-8") as f:
|
with open(document_sample_file, "r", encoding="utf-8") as f:
|
||||||
special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()
|
special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()
|
||||||
if len(doc_id.strip()) > 0]
|
if len(doc_id.strip()) > 0]
|
||||||
# special_doc_id_list = ["573372424", "455235248", "462780211"]
|
# special_doc_id_list = ["384508026"]
|
||||||
pdf_folder: str = r"/data/aus_prospectus/pdf/"
|
pdf_folder: str = r"/data/aus_prospectus/pdf/"
|
||||||
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
|
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
|
||||||
output_extract_data_child_folder: str = (
|
output_extract_data_child_folder: str = (
|
||||||
|
|
|
||||||
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue