Split share class names which with multiple share classes in same line

This commit is contained in:
Blade He 2024-12-06 16:31:42 -06:00
parent d79b05885d
commit d96f77fe00
3 changed files with 136 additions and 22 deletions

View File

@ -582,29 +582,74 @@ class DataExtraction:
# update "fund name" to be "fund_name" # update "fund name" to be "fund_name"
# update "share name" to be "share_name" # update "share name" to be "share_name"
new_data_list = [] new_data_list = []
multi_over_3_share_regex = r"([A-Z]{1,}\,\s){3,}"
exist_multi_over_3_share = False
for data in data_list: for data in data_list:
new_data = {} fund_name = data.get("fund name", "").strip()
fund_name = data.get("fund name", "") if len(fund_name) == 0:
if fund_name != "": continue
new_data["fund_name"] = fund_name raw_share_name = data.get("share name", "")
share_name = data.get("share name", "") if not exist_multi_over_3_share:
if share_name != "": multi_over_3_share_search = re.search(multi_over_3_share_regex, raw_share_name)
new_data["share_name"] = share_name if multi_over_3_share_search is not None:
ter = data.get("ter", None) exist_multi_over_3_share = True
if ter is not None: if exist_multi_over_3_share:
new_data["ter"] = ter share_name_list = self.split_multi_share_name(raw_share_name)
performance_fee = data.get("performance fees", None) else:
if performance_fee is not None: share_name_list = [raw_share_name]
new_data["performance_fee"] = performance_fee if len(share_name_list) > 0:
for share_name in share_name_list:
new_data = {}
new_data["fund_name"] = fund_name
if share_name != "":
new_data["share_name"] = share_name
ter = data.get("ter", None)
if ter is not None:
new_data["ter"] = ter
performance_fee = data.get("performance fees", None)
if performance_fee is not None:
new_data["performance_fee"] = performance_fee
for key, value in data.items(): for key, value in data.items():
if key not in ["fund name", "share name", "ter", "performance fees"]: if key not in ["fund name", "share name", "ter", "performance fees"]:
new_data[key] = value new_data[key] = value
new_data_list.append(new_data) new_data_list.append(new_data)
extract_data_info["data"] = new_data_list extract_data_info["data"] = new_data_list
return extract_data_info return extract_data_info
def split_multi_share_name(self, raw_share_name: str) -> list:
"""
Some document, e.g. 481482392
Exist multi share name as table header, e.g. "Class A, B, E, M, N, P, R, U"
For this case, need split the share name to be ["Class A", "Class B", "Class E",
"Class M", "Class N", "Class P", "Class R", "Class U"]
"""
multi_over_2_share_regex = r"([A-Z]{1,}\,\s){2,}"
multi_over_2_share_search = re.search(multi_over_2_share_regex, raw_share_name)
share_name_list = [raw_share_name]
if multi_over_2_share_search is not None:
multi_share_splits = [share_name.strip() for share_name in raw_share_name.split(",")
if len(share_name.strip()) > 0]
first_share_name = multi_share_splits[0]
first_share_name_split = first_share_name.split()
share_name_prefix = None
if len(first_share_name_split) == 2:
share_name_prefix = first_share_name_split[0]
if share_name_prefix is not None and len(share_name_prefix) > 0:
new_share_name_list = []
for split in multi_share_splits:
if split == first_share_name:
new_share_name_list.append(split)
else:
new_share_name_list.append(f"{share_name_prefix} {split}")
share_name_list = new_share_name_list
else:
share_name_list = multi_share_splits
else:
share_name_list = multi_share_splits
return share_name_list
def get_fund_name(self, fund_name: str, fund_feature: str): def get_fund_name(self, fund_name: str, fund_feature: str):
if not fund_name.endswith(fund_feature): if not fund_name.endswith(fund_feature):
return fund_name return fund_name

View File

@ -1150,7 +1150,7 @@ def batch_run_documents():
"534535767" "534535767"
] ]
special_doc_id_list = check_db_mapping_doc_id_list special_doc_id_list = check_db_mapping_doc_id_list
# special_doc_id_list = ["407275419", "425595958", "451063582", "451878128"] special_doc_id_list = ["481482392"]
pdf_folder = r"/data/emea_ar/pdf/" pdf_folder = r"/data/emea_ar/pdf/"
page_filter_ground_truth_file = ( page_filter_ground_truth_file = (
r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx" r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx"
@ -1159,9 +1159,9 @@ def batch_run_documents():
output_extract_data_total_folder = r"/data/emea_ar/output/extract_data/total/" output_extract_data_total_folder = r"/data/emea_ar/output/extract_data/total/"
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/" output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/" output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
re_run_extract_data = False re_run_extract_data = True
re_run_mapping_data = False re_run_mapping_data = True
force_save_total_data = True force_save_total_data = False
calculate_metrics = False calculate_metrics = False
extract_ways = ["text"] extract_ways = ["text"]

View File

@ -0,0 +1,69 @@
import os
import json
import pandas as pd
from glob import glob
from tqdm import tqdm
from utils.logger import logger
from utils.sql_query_util import query_document_fund_mapping
from core.page_filter import FilterPages
from core.data_extraction import DataExtraction
def test_validate_extraction_data():
document_id = "481482392"
pdf_file = f"/data/emea_ar/pdf/481482392.pdf"
output_extract_data_child_folder = r"/data/emea_ar/output/extract_data/docs/"
output_extract_data_total_folder = r"/data/emea_ar/output/extract_data/total/"
document_mapping_info_df = query_document_fund_mapping(document_id, rerun=False)
filter_pages = FilterPages(
document_id, pdf_file, document_mapping_info_df
)
page_text_dict = filter_pages.page_text_dict
datapoint_page_info, result_details = get_datapoint_page_info(filter_pages)
datapoints = get_datapoints_from_datapoint_page_info(datapoint_page_info)
data_extraction = DataExtraction(
doc_id=document_id,
pdf_file=pdf_file,
output_data_folder=output_extract_data_child_folder,
page_text_dict=page_text_dict,
datapoint_page_info=datapoint_page_info,
datapoints=datapoints,
document_mapping_info_df=document_mapping_info_df,
extract_way="text",
output_image_folder=None
)
output_data_json_folder = os.path.join(
r"/data/emea_ar/output/extract_data/docs/by_text/", "json/"
)
os.makedirs(output_data_json_folder, exist_ok=True)
json_file = os.path.join(output_data_json_folder, f"{document_id}.json")
data_from_gpt = None
if os.path.exists(json_file):
logger.info(
f"The document: {document_id} has been parsed, loading data from {json_file}"
)
with open(json_file, "r", encoding="utf-8") as f:
data_from_gpt = json.load(f)
for extract_data in data_from_gpt:
page_index = extract_data["page_index"]
if page_index == 451:
logger.info(f"Page index: {page_index}")
raw_answer = extract_data["raw_answer"]
raw_answer_json = json.loads(raw_answer)
extract_data_info = data_extraction.validate_data(raw_answer_json)
print(extract_data_info)
def get_datapoint_page_info(filter_pages) -> tuple:
datapoint_page_info, result_details = filter_pages.start_job()
return datapoint_page_info, result_details
def get_datapoints_from_datapoint_page_info(datapoint_page_info) -> list:
datapoints = list(datapoint_page_info.keys())
if "doc_id" in datapoints:
datapoints.remove("doc_id")
return datapoints
if __name__ == "__main__":
test_validate_extraction_data()