2024-09-05 22:22:26 +00:00
|
|
|
import os
|
|
|
|
|
import json
|
|
|
|
|
import json_repair
|
|
|
|
|
import re
|
|
|
|
|
import fitz
|
|
|
|
|
import pandas as pd
|
|
|
|
|
from utils.gpt_utils import chat
|
|
|
|
|
from utils.pdf_util import PDFUtil
|
|
|
|
|
from utils.sql_query_util import query_document_fund_mapping
|
|
|
|
|
from utils.logger import logger
|
|
|
|
|
from utils.biz_utils import add_slash_to_text_as_regex, clean_text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class DataExtraction:
|
|
|
|
|
def __init__(
|
2024-09-06 21:29:35 +00:00
|
|
|
self,
|
|
|
|
|
doc_id: str,
|
2024-09-05 22:22:26 +00:00
|
|
|
pdf_file: str,
|
2024-09-06 21:29:35 +00:00
|
|
|
output_data_folder: str,
|
|
|
|
|
page_text_dict: dict,
|
|
|
|
|
datapoint_page_info: dict,
|
2024-09-09 22:34:53 +00:00
|
|
|
datapoints: list,
|
2024-09-06 21:29:35 +00:00
|
|
|
document_mapping_info_df: pd.DataFrame,
|
2024-09-05 22:22:26 +00:00
|
|
|
) -> None:
|
|
|
|
|
self.doc_id = doc_id
|
|
|
|
|
self.pdf_file = pdf_file
|
|
|
|
|
if output_data_folder is None or len(output_data_folder) == 0:
|
|
|
|
|
output_data_folder = r"/data/emea_ar/output/extract_data/docs/"
|
|
|
|
|
os.makedirs(output_data_folder, exist_ok=True)
|
2024-09-06 21:29:35 +00:00
|
|
|
|
2024-09-05 22:22:26 +00:00
|
|
|
self.output_data_json_folder = os.path.join(output_data_folder, "json/")
|
|
|
|
|
os.makedirs(self.output_data_json_folder, exist_ok=True)
|
2024-09-06 21:29:35 +00:00
|
|
|
|
2024-09-05 22:22:26 +00:00
|
|
|
self.output_data_excel_folder = os.path.join(output_data_folder, "excel/")
|
|
|
|
|
os.makedirs(self.output_data_excel_folder, exist_ok=True)
|
2024-09-06 21:29:35 +00:00
|
|
|
|
2024-09-05 22:22:26 +00:00
|
|
|
if page_text_dict is None or len(page_text_dict.keys()) == 0:
|
|
|
|
|
self.page_text_dict = self.get_pdf_page_text_dict()
|
|
|
|
|
else:
|
|
|
|
|
self.page_text_dict = page_text_dict
|
|
|
|
|
if document_mapping_info_df is None or len(document_mapping_info_df) == 0:
|
|
|
|
|
self.document_mapping_info_df = query_document_fund_mapping(doc_id)
|
|
|
|
|
else:
|
|
|
|
|
self.document_mapping_info_df = document_mapping_info_df
|
|
|
|
|
self.datapoint_page_info = datapoint_page_info
|
2024-09-06 21:29:35 +00:00
|
|
|
self.page_nums_with_datapoints = self.get_page_nums_from_datapoint_page_info()
|
2024-09-09 22:34:53 +00:00
|
|
|
self.datapoints = datapoints
|
2024-09-05 22:22:26 +00:00
|
|
|
self.instructions_config = self.get_instructions_config()
|
|
|
|
|
self.datapoint_level_config = self.get_datapoint_level()
|
|
|
|
|
self.datapoint_name_config = self.get_datapoint_name()
|
2024-09-06 21:29:35 +00:00
|
|
|
|
2024-09-05 22:22:26 +00:00
|
|
|
def get_instructions_config(self) -> dict:
|
|
|
|
|
instructions_config_file = r"./instructions/data_extraction_prompts_config.json"
|
|
|
|
|
with open(instructions_config_file, "r", encoding="utf-8") as f:
|
|
|
|
|
instructions_config = json.load(f)
|
|
|
|
|
return instructions_config
|
2024-09-06 21:29:35 +00:00
|
|
|
|
2024-09-05 22:22:26 +00:00
|
|
|
def get_datapoint_level(self) -> dict:
|
|
|
|
|
datapoint_level_file = r"./configuration/datapoint_level.json"
|
|
|
|
|
with open(datapoint_level_file, "r", encoding="utf-8") as f:
|
|
|
|
|
datapoint_level = json.load(f)
|
|
|
|
|
return datapoint_level
|
|
|
|
|
|
|
|
|
|
def get_datapoint_name(self) -> dict:
|
|
|
|
|
datapoint_name_file = r"./configuration/datapoint_name.json"
|
|
|
|
|
with open(datapoint_name_file, "r", encoding="utf-8") as f:
|
|
|
|
|
datapoint_name = json.load(f)
|
|
|
|
|
return datapoint_name
|
2024-09-06 21:29:35 +00:00
|
|
|
|
2024-09-05 22:22:26 +00:00
|
|
|
def get_pdf_page_text_dict(self) -> dict:
|
|
|
|
|
pdf_util = PDFUtil(self.pdf_file)
|
|
|
|
|
success, text, page_text_dict = pdf_util.extract_text()
|
|
|
|
|
return page_text_dict
|
2024-09-06 21:29:35 +00:00
|
|
|
|
|
|
|
|
def get_page_nums_from_datapoint_page_info(self) -> list:
|
|
|
|
|
page_nums_with_datapoints = []
|
|
|
|
|
for datapoint, page_nums in self.datapoint_page_info.items():
|
|
|
|
|
if datapoint == "doc_id":
|
|
|
|
|
continue
|
|
|
|
|
page_nums_with_datapoints.extend(page_nums)
|
|
|
|
|
page_nums_with_datapoints = list(set(page_nums_with_datapoints))
|
|
|
|
|
# sort the page numbers
|
|
|
|
|
page_nums_with_datapoints.sort()
|
|
|
|
|
return page_nums_with_datapoints
|
|
|
|
|
|
2024-09-05 22:22:26 +00:00
|
|
|
def extract_data(self) -> dict:
|
|
|
|
|
"""
|
|
|
|
|
keys are
|
|
|
|
|
doc_id, page_index, datapoint, value, raw_fund_name, fund_id, fund_name, raw_share_name, share_id, share_name
|
|
|
|
|
"""
|
|
|
|
|
data_list = []
|
2024-09-06 21:29:35 +00:00
|
|
|
pdf_page_count = len(self.page_text_dict.keys())
|
|
|
|
|
handled_page_num_list = []
|
2024-09-05 22:22:26 +00:00
|
|
|
for page_num, page_text in self.page_text_dict.items():
|
2024-09-06 21:29:35 +00:00
|
|
|
if page_num in handled_page_num_list:
|
|
|
|
|
continue
|
2024-09-05 22:22:26 +00:00
|
|
|
page_datapoints = self.get_datapoints_by_page_num(page_num)
|
|
|
|
|
if len(page_datapoints) == 0:
|
|
|
|
|
continue
|
2024-09-06 21:29:35 +00:00
|
|
|
extract_data = self.extract_data_by_page(
|
|
|
|
|
page_num,
|
|
|
|
|
page_text,
|
|
|
|
|
page_datapoints,
|
|
|
|
|
need_exclude=False,
|
|
|
|
|
exclude_data=None,
|
|
|
|
|
)
|
|
|
|
|
data_list.append(extract_data)
|
|
|
|
|
|
|
|
|
|
page_data_list = extract_data.get("extract_data", {}).get("data", [])
|
|
|
|
|
|
|
|
|
|
current_page_data_count = len(page_data_list)
|
|
|
|
|
if current_page_data_count > 0:
|
|
|
|
|
count = 1
|
|
|
|
|
# some pdf documents have multiple pages for the same data
|
|
|
|
|
# and the next page may without table header with data point keywords.
|
|
|
|
|
# the purpose is try to get data from the next page
|
|
|
|
|
current_text = page_text
|
|
|
|
|
|
|
|
|
|
while count < 3:
|
|
|
|
|
try:
|
|
|
|
|
next_page_num = page_num + count
|
|
|
|
|
if next_page_num >= pdf_page_count:
|
|
|
|
|
break
|
|
|
|
|
next_datapoints = page_datapoints
|
|
|
|
|
if next_page_num in self.page_nums_with_datapoints:
|
|
|
|
|
should_continue = False
|
|
|
|
|
next_datapoints = self.get_datapoints_by_page_num(next_page_num)
|
|
|
|
|
if len(next_datapoints) == 0:
|
|
|
|
|
should_continue = True
|
|
|
|
|
else:
|
|
|
|
|
for next_datapoint in next_datapoints:
|
|
|
|
|
if next_datapoint not in page_datapoints:
|
|
|
|
|
should_continue = True
|
|
|
|
|
break
|
|
|
|
|
next_datapoints.extend(page_datapoints)
|
|
|
|
|
# remove duplicate datapoints
|
|
|
|
|
next_datapoints = list(set(next_datapoints))
|
|
|
|
|
if not should_continue:
|
|
|
|
|
break
|
|
|
|
|
next_page_text = self.page_text_dict.get(next_page_num, "")
|
|
|
|
|
target_text = current_text + next_page_text
|
|
|
|
|
# try to get data by current page_datapoints
|
|
|
|
|
next_page_extract_data = self.extract_data_by_page(
|
|
|
|
|
next_page_num,
|
|
|
|
|
target_text,
|
|
|
|
|
next_datapoints,
|
|
|
|
|
need_exclude=True,
|
|
|
|
|
exclude_data=page_data_list,
|
|
|
|
|
)
|
|
|
|
|
next_page_data_list = next_page_extract_data.get(
|
|
|
|
|
"extract_data", {}
|
|
|
|
|
).get("data", [])
|
|
|
|
|
|
|
|
|
|
if next_page_data_list is not None and len(next_page_data_list) > 0:
|
|
|
|
|
for current_page_data in page_data_list:
|
|
|
|
|
if current_page_data in next_page_data_list:
|
|
|
|
|
next_page_data_list.remove(current_page_data)
|
|
|
|
|
next_page_extract_data["extract_data"][
|
|
|
|
|
"data"
|
|
|
|
|
] = next_page_data_list
|
|
|
|
|
data_list.append(next_page_extract_data)
|
|
|
|
|
handled_page_num_list.append(next_page_num)
|
2024-09-16 17:03:13 +00:00
|
|
|
exist_current_page_datapoint = False
|
|
|
|
|
for next_page_data in next_page_data_list:
|
|
|
|
|
for page_datapoint in page_datapoints:
|
|
|
|
|
if page_datapoint in list(next_page_data.keys()):
|
|
|
|
|
exist_current_page_datapoint = True
|
|
|
|
|
break
|
|
|
|
|
if exist_current_page_datapoint:
|
|
|
|
|
break
|
|
|
|
|
if not exist_current_page_datapoint:
|
|
|
|
|
break
|
2024-09-06 21:29:35 +00:00
|
|
|
else:
|
|
|
|
|
break
|
|
|
|
|
count += 1
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logger.error(f"Error in extracting data from next page: {e}")
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
json_data_file = os.path.join(
|
|
|
|
|
self.output_data_json_folder, f"{self.doc_id}.json"
|
|
|
|
|
)
|
2024-09-05 22:22:26 +00:00
|
|
|
with open(json_data_file, "w", encoding="utf-8") as f:
|
|
|
|
|
json.dump(data_list, f, ensure_ascii=False, indent=4)
|
2024-09-06 21:29:35 +00:00
|
|
|
|
2024-09-05 22:22:26 +00:00
|
|
|
data_df = pd.DataFrame(data_list)
|
|
|
|
|
data_df.reset_index(drop=True, inplace=True)
|
2024-09-06 21:29:35 +00:00
|
|
|
excel_data_file = os.path.join(
|
|
|
|
|
self.output_data_excel_folder, f"{self.doc_id}.xlsx"
|
|
|
|
|
)
|
2024-09-05 22:22:26 +00:00
|
|
|
with pd.ExcelWriter(excel_data_file) as writer:
|
|
|
|
|
data_df.to_excel(writer, sheet_name="extract_data", index=False)
|
2024-09-06 21:29:35 +00:00
|
|
|
|
2024-09-05 22:22:26 +00:00
|
|
|
return data_list
|
2024-09-06 21:29:35 +00:00
|
|
|
|
|
|
|
|
def extract_data_by_page(
|
|
|
|
|
self,
|
|
|
|
|
page_num: int,
|
|
|
|
|
page_text: str,
|
|
|
|
|
page_datapoints: list,
|
|
|
|
|
need_exclude: bool = False,
|
|
|
|
|
exclude_data: list = None,
|
|
|
|
|
) -> dict:
|
|
|
|
|
"""
|
|
|
|
|
keys are
|
|
|
|
|
doc_id, page_index, datapoint, value, raw_fund_name, fund_id, fund_name, raw_share_name, share_id, share_name
|
|
|
|
|
"""
|
|
|
|
|
logger.info(f"Extracting data from page {page_num}")
|
|
|
|
|
instructions = self.get_instructions_by_datapoints(
|
|
|
|
|
page_text, page_datapoints, need_exclude, exclude_data
|
|
|
|
|
)
|
|
|
|
|
response, with_error = chat(
|
|
|
|
|
instructions, response_format={"type": "json_object"}
|
|
|
|
|
)
|
|
|
|
|
if with_error:
|
|
|
|
|
logger.error(f"Error in extracting tables from page")
|
2024-09-16 17:03:13 +00:00
|
|
|
data_dict = {"doc_id": self.doc_id}
|
|
|
|
|
data_dict["page_index"] = page_num
|
|
|
|
|
data_dict["datapoints"] = ", ".join(page_datapoints)
|
|
|
|
|
data_dict["page_text"] = page_text
|
|
|
|
|
data_dict["instructions"] = instructions
|
|
|
|
|
data_dict["raw_answer"] = response
|
|
|
|
|
data_dict["extract_data"] = {"data": []}
|
|
|
|
|
return data_dict
|
2024-09-06 21:29:35 +00:00
|
|
|
try:
|
|
|
|
|
data = json.loads(response)
|
|
|
|
|
except:
|
|
|
|
|
try:
|
2024-09-16 17:03:13 +00:00
|
|
|
# if occur error, perhaps the output length is over 4K tokens
|
|
|
|
|
# split the context to two parts and try to get data from the two parts
|
|
|
|
|
data = self.chat_by_split_context(
|
|
|
|
|
page_text, page_datapoints, need_exclude, exclude_data
|
|
|
|
|
)
|
|
|
|
|
if len(data.get("data", [])) == 0:
|
|
|
|
|
data = json_repair.loads(response)
|
2024-09-06 21:29:35 +00:00
|
|
|
except:
|
|
|
|
|
data = {"data": []}
|
2024-09-09 22:34:53 +00:00
|
|
|
data = self.validate_data(data)
|
|
|
|
|
|
2024-09-06 21:29:35 +00:00
|
|
|
data_dict = {"doc_id": self.doc_id}
|
|
|
|
|
data_dict["page_index"] = page_num
|
|
|
|
|
data_dict["datapoints"] = ", ".join(page_datapoints)
|
|
|
|
|
data_dict["page_text"] = page_text
|
|
|
|
|
data_dict["instructions"] = instructions
|
|
|
|
|
data_dict["raw_answer"] = response
|
|
|
|
|
data_dict["extract_data"] = data
|
|
|
|
|
return data_dict
|
2024-09-09 22:34:53 +00:00
|
|
|
|
2024-09-16 17:03:13 +00:00
|
|
|
def chat_by_split_context(self,
|
|
|
|
|
page_text: str,
|
|
|
|
|
page_datapoints: list,
|
|
|
|
|
need_exclude: bool,
|
|
|
|
|
exclude_data: list) -> list:
|
|
|
|
|
"""
|
|
|
|
|
If occur error, split the context to two parts and try to get data from the two parts
|
2024-09-16 21:43:03 +00:00
|
|
|
Relevant document: 503194284
|
2024-09-16 17:03:13 +00:00
|
|
|
"""
|
|
|
|
|
try:
|
|
|
|
|
logger.info(f"Split context to get data to fix issue which output length is over 4K tokens")
|
|
|
|
|
split_context = re.split(r"\n", page_text)
|
|
|
|
|
split_context = [text.strip() for text in split_context
|
|
|
|
|
if len(text.strip()) > 0]
|
|
|
|
|
split_context_len = len(split_context)
|
|
|
|
|
top_10_context = split_context[:10]
|
|
|
|
|
rest_context = split_context[10:]
|
|
|
|
|
header = "\n".join(top_10_context)
|
|
|
|
|
half_len = split_context_len // 2
|
|
|
|
|
# the member of half_len should not start with number
|
|
|
|
|
# reverse iterate the list by half_len
|
|
|
|
|
half_len_list = [i for i in range(half_len)]
|
|
|
|
|
for index in reversed(half_len_list):
|
|
|
|
|
first_letter = rest_context[index].strip()[0]
|
|
|
|
|
if not first_letter.isnumeric() and first_letter not in [".", "(", ")", "-"]:
|
|
|
|
|
half_len = index
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
logger.info(f"Split first part from 0 to {half_len}")
|
|
|
|
|
split_first_part = "\n".join(split_context[:half_len])
|
|
|
|
|
first_part = '\n'.join(split_first_part)
|
|
|
|
|
first_instructions = self.get_instructions_by_datapoints(
|
|
|
|
|
first_part, page_datapoints, need_exclude, exclude_data
|
|
|
|
|
)
|
|
|
|
|
response, with_error = chat(
|
|
|
|
|
first_instructions, response_format={"type": "json_object"}
|
|
|
|
|
)
|
|
|
|
|
first_part_data = {"data": []}
|
|
|
|
|
if not with_error:
|
|
|
|
|
try:
|
|
|
|
|
first_part_data = json.loads(response)
|
|
|
|
|
except:
|
|
|
|
|
first_part_data = json_repair.loads(response)
|
|
|
|
|
|
|
|
|
|
logger.info(f"Split second part from {half_len} to {split_context_len}")
|
|
|
|
|
split_second_part = "\n".join(split_context[half_len:])
|
|
|
|
|
second_part = header + '\n' + split_second_part
|
|
|
|
|
second_instructions = self.get_instructions_by_datapoints(
|
|
|
|
|
second_part, page_datapoints, need_exclude, exclude_data
|
|
|
|
|
)
|
|
|
|
|
response, with_error = chat(
|
|
|
|
|
second_instructions, response_format={"type": "json_object"}
|
|
|
|
|
)
|
|
|
|
|
second_part_data = {"data": []}
|
|
|
|
|
if not with_error:
|
|
|
|
|
try:
|
|
|
|
|
second_part_data = json.loads(response)
|
|
|
|
|
except:
|
|
|
|
|
second_part_data = json_repair.loads(response)
|
|
|
|
|
|
|
|
|
|
first_part_data_list = first_part_data.get("data", [])
|
|
|
|
|
logger.info(f"First part data count: {len(first_part_data_list)}")
|
|
|
|
|
second_part_data_list = second_part_data.get("data", [])
|
|
|
|
|
logger.info(f"Second part data count: {len(second_part_data_list)}")
|
|
|
|
|
for first_data in first_part_data_list:
|
|
|
|
|
if first_data in second_part_data_list:
|
|
|
|
|
second_part_data_list.remove(first_data)
|
|
|
|
|
|
|
|
|
|
data_list = first_part_data_list + second_part_data_list
|
|
|
|
|
extract_data = {"data": data_list}
|
|
|
|
|
return extract_data
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logger.error(f"Error in split context: {e}")
|
|
|
|
|
return {"data": []}
|
|
|
|
|
|
2024-09-09 22:34:53 +00:00
|
|
|
def validate_data(self, extract_data_info: dict) -> dict:
|
|
|
|
|
"""
|
|
|
|
|
Validate data by the rules
|
|
|
|
|
1. Each data should be with fund name
|
|
|
|
|
2. For share level data, it should be with share name
|
|
|
|
|
"""
|
|
|
|
|
data_list = extract_data_info.get("data", [])
|
|
|
|
|
if len(data_list) == 0:
|
|
|
|
|
return extract_data_info
|
|
|
|
|
remove_list = []
|
|
|
|
|
for data in data_list:
|
|
|
|
|
if data.get("fund name", "") == "":
|
|
|
|
|
remove_list.append(data)
|
|
|
|
|
keys = list(data.keys())
|
|
|
|
|
for key in keys:
|
|
|
|
|
if self.datapoint_level_config.get(key, "") == "share_level":
|
|
|
|
|
if data.get("share name", "") == "":
|
|
|
|
|
remove_list.append(data)
|
|
|
|
|
break
|
|
|
|
|
for remove_data in remove_list:
|
|
|
|
|
if remove_data in data_list:
|
|
|
|
|
data_list.remove(remove_data)
|
|
|
|
|
# update "fund name" to be "fund_name"
|
|
|
|
|
# update "share name" to be "share_name"
|
|
|
|
|
new_data_list = []
|
|
|
|
|
for data in data_list:
|
|
|
|
|
new_data = {}
|
|
|
|
|
fund_name = data.get("fund name", "")
|
|
|
|
|
if fund_name != "":
|
|
|
|
|
new_data["fund_name"] = fund_name
|
|
|
|
|
share_name = data.get("share name", "")
|
|
|
|
|
if share_name != "":
|
|
|
|
|
new_data["share_name"] = share_name
|
2024-09-12 21:00:49 +00:00
|
|
|
ter = data.get("ter", None)
|
|
|
|
|
if ter is not None:
|
|
|
|
|
new_data["ter"] = ter
|
|
|
|
|
performance_fee = data.get("performance fees", None)
|
|
|
|
|
if performance_fee is not None:
|
|
|
|
|
new_data["performance_fee"] = performance_fee
|
2024-09-09 22:34:53 +00:00
|
|
|
|
|
|
|
|
for key, value in data.items():
|
2024-09-12 21:00:49 +00:00
|
|
|
if key not in ["fund name", "share name", "ter", "performance fees"]:
|
2024-09-09 22:34:53 +00:00
|
|
|
new_data[key] = value
|
|
|
|
|
new_data_list.append(new_data)
|
|
|
|
|
|
|
|
|
|
extract_data_info["data"] = new_data_list
|
|
|
|
|
return extract_data_info
|
|
|
|
|
|
2024-09-06 21:29:35 +00:00
|
|
|
|
2024-09-05 22:22:26 +00:00
|
|
|
def get_datapoints_by_page_num(self, page_num: int) -> list:
|
|
|
|
|
datapoints = []
|
|
|
|
|
for datapoint in self.datapoints:
|
|
|
|
|
if page_num in self.datapoint_page_info[datapoint]:
|
|
|
|
|
datapoints.append(datapoint)
|
|
|
|
|
return datapoints
|
2024-09-06 21:29:35 +00:00
|
|
|
|
|
|
|
|
def get_instructions_by_datapoints(
|
|
|
|
|
self,
|
|
|
|
|
page_text: str,
|
|
|
|
|
datapoints: list,
|
|
|
|
|
need_exclude: bool = False,
|
|
|
|
|
exclude_data: list = None,
|
|
|
|
|
) -> str:
|
2024-09-05 22:22:26 +00:00
|
|
|
"""
|
|
|
|
|
Get instructions to extract data from the page by the datapoints
|
|
|
|
|
Below is the instructions sections:
|
|
|
|
|
summary: string
|
|
|
|
|
reported_name by datapoints: dict
|
|
|
|
|
data_business_features: dict
|
|
|
|
|
common: list
|
|
|
|
|
investment_level by datapoints: dict
|
|
|
|
|
data_value_range by datapoints: dict
|
|
|
|
|
special_rule by datapoints: dict
|
|
|
|
|
special_cases: dict
|
|
|
|
|
common: list
|
|
|
|
|
title
|
|
|
|
|
contents
|
|
|
|
|
special_case by datapoints: list
|
|
|
|
|
title
|
|
|
|
|
contents
|
|
|
|
|
output_requirement
|
|
|
|
|
common: list
|
|
|
|
|
fund_level: list
|
|
|
|
|
share_level: dict
|
|
|
|
|
fund_name: list
|
|
|
|
|
share_name: list
|
|
|
|
|
ogc_value: list
|
|
|
|
|
ter_value: list
|
|
|
|
|
performance_fee_value: list
|
|
|
|
|
end
|
|
|
|
|
"""
|
|
|
|
|
instructions = [f"Context:\n{page_text}\n\nInstructions:\n"]
|
|
|
|
|
datapoint_name_list = []
|
|
|
|
|
for datapoint in datapoints:
|
|
|
|
|
datapoint_name = self.datapoint_name_config.get(datapoint, "")
|
|
|
|
|
datapoint_name_list.append(datapoint_name)
|
2024-09-06 21:29:35 +00:00
|
|
|
|
2024-09-05 22:22:26 +00:00
|
|
|
summary = self.instructions_config.get("summary", "\n")
|
2024-09-06 21:29:35 +00:00
|
|
|
instructions.append(summary.format(", ".join(datapoint_name_list)))
|
2024-09-05 22:22:26 +00:00
|
|
|
instructions.append("\n")
|
2024-09-06 21:29:35 +00:00
|
|
|
|
2024-09-05 22:22:26 +00:00
|
|
|
instructions.append("Datapoints Reported name:\n")
|
|
|
|
|
reported_name_info = self.instructions_config.get("reported_name", {})
|
|
|
|
|
for datapoint in datapoints:
|
|
|
|
|
reported_name = reported_name_info.get(datapoint, "")
|
|
|
|
|
instructions.append(reported_name)
|
|
|
|
|
instructions.append("\n")
|
|
|
|
|
instructions.append("\n")
|
2024-09-06 21:29:35 +00:00
|
|
|
|
2024-09-05 22:22:26 +00:00
|
|
|
instructions.append("Data business features:\n")
|
2024-09-06 21:29:35 +00:00
|
|
|
data_business_features = self.instructions_config.get(
|
|
|
|
|
"data_business_features", {}
|
|
|
|
|
)
|
|
|
|
|
common = "\n".join(data_business_features.get("common", []))
|
2024-09-05 22:22:26 +00:00
|
|
|
instructions.append(common)
|
|
|
|
|
instructions.append("\n")
|
2024-09-06 21:29:35 +00:00
|
|
|
|
2024-09-05 22:22:26 +00:00
|
|
|
instructions.append("Datapoints investment level:\n")
|
|
|
|
|
investment_level_info = data_business_features.get("investment_level", {})
|
|
|
|
|
for datapoint in datapoints:
|
|
|
|
|
investment_level = investment_level_info.get(datapoint, "")
|
|
|
|
|
instructions.append(investment_level)
|
|
|
|
|
instructions.append("\n")
|
|
|
|
|
instructions.append("\n")
|
2024-09-06 21:29:35 +00:00
|
|
|
|
2024-09-05 22:22:26 +00:00
|
|
|
instructions.append("Datapoints value range:\n")
|
|
|
|
|
data_value_range_info = data_business_features.get("data_value_range", {})
|
|
|
|
|
for datapoint in datapoints:
|
|
|
|
|
data_value_range = data_value_range_info.get(datapoint, "")
|
|
|
|
|
instructions.append(data_value_range)
|
|
|
|
|
instructions.append("\n")
|
|
|
|
|
instructions.append("\n")
|
2024-09-06 21:29:35 +00:00
|
|
|
|
2024-09-05 22:22:26 +00:00
|
|
|
special_rule_info = data_business_features.get("special_rule", {})
|
|
|
|
|
with_special_rule_title = False
|
|
|
|
|
for datapoint in datapoints:
|
|
|
|
|
special_rule_list = special_rule_info.get(datapoint, [])
|
|
|
|
|
if len(special_rule_list) > 0:
|
|
|
|
|
if not with_special_rule_title:
|
|
|
|
|
instructions.append("Special rule:\n")
|
|
|
|
|
with_special_rule_title = True
|
2024-09-06 21:29:35 +00:00
|
|
|
special_rule = "\n".join(special_rule_list)
|
2024-09-05 22:22:26 +00:00
|
|
|
instructions.append(special_rule)
|
|
|
|
|
instructions.append("\n\n")
|
|
|
|
|
instructions.append("\n")
|
2024-09-06 21:29:35 +00:00
|
|
|
|
2024-09-05 22:22:26 +00:00
|
|
|
instructions.append("Special cases:\n")
|
|
|
|
|
special_cases = self.instructions_config.get("special_cases", {})
|
|
|
|
|
special_cases_common_list = special_cases.get("common", [])
|
|
|
|
|
for special_cases_common in special_cases_common_list:
|
|
|
|
|
title = special_cases_common.get("title", "")
|
|
|
|
|
instructions.append(title)
|
|
|
|
|
instructions.append("\n")
|
|
|
|
|
contents_list = special_cases_common.get("contents", [])
|
2024-09-06 21:29:35 +00:00
|
|
|
contents = "\n".join(contents_list)
|
2024-09-05 22:22:26 +00:00
|
|
|
instructions.append(contents)
|
|
|
|
|
instructions.append("\n\n")
|
2024-09-06 21:29:35 +00:00
|
|
|
|
2024-09-05 22:22:26 +00:00
|
|
|
for datapoint in datapoints:
|
|
|
|
|
special_case_list = special_cases.get(datapoint, [])
|
|
|
|
|
for special_case in special_case_list:
|
|
|
|
|
title = special_case.get("title", "")
|
|
|
|
|
instructions.append(title)
|
|
|
|
|
instructions.append("\n")
|
|
|
|
|
contents_list = special_case.get("contents", [])
|
2024-09-06 21:29:35 +00:00
|
|
|
contents = "\n".join(contents_list)
|
2024-09-05 22:22:26 +00:00
|
|
|
instructions.append(contents)
|
|
|
|
|
instructions.append("\n\n")
|
|
|
|
|
instructions.append("\n")
|
2024-09-06 21:29:35 +00:00
|
|
|
|
2024-09-05 22:22:26 +00:00
|
|
|
instructions.append("Output requirement:\n")
|
|
|
|
|
output_requirement = self.instructions_config.get("output_requirement", {})
|
|
|
|
|
output_requirement_common_list = output_requirement.get("common", [])
|
|
|
|
|
instructions.append("\n".join(output_requirement_common_list))
|
|
|
|
|
instructions.append("\n")
|
2024-09-06 21:29:35 +00:00
|
|
|
|
2024-09-05 22:22:26 +00:00
|
|
|
share_datapoint_value_example = {}
|
|
|
|
|
share_level_config = output_requirement.get("share_level", {})
|
2024-09-06 21:29:35 +00:00
|
|
|
|
|
|
|
|
example_list = []
|
2024-09-05 22:22:26 +00:00
|
|
|
for datapoint in datapoints:
|
|
|
|
|
investment_level = self.datapoint_level_config.get(datapoint, "")
|
|
|
|
|
if investment_level == "fund_level":
|
|
|
|
|
fund_level_example_list = output_requirement.get("fund_level", [])
|
|
|
|
|
for example in fund_level_example_list:
|
2024-09-06 21:29:35 +00:00
|
|
|
try:
|
|
|
|
|
sub_example_list = json.loads(example)
|
|
|
|
|
except:
|
|
|
|
|
sub_example_list = json_repair.loads(example)
|
|
|
|
|
example_list.extend(sub_example_list)
|
2024-09-05 22:22:26 +00:00
|
|
|
elif investment_level == "share_level":
|
2024-09-06 21:29:35 +00:00
|
|
|
share_datapoint_value_example[datapoint] = share_level_config.get(
|
|
|
|
|
f"{datapoint}_value", []
|
|
|
|
|
)
|
|
|
|
|
|
2024-09-05 22:22:26 +00:00
|
|
|
share_datapoint_list = list(share_datapoint_value_example.keys())
|
2024-09-06 21:29:35 +00:00
|
|
|
instructions.append(f"Example:\n")
|
2024-09-05 22:22:26 +00:00
|
|
|
if len(share_datapoint_list) > 0:
|
|
|
|
|
fund_name_example_list = share_level_config.get("fund_name", [])
|
|
|
|
|
share_name_example_list = share_level_config.get("share_name", [])
|
2024-09-06 21:29:35 +00:00
|
|
|
|
2024-09-05 22:22:26 +00:00
|
|
|
for index in range(len(fund_name_example_list)):
|
2024-09-06 21:29:35 +00:00
|
|
|
example_dict = {
|
|
|
|
|
"fund name": fund_name_example_list[index],
|
|
|
|
|
"share name": share_name_example_list[index],
|
|
|
|
|
}
|
2024-09-05 22:22:26 +00:00
|
|
|
for share_datapoint in share_datapoint_list:
|
2024-09-06 21:29:35 +00:00
|
|
|
share_datapoint_values = share_datapoint_value_example[
|
|
|
|
|
share_datapoint
|
|
|
|
|
]
|
2024-09-05 22:22:26 +00:00
|
|
|
if index < len(share_datapoint_values):
|
|
|
|
|
example_dict[share_datapoint] = share_datapoint_values[index]
|
2024-09-06 21:29:35 +00:00
|
|
|
example_list.append(example_dict)
|
|
|
|
|
example_data = {"data": example_list}
|
|
|
|
|
instructions.append(json.dumps(example_data, ensure_ascii=False, indent=4))
|
|
|
|
|
instructions.append("\n")
|
|
|
|
|
instructions.append("\n")
|
|
|
|
|
|
2024-09-05 22:22:26 +00:00
|
|
|
end_list = self.instructions_config.get("end", [])
|
2024-09-06 21:29:35 +00:00
|
|
|
instructions.append("\n".join(end_list))
|
2024-09-05 22:22:26 +00:00
|
|
|
instructions.append("\n")
|
2024-09-06 21:29:35 +00:00
|
|
|
|
|
|
|
|
if need_exclude and exclude_data is not None and isinstance(exclude_data, list):
|
|
|
|
|
instructions.append("Please exclude below data from output:\n")
|
|
|
|
|
instructions.append(json.dumps(exclude_data, ensure_ascii=False, indent=4))
|
|
|
|
|
instructions.append("\n")
|
|
|
|
|
instructions.append("\n")
|
2024-09-05 22:22:26 +00:00
|
|
|
instructions.append("Answer:\n")
|
2024-09-06 21:29:35 +00:00
|
|
|
|
|
|
|
|
instructions_text = "".join(instructions)
|
|
|
|
|
return instructions_text
|