1. support fetch data from messy-code page by ChatGPT4o Vision function.
2. multilingual share features configuration
This commit is contained in:
parent
d96f77fe00
commit
75ea5e70de
|
|
@ -58,19 +58,26 @@ def us_ar_data_extract():
|
||||||
re_run_extract_data = False
|
re_run_extract_data = False
|
||||||
re_run_mapping_data = False
|
re_run_mapping_data = False
|
||||||
|
|
||||||
emea_ar_parsing = EMEA_AR_Parsing(doc_id=doc_id,
|
try:
|
||||||
pdf_folder=pdf_folder,
|
emea_ar_parsing = EMEA_AR_Parsing(doc_id=doc_id,
|
||||||
output_extract_data_folder=output_extract_data_folder,
|
pdf_folder=pdf_folder,
|
||||||
output_mapping_data_folder=output_mapping_data_folder,
|
output_extract_data_folder=output_extract_data_folder,
|
||||||
extract_way=extract_way,
|
output_mapping_data_folder=output_mapping_data_folder,
|
||||||
drilldown_folder=drilldown_folder)
|
extract_way=extract_way,
|
||||||
doc_data_from_gpt, annotation_list = emea_ar_parsing.extract_data(re_run=re_run_extract_data)
|
drilldown_folder=drilldown_folder)
|
||||||
doc_mapping_data = emea_ar_parsing.mapping_data(
|
doc_data_from_gpt, annotation_list = emea_ar_parsing.extract_data(re_run=re_run_extract_data)
|
||||||
data_from_gpt=doc_data_from_gpt, re_run=re_run_mapping_data
|
doc_mapping_data = emea_ar_parsing.mapping_data(
|
||||||
)
|
data_from_gpt=doc_data_from_gpt, re_run=re_run_mapping_data
|
||||||
results = {"extract_data": doc_mapping_data,
|
)
|
||||||
"annotation_data": annotation_list}
|
results = {"extract_data": doc_mapping_data,
|
||||||
return jsonify(results)
|
"annotation_data": annotation_list}
|
||||||
|
return jsonify(results)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error: {e}")
|
||||||
|
results = {"extract_data": [],
|
||||||
|
"annotation_data": [],
|
||||||
|
"error": str(e)}
|
||||||
|
return jsonify(results)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
|
||||||
|
|
@ -171,6 +171,8 @@ class DataExtraction:
|
||||||
previous_page_datapoints = []
|
previous_page_datapoints = []
|
||||||
previous_page_fund_name = None
|
previous_page_fund_name = None
|
||||||
for page_num, page_text in self.page_text_dict.items():
|
for page_num, page_text in self.page_text_dict.items():
|
||||||
|
if page_num > 160:
|
||||||
|
break
|
||||||
if page_num in handled_page_num_list:
|
if page_num in handled_page_num_list:
|
||||||
continue
|
continue
|
||||||
page_datapoints = self.get_datapoints_by_page_num(page_num)
|
page_datapoints = self.get_datapoints_by_page_num(page_num)
|
||||||
|
|
@ -184,7 +186,9 @@ class DataExtraction:
|
||||||
# example document: 431073795, page index 1727 to 1728
|
# example document: 431073795, page index 1727 to 1728
|
||||||
logger.info(f"Transfer previous page fund name: {previous_page_fund_name} to be the pre-fix of page text")
|
logger.info(f"Transfer previous page fund name: {previous_page_fund_name} to be the pre-fix of page text")
|
||||||
page_text = f"\nThe last fund name of previous PDF page: {previous_page_fund_name}\n{page_text}"
|
page_text = f"\nThe last fund name of previous PDF page: {previous_page_fund_name}\n{page_text}"
|
||||||
|
else:
|
||||||
|
previous_page_fund_name = None
|
||||||
|
|
||||||
extract_data = self.extract_data_by_page(
|
extract_data = self.extract_data_by_page(
|
||||||
page_num,
|
page_num,
|
||||||
page_text,
|
page_text,
|
||||||
|
|
@ -201,16 +205,19 @@ class DataExtraction:
|
||||||
previous_page_num = page_num
|
previous_page_num = page_num
|
||||||
previous_page_fund_name = page_data_list[-1].get("fund_name", "")
|
previous_page_fund_name = page_data_list[-1].get("fund_name", "")
|
||||||
previous_page_datapoints = page_datapoints
|
previous_page_datapoints = page_datapoints
|
||||||
|
extract_way = extract_data.get("extract_way", "text")
|
||||||
current_page_data_count = len(page_data_list)
|
current_page_data_count = len(page_data_list)
|
||||||
if current_page_data_count > 0:
|
if current_page_data_count > 0:
|
||||||
count = 1
|
count = 1
|
||||||
# some pdf documents have multiple pages for the same data
|
# some pdf documents have multiple pages for the same data
|
||||||
# and the next page may without table header with data point keywords.
|
# and the next page may without table header with data point keywords.
|
||||||
# the purpose is try to get data from the next page
|
# the purpose is try to get data from the next page
|
||||||
current_text = page_text
|
if extract_way == "text":
|
||||||
|
current_text = page_text
|
||||||
|
else:
|
||||||
|
current_text = ""
|
||||||
|
|
||||||
while count < 3:
|
while True:
|
||||||
try:
|
try:
|
||||||
next_page_num = page_num + count
|
next_page_num = page_num + count
|
||||||
if next_page_num >= pdf_page_count:
|
if next_page_num >= pdf_page_count:
|
||||||
|
|
@ -231,9 +238,13 @@ class DataExtraction:
|
||||||
next_datapoints = list(set(next_datapoints))
|
next_datapoints = list(set(next_datapoints))
|
||||||
if not should_continue:
|
if not should_continue:
|
||||||
break
|
break
|
||||||
next_page_text = self.page_text_dict.get(next_page_num, "")
|
if extract_way == "text":
|
||||||
target_text = current_text + next_page_text
|
next_page_text = self.page_text_dict.get(next_page_num, "")
|
||||||
|
target_text = current_text + next_page_text
|
||||||
|
else:
|
||||||
|
target_text = ""
|
||||||
# try to get data by current page_datapoints
|
# try to get data by current page_datapoints
|
||||||
|
logger.info(f"Try to get data from next page {next_page_num}")
|
||||||
next_page_extract_data = self.extract_data_by_page(
|
next_page_extract_data = self.extract_data_by_page(
|
||||||
next_page_num,
|
next_page_num,
|
||||||
target_text,
|
target_text,
|
||||||
|
|
@ -245,7 +256,8 @@ class DataExtraction:
|
||||||
next_page_data_list = next_page_extract_data.get(
|
next_page_data_list = next_page_extract_data.get(
|
||||||
"extract_data", {}
|
"extract_data", {}
|
||||||
).get("data", [])
|
).get("data", [])
|
||||||
|
# print(f"next_page_data_list: {next_page_num}")
|
||||||
|
# print(next_page_data_list)
|
||||||
if next_page_data_list is not None and len(next_page_data_list) > 0:
|
if next_page_data_list is not None and len(next_page_data_list) > 0:
|
||||||
for current_page_data in page_data_list:
|
for current_page_data in page_data_list:
|
||||||
if current_page_data in next_page_data_list:
|
if current_page_data in next_page_data_list:
|
||||||
|
|
@ -254,6 +266,8 @@ class DataExtraction:
|
||||||
"data"
|
"data"
|
||||||
] = next_page_data_list
|
] = next_page_data_list
|
||||||
data_list.append(next_page_extract_data)
|
data_list.append(next_page_extract_data)
|
||||||
|
previous_page_num = next_page_num
|
||||||
|
previous_page_fund_name = next_page_data_list[-1].get("fund_name", "")
|
||||||
handled_page_num_list.append(next_page_num)
|
handled_page_num_list.append(next_page_num)
|
||||||
exist_current_page_datapoint = False
|
exist_current_page_datapoint = False
|
||||||
for next_page_data in next_page_data_list:
|
for next_page_data in next_page_data_list:
|
||||||
|
|
@ -379,11 +393,19 @@ class DataExtraction:
|
||||||
previous_page_last_fund: str = None) -> dict:
|
previous_page_last_fund: str = None) -> dict:
|
||||||
# If can't find numberic value, e.g. 1.25 or 3,88
|
# If can't find numberic value, e.g. 1.25 or 3,88
|
||||||
# apply Vision ChatGPT to extract data
|
# apply Vision ChatGPT to extract data
|
||||||
|
special_code_regex = r"\x00|\x01|\x02|\x03|\x04|\x05|\x06|\x07|\x08|\x09|\x0a|\x0b|\x0c|\x0d|\x0e|\x0f|\x10|\x11|\x12|\x13|\x14|\x15|\x16|\x17|\x18|\x19|\x1a|\x1b|\x1c|\x1d|\x1e|\x1f"
|
||||||
|
special_code_all = [code for code in re.findall(special_code_regex, page_text)
|
||||||
|
if code != "\n"]
|
||||||
|
page_text_line_count = len(page_text.split("\n"))
|
||||||
numeric_regex = r"\d+(\.|\,)\d+"
|
numeric_regex = r"\d+(\.|\,)\d+"
|
||||||
if not re.search(numeric_regex, page_text):
|
if not re.search(numeric_regex, page_text) or page_text_line_count < 3 or len(special_code_all) > 100:
|
||||||
logger.info(f"Can't find numberic value in page {page_num}, apply Vision ChatGPT to extract data")
|
logger.info(f"Can't find numberic value in page {page_num}, apply Vision ChatGPT to extract data")
|
||||||
return self.extract_data_by_page_image(
|
return self.extract_data_by_page_image(
|
||||||
page_num, page_datapoints, need_exclude, exclude_data)
|
page_num=page_num,
|
||||||
|
page_datapoints=page_datapoints,
|
||||||
|
need_exclude=False,
|
||||||
|
exclude_data=None,
|
||||||
|
previous_page_last_fund=previous_page_last_fund)
|
||||||
else:
|
else:
|
||||||
return self.extract_data_by_page_text(
|
return self.extract_data_by_page_text(
|
||||||
page_num=page_num,
|
page_num=page_num,
|
||||||
|
|
@ -401,7 +423,8 @@ class DataExtraction:
|
||||||
page_datapoints: list,
|
page_datapoints: list,
|
||||||
need_exclude: bool = False,
|
need_exclude: bool = False,
|
||||||
exclude_data: list = None,
|
exclude_data: list = None,
|
||||||
previous_page_last_fund: str = None
|
previous_page_last_fund: str = None,
|
||||||
|
original_way: str = "text"
|
||||||
) -> dict:
|
) -> dict:
|
||||||
"""
|
"""
|
||||||
keys are
|
keys are
|
||||||
|
|
@ -427,7 +450,7 @@ class DataExtraction:
|
||||||
data_dict["instructions"] = instructions
|
data_dict["instructions"] = instructions
|
||||||
data_dict["raw_answer"] = response
|
data_dict["raw_answer"] = response
|
||||||
data_dict["extract_data"] = {"data": []}
|
data_dict["extract_data"] = {"data": []}
|
||||||
data_dict["extract_way"] = "text"
|
data_dict["extract_way"] = original_way
|
||||||
return data_dict
|
return data_dict
|
||||||
try:
|
try:
|
||||||
data = json.loads(response)
|
data = json.loads(response)
|
||||||
|
|
@ -444,7 +467,9 @@ class DataExtraction:
|
||||||
data = json_repair.loads(response)
|
data = json_repair.loads(response)
|
||||||
except:
|
except:
|
||||||
data = {"data": []}
|
data = {"data": []}
|
||||||
data = self.validate_data(data, previous_page_last_fund)
|
data = self.validate_data(extract_data_info=data,
|
||||||
|
page_text=page_text,
|
||||||
|
previous_page_last_fund=previous_page_last_fund)
|
||||||
|
|
||||||
data_dict = {"doc_id": self.doc_id}
|
data_dict = {"doc_id": self.doc_id}
|
||||||
data_dict["page_index"] = page_num
|
data_dict["page_index"] = page_num
|
||||||
|
|
@ -453,7 +478,7 @@ class DataExtraction:
|
||||||
data_dict["instructions"] = instructions
|
data_dict["instructions"] = instructions
|
||||||
data_dict["raw_answer"] = response
|
data_dict["raw_answer"] = response
|
||||||
data_dict["extract_data"] = data
|
data_dict["extract_data"] = data
|
||||||
data_dict["extract_way"] = "text"
|
data_dict["extract_way"] = original_way
|
||||||
return data_dict
|
return data_dict
|
||||||
|
|
||||||
def extract_data_by_page_image(
|
def extract_data_by_page_image(
|
||||||
|
|
@ -462,54 +487,63 @@ class DataExtraction:
|
||||||
page_datapoints: list,
|
page_datapoints: list,
|
||||||
need_exclude: bool = False,
|
need_exclude: bool = False,
|
||||||
exclude_data: list = None,
|
exclude_data: list = None,
|
||||||
|
previous_page_last_fund: str = None
|
||||||
) -> dict:
|
) -> dict:
|
||||||
"""
|
"""
|
||||||
keys are
|
keys are
|
||||||
doc_id, page_index, datapoint, value, raw_fund_name, fund_id, fund_name, raw_share_name, share_id, share_name
|
doc_id, page_index, datapoint, value, raw_fund_name, fund_id, fund_name, raw_share_name, share_id, share_name
|
||||||
"""
|
"""
|
||||||
logger.info(f"Extracting data from page {page_num}")
|
logger.info(f"Extracting data from page {page_num}")
|
||||||
image_base64 = self.get_pdf_image_base64(page_num)
|
# image_base64 = self.get_pdf_image_base64(page_num)
|
||||||
instructions = self.get_instructions_by_datapoints(
|
page_text = self.get_image_text(page_num)
|
||||||
"",
|
if page_text is None or len(page_text) == 0:
|
||||||
page_datapoints,
|
|
||||||
need_exclude=need_exclude,
|
|
||||||
exclude_data=exclude_data,
|
|
||||||
extract_way="image"
|
|
||||||
)
|
|
||||||
response, with_error = chat(
|
|
||||||
instructions, response_format={"type": "json_object"}, image_base64=image_base64
|
|
||||||
)
|
|
||||||
if with_error:
|
|
||||||
logger.error(f"Error in extracting tables from page")
|
|
||||||
data_dict = {"doc_id": self.doc_id}
|
data_dict = {"doc_id": self.doc_id}
|
||||||
data_dict["page_index"] = page_num
|
data_dict["page_index"] = page_num
|
||||||
data_dict["datapoints"] = ", ".join(page_datapoints)
|
data_dict["datapoints"] = ", ".join(page_datapoints)
|
||||||
data_dict["page_text"] = ""
|
data_dict["page_text"] = ""
|
||||||
data_dict["instructions"] = instructions
|
data_dict["instructions"] = ""
|
||||||
data_dict["raw_answer"] = response
|
data_dict["raw_answer"] = ""
|
||||||
data_dict["extract_data"] = {"data": []}
|
data_dict["extract_data"] = {"data": []}
|
||||||
data_dict["extract_way"] = "image"
|
data_dict["extract_way"] = "image"
|
||||||
return data_dict
|
return data_dict
|
||||||
|
else:
|
||||||
|
if previous_page_last_fund is not None and len(previous_page_last_fund) > 0:
|
||||||
|
logger.info(f"Transfer previous page fund name: {previous_page_last_fund} to be the pre-fix of page text")
|
||||||
|
page_text = f"\nThe last fund name of previous PDF page: {previous_page_last_fund}\n{page_text}"
|
||||||
|
return self.extract_data_by_page_text(
|
||||||
|
page_num=page_num,
|
||||||
|
page_text=page_text,
|
||||||
|
page_datapoints=page_datapoints,
|
||||||
|
need_exclude=need_exclude,
|
||||||
|
exclude_data=exclude_data,
|
||||||
|
previous_page_last_fund=previous_page_last_fund,
|
||||||
|
original_way="image"
|
||||||
|
)
|
||||||
|
|
||||||
|
def get_image_text(self, page_num: int) -> str:
|
||||||
|
image_base64 = self.get_pdf_image_base64(page_num)
|
||||||
|
instructions = self.instructions_config.get("get_image_text", "\n")
|
||||||
|
logger.info(f"Get text from image of page {page_num}")
|
||||||
|
response, with_error = chat(
|
||||||
|
instructions, response_format={"type": "json_object"}, image_base64=image_base64
|
||||||
|
)
|
||||||
|
text = ""
|
||||||
|
if with_error:
|
||||||
|
logger.error(f"Can't get text from current image")
|
||||||
try:
|
try:
|
||||||
data = json.loads(response)
|
data = json.loads(response)
|
||||||
except:
|
except:
|
||||||
try:
|
try:
|
||||||
data = json_repair.loads(response)
|
data = json_repair.loads(response)
|
||||||
except:
|
except:
|
||||||
data = {"data": []}
|
pass
|
||||||
data = self.validate_data(data, None)
|
text = data.get("text", "")
|
||||||
|
return text
|
||||||
|
|
||||||
data_dict = {"doc_id": self.doc_id}
|
def validate_data(self,
|
||||||
data_dict["page_index"] = page_num
|
extract_data_info: dict,
|
||||||
data_dict["datapoints"] = ", ".join(page_datapoints)
|
page_text: str,
|
||||||
data_dict["page_text"] = ""
|
previous_page_last_fund: str=None) -> dict:
|
||||||
data_dict["instructions"] = instructions
|
|
||||||
data_dict["raw_answer"] = response
|
|
||||||
data_dict["extract_data"] = data
|
|
||||||
data_dict["extract_way"] = "image"
|
|
||||||
return data_dict
|
|
||||||
|
|
||||||
def validate_data(self, extract_data_info: dict, previous_page_last_fund: str=None) -> dict:
|
|
||||||
"""
|
"""
|
||||||
Validate data by the rules
|
Validate data by the rules
|
||||||
1. Each data should be with fund name
|
1. Each data should be with fund name
|
||||||
|
|
@ -519,11 +553,29 @@ class DataExtraction:
|
||||||
if len(data_list) == 0:
|
if len(data_list) == 0:
|
||||||
return extract_data_info
|
return extract_data_info
|
||||||
remove_list = []
|
remove_list = []
|
||||||
|
performance_fee_regex = r"Amount\s+of\s+the\s+performance\s+fees|Performance\s+Fees\s+amounts|Performance\s+fees\s+amounts|Commissioni\s+di\s+performance|Performance\s+Fee\s+"
|
||||||
|
nav_regex = r"based\s+on\s+(the\s+)?NAV|on\s+the\s+Share\s+Class\s+NAV|NAV\s+of\s+performance\s+fee|of\s+the\s+average\s+Net\s+Asset\s+Value|Attivi\s+in\s+gestione|Performance\s+Fee\s+of\s+NAV\s+in"
|
||||||
|
if page_text is not None and len(page_text) > 0:
|
||||||
|
performance_fee_search = re.search(performance_fee_regex, page_text)
|
||||||
|
nav_search = re.search(nav_regex, page_text)
|
||||||
|
else:
|
||||||
|
performance_fee_search = None
|
||||||
|
nav_search = None
|
||||||
for data in data_list:
|
for data in data_list:
|
||||||
|
if (data.get("performance_fee", None) is not None and
|
||||||
|
performance_fee_search is not None and
|
||||||
|
nav_search is not None):
|
||||||
|
data.pop("performance_fee")
|
||||||
|
keys = [key for key in list(data.keys())
|
||||||
|
if key not in ["fund name", "share name"]]
|
||||||
|
if len(keys) == 0:
|
||||||
|
remove_list.append(data)
|
||||||
|
continue
|
||||||
fund_name = data.get("fund name", "").strip()
|
fund_name = data.get("fund name", "").strip()
|
||||||
if fund_name == "":
|
if fund_name == "":
|
||||||
remove_list.append(data)
|
remove_list.append(data)
|
||||||
|
continue
|
||||||
|
|
||||||
# Clean fund name start
|
# Clean fund name start
|
||||||
if previous_page_last_fund is not None and len(previous_page_last_fund) > 0:
|
if previous_page_last_fund is not None and len(previous_page_last_fund) > 0:
|
||||||
previous_page_last_fund = previous_page_last_fund.strip()
|
previous_page_last_fund = previous_page_last_fund.strip()
|
||||||
|
|
@ -534,10 +586,10 @@ class DataExtraction:
|
||||||
fund_name = self.get_fund_name(fund_name, "Fund")
|
fund_name = self.get_fund_name(fund_name, "Fund")
|
||||||
fund_name = self.get_fund_name(fund_name, "Bond")
|
fund_name = self.get_fund_name(fund_name, "Bond")
|
||||||
|
|
||||||
remove_list = ["Market Specific Equity Sub-Funds",
|
remove_prefix_list = ["Market Specific Equity Sub-Funds",
|
||||||
"International and Regional Equity Sub-Funds",
|
"International and Regional Equity Sub-Funds",
|
||||||
"Equity Sub-Funds"]
|
"Equity Sub-Funds"]
|
||||||
for remove_item in remove_list:
|
for remove_item in remove_prefix_list:
|
||||||
if fund_name.startswith(remove_item):
|
if fund_name.startswith(remove_item):
|
||||||
fund_name = fund_name.replace(remove_item, "").strip()
|
fund_name = fund_name.replace(remove_item, "").strip()
|
||||||
|
|
||||||
|
|
@ -617,7 +669,7 @@ class DataExtraction:
|
||||||
|
|
||||||
extract_data_info["data"] = new_data_list
|
extract_data_info["data"] = new_data_list
|
||||||
return extract_data_info
|
return extract_data_info
|
||||||
|
|
||||||
def split_multi_share_name(self, raw_share_name: str) -> list:
|
def split_multi_share_name(self, raw_share_name: str) -> list:
|
||||||
"""
|
"""
|
||||||
Some document, e.g. 481482392
|
Some document, e.g. 481482392
|
||||||
|
|
@ -739,6 +791,8 @@ class DataExtraction:
|
||||||
summary = self.instructions_config.get("summary", "\n")
|
summary = self.instructions_config.get("summary", "\n")
|
||||||
elif extract_way == "image":
|
elif extract_way == "image":
|
||||||
summary = self.instructions_config.get("summary_image", "\n")
|
summary = self.instructions_config.get("summary_image", "\n")
|
||||||
|
if page_text is not None and len(page_text) > 0:
|
||||||
|
summary += f"\nThe last fund name of previous PDF page: {page_text}\n"
|
||||||
else:
|
else:
|
||||||
summary = self.instructions_config.get("summary", "\n")
|
summary = self.instructions_config.get("summary", "\n")
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,7 @@
|
||||||
{
|
{
|
||||||
"summary": "Read the context carefully.\nMaybe exists {} data in the context.\n",
|
"summary": "Read the context carefully.\nMaybe exists {} data in the context.\n",
|
||||||
"summary_image": "Read the image carefully.\nMaybe exists {} data in the image.\n",
|
"summary_image": "Read the image carefully.\nMaybe exists {} data in the image.\n",
|
||||||
|
"get_image_text": "Instructions: Please extract the text from the image. output the result as a JSON, the JSON format is like below example(s): {\"text\": \"Text from image\"} \n\nAnswer:\n",
|
||||||
"image_features":
|
"image_features":
|
||||||
[
|
[
|
||||||
"1. Identify the text in the PDF page image.",
|
"1. Identify the text in the PDF page image.",
|
||||||
|
|
@ -93,7 +94,8 @@
|
||||||
"- With \"Ongoing Charges inkl. Performance-Fee in % **)\" and \"Ongoing Charges inkl. Performance-Fee in % (inkl. Zielfonds)\", pick up the values from \"Ongoing Charges inkl. Performance-Fee in % **)\"."
|
"- With \"Ongoing Charges inkl. Performance-Fee in % **)\" and \"Ongoing Charges inkl. Performance-Fee in % (inkl. Zielfonds)\", pick up the values from \"Ongoing Charges inkl. Performance-Fee in % **)\"."
|
||||||
],
|
],
|
||||||
"performance_fee": [
|
"performance_fee": [
|
||||||
"The performance fees should not be the presence of the rates at which the performance fees are calculated."
|
"The performance fees should not be the presence of the rates at which the performance fees are calculated.",
|
||||||
|
"The reported of performance fees should not be \"% based on the NAV at the end of the accounting period\""
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
|
|
||||||
90
main.py
90
main.py
|
|
@ -131,7 +131,11 @@ class EMEA_AR_Parsing:
|
||||||
data_from_gpt = {"data": []}
|
data_from_gpt = {"data": []}
|
||||||
|
|
||||||
# Drilldown data to relevant PDF document
|
# Drilldown data to relevant PDF document
|
||||||
annotation_list = self.drilldown_pdf_document(data_from_gpt)
|
annotation_list = []
|
||||||
|
try:
|
||||||
|
annotation_list = self.drilldown_pdf_document(data_from_gpt)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error: {e}")
|
||||||
return data_from_gpt, annotation_list
|
return data_from_gpt, annotation_list
|
||||||
|
|
||||||
def drilldown_pdf_document(self, data_from_gpt: list) -> list:
|
def drilldown_pdf_document(self, data_from_gpt: list) -> list:
|
||||||
|
|
@ -195,8 +199,12 @@ class EMEA_AR_Parsing:
|
||||||
annotation_list_df = pd.DataFrame(annotation_list)
|
annotation_list_df = pd.DataFrame(annotation_list)
|
||||||
# set drilldown_result_df column order as doc_id, pdf_file, page_index,
|
# set drilldown_result_df column order as doc_id, pdf_file, page_index,
|
||||||
# data_point, value, matching_val_area, normalized_bbox
|
# data_point, value, matching_val_area, normalized_bbox
|
||||||
annotation_list_df = annotation_list_df[["doc_id", "pdf_file", "page_index",
|
try:
|
||||||
"data_point", "value", "matching_val_area", "normalized_bbox"]]
|
annotation_list_df = annotation_list_df[["doc_id", "pdf_file", "page_index",
|
||||||
|
"data_point", "value", "matching_val_area",
|
||||||
|
"normalized_bbox"]]
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error: {e}")
|
||||||
logger.info(f"Writing drilldown data to {drilldown_file}")
|
logger.info(f"Writing drilldown data to {drilldown_file}")
|
||||||
try:
|
try:
|
||||||
with pd.ExcelWriter(drilldown_file) as writer:
|
with pd.ExcelWriter(drilldown_file) as writer:
|
||||||
|
|
@ -353,6 +361,7 @@ def batch_start_job(
|
||||||
re_run_mapping_data: bool = False,
|
re_run_mapping_data: bool = False,
|
||||||
force_save_total_data: bool = False,
|
force_save_total_data: bool = False,
|
||||||
calculate_metrics: bool = False,
|
calculate_metrics: bool = False,
|
||||||
|
total_data_prefix: str = None
|
||||||
):
|
):
|
||||||
pdf_files = glob(pdf_folder + "*.pdf")
|
pdf_files = glob(pdf_folder + "*.pdf")
|
||||||
doc_list = []
|
doc_list = []
|
||||||
|
|
@ -376,17 +385,21 @@ def batch_start_job(
|
||||||
result_extract_data_list = []
|
result_extract_data_list = []
|
||||||
result_mapping_data_list = []
|
result_mapping_data_list = []
|
||||||
for doc_id in tqdm(doc_list):
|
for doc_id in tqdm(doc_list):
|
||||||
doc_data_from_gpt, annotation_list, doc_mapping_data_list = mapping_data(
|
try:
|
||||||
doc_id=doc_id,
|
doc_data_from_gpt, annotation_list, doc_mapping_data_list = mapping_data(
|
||||||
pdf_folder=pdf_folder,
|
doc_id=doc_id,
|
||||||
output_extract_data_folder=output_extract_data_child_folder,
|
pdf_folder=pdf_folder,
|
||||||
output_mapping_folder=output_mapping_child_folder,
|
output_extract_data_folder=output_extract_data_child_folder,
|
||||||
extract_way=extract_way,
|
output_mapping_folder=output_mapping_child_folder,
|
||||||
re_run_extract_data=re_run_extract_data,
|
extract_way=extract_way,
|
||||||
re_run_mapping_data=re_run_mapping_data,
|
re_run_extract_data=re_run_extract_data,
|
||||||
)
|
re_run_mapping_data=re_run_mapping_data,
|
||||||
result_extract_data_list.extend(doc_data_from_gpt)
|
)
|
||||||
result_mapping_data_list.extend(doc_mapping_data_list)
|
result_extract_data_list.extend(doc_data_from_gpt)
|
||||||
|
result_mapping_data_list.extend(doc_mapping_data_list)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Document: {doc_id} met error: {e}")
|
||||||
|
print_exc()
|
||||||
|
|
||||||
if force_save_total_data or (
|
if force_save_total_data or (
|
||||||
special_doc_id_list is None or len(special_doc_id_list) == 0
|
special_doc_id_list is None or len(special_doc_id_list) == 0
|
||||||
|
|
@ -401,10 +414,10 @@ def batch_start_job(
|
||||||
unique_doc_ids = result_extract_data_df["doc_id"].unique().tolist()
|
unique_doc_ids = result_extract_data_df["doc_id"].unique().tolist()
|
||||||
os.makedirs(output_extract_data_total_folder, exist_ok=True)
|
os.makedirs(output_extract_data_total_folder, exist_ok=True)
|
||||||
time_stamp = time.strftime("%Y%m%d%H%M%S", time.localtime())
|
time_stamp = time.strftime("%Y%m%d%H%M%S", time.localtime())
|
||||||
output_file = os.path.join(
|
file_name = f"extract_data_info_{len(unique_doc_ids)}_documents_by_{extract_way}_{time_stamp}.xlsx"
|
||||||
output_extract_data_total_folder,
|
if total_data_prefix is not None and len(total_data_prefix) > 0:
|
||||||
f"extract_data_info_{len(unique_doc_ids)}_documents_by_{extract_way}_{time_stamp}.xlsx",
|
file_name = f"{total_data_prefix}_{file_name}"
|
||||||
)
|
output_file = os.path.join(output_extract_data_total_folder, file_name)
|
||||||
with pd.ExcelWriter(output_file) as writer:
|
with pd.ExcelWriter(output_file) as writer:
|
||||||
result_extract_data_df.to_excel(
|
result_extract_data_df.to_excel(
|
||||||
writer, index=False, sheet_name="extract_data_info"
|
writer, index=False, sheet_name="extract_data_info"
|
||||||
|
|
@ -414,10 +427,10 @@ def batch_start_job(
|
||||||
unique_doc_ids = result_mappingdata_df["doc_id"].unique().tolist()
|
unique_doc_ids = result_mappingdata_df["doc_id"].unique().tolist()
|
||||||
os.makedirs(output_mapping_total_folder, exist_ok=True)
|
os.makedirs(output_mapping_total_folder, exist_ok=True)
|
||||||
time_stamp = time.strftime("%Y%m%d%H%M%S", time.localtime())
|
time_stamp = time.strftime("%Y%m%d%H%M%S", time.localtime())
|
||||||
output_file = os.path.join(
|
file_name = f"mapping_data_info_{len(unique_doc_ids)}_documents_by_{extract_way}_{time_stamp}.xlsx"
|
||||||
output_mapping_total_folder,
|
if total_data_prefix is not None and len(total_data_prefix) > 0:
|
||||||
f"mapping_data_info_{len(unique_doc_ids)}_documents_by_{extract_way}_{time_stamp}.xlsx",
|
file_name = f"{total_data_prefix}_{file_name}"
|
||||||
)
|
output_file = os.path.join(output_mapping_total_folder, file_name)
|
||||||
|
|
||||||
doc_mapping_data_in_db = only_output_mapping_data_in_db(result_mappingdata_df)
|
doc_mapping_data_in_db = only_output_mapping_data_in_db(result_mappingdata_df)
|
||||||
with pd.ExcelWriter(output_file) as writer:
|
with pd.ExcelWriter(output_file) as writer:
|
||||||
|
|
@ -431,7 +444,6 @@ def batch_start_job(
|
||||||
writer, index=False, sheet_name="extract_data"
|
writer, index=False, sheet_name="extract_data"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
if calculate_metrics:
|
if calculate_metrics:
|
||||||
prediction_sheet_name = "total_mapping_data"
|
prediction_sheet_name = "total_mapping_data"
|
||||||
ground_truth_file = r"/data/emea_ar/ground_truth/data_extraction/mapping_data_info_73_documents.xlsx"
|
ground_truth_file = r"/data/emea_ar/ground_truth/data_extraction/mapping_data_info_73_documents.xlsx"
|
||||||
|
|
@ -1149,8 +1161,37 @@ def batch_run_documents():
|
||||||
"527525440",
|
"527525440",
|
||||||
"534535767"
|
"534535767"
|
||||||
]
|
]
|
||||||
|
# total_data_prefix = "complex_doc_"
|
||||||
|
# documents from EMEA Case 1.docx
|
||||||
|
check_db_mapping_doc_id_list = [
|
||||||
|
"510483464",
|
||||||
|
"525412280",
|
||||||
|
"528208797",
|
||||||
|
"435128656",
|
||||||
|
"425480144",
|
||||||
|
"466528487",
|
||||||
|
"434902020",
|
||||||
|
"440029306",
|
||||||
|
"431073795",
|
||||||
|
"430240853",
|
||||||
|
"427637151",
|
||||||
|
"434924914",
|
||||||
|
"467595142",
|
||||||
|
"466859621",
|
||||||
|
"429564034",
|
||||||
|
"424976833",
|
||||||
|
"466860852",
|
||||||
|
"466371135",
|
||||||
|
"470515549",
|
||||||
|
"434851173",
|
||||||
|
"434710819",
|
||||||
|
"429950833",
|
||||||
|
"467788879"
|
||||||
|
]
|
||||||
|
total_data_prefix = "complex_doc_from_word"
|
||||||
special_doc_id_list = check_db_mapping_doc_id_list
|
special_doc_id_list = check_db_mapping_doc_id_list
|
||||||
special_doc_id_list = ["481482392"]
|
special_doc_id_list = ["334584772", "435128656"]
|
||||||
|
special_doc_id_list = ["514213638"]
|
||||||
pdf_folder = r"/data/emea_ar/pdf/"
|
pdf_folder = r"/data/emea_ar/pdf/"
|
||||||
page_filter_ground_truth_file = (
|
page_filter_ground_truth_file = (
|
||||||
r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx"
|
r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx"
|
||||||
|
|
@ -1179,6 +1220,7 @@ def batch_run_documents():
|
||||||
re_run_mapping_data,
|
re_run_mapping_data,
|
||||||
force_save_total_data=force_save_total_data,
|
force_save_total_data=force_save_total_data,
|
||||||
calculate_metrics=calculate_metrics,
|
calculate_metrics=calculate_metrics,
|
||||||
|
total_data_prefix=total_data_prefix
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -380,7 +380,8 @@ def replace_share_name_for_multilingual(text: str, share_name: str):
|
||||||
"Kategorie Anteile", "Kategorie anteile",
|
"Kategorie Anteile", "Kategorie anteile",
|
||||||
"Clase de participaciones", "Aandelenklasse",
|
"Clase de participaciones", "Aandelenklasse",
|
||||||
"aandelenklasse", "Anteilklasse", "anteilklasse",
|
"aandelenklasse", "Anteilklasse", "anteilklasse",
|
||||||
"Aktien", "Aktienklasse", "aktien", "aktienklasse"]
|
"Aktien", "Aktienklasse", "aktien", "aktienklasse",
|
||||||
|
"Klasse"]
|
||||||
for multilingual_share in multilingual_share_list:
|
for multilingual_share in multilingual_share_list:
|
||||||
if multilingual_share in text:
|
if multilingual_share in text:
|
||||||
text = text.replace(multilingual_share, "Class")
|
text = text.replace(multilingual_share, "Class")
|
||||||
|
|
@ -879,11 +880,16 @@ def replace_abbrevation(text: str):
|
||||||
text_splits = text.split()
|
text_splits = text.split()
|
||||||
new_text_splits = []
|
new_text_splits = []
|
||||||
for split in text_splits:
|
for split in text_splits:
|
||||||
if split.lower() in ['acc', 'acc.', 'accumulating']:
|
if split.lower() in ['acc', 'acc.', 'accumulating',
|
||||||
|
'thesaurierende', 'thes.', 'accumulazione',
|
||||||
|
'akkumulation', 'acumulación',
|
||||||
|
'accumulatie']:
|
||||||
new_text_splits.append('Accumulation')
|
new_text_splits.append('Accumulation')
|
||||||
elif split.lower() in ['inc', 'inc.']:
|
elif split.lower() in ['inc', 'inc.']:
|
||||||
new_text_splits.append('Income')
|
new_text_splits.append('Income')
|
||||||
elif split.lower() in ['dist', 'dist.', 'dis', 'dis.', "distributing"]:
|
elif split.lower() in ['dist', 'dist.', 'dis',
|
||||||
|
'dis.', 'distributing', 'ausschüttende',
|
||||||
|
'aussch.', 'distribuzione']:
|
||||||
new_text_splits.append('Distribution')
|
new_text_splits.append('Distribution')
|
||||||
elif split.lower() in ['inv', 'inv.']:
|
elif split.lower() in ['inv', 'inv.']:
|
||||||
new_text_splits.append('Investor')
|
new_text_splits.append('Investor')
|
||||||
|
|
|
||||||
|
|
@ -107,7 +107,7 @@ def chat(
|
||||||
|
|
||||||
count = 0
|
count = 0
|
||||||
error = ""
|
error = ""
|
||||||
request_timeout = 120
|
request_timeout = 600
|
||||||
while count < 8:
|
while count < 8:
|
||||||
try:
|
try:
|
||||||
if count > 0:
|
if count > 0:
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue