support fetch data from Prospectus
This commit is contained in:
parent
e230a5bf15
commit
a89aa9c4de
|
|
@ -1,7 +1,6 @@
|
||||||
{
|
{
|
||||||
"539790009": [39, 40, 45, 46, 47],
|
"539790009": [39, 40, 45, 46, 47],
|
||||||
"542300403": [12],
|
"542300403": [12],
|
||||||
"542301117": [17, 18],
|
|
||||||
"542306317": [4, 15, 16, 17, 18],
|
"542306317": [4, 15, 16, 17, 18],
|
||||||
"547567013": [12, 13, 14, 15, 16, 17, 33, 34, 35, 39, 40, 42, 43, 44, 45],
|
"547567013": [12, 13, 14, 15, 16, 17, 33, 34, 35, 39, 40, 42, 43, 44, 45],
|
||||||
"552505237": [16, 17, 18, 19, 25, 26, 27],
|
"552505237": [16, 17, 18, 19, 25, 26, 27],
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,7 @@
|
||||||
|
{
|
||||||
|
"objective_strategy":
|
||||||
|
{
|
||||||
|
"start": "\\n[0-9\\W\\s]*(investment\\s*objective|objective|fund\\s*objective|investment\\s*objective(s)?\\s*(and|\\&)\\s*(policy|policies|investment)|Investment\\s*(Policy|policies)\\s*and\\s*Objective(s)?\\s*of\\s*the\\s*Trust|investment\\s*objective(s)?\\s*(and|\\&)\\s*policy\\W*and\\s*investment\\s*restriction|Investment\\s*Objective\\s*and\\s*Investment\\s*Policy\\s*and\\s*Strategy|What\\s*the\\s*Fund\\s*Aims\\s*to Deliver\\s*(\\WFund\\s*Objective\\W)?)(s)?(\\W)*\\s*\\n",
|
||||||
|
"end": "\\n[0-9\\W\\s]*(uk\\s*ucits\\s*investment\\s*and\\s*borrowing\\s*powers|risk\\s*consideration|risk\\s*factor|fund\\s*risk|investor(s)?\\s*profile|final\\s*accounting\\s*date|dealing\\s*cut\\s*off\\s*point|cut\\s*off\\s* point|class(es)?\\s*of\\s*share(s)?\\s*available|class(es)?\\s*of\\s*share(s)?\\s*which\\s*may\\s*be\\s*issue(d)?|manager.*charge|investment\\s*style|profile\\s*of\\s*the\\s*typical\\s*investor|typical\\s*investor(s)?\\s*profile|accounting\\s*reference\\s*date.*|specific\\s*fund\\s*risk\\s*factor|change(s)?\\s*to\\s*the\\s*investment\\s*objective\\s*and(\\/or)?\\s*investment\\s*policy|accounting\\s*and\\s*record\\s*date|share\\s*class(es)?\\s*established\\s*as\\s*at\\s*the\\s*date\\s*of\\s*this\\s*prospectus|isa|class(es)?\\s*for\\s*investment\\s*in\\s*the\\s*catholic\\s*investment\\s*fund|fund\\s*detail|derivative(s)?\\s*and\\s*technique|investment\\s*(restriction|approach)|Tracking\\s*Error|Characteristics\\s*of\\s*the\\s*Trust|investment\\s*style|Limit\\s*on\\s*investment\\s*in\\s*other\\s*collective\\s*investment\\s*scheme|Participation\\s*in\\s*the\\s*Fund|Initial\\s*Charge|other|Additional\\s*Information)(s)?(\\W)*\\s*\\n"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -44,6 +44,15 @@ class DataExtraction:
|
||||||
self.document_mapping_info_df = query_document_fund_mapping(doc_id, rerun=False)
|
self.document_mapping_info_df = query_document_fund_mapping(doc_id, rerun=False)
|
||||||
else:
|
else:
|
||||||
self.document_mapping_info_df = document_mapping_info_df
|
self.document_mapping_info_df = document_mapping_info_df
|
||||||
|
|
||||||
|
self.fund_name_list = self.document_mapping_info_df["FundName"].unique().tolist()
|
||||||
|
|
||||||
|
# get document type by DocumentType in self.document_mapping_info_df
|
||||||
|
self.document_type = int(self.document_mapping_info_df["DocumentType"].iloc[0])
|
||||||
|
self.investment_objective_pages = []
|
||||||
|
if self.document_type == 1:
|
||||||
|
self.investment_objective_pages = self.get_investment_objective_pages()
|
||||||
|
|
||||||
self.provider_mapping_df = self.get_provider_mapping()
|
self.provider_mapping_df = self.get_provider_mapping()
|
||||||
if len(self.provider_mapping_df) == 0:
|
if len(self.provider_mapping_df) == 0:
|
||||||
self.provider_fund_name_list = []
|
self.provider_fund_name_list = []
|
||||||
|
|
@ -62,6 +71,23 @@ class DataExtraction:
|
||||||
self.extract_way = extract_way
|
self.extract_way = extract_way
|
||||||
self.output_image_folder = output_image_folder
|
self.output_image_folder = output_image_folder
|
||||||
|
|
||||||
|
|
||||||
|
def get_investment_objective_pages(self):
|
||||||
|
investment_objective_pages = []
|
||||||
|
if self.document_type == 1:
|
||||||
|
objective_strategy_regex_config_file = r"./configuration/objective_strategy_regex.json"
|
||||||
|
with open(objective_strategy_regex_config_file, "r", encoding="utf-8") as f:
|
||||||
|
objective_strategy_regex_config = json.load(f)
|
||||||
|
objective_start_regex = objective_strategy_regex_config.get("objective_strategy", {}).get("start", "")
|
||||||
|
|
||||||
|
if objective_start_regex is not None and len(objective_start_regex) > 0:
|
||||||
|
for page_index, text in self.page_text_dict.items():
|
||||||
|
if re.search(objective_start_regex, text, re.I):
|
||||||
|
investment_objective_pages.append(page_index)
|
||||||
|
if len(investment_objective_pages) > 0:
|
||||||
|
investment_objective_pages.sort()
|
||||||
|
return investment_objective_pages
|
||||||
|
|
||||||
def get_datapoint_reported_name(self):
|
def get_datapoint_reported_name(self):
|
||||||
language_config_file = r"./configuration/language.json"
|
language_config_file = r"./configuration/language.json"
|
||||||
self.language_config = {}
|
self.language_config = {}
|
||||||
|
|
@ -222,6 +248,8 @@ class DataExtraction:
|
||||||
next_page_num = page_num + count
|
next_page_num = page_num + count
|
||||||
if next_page_num >= pdf_page_count:
|
if next_page_num >= pdf_page_count:
|
||||||
break
|
break
|
||||||
|
if self.document_type == 1 and next_page_num in self.investment_objective_pages:
|
||||||
|
break
|
||||||
next_datapoints = page_datapoints
|
next_datapoints = page_datapoints
|
||||||
if next_page_num in self.page_nums_with_datapoints:
|
if next_page_num in self.page_nums_with_datapoints:
|
||||||
should_continue = False
|
should_continue = False
|
||||||
|
|
@ -434,6 +462,25 @@ class DataExtraction:
|
||||||
doc_id, page_index, datapoint, value, raw_fund_name, fund_id, fund_name, raw_share_name, share_id, share_name
|
doc_id, page_index, datapoint, value, raw_fund_name, fund_id, fund_name, raw_share_name, share_id, share_name
|
||||||
"""
|
"""
|
||||||
logger.info(f"Extracting data from page {page_num}")
|
logger.info(f"Extracting data from page {page_num}")
|
||||||
|
if self.document_type == 1:
|
||||||
|
pre_context = f"The document type is prospectus. \nThe fund names in this document are {', '.join(self.fund_name_list)}."
|
||||||
|
if pre_context in page_text:
|
||||||
|
page_text = page_text.replace(pre_context, "\n").strip()
|
||||||
|
|
||||||
|
if len(self.investment_objective_pages) > 0:
|
||||||
|
# Get the page number of the most recent investment objective at the top of the current page.
|
||||||
|
diff_pages = [page_num - investment_objective_page for investment_objective_page
|
||||||
|
in self.investment_objective_pages
|
||||||
|
if investment_objective_page <= page_num]
|
||||||
|
if len(diff_pages) > 0 and diff_pages[-1] < 5:
|
||||||
|
top_nearest_investment_objective_page = self.investment_objective_pages[len(diff_pages) - 1]
|
||||||
|
top_nearest_investment_objective_text = self.page_text_dict.get(top_nearest_investment_objective_page, "")
|
||||||
|
if top_nearest_investment_objective_text in page_text:
|
||||||
|
page_text = page_text.replace(top_nearest_investment_objective_text, "").strip()
|
||||||
|
pre_context = f"\nThe most recent investment objective page text which maybe with fund name is: \n{top_nearest_investment_objective_text}.\n"
|
||||||
|
# If can't find previous investment objective text, add the fund names to be the pre-fix of page text
|
||||||
|
page_text = f"{pre_context}\n{page_text}"
|
||||||
|
|
||||||
instructions = self.get_instructions_by_datapoints(
|
instructions = self.get_instructions_by_datapoints(
|
||||||
page_text,
|
page_text,
|
||||||
page_datapoints,
|
page_datapoints,
|
||||||
|
|
|
||||||
|
|
@ -7,14 +7,16 @@ from utils.pdf_util import PDFUtil
|
||||||
from utils.sql_query_util import query_document_fund_mapping
|
from utils.sql_query_util import query_document_fund_mapping
|
||||||
from utils.logger import logger
|
from utils.logger import logger
|
||||||
from utils.biz_utils import add_slash_to_text_as_regex, clean_text
|
from utils.biz_utils import add_slash_to_text_as_regex, clean_text
|
||||||
|
from utils.pdf_util import get_pdf_pages_by_html
|
||||||
|
|
||||||
|
|
||||||
class FilterPages:
|
class FilterPages:
|
||||||
def __init__(
|
def __init__(
|
||||||
self, doc_id: str, pdf_file: str, document_mapping_info_df: pd.DataFrame
|
self, doc_id: str, pdf_file: str, document_mapping_info_df: pd.DataFrame, apply_pdf2html: bool = False
|
||||||
) -> None:
|
) -> None:
|
||||||
self.doc_id = doc_id
|
self.doc_id = doc_id
|
||||||
self.pdf_file = pdf_file
|
self.pdf_file = pdf_file
|
||||||
|
self.apply_pdf2html = apply_pdf2html
|
||||||
self.page_text_dict = self.get_pdf_page_text_dict()
|
self.page_text_dict = self.get_pdf_page_text_dict()
|
||||||
if document_mapping_info_df is None or len(document_mapping_info_df) == 0:
|
if document_mapping_info_df is None or len(document_mapping_info_df) == 0:
|
||||||
self.document_mapping_info_df = query_document_fund_mapping(doc_id, rerun=False)
|
self.document_mapping_info_df = query_document_fund_mapping(doc_id, rerun=False)
|
||||||
|
|
@ -35,6 +37,10 @@ class FilterPages:
|
||||||
self.document_dp_pages = self.document_dp_pages_config.get(self.doc_id, [])
|
self.document_dp_pages = self.document_dp_pages_config.get(self.doc_id, [])
|
||||||
|
|
||||||
def get_pdf_page_text_dict(self) -> dict:
|
def get_pdf_page_text_dict(self) -> dict:
|
||||||
|
page_text_dict = {}
|
||||||
|
if self.apply_pdf2html:
|
||||||
|
page_text_dict = get_pdf_pages_by_html(self.pdf_file, pdf_info_type="pdf_path")
|
||||||
|
else:
|
||||||
pdf_util = PDFUtil(self.pdf_file)
|
pdf_util = PDFUtil(self.pdf_file)
|
||||||
success, text, page_text_dict = pdf_util.extract_text()
|
success, text, page_text_dict = pdf_util.extract_text()
|
||||||
return page_text_dict
|
return page_text_dict
|
||||||
|
|
|
||||||
69
main.py
69
main.py
|
|
@ -24,9 +24,11 @@ class EMEA_AR_Parsing:
|
||||||
self,
|
self,
|
||||||
doc_id: str,
|
doc_id: str,
|
||||||
pdf_folder: str = r"/data/emea_ar/pdf/",
|
pdf_folder: str = r"/data/emea_ar/pdf/",
|
||||||
|
output_pdf_text_folder: str = r"/data/emea_ar/output/pdf_text/",
|
||||||
output_extract_data_folder: str = r"/data/emea_ar/output/extract_data/docs/",
|
output_extract_data_folder: str = r"/data/emea_ar/output/extract_data/docs/",
|
||||||
output_mapping_data_folder: str = r"/data/emea_ar/output/mapping_data/docs/",
|
output_mapping_data_folder: str = r"/data/emea_ar/output/mapping_data/docs/",
|
||||||
extract_way: str = "text",
|
extract_way: str = "text",
|
||||||
|
apply_pdf2html: bool = False,
|
||||||
drilldown_folder: str = r"/data/emea_ar/output/drilldown/",
|
drilldown_folder: str = r"/data/emea_ar/output/drilldown/",
|
||||||
) -> None:
|
) -> None:
|
||||||
self.doc_id = doc_id
|
self.doc_id = doc_id
|
||||||
|
|
@ -68,9 +70,24 @@ class EMEA_AR_Parsing:
|
||||||
os.makedirs(self.output_mapping_data_folder, exist_ok=True)
|
os.makedirs(self.output_mapping_data_folder, exist_ok=True)
|
||||||
|
|
||||||
self.filter_pages = FilterPages(
|
self.filter_pages = FilterPages(
|
||||||
self.doc_id, self.pdf_file, self.document_mapping_info_df
|
self.doc_id, self.pdf_file, self.document_mapping_info_df, apply_pdf2html
|
||||||
)
|
)
|
||||||
self.page_text_dict = self.filter_pages.page_text_dict
|
self.page_text_dict = self.filter_pages.page_text_dict
|
||||||
|
try:
|
||||||
|
os.makedirs(output_pdf_text_folder, exist_ok=True)
|
||||||
|
if apply_pdf2html:
|
||||||
|
output_pdf_text_folder = os.path.join(output_pdf_text_folder, "pdf2html/")
|
||||||
|
else:
|
||||||
|
output_pdf_text_folder = os.path.join(output_pdf_text_folder, "pymupdf/")
|
||||||
|
os.makedirs(output_pdf_text_folder, exist_ok=True)
|
||||||
|
self.page_text_file = os.path.join(
|
||||||
|
output_pdf_text_folder, f"{self.doc_id}_page_text.json"
|
||||||
|
)
|
||||||
|
with open(self.page_text_file, "w", encoding="utf-8") as f:
|
||||||
|
json.dump(self.page_text_dict, f, ensure_ascii=False, indent=4)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error: {e}")
|
||||||
|
|
||||||
self.datapoint_page_info, self.result_details = self.get_datapoint_page_info()
|
self.datapoint_page_info, self.result_details = self.get_datapoint_page_info()
|
||||||
self.datapoints = self.get_datapoints_from_datapoint_page_info()
|
self.datapoints = self.get_datapoints_from_datapoint_page_info()
|
||||||
|
|
||||||
|
|
@ -274,9 +291,11 @@ def extract_data(
|
||||||
def mapping_data(
|
def mapping_data(
|
||||||
doc_id: str,
|
doc_id: str,
|
||||||
pdf_folder: str,
|
pdf_folder: str,
|
||||||
|
output_pdf_text_folder: str,
|
||||||
output_extract_data_folder: str,
|
output_extract_data_folder: str,
|
||||||
output_mapping_folder: str,
|
output_mapping_folder: str,
|
||||||
extract_way: str = "text",
|
extract_way: str = "text",
|
||||||
|
apply_pdf2html: bool = False,
|
||||||
drilldown_folder: str = r"/data/emea_ar/output/drilldown/",
|
drilldown_folder: str = r"/data/emea_ar/output/drilldown/",
|
||||||
re_run_extract_data: bool = False,
|
re_run_extract_data: bool = False,
|
||||||
re_run_mapping_data: bool = False,
|
re_run_mapping_data: bool = False,
|
||||||
|
|
@ -285,9 +304,11 @@ def mapping_data(
|
||||||
emea_ar_parsing = EMEA_AR_Parsing(
|
emea_ar_parsing = EMEA_AR_Parsing(
|
||||||
doc_id,
|
doc_id,
|
||||||
pdf_folder,
|
pdf_folder,
|
||||||
|
output_pdf_text_folder=output_pdf_text_folder,
|
||||||
output_extract_data_folder=output_extract_data_folder,
|
output_extract_data_folder=output_extract_data_folder,
|
||||||
output_mapping_data_folder=output_mapping_folder,
|
output_mapping_data_folder=output_mapping_folder,
|
||||||
extract_way=extract_way,
|
extract_way=extract_way,
|
||||||
|
apply_pdf2html=apply_pdf2html,
|
||||||
drilldown_folder=drilldown_folder,
|
drilldown_folder=drilldown_folder,
|
||||||
)
|
)
|
||||||
doc_data_from_gpt, annotation_list = emea_ar_parsing.extract_data(re_run=re_run_extract_data)
|
doc_data_from_gpt, annotation_list = emea_ar_parsing.extract_data(re_run=re_run_extract_data)
|
||||||
|
|
@ -352,12 +373,14 @@ def batch_extract_data(
|
||||||
|
|
||||||
def batch_start_job(
|
def batch_start_job(
|
||||||
pdf_folder: str,
|
pdf_folder: str,
|
||||||
|
output_pdf_text_folder: str = r"/data/emea_ar/output/pdf_text/",
|
||||||
doc_data_excel_file: str = None,
|
doc_data_excel_file: str = None,
|
||||||
output_extract_data_child_folder: str = r"/data/emea_ar/output/extract_data/docs/",
|
output_extract_data_child_folder: str = r"/data/emea_ar/output/extract_data/docs/",
|
||||||
output_mapping_child_folder: str = r"/data/emea_ar/output/mapping_data/docs/",
|
output_mapping_child_folder: str = r"/data/emea_ar/output/mapping_data/docs/",
|
||||||
output_extract_data_total_folder: str = r"/data/emea_ar/output/extract_data/total/",
|
output_extract_data_total_folder: str = r"/data/emea_ar/output/extract_data/total/",
|
||||||
output_mapping_total_folder: str = r"/data/emea_ar/output/mapping_data/total/",
|
output_mapping_total_folder: str = r"/data/emea_ar/output/mapping_data/total/",
|
||||||
extract_way: str = "text",
|
extract_way: str = "text",
|
||||||
|
apply_pdf2html: bool = False,
|
||||||
drilldown_folder: str = r"/data/emea_ar/output/drilldown/",
|
drilldown_folder: str = r"/data/emea_ar/output/drilldown/",
|
||||||
special_doc_id_list: list = None,
|
special_doc_id_list: list = None,
|
||||||
re_run_extract_data: bool = False,
|
re_run_extract_data: bool = False,
|
||||||
|
|
@ -392,9 +415,11 @@ def batch_start_job(
|
||||||
doc_data_from_gpt, annotation_list, doc_mapping_data_list = mapping_data(
|
doc_data_from_gpt, annotation_list, doc_mapping_data_list = mapping_data(
|
||||||
doc_id=doc_id,
|
doc_id=doc_id,
|
||||||
pdf_folder=pdf_folder,
|
pdf_folder=pdf_folder,
|
||||||
|
output_pdf_text_folder=output_pdf_text_folder,
|
||||||
output_extract_data_folder=output_extract_data_child_folder,
|
output_extract_data_folder=output_extract_data_child_folder,
|
||||||
output_mapping_folder=output_mapping_child_folder,
|
output_mapping_folder=output_mapping_child_folder,
|
||||||
extract_way=extract_way,
|
extract_way=extract_way,
|
||||||
|
apply_pdf2html=apply_pdf2html,
|
||||||
drilldown_folder=drilldown_folder,
|
drilldown_folder=drilldown_folder,
|
||||||
re_run_extract_data=re_run_extract_data,
|
re_run_extract_data=re_run_extract_data,
|
||||||
re_run_mapping_data=re_run_mapping_data,
|
re_run_mapping_data=re_run_mapping_data,
|
||||||
|
|
@ -875,11 +900,13 @@ def replace_rerun_data(new_data_file: str, original_data_file: str):
|
||||||
|
|
||||||
def batch_run_documents(special_doc_id_list: list = None,
|
def batch_run_documents(special_doc_id_list: list = None,
|
||||||
pdf_folder:str = r"/data/emea_ar/pdf/",
|
pdf_folder:str = r"/data/emea_ar/pdf/",
|
||||||
|
output_pdf_text_folder: str = r"/data/emea_ar/output/pdf_text/",
|
||||||
output_extract_data_child_folder:str = r"/data/emea_ar/output/extract_data/docs/",
|
output_extract_data_child_folder:str = r"/data/emea_ar/output/extract_data/docs/",
|
||||||
output_extract_data_total_folder:str = r"/data/emea_ar/output/extract_data/total/",
|
output_extract_data_total_folder:str = r"/data/emea_ar/output/extract_data/total/",
|
||||||
output_mapping_child_folder:str = r"/data/emea_ar/output/mapping_data/docs/",
|
output_mapping_child_folder:str = r"/data/emea_ar/output/mapping_data/docs/",
|
||||||
output_mapping_total_folder:str = r"/data/emea_ar/output/mapping_data/total/",
|
output_mapping_total_folder:str = r"/data/emea_ar/output/mapping_data/total/",
|
||||||
drilldown_folder: str = r"/data/emea_ar/output/drilldown/"):
|
drilldown_folder: str = r"/data/emea_ar/output/drilldown/",
|
||||||
|
apply_pdf2html: bool = False):
|
||||||
sample_document_list_folder = r'./sample_documents/'
|
sample_document_list_folder = r'./sample_documents/'
|
||||||
document_list_files = glob(sample_document_list_folder + "*.txt")
|
document_list_files = glob(sample_document_list_folder + "*.txt")
|
||||||
page_filter_ground_truth_file = (
|
page_filter_ground_truth_file = (
|
||||||
|
|
@ -887,7 +914,7 @@ def batch_run_documents(special_doc_id_list: list = None,
|
||||||
)
|
)
|
||||||
re_run_extract_data = True
|
re_run_extract_data = True
|
||||||
re_run_mapping_data = True
|
re_run_mapping_data = True
|
||||||
force_save_total_data = True
|
force_save_total_data = False
|
||||||
calculate_metrics = False
|
calculate_metrics = False
|
||||||
|
|
||||||
extract_way = "text"
|
extract_way = "text"
|
||||||
|
|
@ -906,12 +933,14 @@ def batch_run_documents(special_doc_id_list: list = None,
|
||||||
doc_id_list = [doc_id.strip() for doc_id in doc_id_list]
|
doc_id_list = [doc_id.strip() for doc_id in doc_id_list]
|
||||||
batch_start_job(
|
batch_start_job(
|
||||||
pdf_folder,
|
pdf_folder,
|
||||||
|
output_pdf_text_folder,
|
||||||
page_filter_ground_truth_file,
|
page_filter_ground_truth_file,
|
||||||
output_extract_data_child_folder,
|
output_extract_data_child_folder,
|
||||||
output_mapping_child_folder,
|
output_mapping_child_folder,
|
||||||
output_extract_data_total_folder,
|
output_extract_data_total_folder,
|
||||||
output_mapping_total_folder,
|
output_mapping_total_folder,
|
||||||
extract_way,
|
extract_way,
|
||||||
|
apply_pdf2html,
|
||||||
drilldown_folder,
|
drilldown_folder,
|
||||||
doc_id_list,
|
doc_id_list,
|
||||||
re_run_extract_data,
|
re_run_extract_data,
|
||||||
|
|
@ -923,12 +952,14 @@ def batch_run_documents(special_doc_id_list: list = None,
|
||||||
else:
|
else:
|
||||||
batch_start_job(
|
batch_start_job(
|
||||||
pdf_folder,
|
pdf_folder,
|
||||||
|
output_pdf_text_folder,
|
||||||
page_filter_ground_truth_file,
|
page_filter_ground_truth_file,
|
||||||
output_extract_data_child_folder,
|
output_extract_data_child_folder,
|
||||||
output_mapping_child_folder,
|
output_mapping_child_folder,
|
||||||
output_extract_data_total_folder,
|
output_extract_data_total_folder,
|
||||||
output_mapping_total_folder,
|
output_mapping_total_folder,
|
||||||
extract_way,
|
extract_way,
|
||||||
|
apply_pdf2html,
|
||||||
drilldown_folder,
|
drilldown_folder,
|
||||||
special_doc_id_list,
|
special_doc_id_list,
|
||||||
re_run_extract_data,
|
re_run_extract_data,
|
||||||
|
|
@ -1048,31 +1079,37 @@ if __name__ == "__main__":
|
||||||
|
|
||||||
|
|
||||||
# special_doc_id_list = ["553242411"]
|
# special_doc_id_list = ["553242411"]
|
||||||
special_doc_id_list: list = ["539790009",
|
# special_doc_id_list: list = ["539790009",
|
||||||
"542300403",
|
# "542300403",
|
||||||
"542301117",
|
# "542301117",
|
||||||
"542306317",
|
# "542306317",
|
||||||
"547567013",
|
# "547567013",
|
||||||
"552505237",
|
# "552505237",
|
||||||
"552505278",
|
# "552505278",
|
||||||
"554431052",
|
# "554431052",
|
||||||
"554851189",
|
# "554851189",
|
||||||
"555377021",
|
# "555377021",
|
||||||
"555654388"]
|
# "555654388"]
|
||||||
# special_doc_id_list: list = ["552505278"]
|
special_doc_id_list: list = ["539790009", "542301117"]
|
||||||
|
special_doc_id_list: list = ["539790009"]
|
||||||
pdf_folder:str = r"/data/aus_prospectus/pdf/"
|
pdf_folder:str = r"/data/aus_prospectus/pdf/"
|
||||||
|
output_pdf_text_folder:str = r"/data/aus_prospectus/output/pdf_text/"
|
||||||
output_extract_data_child_folder:str = r"/data/aus_prospectus/output/extract_data/docs/"
|
output_extract_data_child_folder:str = r"/data/aus_prospectus/output/extract_data/docs/"
|
||||||
output_extract_data_total_folder:str = r"/data/aus_prospectus/output/extract_data/total/"
|
output_extract_data_total_folder:str = r"/data/aus_prospectus/output/extract_data/total/"
|
||||||
output_mapping_child_folder:str = r"/data/aus_prospectus/output/mapping_data/docs/"
|
output_mapping_child_folder:str = r"/data/aus_prospectus/output/mapping_data/docs/"
|
||||||
output_mapping_total_folder:str = r"/data/aus_prospectus/output/mapping_data/total/"
|
output_mapping_total_folder:str = r"/data/aus_prospectus/output/mapping_data/total/"
|
||||||
drilldown_folder = r"/data/aus_prospectus/output/drilldown/"
|
drilldown_folder = r"/data/aus_prospectus/output/drilldown/"
|
||||||
|
apply_pdf2html = True
|
||||||
batch_run_documents(special_doc_id_list=special_doc_id_list,
|
batch_run_documents(special_doc_id_list=special_doc_id_list,
|
||||||
pdf_folder=pdf_folder,
|
pdf_folder=pdf_folder,
|
||||||
|
output_pdf_text_folder=output_pdf_text_folder,
|
||||||
output_extract_data_child_folder=output_extract_data_child_folder,
|
output_extract_data_child_folder=output_extract_data_child_folder,
|
||||||
output_extract_data_total_folder=output_extract_data_total_folder,
|
output_extract_data_total_folder=output_extract_data_total_folder,
|
||||||
output_mapping_child_folder=output_mapping_child_folder,
|
output_mapping_child_folder=output_mapping_child_folder,
|
||||||
output_mapping_total_folder=output_mapping_total_folder,
|
output_mapping_total_folder=output_mapping_total_folder,
|
||||||
drilldown_folder=drilldown_folder)
|
drilldown_folder=drilldown_folder,
|
||||||
|
apply_pdf2html=apply_pdf2html
|
||||||
|
)
|
||||||
|
|
||||||
# new_data_file = r"/data/emea_ar/output/mapping_data/total/mapping_data_info_15_documents_by_text_20241121154243.xlsx"
|
# new_data_file = r"/data/emea_ar/output/mapping_data/total/mapping_data_info_15_documents_by_text_20241121154243.xlsx"
|
||||||
# original_data_file = r"/data/emea_ar/ground_truth/data_extraction/verify/mapping_data_info_30_documents_all_4_datapoints_20241106_verify_mapping.xlsx"
|
# original_data_file = r"/data/emea_ar/ground_truth/data_extraction/verify/mapping_data_info_30_documents_all_4_datapoints_20241106_verify_mapping.xlsx"
|
||||||
|
|
|
||||||
|
|
@ -1387,6 +1387,10 @@ def merge_aus_document_prospectus_data():
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def get_pdf_2_html():
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
# merge_aus_document_prospectus_data()
|
# merge_aus_document_prospectus_data()
|
||||||
|
|
|
||||||
|
|
@ -11,3 +11,4 @@ pandas==2.2.3
|
||||||
openpyxl==3.1.2
|
openpyxl==3.1.2
|
||||||
XlsxWriter==3.1.2
|
XlsxWriter==3.1.2
|
||||||
tiktoken==0.7.0
|
tiktoken==0.7.0
|
||||||
|
beautifulsoup4==4.12.3
|
||||||
|
|
@ -14,6 +14,11 @@ from utils.similarity import Similarity
|
||||||
from utils.biz_utils import total_currency_list
|
from utils.biz_utils import total_currency_list
|
||||||
|
|
||||||
from utils.logger import logger
|
from utils.logger import logger
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import dotenv
|
||||||
|
# loads .env file with your OPENAI_API_KEY
|
||||||
|
dotenv.load_dotenv()
|
||||||
|
|
||||||
|
|
||||||
class PDFUtil:
|
class PDFUtil:
|
||||||
|
|
@ -1667,3 +1672,58 @@ class PDFUtil:
|
||||||
action=action,
|
action=action,
|
||||||
)
|
)
|
||||||
return data_list
|
return data_list
|
||||||
|
|
||||||
|
|
||||||
|
def pdf_to_html_with_docid(doc_id, para):
|
||||||
|
headers = {
|
||||||
|
'user': 'visitor',
|
||||||
|
'Accept': 'application/json',
|
||||||
|
}
|
||||||
|
|
||||||
|
args = {
|
||||||
|
'docId': doc_id,
|
||||||
|
'parameters': json.dumps(para)
|
||||||
|
}
|
||||||
|
|
||||||
|
pdf2html_url = os.getenv("pdf2html_url")
|
||||||
|
response = requests.post(pdf2html_url, data=args, headers=headers)
|
||||||
|
response.encoding = 'utf-8'
|
||||||
|
text = response.text
|
||||||
|
return text
|
||||||
|
|
||||||
|
def pdf_to_html(pdf_path, para):
|
||||||
|
headers = {
|
||||||
|
"user": "visitor",
|
||||||
|
"Accept": "application/json",
|
||||||
|
}
|
||||||
|
args = {
|
||||||
|
"parameters": json.dumps(para)
|
||||||
|
}
|
||||||
|
|
||||||
|
with open(pdf_path, mode='rb') as f:
|
||||||
|
file_bytes = f.read()
|
||||||
|
|
||||||
|
files = {"file": ("tempName.pdf", file_bytes)}
|
||||||
|
|
||||||
|
pdf2html_url = os.getenv("pdf2html_url")
|
||||||
|
response = requests.post(pdf2html_url, data=args, files=files, headers=headers)
|
||||||
|
response.encoding = 'utf-8'
|
||||||
|
text = response.text
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
def get_pdf_pages_by_html(pdf_info: str, pdf_info_type: str="doc_id"):
|
||||||
|
# Convert pdf to html
|
||||||
|
para = {
|
||||||
|
"detectTable": True
|
||||||
|
}
|
||||||
|
if pdf_info_type == "doc_id":
|
||||||
|
html = pdf_to_html_with_docid(pdf_info, para)
|
||||||
|
else:
|
||||||
|
html = pdf_to_html(pdf_info, para)
|
||||||
|
html = BeautifulSoup(html, 'html.parser')
|
||||||
|
pages = html.find_all('div', attrs={'page-idx': True})
|
||||||
|
page_text_dict = {}
|
||||||
|
for index, page in enumerate(pages):
|
||||||
|
page_text_dict[index] = page.get_text().strip()
|
||||||
|
return page_text_dict
|
||||||
Loading…
Reference in New Issue