support fetch data from Prospectus

This commit is contained in:
Blade He 2025-01-14 16:21:48 -06:00
parent e230a5bf15
commit a89aa9c4de
8 changed files with 183 additions and 22 deletions

View File

@ -1,7 +1,6 @@
{ {
"539790009": [39, 40, 45, 46, 47], "539790009": [39, 40, 45, 46, 47],
"542300403": [12], "542300403": [12],
"542301117": [17, 18],
"542306317": [4, 15, 16, 17, 18], "542306317": [4, 15, 16, 17, 18],
"547567013": [12, 13, 14, 15, 16, 17, 33, 34, 35, 39, 40, 42, 43, 44, 45], "547567013": [12, 13, 14, 15, 16, 17, 33, 34, 35, 39, 40, 42, 43, 44, 45],
"552505237": [16, 17, 18, 19, 25, 26, 27], "552505237": [16, 17, 18, 19, 25, 26, 27],

View File

@ -0,0 +1,7 @@
{
"objective_strategy":
{
"start": "\\n[0-9\\W\\s]*(investment\\s*objective|objective|fund\\s*objective|investment\\s*objective(s)?\\s*(and|\\&)\\s*(policy|policies|investment)|Investment\\s*(Policy|policies)\\s*and\\s*Objective(s)?\\s*of\\s*the\\s*Trust|investment\\s*objective(s)?\\s*(and|\\&)\\s*policy\\W*and\\s*investment\\s*restriction|Investment\\s*Objective\\s*and\\s*Investment\\s*Policy\\s*and\\s*Strategy|What\\s*the\\s*Fund\\s*Aims\\s*to Deliver\\s*(\\WFund\\s*Objective\\W)?)(s)?(\\W)*\\s*\\n",
"end": "\\n[0-9\\W\\s]*(uk\\s*ucits\\s*investment\\s*and\\s*borrowing\\s*powers|risk\\s*consideration|risk\\s*factor|fund\\s*risk|investor(s)?\\s*profile|final\\s*accounting\\s*date|dealing\\s*cut\\s*off\\s*point|cut\\s*off\\s* point|class(es)?\\s*of\\s*share(s)?\\s*available|class(es)?\\s*of\\s*share(s)?\\s*which\\s*may\\s*be\\s*issue(d)?|manager.*charge|investment\\s*style|profile\\s*of\\s*the\\s*typical\\s*investor|typical\\s*investor(s)?\\s*profile|accounting\\s*reference\\s*date.*|specific\\s*fund\\s*risk\\s*factor|change(s)?\\s*to\\s*the\\s*investment\\s*objective\\s*and(\\/or)?\\s*investment\\s*policy|accounting\\s*and\\s*record\\s*date|share\\s*class(es)?\\s*established\\s*as\\s*at\\s*the\\s*date\\s*of\\s*this\\s*prospectus|isa|class(es)?\\s*for\\s*investment\\s*in\\s*the\\s*catholic\\s*investment\\s*fund|fund\\s*detail|derivative(s)?\\s*and\\s*technique|investment\\s*(restriction|approach)|Tracking\\s*Error|Characteristics\\s*of\\s*the\\s*Trust|investment\\s*style|Limit\\s*on\\s*investment\\s*in\\s*other\\s*collective\\s*investment\\s*scheme|Participation\\s*in\\s*the\\s*Fund|Initial\\s*Charge|other|Additional\\s*Information)(s)?(\\W)*\\s*\\n"
}
}

View File

@ -44,6 +44,15 @@ class DataExtraction:
self.document_mapping_info_df = query_document_fund_mapping(doc_id, rerun=False) self.document_mapping_info_df = query_document_fund_mapping(doc_id, rerun=False)
else: else:
self.document_mapping_info_df = document_mapping_info_df self.document_mapping_info_df = document_mapping_info_df
self.fund_name_list = self.document_mapping_info_df["FundName"].unique().tolist()
# get document type by DocumentType in self.document_mapping_info_df
self.document_type = int(self.document_mapping_info_df["DocumentType"].iloc[0])
self.investment_objective_pages = []
if self.document_type == 1:
self.investment_objective_pages = self.get_investment_objective_pages()
self.provider_mapping_df = self.get_provider_mapping() self.provider_mapping_df = self.get_provider_mapping()
if len(self.provider_mapping_df) == 0: if len(self.provider_mapping_df) == 0:
self.provider_fund_name_list = [] self.provider_fund_name_list = []
@ -61,7 +70,24 @@ class DataExtraction:
self.get_datapoint_reported_name() self.get_datapoint_reported_name()
self.extract_way = extract_way self.extract_way = extract_way
self.output_image_folder = output_image_folder self.output_image_folder = output_image_folder
def get_investment_objective_pages(self):
investment_objective_pages = []
if self.document_type == 1:
objective_strategy_regex_config_file = r"./configuration/objective_strategy_regex.json"
with open(objective_strategy_regex_config_file, "r", encoding="utf-8") as f:
objective_strategy_regex_config = json.load(f)
objective_start_regex = objective_strategy_regex_config.get("objective_strategy", {}).get("start", "")
if objective_start_regex is not None and len(objective_start_regex) > 0:
for page_index, text in self.page_text_dict.items():
if re.search(objective_start_regex, text, re.I):
investment_objective_pages.append(page_index)
if len(investment_objective_pages) > 0:
investment_objective_pages.sort()
return investment_objective_pages
def get_datapoint_reported_name(self): def get_datapoint_reported_name(self):
language_config_file = r"./configuration/language.json" language_config_file = r"./configuration/language.json"
self.language_config = {} self.language_config = {}
@ -222,6 +248,8 @@ class DataExtraction:
next_page_num = page_num + count next_page_num = page_num + count
if next_page_num >= pdf_page_count: if next_page_num >= pdf_page_count:
break break
if self.document_type == 1 and next_page_num in self.investment_objective_pages:
break
next_datapoints = page_datapoints next_datapoints = page_datapoints
if next_page_num in self.page_nums_with_datapoints: if next_page_num in self.page_nums_with_datapoints:
should_continue = False should_continue = False
@ -434,6 +462,25 @@ class DataExtraction:
doc_id, page_index, datapoint, value, raw_fund_name, fund_id, fund_name, raw_share_name, share_id, share_name doc_id, page_index, datapoint, value, raw_fund_name, fund_id, fund_name, raw_share_name, share_id, share_name
""" """
logger.info(f"Extracting data from page {page_num}") logger.info(f"Extracting data from page {page_num}")
if self.document_type == 1:
pre_context = f"The document type is prospectus. \nThe fund names in this document are {', '.join(self.fund_name_list)}."
if pre_context in page_text:
page_text = page_text.replace(pre_context, "\n").strip()
if len(self.investment_objective_pages) > 0:
# Get the page number of the most recent investment objective at the top of the current page.
diff_pages = [page_num - investment_objective_page for investment_objective_page
in self.investment_objective_pages
if investment_objective_page <= page_num]
if len(diff_pages) > 0 and diff_pages[-1] < 5:
top_nearest_investment_objective_page = self.investment_objective_pages[len(diff_pages) - 1]
top_nearest_investment_objective_text = self.page_text_dict.get(top_nearest_investment_objective_page, "")
if top_nearest_investment_objective_text in page_text:
page_text = page_text.replace(top_nearest_investment_objective_text, "").strip()
pre_context = f"\nThe most recent investment objective page text which maybe with fund name is: \n{top_nearest_investment_objective_text}.\n"
# If can't find previous investment objective text, add the fund names to be the pre-fix of page text
page_text = f"{pre_context}\n{page_text}"
instructions = self.get_instructions_by_datapoints( instructions = self.get_instructions_by_datapoints(
page_text, page_text,
page_datapoints, page_datapoints,

View File

@ -7,14 +7,16 @@ from utils.pdf_util import PDFUtil
from utils.sql_query_util import query_document_fund_mapping from utils.sql_query_util import query_document_fund_mapping
from utils.logger import logger from utils.logger import logger
from utils.biz_utils import add_slash_to_text_as_regex, clean_text from utils.biz_utils import add_slash_to_text_as_regex, clean_text
from utils.pdf_util import get_pdf_pages_by_html
class FilterPages: class FilterPages:
def __init__( def __init__(
self, doc_id: str, pdf_file: str, document_mapping_info_df: pd.DataFrame self, doc_id: str, pdf_file: str, document_mapping_info_df: pd.DataFrame, apply_pdf2html: bool = False
) -> None: ) -> None:
self.doc_id = doc_id self.doc_id = doc_id
self.pdf_file = pdf_file self.pdf_file = pdf_file
self.apply_pdf2html = apply_pdf2html
self.page_text_dict = self.get_pdf_page_text_dict() self.page_text_dict = self.get_pdf_page_text_dict()
if document_mapping_info_df is None or len(document_mapping_info_df) == 0: if document_mapping_info_df is None or len(document_mapping_info_df) == 0:
self.document_mapping_info_df = query_document_fund_mapping(doc_id, rerun=False) self.document_mapping_info_df = query_document_fund_mapping(doc_id, rerun=False)
@ -35,8 +37,12 @@ class FilterPages:
self.document_dp_pages = self.document_dp_pages_config.get(self.doc_id, []) self.document_dp_pages = self.document_dp_pages_config.get(self.doc_id, [])
def get_pdf_page_text_dict(self) -> dict: def get_pdf_page_text_dict(self) -> dict:
pdf_util = PDFUtil(self.pdf_file) page_text_dict = {}
success, text, page_text_dict = pdf_util.extract_text() if self.apply_pdf2html:
page_text_dict = get_pdf_pages_by_html(self.pdf_file, pdf_info_type="pdf_path")
else:
pdf_util = PDFUtil(self.pdf_file)
success, text, page_text_dict = pdf_util.extract_text()
return page_text_dict return page_text_dict
def get_configuration_from_file(self) -> dict: def get_configuration_from_file(self) -> dict:

69
main.py
View File

@ -24,9 +24,11 @@ class EMEA_AR_Parsing:
self, self,
doc_id: str, doc_id: str,
pdf_folder: str = r"/data/emea_ar/pdf/", pdf_folder: str = r"/data/emea_ar/pdf/",
output_pdf_text_folder: str = r"/data/emea_ar/output/pdf_text/",
output_extract_data_folder: str = r"/data/emea_ar/output/extract_data/docs/", output_extract_data_folder: str = r"/data/emea_ar/output/extract_data/docs/",
output_mapping_data_folder: str = r"/data/emea_ar/output/mapping_data/docs/", output_mapping_data_folder: str = r"/data/emea_ar/output/mapping_data/docs/",
extract_way: str = "text", extract_way: str = "text",
apply_pdf2html: bool = False,
drilldown_folder: str = r"/data/emea_ar/output/drilldown/", drilldown_folder: str = r"/data/emea_ar/output/drilldown/",
) -> None: ) -> None:
self.doc_id = doc_id self.doc_id = doc_id
@ -68,9 +70,24 @@ class EMEA_AR_Parsing:
os.makedirs(self.output_mapping_data_folder, exist_ok=True) os.makedirs(self.output_mapping_data_folder, exist_ok=True)
self.filter_pages = FilterPages( self.filter_pages = FilterPages(
self.doc_id, self.pdf_file, self.document_mapping_info_df self.doc_id, self.pdf_file, self.document_mapping_info_df, apply_pdf2html
) )
self.page_text_dict = self.filter_pages.page_text_dict self.page_text_dict = self.filter_pages.page_text_dict
try:
os.makedirs(output_pdf_text_folder, exist_ok=True)
if apply_pdf2html:
output_pdf_text_folder = os.path.join(output_pdf_text_folder, "pdf2html/")
else:
output_pdf_text_folder = os.path.join(output_pdf_text_folder, "pymupdf/")
os.makedirs(output_pdf_text_folder, exist_ok=True)
self.page_text_file = os.path.join(
output_pdf_text_folder, f"{self.doc_id}_page_text.json"
)
with open(self.page_text_file, "w", encoding="utf-8") as f:
json.dump(self.page_text_dict, f, ensure_ascii=False, indent=4)
except Exception as e:
logger.error(f"Error: {e}")
self.datapoint_page_info, self.result_details = self.get_datapoint_page_info() self.datapoint_page_info, self.result_details = self.get_datapoint_page_info()
self.datapoints = self.get_datapoints_from_datapoint_page_info() self.datapoints = self.get_datapoints_from_datapoint_page_info()
@ -274,9 +291,11 @@ def extract_data(
def mapping_data( def mapping_data(
doc_id: str, doc_id: str,
pdf_folder: str, pdf_folder: str,
output_pdf_text_folder: str,
output_extract_data_folder: str, output_extract_data_folder: str,
output_mapping_folder: str, output_mapping_folder: str,
extract_way: str = "text", extract_way: str = "text",
apply_pdf2html: bool = False,
drilldown_folder: str = r"/data/emea_ar/output/drilldown/", drilldown_folder: str = r"/data/emea_ar/output/drilldown/",
re_run_extract_data: bool = False, re_run_extract_data: bool = False,
re_run_mapping_data: bool = False, re_run_mapping_data: bool = False,
@ -285,9 +304,11 @@ def mapping_data(
emea_ar_parsing = EMEA_AR_Parsing( emea_ar_parsing = EMEA_AR_Parsing(
doc_id, doc_id,
pdf_folder, pdf_folder,
output_pdf_text_folder=output_pdf_text_folder,
output_extract_data_folder=output_extract_data_folder, output_extract_data_folder=output_extract_data_folder,
output_mapping_data_folder=output_mapping_folder, output_mapping_data_folder=output_mapping_folder,
extract_way=extract_way, extract_way=extract_way,
apply_pdf2html=apply_pdf2html,
drilldown_folder=drilldown_folder, drilldown_folder=drilldown_folder,
) )
doc_data_from_gpt, annotation_list = emea_ar_parsing.extract_data(re_run=re_run_extract_data) doc_data_from_gpt, annotation_list = emea_ar_parsing.extract_data(re_run=re_run_extract_data)
@ -352,12 +373,14 @@ def batch_extract_data(
def batch_start_job( def batch_start_job(
pdf_folder: str, pdf_folder: str,
output_pdf_text_folder: str = r"/data/emea_ar/output/pdf_text/",
doc_data_excel_file: str = None, doc_data_excel_file: str = None,
output_extract_data_child_folder: str = r"/data/emea_ar/output/extract_data/docs/", output_extract_data_child_folder: str = r"/data/emea_ar/output/extract_data/docs/",
output_mapping_child_folder: str = r"/data/emea_ar/output/mapping_data/docs/", output_mapping_child_folder: str = r"/data/emea_ar/output/mapping_data/docs/",
output_extract_data_total_folder: str = r"/data/emea_ar/output/extract_data/total/", output_extract_data_total_folder: str = r"/data/emea_ar/output/extract_data/total/",
output_mapping_total_folder: str = r"/data/emea_ar/output/mapping_data/total/", output_mapping_total_folder: str = r"/data/emea_ar/output/mapping_data/total/",
extract_way: str = "text", extract_way: str = "text",
apply_pdf2html: bool = False,
drilldown_folder: str = r"/data/emea_ar/output/drilldown/", drilldown_folder: str = r"/data/emea_ar/output/drilldown/",
special_doc_id_list: list = None, special_doc_id_list: list = None,
re_run_extract_data: bool = False, re_run_extract_data: bool = False,
@ -392,9 +415,11 @@ def batch_start_job(
doc_data_from_gpt, annotation_list, doc_mapping_data_list = mapping_data( doc_data_from_gpt, annotation_list, doc_mapping_data_list = mapping_data(
doc_id=doc_id, doc_id=doc_id,
pdf_folder=pdf_folder, pdf_folder=pdf_folder,
output_pdf_text_folder=output_pdf_text_folder,
output_extract_data_folder=output_extract_data_child_folder, output_extract_data_folder=output_extract_data_child_folder,
output_mapping_folder=output_mapping_child_folder, output_mapping_folder=output_mapping_child_folder,
extract_way=extract_way, extract_way=extract_way,
apply_pdf2html=apply_pdf2html,
drilldown_folder=drilldown_folder, drilldown_folder=drilldown_folder,
re_run_extract_data=re_run_extract_data, re_run_extract_data=re_run_extract_data,
re_run_mapping_data=re_run_mapping_data, re_run_mapping_data=re_run_mapping_data,
@ -875,11 +900,13 @@ def replace_rerun_data(new_data_file: str, original_data_file: str):
def batch_run_documents(special_doc_id_list: list = None, def batch_run_documents(special_doc_id_list: list = None,
pdf_folder:str = r"/data/emea_ar/pdf/", pdf_folder:str = r"/data/emea_ar/pdf/",
output_pdf_text_folder: str = r"/data/emea_ar/output/pdf_text/",
output_extract_data_child_folder:str = r"/data/emea_ar/output/extract_data/docs/", output_extract_data_child_folder:str = r"/data/emea_ar/output/extract_data/docs/",
output_extract_data_total_folder:str = r"/data/emea_ar/output/extract_data/total/", output_extract_data_total_folder:str = r"/data/emea_ar/output/extract_data/total/",
output_mapping_child_folder:str = r"/data/emea_ar/output/mapping_data/docs/", output_mapping_child_folder:str = r"/data/emea_ar/output/mapping_data/docs/",
output_mapping_total_folder:str = r"/data/emea_ar/output/mapping_data/total/", output_mapping_total_folder:str = r"/data/emea_ar/output/mapping_data/total/",
drilldown_folder: str = r"/data/emea_ar/output/drilldown/"): drilldown_folder: str = r"/data/emea_ar/output/drilldown/",
apply_pdf2html: bool = False):
sample_document_list_folder = r'./sample_documents/' sample_document_list_folder = r'./sample_documents/'
document_list_files = glob(sample_document_list_folder + "*.txt") document_list_files = glob(sample_document_list_folder + "*.txt")
page_filter_ground_truth_file = ( page_filter_ground_truth_file = (
@ -887,7 +914,7 @@ def batch_run_documents(special_doc_id_list: list = None,
) )
re_run_extract_data = True re_run_extract_data = True
re_run_mapping_data = True re_run_mapping_data = True
force_save_total_data = True force_save_total_data = False
calculate_metrics = False calculate_metrics = False
extract_way = "text" extract_way = "text"
@ -906,12 +933,14 @@ def batch_run_documents(special_doc_id_list: list = None,
doc_id_list = [doc_id.strip() for doc_id in doc_id_list] doc_id_list = [doc_id.strip() for doc_id in doc_id_list]
batch_start_job( batch_start_job(
pdf_folder, pdf_folder,
output_pdf_text_folder,
page_filter_ground_truth_file, page_filter_ground_truth_file,
output_extract_data_child_folder, output_extract_data_child_folder,
output_mapping_child_folder, output_mapping_child_folder,
output_extract_data_total_folder, output_extract_data_total_folder,
output_mapping_total_folder, output_mapping_total_folder,
extract_way, extract_way,
apply_pdf2html,
drilldown_folder, drilldown_folder,
doc_id_list, doc_id_list,
re_run_extract_data, re_run_extract_data,
@ -923,12 +952,14 @@ def batch_run_documents(special_doc_id_list: list = None,
else: else:
batch_start_job( batch_start_job(
pdf_folder, pdf_folder,
output_pdf_text_folder,
page_filter_ground_truth_file, page_filter_ground_truth_file,
output_extract_data_child_folder, output_extract_data_child_folder,
output_mapping_child_folder, output_mapping_child_folder,
output_extract_data_total_folder, output_extract_data_total_folder,
output_mapping_total_folder, output_mapping_total_folder,
extract_way, extract_way,
apply_pdf2html,
drilldown_folder, drilldown_folder,
special_doc_id_list, special_doc_id_list,
re_run_extract_data, re_run_extract_data,
@ -1048,31 +1079,37 @@ if __name__ == "__main__":
# special_doc_id_list = ["553242411"] # special_doc_id_list = ["553242411"]
special_doc_id_list: list = ["539790009", # special_doc_id_list: list = ["539790009",
"542300403", # "542300403",
"542301117", # "542301117",
"542306317", # "542306317",
"547567013", # "547567013",
"552505237", # "552505237",
"552505278", # "552505278",
"554431052", # "554431052",
"554851189", # "554851189",
"555377021", # "555377021",
"555654388"] # "555654388"]
# special_doc_id_list: list = ["552505278"] special_doc_id_list: list = ["539790009", "542301117"]
special_doc_id_list: list = ["539790009"]
pdf_folder:str = r"/data/aus_prospectus/pdf/" pdf_folder:str = r"/data/aus_prospectus/pdf/"
output_pdf_text_folder:str = r"/data/aus_prospectus/output/pdf_text/"
output_extract_data_child_folder:str = r"/data/aus_prospectus/output/extract_data/docs/" output_extract_data_child_folder:str = r"/data/aus_prospectus/output/extract_data/docs/"
output_extract_data_total_folder:str = r"/data/aus_prospectus/output/extract_data/total/" output_extract_data_total_folder:str = r"/data/aus_prospectus/output/extract_data/total/"
output_mapping_child_folder:str = r"/data/aus_prospectus/output/mapping_data/docs/" output_mapping_child_folder:str = r"/data/aus_prospectus/output/mapping_data/docs/"
output_mapping_total_folder:str = r"/data/aus_prospectus/output/mapping_data/total/" output_mapping_total_folder:str = r"/data/aus_prospectus/output/mapping_data/total/"
drilldown_folder = r"/data/aus_prospectus/output/drilldown/" drilldown_folder = r"/data/aus_prospectus/output/drilldown/"
apply_pdf2html = True
batch_run_documents(special_doc_id_list=special_doc_id_list, batch_run_documents(special_doc_id_list=special_doc_id_list,
pdf_folder=pdf_folder, pdf_folder=pdf_folder,
output_pdf_text_folder=output_pdf_text_folder,
output_extract_data_child_folder=output_extract_data_child_folder, output_extract_data_child_folder=output_extract_data_child_folder,
output_extract_data_total_folder=output_extract_data_total_folder, output_extract_data_total_folder=output_extract_data_total_folder,
output_mapping_child_folder=output_mapping_child_folder, output_mapping_child_folder=output_mapping_child_folder,
output_mapping_total_folder=output_mapping_total_folder, output_mapping_total_folder=output_mapping_total_folder,
drilldown_folder=drilldown_folder) drilldown_folder=drilldown_folder,
apply_pdf2html=apply_pdf2html
)
# new_data_file = r"/data/emea_ar/output/mapping_data/total/mapping_data_info_15_documents_by_text_20241121154243.xlsx" # new_data_file = r"/data/emea_ar/output/mapping_data/total/mapping_data_info_15_documents_by_text_20241121154243.xlsx"
# original_data_file = r"/data/emea_ar/ground_truth/data_extraction/verify/mapping_data_info_30_documents_all_4_datapoints_20241106_verify_mapping.xlsx" # original_data_file = r"/data/emea_ar/ground_truth/data_extraction/verify/mapping_data_info_30_documents_all_4_datapoints_20241106_verify_mapping.xlsx"

View File

@ -1385,6 +1385,10 @@ def merge_aus_document_prospectus_data():
aus_document_prospectus_data.to_excel( aus_document_prospectus_data.to_excel(
writer, sheet_name="aus_document_prospectus", index=False writer, sheet_name="aus_document_prospectus", index=False
) )
def get_pdf_2_html():
pass

View File

@ -10,4 +10,5 @@ scikit-learn==1.5.1
pandas==2.2.3 pandas==2.2.3
openpyxl==3.1.2 openpyxl==3.1.2
XlsxWriter==3.1.2 XlsxWriter==3.1.2
tiktoken==0.7.0 tiktoken==0.7.0
beautifulsoup4==4.12.3

View File

@ -14,6 +14,11 @@ from utils.similarity import Similarity
from utils.biz_utils import total_currency_list from utils.biz_utils import total_currency_list
from utils.logger import logger from utils.logger import logger
import requests
from bs4 import BeautifulSoup
import dotenv
# loads .env file with your OPENAI_API_KEY
dotenv.load_dotenv()
class PDFUtil: class PDFUtil:
@ -1667,3 +1672,58 @@ class PDFUtil:
action=action, action=action,
) )
return data_list return data_list
def pdf_to_html_with_docid(doc_id, para):
headers = {
'user': 'visitor',
'Accept': 'application/json',
}
args = {
'docId': doc_id,
'parameters': json.dumps(para)
}
pdf2html_url = os.getenv("pdf2html_url")
response = requests.post(pdf2html_url, data=args, headers=headers)
response.encoding = 'utf-8'
text = response.text
return text
def pdf_to_html(pdf_path, para):
headers = {
"user": "visitor",
"Accept": "application/json",
}
args = {
"parameters": json.dumps(para)
}
with open(pdf_path, mode='rb') as f:
file_bytes = f.read()
files = {"file": ("tempName.pdf", file_bytes)}
pdf2html_url = os.getenv("pdf2html_url")
response = requests.post(pdf2html_url, data=args, files=files, headers=headers)
response.encoding = 'utf-8'
text = response.text
return text
def get_pdf_pages_by_html(pdf_info: str, pdf_info_type: str="doc_id"):
# Convert pdf to html
para = {
"detectTable": True
}
if pdf_info_type == "doc_id":
html = pdf_to_html_with_docid(pdf_info, para)
else:
html = pdf_to_html(pdf_info, para)
html = BeautifulSoup(html, 'html.parser')
pages = html.find_all('div', attrs={'page-idx': True})
page_text_dict = {}
for index, page in enumerate(pages):
page_text_dict[index] = page.get_text().strip()
return page_text_dict