diff --git a/configuration/document_dp_pages.json b/configuration/document_dp_pages.json index 442a394..4fe058c 100644 --- a/configuration/document_dp_pages.json +++ b/configuration/document_dp_pages.json @@ -1,7 +1,6 @@ { "539790009": [39, 40, 45, 46, 47], "542300403": [12], - "542301117": [17, 18], "542306317": [4, 15, 16, 17, 18], "547567013": [12, 13, 14, 15, 16, 17, 33, 34, 35, 39, 40, 42, 43, 44, 45], "552505237": [16, 17, 18, 19, 25, 26, 27], diff --git a/configuration/objective_strategy_regex.json b/configuration/objective_strategy_regex.json new file mode 100644 index 0000000..c2b27ce --- /dev/null +++ b/configuration/objective_strategy_regex.json @@ -0,0 +1,7 @@ +{ + "objective_strategy": + { + "start": "\\n[0-9\\W\\s]*(investment\\s*objective|objective|fund\\s*objective|investment\\s*objective(s)?\\s*(and|\\&)\\s*(policy|policies|investment)|Investment\\s*(Policy|policies)\\s*and\\s*Objective(s)?\\s*of\\s*the\\s*Trust|investment\\s*objective(s)?\\s*(and|\\&)\\s*policy\\W*and\\s*investment\\s*restriction|Investment\\s*Objective\\s*and\\s*Investment\\s*Policy\\s*and\\s*Strategy|What\\s*the\\s*Fund\\s*Aims\\s*to Deliver\\s*(\\WFund\\s*Objective\\W)?)(s)?(\\W)*\\s*\\n", + "end": "\\n[0-9\\W\\s]*(uk\\s*ucits\\s*investment\\s*and\\s*borrowing\\s*powers|risk\\s*consideration|risk\\s*factor|fund\\s*risk|investor(s)?\\s*profile|final\\s*accounting\\s*date|dealing\\s*cut\\s*off\\s*point|cut\\s*off\\s* point|class(es)?\\s*of\\s*share(s)?\\s*available|class(es)?\\s*of\\s*share(s)?\\s*which\\s*may\\s*be\\s*issue(d)?|manager.*charge|investment\\s*style|profile\\s*of\\s*the\\s*typical\\s*investor|typical\\s*investor(s)?\\s*profile|accounting\\s*reference\\s*date.*|specific\\s*fund\\s*risk\\s*factor|change(s)?\\s*to\\s*the\\s*investment\\s*objective\\s*and(\\/or)?\\s*investment\\s*policy|accounting\\s*and\\s*record\\s*date|share\\s*class(es)?\\s*established\\s*as\\s*at\\s*the\\s*date\\s*of\\s*this\\s*prospectus|isa|class(es)?\\s*for\\s*investment\\s*in\\s*the\\s*catholic\\s*investment\\s*fund|fund\\s*detail|derivative(s)?\\s*and\\s*technique|investment\\s*(restriction|approach)|Tracking\\s*Error|Characteristics\\s*of\\s*the\\s*Trust|investment\\s*style|Limit\\s*on\\s*investment\\s*in\\s*other\\s*collective\\s*investment\\s*scheme|Participation\\s*in\\s*the\\s*Fund|Initial\\s*Charge|other|Additional\\s*Information)(s)?(\\W)*\\s*\\n" + } +} \ No newline at end of file diff --git a/core/data_extraction.py b/core/data_extraction.py index c31d617..5175264 100644 --- a/core/data_extraction.py +++ b/core/data_extraction.py @@ -44,6 +44,15 @@ class DataExtraction: self.document_mapping_info_df = query_document_fund_mapping(doc_id, rerun=False) else: self.document_mapping_info_df = document_mapping_info_df + + self.fund_name_list = self.document_mapping_info_df["FundName"].unique().tolist() + + # get document type by DocumentType in self.document_mapping_info_df + self.document_type = int(self.document_mapping_info_df["DocumentType"].iloc[0]) + self.investment_objective_pages = [] + if self.document_type == 1: + self.investment_objective_pages = self.get_investment_objective_pages() + self.provider_mapping_df = self.get_provider_mapping() if len(self.provider_mapping_df) == 0: self.provider_fund_name_list = [] @@ -61,7 +70,24 @@ class DataExtraction: self.get_datapoint_reported_name() self.extract_way = extract_way self.output_image_folder = output_image_folder - + + + def get_investment_objective_pages(self): + investment_objective_pages = [] + if self.document_type == 1: + objective_strategy_regex_config_file = r"./configuration/objective_strategy_regex.json" + with open(objective_strategy_regex_config_file, "r", encoding="utf-8") as f: + objective_strategy_regex_config = json.load(f) + objective_start_regex = objective_strategy_regex_config.get("objective_strategy", {}).get("start", "") + + if objective_start_regex is not None and len(objective_start_regex) > 0: + for page_index, text in self.page_text_dict.items(): + if re.search(objective_start_regex, text, re.I): + investment_objective_pages.append(page_index) + if len(investment_objective_pages) > 0: + investment_objective_pages.sort() + return investment_objective_pages + def get_datapoint_reported_name(self): language_config_file = r"./configuration/language.json" self.language_config = {} @@ -222,6 +248,8 @@ class DataExtraction: next_page_num = page_num + count if next_page_num >= pdf_page_count: break + if self.document_type == 1 and next_page_num in self.investment_objective_pages: + break next_datapoints = page_datapoints if next_page_num in self.page_nums_with_datapoints: should_continue = False @@ -434,6 +462,25 @@ class DataExtraction: doc_id, page_index, datapoint, value, raw_fund_name, fund_id, fund_name, raw_share_name, share_id, share_name """ logger.info(f"Extracting data from page {page_num}") + if self.document_type == 1: + pre_context = f"The document type is prospectus. \nThe fund names in this document are {', '.join(self.fund_name_list)}." + if pre_context in page_text: + page_text = page_text.replace(pre_context, "\n").strip() + + if len(self.investment_objective_pages) > 0: + # Get the page number of the most recent investment objective at the top of the current page. + diff_pages = [page_num - investment_objective_page for investment_objective_page + in self.investment_objective_pages + if investment_objective_page <= page_num] + if len(diff_pages) > 0 and diff_pages[-1] < 5: + top_nearest_investment_objective_page = self.investment_objective_pages[len(diff_pages) - 1] + top_nearest_investment_objective_text = self.page_text_dict.get(top_nearest_investment_objective_page, "") + if top_nearest_investment_objective_text in page_text: + page_text = page_text.replace(top_nearest_investment_objective_text, "").strip() + pre_context = f"\nThe most recent investment objective page text which maybe with fund name is: \n{top_nearest_investment_objective_text}.\n" + # If can't find previous investment objective text, add the fund names to be the pre-fix of page text + page_text = f"{pre_context}\n{page_text}" + instructions = self.get_instructions_by_datapoints( page_text, page_datapoints, diff --git a/core/page_filter.py b/core/page_filter.py index 5c9e487..9104813 100644 --- a/core/page_filter.py +++ b/core/page_filter.py @@ -7,14 +7,16 @@ from utils.pdf_util import PDFUtil from utils.sql_query_util import query_document_fund_mapping from utils.logger import logger from utils.biz_utils import add_slash_to_text_as_regex, clean_text +from utils.pdf_util import get_pdf_pages_by_html class FilterPages: def __init__( - self, doc_id: str, pdf_file: str, document_mapping_info_df: pd.DataFrame + self, doc_id: str, pdf_file: str, document_mapping_info_df: pd.DataFrame, apply_pdf2html: bool = False ) -> None: self.doc_id = doc_id self.pdf_file = pdf_file + self.apply_pdf2html = apply_pdf2html self.page_text_dict = self.get_pdf_page_text_dict() if document_mapping_info_df is None or len(document_mapping_info_df) == 0: self.document_mapping_info_df = query_document_fund_mapping(doc_id, rerun=False) @@ -35,8 +37,12 @@ class FilterPages: self.document_dp_pages = self.document_dp_pages_config.get(self.doc_id, []) def get_pdf_page_text_dict(self) -> dict: - pdf_util = PDFUtil(self.pdf_file) - success, text, page_text_dict = pdf_util.extract_text() + page_text_dict = {} + if self.apply_pdf2html: + page_text_dict = get_pdf_pages_by_html(self.pdf_file, pdf_info_type="pdf_path") + else: + pdf_util = PDFUtil(self.pdf_file) + success, text, page_text_dict = pdf_util.extract_text() return page_text_dict def get_configuration_from_file(self) -> dict: diff --git a/main.py b/main.py index c8e4e77..68d9d78 100644 --- a/main.py +++ b/main.py @@ -24,9 +24,11 @@ class EMEA_AR_Parsing: self, doc_id: str, pdf_folder: str = r"/data/emea_ar/pdf/", + output_pdf_text_folder: str = r"/data/emea_ar/output/pdf_text/", output_extract_data_folder: str = r"/data/emea_ar/output/extract_data/docs/", output_mapping_data_folder: str = r"/data/emea_ar/output/mapping_data/docs/", extract_way: str = "text", + apply_pdf2html: bool = False, drilldown_folder: str = r"/data/emea_ar/output/drilldown/", ) -> None: self.doc_id = doc_id @@ -68,9 +70,24 @@ class EMEA_AR_Parsing: os.makedirs(self.output_mapping_data_folder, exist_ok=True) self.filter_pages = FilterPages( - self.doc_id, self.pdf_file, self.document_mapping_info_df + self.doc_id, self.pdf_file, self.document_mapping_info_df, apply_pdf2html ) self.page_text_dict = self.filter_pages.page_text_dict + try: + os.makedirs(output_pdf_text_folder, exist_ok=True) + if apply_pdf2html: + output_pdf_text_folder = os.path.join(output_pdf_text_folder, "pdf2html/") + else: + output_pdf_text_folder = os.path.join(output_pdf_text_folder, "pymupdf/") + os.makedirs(output_pdf_text_folder, exist_ok=True) + self.page_text_file = os.path.join( + output_pdf_text_folder, f"{self.doc_id}_page_text.json" + ) + with open(self.page_text_file, "w", encoding="utf-8") as f: + json.dump(self.page_text_dict, f, ensure_ascii=False, indent=4) + except Exception as e: + logger.error(f"Error: {e}") + self.datapoint_page_info, self.result_details = self.get_datapoint_page_info() self.datapoints = self.get_datapoints_from_datapoint_page_info() @@ -274,9 +291,11 @@ def extract_data( def mapping_data( doc_id: str, pdf_folder: str, + output_pdf_text_folder: str, output_extract_data_folder: str, output_mapping_folder: str, extract_way: str = "text", + apply_pdf2html: bool = False, drilldown_folder: str = r"/data/emea_ar/output/drilldown/", re_run_extract_data: bool = False, re_run_mapping_data: bool = False, @@ -285,9 +304,11 @@ def mapping_data( emea_ar_parsing = EMEA_AR_Parsing( doc_id, pdf_folder, + output_pdf_text_folder=output_pdf_text_folder, output_extract_data_folder=output_extract_data_folder, output_mapping_data_folder=output_mapping_folder, extract_way=extract_way, + apply_pdf2html=apply_pdf2html, drilldown_folder=drilldown_folder, ) doc_data_from_gpt, annotation_list = emea_ar_parsing.extract_data(re_run=re_run_extract_data) @@ -352,12 +373,14 @@ def batch_extract_data( def batch_start_job( pdf_folder: str, + output_pdf_text_folder: str = r"/data/emea_ar/output/pdf_text/", doc_data_excel_file: str = None, output_extract_data_child_folder: str = r"/data/emea_ar/output/extract_data/docs/", output_mapping_child_folder: str = r"/data/emea_ar/output/mapping_data/docs/", output_extract_data_total_folder: str = r"/data/emea_ar/output/extract_data/total/", output_mapping_total_folder: str = r"/data/emea_ar/output/mapping_data/total/", extract_way: str = "text", + apply_pdf2html: bool = False, drilldown_folder: str = r"/data/emea_ar/output/drilldown/", special_doc_id_list: list = None, re_run_extract_data: bool = False, @@ -392,9 +415,11 @@ def batch_start_job( doc_data_from_gpt, annotation_list, doc_mapping_data_list = mapping_data( doc_id=doc_id, pdf_folder=pdf_folder, + output_pdf_text_folder=output_pdf_text_folder, output_extract_data_folder=output_extract_data_child_folder, output_mapping_folder=output_mapping_child_folder, extract_way=extract_way, + apply_pdf2html=apply_pdf2html, drilldown_folder=drilldown_folder, re_run_extract_data=re_run_extract_data, re_run_mapping_data=re_run_mapping_data, @@ -875,11 +900,13 @@ def replace_rerun_data(new_data_file: str, original_data_file: str): def batch_run_documents(special_doc_id_list: list = None, pdf_folder:str = r"/data/emea_ar/pdf/", + output_pdf_text_folder: str = r"/data/emea_ar/output/pdf_text/", output_extract_data_child_folder:str = r"/data/emea_ar/output/extract_data/docs/", output_extract_data_total_folder:str = r"/data/emea_ar/output/extract_data/total/", output_mapping_child_folder:str = r"/data/emea_ar/output/mapping_data/docs/", output_mapping_total_folder:str = r"/data/emea_ar/output/mapping_data/total/", - drilldown_folder: str = r"/data/emea_ar/output/drilldown/"): + drilldown_folder: str = r"/data/emea_ar/output/drilldown/", + apply_pdf2html: bool = False): sample_document_list_folder = r'./sample_documents/' document_list_files = glob(sample_document_list_folder + "*.txt") page_filter_ground_truth_file = ( @@ -887,7 +914,7 @@ def batch_run_documents(special_doc_id_list: list = None, ) re_run_extract_data = True re_run_mapping_data = True - force_save_total_data = True + force_save_total_data = False calculate_metrics = False extract_way = "text" @@ -906,12 +933,14 @@ def batch_run_documents(special_doc_id_list: list = None, doc_id_list = [doc_id.strip() for doc_id in doc_id_list] batch_start_job( pdf_folder, + output_pdf_text_folder, page_filter_ground_truth_file, output_extract_data_child_folder, output_mapping_child_folder, output_extract_data_total_folder, output_mapping_total_folder, extract_way, + apply_pdf2html, drilldown_folder, doc_id_list, re_run_extract_data, @@ -923,12 +952,14 @@ def batch_run_documents(special_doc_id_list: list = None, else: batch_start_job( pdf_folder, + output_pdf_text_folder, page_filter_ground_truth_file, output_extract_data_child_folder, output_mapping_child_folder, output_extract_data_total_folder, output_mapping_total_folder, extract_way, + apply_pdf2html, drilldown_folder, special_doc_id_list, re_run_extract_data, @@ -1048,31 +1079,37 @@ if __name__ == "__main__": # special_doc_id_list = ["553242411"] - special_doc_id_list: list = ["539790009", - "542300403", - "542301117", - "542306317", - "547567013", - "552505237", - "552505278", - "554431052", - "554851189", - "555377021", - "555654388"] - # special_doc_id_list: list = ["552505278"] + # special_doc_id_list: list = ["539790009", + # "542300403", + # "542301117", + # "542306317", + # "547567013", + # "552505237", + # "552505278", + # "554431052", + # "554851189", + # "555377021", + # "555654388"] + special_doc_id_list: list = ["539790009", "542301117"] + special_doc_id_list: list = ["539790009"] pdf_folder:str = r"/data/aus_prospectus/pdf/" + output_pdf_text_folder:str = r"/data/aus_prospectus/output/pdf_text/" output_extract_data_child_folder:str = r"/data/aus_prospectus/output/extract_data/docs/" output_extract_data_total_folder:str = r"/data/aus_prospectus/output/extract_data/total/" output_mapping_child_folder:str = r"/data/aus_prospectus/output/mapping_data/docs/" output_mapping_total_folder:str = r"/data/aus_prospectus/output/mapping_data/total/" drilldown_folder = r"/data/aus_prospectus/output/drilldown/" + apply_pdf2html = True batch_run_documents(special_doc_id_list=special_doc_id_list, pdf_folder=pdf_folder, + output_pdf_text_folder=output_pdf_text_folder, output_extract_data_child_folder=output_extract_data_child_folder, output_extract_data_total_folder=output_extract_data_total_folder, output_mapping_child_folder=output_mapping_child_folder, output_mapping_total_folder=output_mapping_total_folder, - drilldown_folder=drilldown_folder) + drilldown_folder=drilldown_folder, + apply_pdf2html=apply_pdf2html + ) # new_data_file = r"/data/emea_ar/output/mapping_data/total/mapping_data_info_15_documents_by_text_20241121154243.xlsx" # original_data_file = r"/data/emea_ar/ground_truth/data_extraction/verify/mapping_data_info_30_documents_all_4_datapoints_20241106_verify_mapping.xlsx" diff --git a/prepare_data.py b/prepare_data.py index 78134f3..31b11f3 100644 --- a/prepare_data.py +++ b/prepare_data.py @@ -1385,6 +1385,10 @@ def merge_aus_document_prospectus_data(): aus_document_prospectus_data.to_excel( writer, sheet_name="aus_document_prospectus", index=False ) + + +def get_pdf_2_html(): + pass diff --git a/requirements.txt b/requirements.txt index 42f1a3f..f3839f2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,4 +10,5 @@ scikit-learn==1.5.1 pandas==2.2.3 openpyxl==3.1.2 XlsxWriter==3.1.2 -tiktoken==0.7.0 \ No newline at end of file +tiktoken==0.7.0 +beautifulsoup4==4.12.3 \ No newline at end of file diff --git a/utils/pdf_util.py b/utils/pdf_util.py index 6745754..008c71a 100644 --- a/utils/pdf_util.py +++ b/utils/pdf_util.py @@ -14,6 +14,11 @@ from utils.similarity import Similarity from utils.biz_utils import total_currency_list from utils.logger import logger +import requests +from bs4 import BeautifulSoup +import dotenv +# loads .env file with your OPENAI_API_KEY +dotenv.load_dotenv() class PDFUtil: @@ -1667,3 +1672,58 @@ class PDFUtil: action=action, ) return data_list + + +def pdf_to_html_with_docid(doc_id, para): + headers = { + 'user': 'visitor', + 'Accept': 'application/json', + } + + args = { + 'docId': doc_id, + 'parameters': json.dumps(para) + } + + pdf2html_url = os.getenv("pdf2html_url") + response = requests.post(pdf2html_url, data=args, headers=headers) + response.encoding = 'utf-8' + text = response.text + return text + +def pdf_to_html(pdf_path, para): + headers = { + "user": "visitor", + "Accept": "application/json", + } + args = { + "parameters": json.dumps(para) + } + + with open(pdf_path, mode='rb') as f: + file_bytes = f.read() + + files = {"file": ("tempName.pdf", file_bytes)} + + pdf2html_url = os.getenv("pdf2html_url") + response = requests.post(pdf2html_url, data=args, files=files, headers=headers) + response.encoding = 'utf-8' + text = response.text + return text + + +def get_pdf_pages_by_html(pdf_info: str, pdf_info_type: str="doc_id"): + # Convert pdf to html + para = { + "detectTable": True + } + if pdf_info_type == "doc_id": + html = pdf_to_html_with_docid(pdf_info, para) + else: + html = pdf_to_html(pdf_info, para) + html = BeautifulSoup(html, 'html.parser') + pages = html.find_all('div', attrs={'page-idx': True}) + page_text_dict = {} + for index, page in enumerate(pages): + page_text_dict[index] = page.get_text().strip() + return page_text_dict \ No newline at end of file