diff --git a/configuration/document_dp_pages.json b/configuration/document_dp_pages.json
index 442a394..4fe058c 100644
--- a/configuration/document_dp_pages.json
+++ b/configuration/document_dp_pages.json
@@ -1,7 +1,6 @@
 {
 	"539790009": [39, 40, 45, 46, 47],
 	"542300403": [12],
-	"542301117": [17, 18],
 	"542306317": [4, 15, 16, 17, 18],
 	"547567013": [12, 13, 14, 15, 16, 17, 33, 34, 35, 39, 40, 42, 43, 44, 45],
 	"552505237": [16, 17, 18, 19, 25, 26, 27],
diff --git a/configuration/objective_strategy_regex.json b/configuration/objective_strategy_regex.json
new file mode 100644
index 0000000..c2b27ce
--- /dev/null
+++ b/configuration/objective_strategy_regex.json
@@ -0,0 +1,7 @@
+{
+    "objective_strategy":
+    {
+        "start": "\\n[0-9\\W\\s]*(investment\\s*objective|objective|fund\\s*objective|investment\\s*objective(s)?\\s*(and|\\&)\\s*(policy|policies|investment)|Investment\\s*(Policy|policies)\\s*and\\s*Objective(s)?\\s*of\\s*the\\s*Trust|investment\\s*objective(s)?\\s*(and|\\&)\\s*policy\\W*and\\s*investment\\s*restriction|Investment\\s*Objective\\s*and\\s*Investment\\s*Policy\\s*and\\s*Strategy|What\\s*the\\s*Fund\\s*Aims\\s*to Deliver\\s*(\\WFund\\s*Objective\\W)?)(s)?(\\W)*\\s*\\n",
+        "end": "\\n[0-9\\W\\s]*(uk\\s*ucits\\s*investment\\s*and\\s*borrowing\\s*powers|risk\\s*consideration|risk\\s*factor|fund\\s*risk|investor(s)?\\s*profile|final\\s*accounting\\s*date|dealing\\s*cut\\s*off\\s*point|cut\\s*off\\s* point|class(es)?\\s*of\\s*share(s)?\\s*available|class(es)?\\s*of\\s*share(s)?\\s*which\\s*may\\s*be\\s*issue(d)?|manager.*charge|investment\\s*style|profile\\s*of\\s*the\\s*typical\\s*investor|typical\\s*investor(s)?\\s*profile|accounting\\s*reference\\s*date.*|specific\\s*fund\\s*risk\\s*factor|change(s)?\\s*to\\s*the\\s*investment\\s*objective\\s*and(\\/or)?\\s*investment\\s*policy|accounting\\s*and\\s*record\\s*date|share\\s*class(es)?\\s*established\\s*as\\s*at\\s*the\\s*date\\s*of\\s*this\\s*prospectus|isa|class(es)?\\s*for\\s*investment\\s*in\\s*the\\s*catholic\\s*investment\\s*fund|fund\\s*detail|derivative(s)?\\s*and\\s*technique|investment\\s*(restriction|approach)|Tracking\\s*Error|Characteristics\\s*of\\s*the\\s*Trust|investment\\s*style|Limit\\s*on\\s*investment\\s*in\\s*other\\s*collective\\s*investment\\s*scheme|Participation\\s*in\\s*the\\s*Fund|Initial\\s*Charge|other|Additional\\s*Information)(s)?(\\W)*\\s*\\n"
+    }
+}
\ No newline at end of file
diff --git a/core/data_extraction.py b/core/data_extraction.py
index c31d617..5175264 100644
--- a/core/data_extraction.py
+++ b/core/data_extraction.py
@@ -44,6 +44,15 @@ class DataExtraction:
             self.document_mapping_info_df = query_document_fund_mapping(doc_id, rerun=False)
         else:
             self.document_mapping_info_df = document_mapping_info_df
+        
+        self.fund_name_list = self.document_mapping_info_df["FundName"].unique().tolist()
+        
+        # get document type by DocumentType in self.document_mapping_info_df
+        self.document_type = int(self.document_mapping_info_df["DocumentType"].iloc[0])
+        self.investment_objective_pages = []
+        if self.document_type == 1:
+            self.investment_objective_pages = self.get_investment_objective_pages()
+        
         self.provider_mapping_df = self.get_provider_mapping()
         if len(self.provider_mapping_df) == 0:
             self.provider_fund_name_list = []
@@ -61,7 +70,24 @@ class DataExtraction:
             self.get_datapoint_reported_name()
         self.extract_way = extract_way
         self.output_image_folder = output_image_folder
-        
+    
+    
+    def get_investment_objective_pages(self):
+        investment_objective_pages = []
+        if self.document_type == 1:
+            objective_strategy_regex_config_file = r"./configuration/objective_strategy_regex.json"
+            with open(objective_strategy_regex_config_file, "r", encoding="utf-8") as f:
+                objective_strategy_regex_config = json.load(f)
+            objective_start_regex = objective_strategy_regex_config.get("objective_strategy", {}).get("start", "")
+            
+            if objective_start_regex is not None and len(objective_start_regex) > 0:
+                for page_index, text in self.page_text_dict.items():
+                    if re.search(objective_start_regex, text, re.I):
+                        investment_objective_pages.append(page_index)
+        if len(investment_objective_pages) > 0:
+            investment_objective_pages.sort()
+        return investment_objective_pages
+    
     def get_datapoint_reported_name(self):
         language_config_file = r"./configuration/language.json"
         self.language_config = {}
@@ -222,6 +248,8 @@ class DataExtraction:
                         next_page_num = page_num + count
                         if next_page_num >= pdf_page_count:
                             break
+                        if self.document_type == 1 and next_page_num in self.investment_objective_pages:
+                            break
                         next_datapoints = page_datapoints
                         if next_page_num in self.page_nums_with_datapoints:
                             should_continue = False
@@ -434,6 +462,25 @@ class DataExtraction:
          doc_id, page_index, datapoint, value, raw_fund_name, fund_id, fund_name, raw_share_name, share_id, share_name
         """
         logger.info(f"Extracting data from page {page_num}")
+        if self.document_type == 1:
+            pre_context = f"The document type is prospectus. \nThe fund names in this document are {', '.join(self.fund_name_list)}."
+            if pre_context in page_text:
+                page_text = page_text.replace(pre_context, "\n").strip()
+                
+            if len(self.investment_objective_pages) > 0:
+                # Get the page number of the most recent investment objective at the top of the current page.
+                diff_pages = [page_num - investment_objective_page for investment_objective_page 
+                              in self.investment_objective_pages 
+                              if investment_objective_page <= page_num]
+                if len(diff_pages) > 0 and diff_pages[-1] < 5:
+                    top_nearest_investment_objective_page = self.investment_objective_pages[len(diff_pages) - 1]
+                    top_nearest_investment_objective_text = self.page_text_dict.get(top_nearest_investment_objective_page, "")
+                    if top_nearest_investment_objective_text in page_text:
+                        page_text = page_text.replace(top_nearest_investment_objective_text, "").strip()
+                    pre_context = f"\nThe most recent investment objective page text which maybe with fund name is: \n{top_nearest_investment_objective_text}.\n"
+            # If can't find previous investment objective text, add the fund names to be the pre-fix of page text
+            page_text = f"{pre_context}\n{page_text}"
+                
         instructions = self.get_instructions_by_datapoints(
             page_text, 
             page_datapoints, 
diff --git a/core/page_filter.py b/core/page_filter.py
index 5c9e487..9104813 100644
--- a/core/page_filter.py
+++ b/core/page_filter.py
@@ -7,14 +7,16 @@ from utils.pdf_util import PDFUtil
 from utils.sql_query_util import query_document_fund_mapping
 from utils.logger import logger
 from utils.biz_utils import add_slash_to_text_as_regex, clean_text
+from utils.pdf_util import get_pdf_pages_by_html
 
 
 class FilterPages:
     def __init__(
-        self, doc_id: str, pdf_file: str, document_mapping_info_df: pd.DataFrame
+        self, doc_id: str, pdf_file: str, document_mapping_info_df: pd.DataFrame, apply_pdf2html: bool = False
     ) -> None:
         self.doc_id = doc_id
         self.pdf_file = pdf_file
+        self.apply_pdf2html = apply_pdf2html
         self.page_text_dict = self.get_pdf_page_text_dict()
         if document_mapping_info_df is None or len(document_mapping_info_df) == 0:
             self.document_mapping_info_df = query_document_fund_mapping(doc_id, rerun=False)
@@ -35,8 +37,12 @@ class FilterPages:
         self.document_dp_pages = self.document_dp_pages_config.get(self.doc_id, [])
 
     def get_pdf_page_text_dict(self) -> dict:
-        pdf_util = PDFUtil(self.pdf_file)
-        success, text, page_text_dict = pdf_util.extract_text()
+        page_text_dict = {}
+        if self.apply_pdf2html:
+            page_text_dict = get_pdf_pages_by_html(self.pdf_file, pdf_info_type="pdf_path")
+        else:
+            pdf_util = PDFUtil(self.pdf_file)
+            success, text, page_text_dict = pdf_util.extract_text()
         return page_text_dict
 
     def get_configuration_from_file(self) -> dict:
diff --git a/main.py b/main.py
index c8e4e77..68d9d78 100644
--- a/main.py
+++ b/main.py
@@ -24,9 +24,11 @@ class EMEA_AR_Parsing:
         self,
         doc_id: str,
         pdf_folder: str = r"/data/emea_ar/pdf/",
+        output_pdf_text_folder: str = r"/data/emea_ar/output/pdf_text/",
         output_extract_data_folder: str = r"/data/emea_ar/output/extract_data/docs/",
         output_mapping_data_folder: str = r"/data/emea_ar/output/mapping_data/docs/",
         extract_way: str = "text",
+        apply_pdf2html: bool = False,
         drilldown_folder: str = r"/data/emea_ar/output/drilldown/",
     ) -> None:
         self.doc_id = doc_id
@@ -68,9 +70,24 @@ class EMEA_AR_Parsing:
         os.makedirs(self.output_mapping_data_folder, exist_ok=True)
 
         self.filter_pages = FilterPages(
-            self.doc_id, self.pdf_file, self.document_mapping_info_df
+            self.doc_id, self.pdf_file, self.document_mapping_info_df, apply_pdf2html
         )
         self.page_text_dict = self.filter_pages.page_text_dict
+        try:
+            os.makedirs(output_pdf_text_folder, exist_ok=True)
+            if apply_pdf2html:
+                output_pdf_text_folder = os.path.join(output_pdf_text_folder, "pdf2html/")
+            else:
+                output_pdf_text_folder = os.path.join(output_pdf_text_folder, "pymupdf/")
+            os.makedirs(output_pdf_text_folder, exist_ok=True)
+            self.page_text_file = os.path.join(
+                output_pdf_text_folder, f"{self.doc_id}_page_text.json"
+            )
+            with open(self.page_text_file, "w", encoding="utf-8") as f:
+                json.dump(self.page_text_dict, f, ensure_ascii=False, indent=4)
+        except Exception as e:
+            logger.error(f"Error: {e}")
+            
         self.datapoint_page_info, self.result_details = self.get_datapoint_page_info()
         self.datapoints = self.get_datapoints_from_datapoint_page_info()
         
@@ -274,9 +291,11 @@ def extract_data(
 def mapping_data(
     doc_id: str,
     pdf_folder: str,
+    output_pdf_text_folder: str,
     output_extract_data_folder: str,
     output_mapping_folder: str,
     extract_way: str = "text",
+    apply_pdf2html: bool = False,
     drilldown_folder: str = r"/data/emea_ar/output/drilldown/",
     re_run_extract_data: bool = False,
     re_run_mapping_data: bool = False,
@@ -285,9 +304,11 @@ def mapping_data(
     emea_ar_parsing = EMEA_AR_Parsing(
         doc_id,
         pdf_folder,
+        output_pdf_text_folder=output_pdf_text_folder,
         output_extract_data_folder=output_extract_data_folder,
         output_mapping_data_folder=output_mapping_folder,
         extract_way=extract_way,
+        apply_pdf2html=apply_pdf2html,
         drilldown_folder=drilldown_folder,
     )
     doc_data_from_gpt, annotation_list = emea_ar_parsing.extract_data(re_run=re_run_extract_data)
@@ -352,12 +373,14 @@ def batch_extract_data(
 
 def batch_start_job(
     pdf_folder: str,
+    output_pdf_text_folder: str = r"/data/emea_ar/output/pdf_text/",
     doc_data_excel_file: str = None,
     output_extract_data_child_folder: str = r"/data/emea_ar/output/extract_data/docs/",
     output_mapping_child_folder: str = r"/data/emea_ar/output/mapping_data/docs/",
     output_extract_data_total_folder: str = r"/data/emea_ar/output/extract_data/total/",
     output_mapping_total_folder: str = r"/data/emea_ar/output/mapping_data/total/",
     extract_way: str = "text",
+    apply_pdf2html: bool = False,
     drilldown_folder: str = r"/data/emea_ar/output/drilldown/",
     special_doc_id_list: list = None,
     re_run_extract_data: bool = False,
@@ -392,9 +415,11 @@ def batch_start_job(
             doc_data_from_gpt, annotation_list, doc_mapping_data_list = mapping_data(
                 doc_id=doc_id,
                 pdf_folder=pdf_folder,
+                output_pdf_text_folder=output_pdf_text_folder,
                 output_extract_data_folder=output_extract_data_child_folder,
                 output_mapping_folder=output_mapping_child_folder,
                 extract_way=extract_way,
+                apply_pdf2html=apply_pdf2html,
                 drilldown_folder=drilldown_folder,
                 re_run_extract_data=re_run_extract_data,
                 re_run_mapping_data=re_run_mapping_data,
@@ -875,11 +900,13 @@ def replace_rerun_data(new_data_file: str, original_data_file: str):
 
 def batch_run_documents(special_doc_id_list: list = None,
                         pdf_folder:str = r"/data/emea_ar/pdf/",
+                        output_pdf_text_folder: str = r"/data/emea_ar/output/pdf_text/",
                         output_extract_data_child_folder:str = r"/data/emea_ar/output/extract_data/docs/",
                         output_extract_data_total_folder:str = r"/data/emea_ar/output/extract_data/total/",
                         output_mapping_child_folder:str = r"/data/emea_ar/output/mapping_data/docs/",
                         output_mapping_total_folder:str = r"/data/emea_ar/output/mapping_data/total/",
-                        drilldown_folder: str = r"/data/emea_ar/output/drilldown/"):
+                        drilldown_folder: str = r"/data/emea_ar/output/drilldown/",
+                        apply_pdf2html: bool = False):
     sample_document_list_folder = r'./sample_documents/'
     document_list_files = glob(sample_document_list_folder + "*.txt")
     page_filter_ground_truth_file = (
@@ -887,7 +914,7 @@ def batch_run_documents(special_doc_id_list: list = None,
     )
     re_run_extract_data = True
     re_run_mapping_data = True
-    force_save_total_data = True
+    force_save_total_data = False
     calculate_metrics = False
 
     extract_way = "text"
@@ -906,12 +933,14 @@ def batch_run_documents(special_doc_id_list: list = None,
             doc_id_list = [doc_id.strip() for doc_id in doc_id_list]
             batch_start_job(
                 pdf_folder,
+                output_pdf_text_folder,
                 page_filter_ground_truth_file,
                 output_extract_data_child_folder,
                 output_mapping_child_folder,
                 output_extract_data_total_folder,
                 output_mapping_total_folder,
                 extract_way,
+                apply_pdf2html,
                 drilldown_folder,
                 doc_id_list,
                 re_run_extract_data,
@@ -923,12 +952,14 @@ def batch_run_documents(special_doc_id_list: list = None,
     else:
         batch_start_job(
             pdf_folder,
+            output_pdf_text_folder,
             page_filter_ground_truth_file,
             output_extract_data_child_folder,
             output_mapping_child_folder,
             output_extract_data_total_folder,
             output_mapping_total_folder,
             extract_way,
+            apply_pdf2html,
             drilldown_folder,
             special_doc_id_list,
             re_run_extract_data,
@@ -1048,31 +1079,37 @@ if __name__ == "__main__":
     
     
     # special_doc_id_list = ["553242411"]
-    special_doc_id_list: list = ["539790009",
-                                "542300403",
-                                "542301117",
-                                "542306317",
-                                "547567013",
-                                "552505237",
-                                "552505278",
-                                "554431052",
-                                "554851189",
-                                "555377021",
-                                "555654388"]
-    # special_doc_id_list: list = ["552505278"]
+    # special_doc_id_list: list = ["539790009",
+    #                             "542300403",
+    #                             "542301117",
+    #                             "542306317",
+    #                             "547567013",
+    #                             "552505237",
+    #                             "552505278",
+    #                             "554431052",
+    #                             "554851189",
+    #                             "555377021",
+    #                             "555654388"]
+    special_doc_id_list: list = ["539790009", "542301117"]
+    special_doc_id_list: list = ["539790009"]
     pdf_folder:str = r"/data/aus_prospectus/pdf/"
+    output_pdf_text_folder:str = r"/data/aus_prospectus/output/pdf_text/"
     output_extract_data_child_folder:str = r"/data/aus_prospectus/output/extract_data/docs/"
     output_extract_data_total_folder:str = r"/data/aus_prospectus/output/extract_data/total/"
     output_mapping_child_folder:str = r"/data/aus_prospectus/output/mapping_data/docs/"
     output_mapping_total_folder:str = r"/data/aus_prospectus/output/mapping_data/total/"
     drilldown_folder = r"/data/aus_prospectus/output/drilldown/"
+    apply_pdf2html = True
     batch_run_documents(special_doc_id_list=special_doc_id_list,
                         pdf_folder=pdf_folder,
+                        output_pdf_text_folder=output_pdf_text_folder,
                         output_extract_data_child_folder=output_extract_data_child_folder,
                         output_extract_data_total_folder=output_extract_data_total_folder,
                         output_mapping_child_folder=output_mapping_child_folder,
                         output_mapping_total_folder=output_mapping_total_folder,
-                        drilldown_folder=drilldown_folder)
+                        drilldown_folder=drilldown_folder,
+                        apply_pdf2html=apply_pdf2html
+                        )
     
     # new_data_file = r"/data/emea_ar/output/mapping_data/total/mapping_data_info_15_documents_by_text_20241121154243.xlsx"
     # original_data_file = r"/data/emea_ar/ground_truth/data_extraction/verify/mapping_data_info_30_documents_all_4_datapoints_20241106_verify_mapping.xlsx"
diff --git a/prepare_data.py b/prepare_data.py
index 78134f3..31b11f3 100644
--- a/prepare_data.py
+++ b/prepare_data.py
@@ -1385,6 +1385,10 @@ def merge_aus_document_prospectus_data():
         aus_document_prospectus_data.to_excel(
             writer, sheet_name="aus_document_prospectus", index=False
         )
+
+
+def get_pdf_2_html():
+    pass
         
 
 
diff --git a/requirements.txt b/requirements.txt
index 42f1a3f..f3839f2 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,4 +10,5 @@ scikit-learn==1.5.1
 pandas==2.2.3
 openpyxl==3.1.2
 XlsxWriter==3.1.2
-tiktoken==0.7.0
\ No newline at end of file
+tiktoken==0.7.0
+beautifulsoup4==4.12.3
\ No newline at end of file
diff --git a/utils/pdf_util.py b/utils/pdf_util.py
index 6745754..008c71a 100644
--- a/utils/pdf_util.py
+++ b/utils/pdf_util.py
@@ -14,6 +14,11 @@ from utils.similarity import Similarity
 from utils.biz_utils import total_currency_list
 
 from utils.logger import logger
+import requests
+from bs4 import BeautifulSoup
+import dotenv
+# loads .env file with your OPENAI_API_KEY
+dotenv.load_dotenv()
 
 
 class PDFUtil:
@@ -1667,3 +1672,58 @@ class PDFUtil:
                 action=action,
             )
         return data_list
+
+
+def pdf_to_html_with_docid(doc_id, para):
+    headers = {
+        'user': 'visitor',
+        'Accept': 'application/json',
+    }
+
+    args = {
+        'docId': doc_id,
+        'parameters': json.dumps(para)
+    }
+
+    pdf2html_url = os.getenv("pdf2html_url")
+    response = requests.post(pdf2html_url, data=args, headers=headers)
+    response.encoding = 'utf-8'
+    text = response.text
+    return text
+
+def pdf_to_html(pdf_path, para):
+    headers = {
+        "user": "visitor",
+        "Accept": "application/json",
+    }
+    args = {
+        "parameters": json.dumps(para)
+    }
+    
+    with open(pdf_path, mode='rb') as f:
+        file_bytes = f.read()
+        
+    files = {"file": ("tempName.pdf", file_bytes)}
+    
+    pdf2html_url = os.getenv("pdf2html_url")
+    response = requests.post(pdf2html_url, data=args, files=files, headers=headers)
+    response.encoding = 'utf-8'
+    text = response.text
+    return text
+
+
+def get_pdf_pages_by_html(pdf_info: str, pdf_info_type: str="doc_id"):
+    # Convert pdf to html
+    para = {
+        "detectTable": True
+    }
+    if pdf_info_type == "doc_id":
+        html = pdf_to_html_with_docid(pdf_info, para)
+    else:
+        html = pdf_to_html(pdf_info, para)
+    html = BeautifulSoup(html, 'html.parser')
+    pages = html.find_all('div', attrs={'page-idx': True})
+    page_text_dict = {}
+    for index, page in enumerate(pages):
+        page_text_dict[index] = page.get_text().strip()
+    return page_text_dict
\ No newline at end of file