diff --git a/core/data_extraction.py b/core/data_extraction.py index 587fbc9..f258f1f 100644 --- a/core/data_extraction.py +++ b/core/data_extraction.py @@ -8,7 +8,7 @@ from utils.gpt_utils import chat from utils.pdf_util import PDFUtil from utils.sql_query_util import query_document_fund_mapping, query_investment_by_provider from utils.logger import logger -from utils.biz_utils import add_slash_to_text_as_regex, clean_text, get_most_similar_name +from utils.biz_utils import add_slash_to_text_as_regex, clean_text, get_most_similar_name, remove_abundant_data class DataExtraction: @@ -149,11 +149,14 @@ class DataExtraction: def extract_data(self) -> dict: logger.info(f"Extracting data from document {self.doc_id}, extract way: {self.extract_way}") if self.extract_way == "text": - return self.extract_data_by_text() + data_list = self.extract_data_by_text() elif self.extract_way == "image": - return self.extract_data_by_image() + data_list = self.extract_data_by_image() else: - return self.extract_data_by_text() + data_list = self.extract_data_by_text() + data_list = remove_abundant_data(data_list) + self.output_data_to_file(data_list) + return data_list def extract_data_by_text(self) -> dict: """ @@ -267,8 +270,7 @@ class DataExtraction: logger.error(f"Error in extracting data from next page: {e}") break - self.output_data_to_file(data_list) - + # self.output_data_to_file(data_list) return data_list def extract_data_by_image(self) -> dict: @@ -346,7 +348,7 @@ class DataExtraction: logger.error(f"Error in extracting data from next page: {e}") break - self.output_data_to_file(data_list) + # self.output_data_to_file(data_list) return data_list @@ -512,6 +514,8 @@ class DataExtraction: fund_name = data.get("fund name", "") if fund_name == "": remove_list.append(data) + fund_name = self.get_fund_name(fund_name, "Fund") + data["fund name"] = fund_name keys = list(data.keys()) for key in keys: if self.datapoint_level_config.get(key, "") == "share_level": @@ -572,6 +576,17 @@ class DataExtraction: extract_data_info["data"] = new_data_list return extract_data_info + + def get_fund_name(self, fund_name: str, fund_feature: str): + if not fund_name.endswith(fund_feature): + return fund_name + fund_name_split = fund_name.split(fund_feature) + if len(fund_name_split) > 1: + last_fund = fund_name_split[-1].strip() + if len(last_fund) == 0: + last_fund = fund_name_split[-2].strip() + fund_name = f"{last_fund} {fund_feature}" + return fund_name def check_fund_name_as_share(self, fund_name: str) -> bool: """ diff --git a/main.py b/main.py index ac34f78..14adec9 100644 --- a/main.py +++ b/main.py @@ -1197,12 +1197,12 @@ if __name__ == "__main__": "534535767" ] special_doc_id_list = check_db_mapping_doc_id_list - # special_doc_id_list = ["337293427"] + special_doc_id_list = ["451878128"] output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/" output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/" - re_run_extract_data = False + re_run_extract_data = True re_run_mapping_data = True - force_save_total_data = True + force_save_total_data = False calculate_metrics = False extract_ways = ["text"] diff --git a/playground.ipynb b/playground.ipynb index 3504fa4..883c4ad 100644 --- a/playground.ipynb +++ b/playground.ipynb @@ -613,6 +613,42 @@ "pathlib.Path(r\"./data/emea_ar/output/markdown/501380553.md\").write_bytes(md_text.encode())" ] }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "def get_fund_name(fund_name: str, fund_feature: str):\n", + " fund_name_split = fund_name.split(fund_feature)\n", + " if len(fund_name_split) > 1:\n", + " last_fund = fund_name_split[-1].strip()\n", + " if len(last_fund) == 0:\n", + " last_fund = fund_name_split[-2].strip()\n", + " fund_name = f\"{last_fund} {fund_feature}\"\n", + " return fund_name" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'C Fund'" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "get_fund_name(\"A Fund B Fund C Fund\", \"Fund\")" + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/prepare_data.py b/prepare_data.py index 3306c09..43c1102 100644 --- a/prepare_data.py +++ b/prepare_data.py @@ -249,27 +249,27 @@ def statistics_document( doc_id_list = [str(docid) for docid in doc_mapping_data["DocumentId"].unique().tolist()] # statistics document page number - # pdf_files = glob(os.path.join(pdf_folder, "*.pdf")) - # logger.info(f"Total {len(pdf_files)} pdf files found in {pdf_folder}") - # logger.info("statistics document page number") - # doc_page_num_list = [] - # for pdf_file in tqdm(pdf_files): - # pdf_base_name = os.path.basename(pdf_file).replace(".pdf", "") - # if pdf_base_name not in doc_id_list: - # continue - # docid = os.path.basename(pdf_file).split(".")[0] - # doc = fitz.open(pdf_file) - # page_num = doc.page_count - # doc_page_num_list.append({"docid": docid, "page_num": page_num}) - # doc.close() - # doc_page_num_df = pd.DataFrame(doc_page_num_list) - # # order by page_num in descending order - # doc_page_num_df = doc_page_num_df.sort_values(by="page_num", ascending=False) - # # statistics page_num by describe and transform to DataFrame - # doc_page_num_stat_df = get_describe_stat( - # doc_page_num_df, "page_num", "doc_page_num" - # ) - # describe_stat_df_list.append(doc_page_num_stat_df) + pdf_files = glob(os.path.join(pdf_folder, "*.pdf")) + logger.info(f"Total {len(pdf_files)} pdf files found in {pdf_folder}") + logger.info("statistics document page number") + doc_page_num_list = [] + for pdf_file in tqdm(pdf_files): + pdf_base_name = os.path.basename(pdf_file).replace(".pdf", "") + if pdf_base_name not in doc_id_list: + continue + docid = os.path.basename(pdf_file).split(".")[0] + doc = fitz.open(pdf_file) + page_num = doc.page_count + doc_page_num_list.append({"docid": docid, "page_num": page_num}) + doc.close() + doc_page_num_df = pd.DataFrame(doc_page_num_list) + # order by page_num in descending order + doc_page_num_df = doc_page_num_df.sort_values(by="page_num", ascending=False) + # statistics page_num by describe and transform to DataFrame + doc_page_num_stat_df = get_describe_stat( + doc_page_num_df, "page_num", "doc_page_num" + ) + describe_stat_df_list.append(doc_page_num_stat_df) describe_stat_df = pd.concat(describe_stat_df_list) describe_stat_df.reset_index(drop=True, inplace=True) @@ -295,7 +295,7 @@ def statistics_document( # save statistics data to excel with pd.ExcelWriter(stat_file) as writer: - # doc_page_num_df.to_excel(writer, sheet_name="doc_page_num", index=False) + doc_page_num_df.to_excel(writer, sheet_name="doc_page_num", index=False) doc_dp_data_df.to_excel(writer, sheet_name="doc_dp_data", index=False) doc_fund_count.to_excel(writer, sheet_name="doc_fund_count", index=False) doc_share_class_count.to_excel( @@ -1392,13 +1392,13 @@ if __name__ == "__main__": # pdf_folder) - doc_mapping_file_path = r"/data/emea_ar/basic_information/English/sample_doc/emea_doc_with_all_4_dp/doc_ar_data_with_all_4_dp.xlsx" - output_data_folder = r"/data/emea_ar/basic_information/English/sample_doc/emea_doc_with_all_4_dp/" + doc_mapping_file_path = r"/data/emea_ar/basic_information/English/sample_doc/emea_sample_documents/doc_ar_data_for_emea_sample_documents.xlsx" + output_data_folder = r"/data/emea_ar/basic_information/English/sample_doc/emea_sample_documents/" statistics_document(pdf_folder=pdf_folder, doc_mapping_file_path=doc_mapping_file_path, sheet_name="doc_ar_data_in_db", output_folder=output_data_folder, - output_file="doc_ar_data_with_all_4_dp_statistics.xlsx") + output_file="doc_ar_data_sample_documents_statistics.xlsx") # get_document_extracted_share_diff_by_db() # statistics_provider_mapping( # provider_mapping_data_file=provider_mapping_data_file, diff --git a/utils/biz_utils.py b/utils/biz_utils.py index 7199f7e..fb48edc 100644 --- a/utils/biz_utils.py +++ b/utils/biz_utils.py @@ -968,4 +968,57 @@ def clean_folder(folder_path: str, expired_days: int = 5): try: os.remove(file_path) except: - pass \ No newline at end of file + pass + + +def remove_abundant_data(data_list: list): + exist_data_list = [] + + # remove abundant data, only keep the first one with value + for data in data_list: + extract_data = data.get("extract_data", {}) + data_detail_list = extract_data.get("data", []) + data_detail_list = remove_abundant_data_detail(data_detail_list, + exist_data_list) + data["extract_data"]["data"] = data_detail_list + return data_list + + +def remove_abundant_data_detail(data_detail_list: list, + exist_data_list: list): + regular_attributes = ["fund_name", "share_name"] + remove_list = [] + for data_detail in data_detail_list: + fund_name = data_detail.get("fund_name", "") + share_name = data_detail.get("share_name", "") + is_exist_data = False + for exist_data in exist_data_list: + if fund_name == exist_data["fund_name"] and share_name == exist_data["share_name"]: + is_exist_data = True + break + if not is_exist_data: + exist_data_list.append({"fund_name": fund_name, "share_name": share_name}) + pop_keys = [] + for data_key, data_value in data_detail.items(): + if data_key in regular_attributes: + continue + for exist_data in exist_data_list: + if fund_name == exist_data["fund_name"] and share_name == exist_data["share_name"]: + if data_key in exist_data.keys(): + # fund_data remove fund_key + exist_data_value = exist_data.get(data_key, None) + if exist_data_value is not None: + pop_keys.append(data_key) + else: + exist_data[data_key] = data_value + if len(pop_keys) > 0: + for pop_key in pop_keys: + data_detail.pop(pop_key) + value_keys = [value_key for value_key in list(data_detail.keys()) + if value_key not in regular_attributes] + if len(value_keys) == 0: + remove_list.append(data_detail) + for remove_data in remove_list: + if remove_data in data_detail_list: + data_detail_list.remove(remove_data) + return data_detail_list \ No newline at end of file