remove_abundant_data

This commit is contained in:
Blade He 2024-12-02 17:16:56 -06:00
parent c146497052
commit bc32860f87
5 changed files with 140 additions and 36 deletions

View File

@ -8,7 +8,7 @@ from utils.gpt_utils import chat
from utils.pdf_util import PDFUtil
from utils.sql_query_util import query_document_fund_mapping, query_investment_by_provider
from utils.logger import logger
from utils.biz_utils import add_slash_to_text_as_regex, clean_text, get_most_similar_name
from utils.biz_utils import add_slash_to_text_as_regex, clean_text, get_most_similar_name, remove_abundant_data
class DataExtraction:
@ -149,11 +149,14 @@ class DataExtraction:
def extract_data(self) -> dict:
logger.info(f"Extracting data from document {self.doc_id}, extract way: {self.extract_way}")
if self.extract_way == "text":
return self.extract_data_by_text()
data_list = self.extract_data_by_text()
elif self.extract_way == "image":
return self.extract_data_by_image()
data_list = self.extract_data_by_image()
else:
return self.extract_data_by_text()
data_list = self.extract_data_by_text()
data_list = remove_abundant_data(data_list)
self.output_data_to_file(data_list)
return data_list
def extract_data_by_text(self) -> dict:
"""
@ -267,8 +270,7 @@ class DataExtraction:
logger.error(f"Error in extracting data from next page: {e}")
break
self.output_data_to_file(data_list)
# self.output_data_to_file(data_list)
return data_list
def extract_data_by_image(self) -> dict:
@ -346,7 +348,7 @@ class DataExtraction:
logger.error(f"Error in extracting data from next page: {e}")
break
self.output_data_to_file(data_list)
# self.output_data_to_file(data_list)
return data_list
@ -512,6 +514,8 @@ class DataExtraction:
fund_name = data.get("fund name", "")
if fund_name == "":
remove_list.append(data)
fund_name = self.get_fund_name(fund_name, "Fund")
data["fund name"] = fund_name
keys = list(data.keys())
for key in keys:
if self.datapoint_level_config.get(key, "") == "share_level":
@ -573,6 +577,17 @@ class DataExtraction:
extract_data_info["data"] = new_data_list
return extract_data_info
def get_fund_name(self, fund_name: str, fund_feature: str):
if not fund_name.endswith(fund_feature):
return fund_name
fund_name_split = fund_name.split(fund_feature)
if len(fund_name_split) > 1:
last_fund = fund_name_split[-1].strip()
if len(last_fund) == 0:
last_fund = fund_name_split[-2].strip()
fund_name = f"{last_fund} {fund_feature}"
return fund_name
def check_fund_name_as_share(self, fund_name: str) -> bool:
"""
Check if the fund name is the same as share name

View File

@ -1197,12 +1197,12 @@ if __name__ == "__main__":
"534535767"
]
special_doc_id_list = check_db_mapping_doc_id_list
# special_doc_id_list = ["337293427"]
special_doc_id_list = ["451878128"]
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
re_run_extract_data = False
re_run_extract_data = True
re_run_mapping_data = True
force_save_total_data = True
force_save_total_data = False
calculate_metrics = False
extract_ways = ["text"]

View File

@ -613,6 +613,42 @@
"pathlib.Path(r\"./data/emea_ar/output/markdown/501380553.md\").write_bytes(md_text.encode())"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"def get_fund_name(fund_name: str, fund_feature: str):\n",
" fund_name_split = fund_name.split(fund_feature)\n",
" if len(fund_name_split) > 1:\n",
" last_fund = fund_name_split[-1].strip()\n",
" if len(last_fund) == 0:\n",
" last_fund = fund_name_split[-2].strip()\n",
" fund_name = f\"{last_fund} {fund_feature}\"\n",
" return fund_name"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'C Fund'"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"get_fund_name(\"A Fund B Fund C Fund\", \"Fund\")"
]
},
{
"cell_type": "code",
"execution_count": null,

View File

@ -249,27 +249,27 @@ def statistics_document(
doc_id_list = [str(docid) for docid in doc_mapping_data["DocumentId"].unique().tolist()]
# statistics document page number
# pdf_files = glob(os.path.join(pdf_folder, "*.pdf"))
# logger.info(f"Total {len(pdf_files)} pdf files found in {pdf_folder}")
# logger.info("statistics document page number")
# doc_page_num_list = []
# for pdf_file in tqdm(pdf_files):
# pdf_base_name = os.path.basename(pdf_file).replace(".pdf", "")
# if pdf_base_name not in doc_id_list:
# continue
# docid = os.path.basename(pdf_file).split(".")[0]
# doc = fitz.open(pdf_file)
# page_num = doc.page_count
# doc_page_num_list.append({"docid": docid, "page_num": page_num})
# doc.close()
# doc_page_num_df = pd.DataFrame(doc_page_num_list)
# # order by page_num in descending order
# doc_page_num_df = doc_page_num_df.sort_values(by="page_num", ascending=False)
# # statistics page_num by describe and transform to DataFrame
# doc_page_num_stat_df = get_describe_stat(
# doc_page_num_df, "page_num", "doc_page_num"
# )
# describe_stat_df_list.append(doc_page_num_stat_df)
pdf_files = glob(os.path.join(pdf_folder, "*.pdf"))
logger.info(f"Total {len(pdf_files)} pdf files found in {pdf_folder}")
logger.info("statistics document page number")
doc_page_num_list = []
for pdf_file in tqdm(pdf_files):
pdf_base_name = os.path.basename(pdf_file).replace(".pdf", "")
if pdf_base_name not in doc_id_list:
continue
docid = os.path.basename(pdf_file).split(".")[0]
doc = fitz.open(pdf_file)
page_num = doc.page_count
doc_page_num_list.append({"docid": docid, "page_num": page_num})
doc.close()
doc_page_num_df = pd.DataFrame(doc_page_num_list)
# order by page_num in descending order
doc_page_num_df = doc_page_num_df.sort_values(by="page_num", ascending=False)
# statistics page_num by describe and transform to DataFrame
doc_page_num_stat_df = get_describe_stat(
doc_page_num_df, "page_num", "doc_page_num"
)
describe_stat_df_list.append(doc_page_num_stat_df)
describe_stat_df = pd.concat(describe_stat_df_list)
describe_stat_df.reset_index(drop=True, inplace=True)
@ -295,7 +295,7 @@ def statistics_document(
# save statistics data to excel
with pd.ExcelWriter(stat_file) as writer:
# doc_page_num_df.to_excel(writer, sheet_name="doc_page_num", index=False)
doc_page_num_df.to_excel(writer, sheet_name="doc_page_num", index=False)
doc_dp_data_df.to_excel(writer, sheet_name="doc_dp_data", index=False)
doc_fund_count.to_excel(writer, sheet_name="doc_fund_count", index=False)
doc_share_class_count.to_excel(
@ -1392,13 +1392,13 @@ if __name__ == "__main__":
# pdf_folder)
doc_mapping_file_path = r"/data/emea_ar/basic_information/English/sample_doc/emea_doc_with_all_4_dp/doc_ar_data_with_all_4_dp.xlsx"
output_data_folder = r"/data/emea_ar/basic_information/English/sample_doc/emea_doc_with_all_4_dp/"
doc_mapping_file_path = r"/data/emea_ar/basic_information/English/sample_doc/emea_sample_documents/doc_ar_data_for_emea_sample_documents.xlsx"
output_data_folder = r"/data/emea_ar/basic_information/English/sample_doc/emea_sample_documents/"
statistics_document(pdf_folder=pdf_folder,
doc_mapping_file_path=doc_mapping_file_path,
sheet_name="doc_ar_data_in_db",
output_folder=output_data_folder,
output_file="doc_ar_data_with_all_4_dp_statistics.xlsx")
output_file="doc_ar_data_sample_documents_statistics.xlsx")
# get_document_extracted_share_diff_by_db()
# statistics_provider_mapping(
# provider_mapping_data_file=provider_mapping_data_file,

View File

@ -969,3 +969,56 @@ def clean_folder(folder_path: str, expired_days: int = 5):
os.remove(file_path)
except:
pass
def remove_abundant_data(data_list: list):
exist_data_list = []
# remove abundant data, only keep the first one with value
for data in data_list:
extract_data = data.get("extract_data", {})
data_detail_list = extract_data.get("data", [])
data_detail_list = remove_abundant_data_detail(data_detail_list,
exist_data_list)
data["extract_data"]["data"] = data_detail_list
return data_list
def remove_abundant_data_detail(data_detail_list: list,
exist_data_list: list):
regular_attributes = ["fund_name", "share_name"]
remove_list = []
for data_detail in data_detail_list:
fund_name = data_detail.get("fund_name", "")
share_name = data_detail.get("share_name", "")
is_exist_data = False
for exist_data in exist_data_list:
if fund_name == exist_data["fund_name"] and share_name == exist_data["share_name"]:
is_exist_data = True
break
if not is_exist_data:
exist_data_list.append({"fund_name": fund_name, "share_name": share_name})
pop_keys = []
for data_key, data_value in data_detail.items():
if data_key in regular_attributes:
continue
for exist_data in exist_data_list:
if fund_name == exist_data["fund_name"] and share_name == exist_data["share_name"]:
if data_key in exist_data.keys():
# fund_data remove fund_key
exist_data_value = exist_data.get(data_key, None)
if exist_data_value is not None:
pop_keys.append(data_key)
else:
exist_data[data_key] = data_value
if len(pop_keys) > 0:
for pop_key in pop_keys:
data_detail.pop(pop_key)
value_keys = [value_key for value_key in list(data_detail.keys())
if value_key not in regular_attributes]
if len(value_keys) == 0:
remove_list.append(data_detail)
for remove_data in remove_list:
if remove_data in data_detail_list:
data_detail_list.remove(remove_data)
return data_detail_list