remove_abundant_data
This commit is contained in:
parent
c146497052
commit
bc32860f87
|
|
@ -8,7 +8,7 @@ from utils.gpt_utils import chat
|
|||
from utils.pdf_util import PDFUtil
|
||||
from utils.sql_query_util import query_document_fund_mapping, query_investment_by_provider
|
||||
from utils.logger import logger
|
||||
from utils.biz_utils import add_slash_to_text_as_regex, clean_text, get_most_similar_name
|
||||
from utils.biz_utils import add_slash_to_text_as_regex, clean_text, get_most_similar_name, remove_abundant_data
|
||||
|
||||
|
||||
class DataExtraction:
|
||||
|
|
@ -149,11 +149,14 @@ class DataExtraction:
|
|||
def extract_data(self) -> dict:
|
||||
logger.info(f"Extracting data from document {self.doc_id}, extract way: {self.extract_way}")
|
||||
if self.extract_way == "text":
|
||||
return self.extract_data_by_text()
|
||||
data_list = self.extract_data_by_text()
|
||||
elif self.extract_way == "image":
|
||||
return self.extract_data_by_image()
|
||||
data_list = self.extract_data_by_image()
|
||||
else:
|
||||
return self.extract_data_by_text()
|
||||
data_list = self.extract_data_by_text()
|
||||
data_list = remove_abundant_data(data_list)
|
||||
self.output_data_to_file(data_list)
|
||||
return data_list
|
||||
|
||||
def extract_data_by_text(self) -> dict:
|
||||
"""
|
||||
|
|
@ -267,8 +270,7 @@ class DataExtraction:
|
|||
logger.error(f"Error in extracting data from next page: {e}")
|
||||
break
|
||||
|
||||
self.output_data_to_file(data_list)
|
||||
|
||||
# self.output_data_to_file(data_list)
|
||||
return data_list
|
||||
|
||||
def extract_data_by_image(self) -> dict:
|
||||
|
|
@ -346,7 +348,7 @@ class DataExtraction:
|
|||
logger.error(f"Error in extracting data from next page: {e}")
|
||||
break
|
||||
|
||||
self.output_data_to_file(data_list)
|
||||
# self.output_data_to_file(data_list)
|
||||
|
||||
return data_list
|
||||
|
||||
|
|
@ -512,6 +514,8 @@ class DataExtraction:
|
|||
fund_name = data.get("fund name", "")
|
||||
if fund_name == "":
|
||||
remove_list.append(data)
|
||||
fund_name = self.get_fund_name(fund_name, "Fund")
|
||||
data["fund name"] = fund_name
|
||||
keys = list(data.keys())
|
||||
for key in keys:
|
||||
if self.datapoint_level_config.get(key, "") == "share_level":
|
||||
|
|
@ -572,6 +576,17 @@ class DataExtraction:
|
|||
|
||||
extract_data_info["data"] = new_data_list
|
||||
return extract_data_info
|
||||
|
||||
def get_fund_name(self, fund_name: str, fund_feature: str):
|
||||
if not fund_name.endswith(fund_feature):
|
||||
return fund_name
|
||||
fund_name_split = fund_name.split(fund_feature)
|
||||
if len(fund_name_split) > 1:
|
||||
last_fund = fund_name_split[-1].strip()
|
||||
if len(last_fund) == 0:
|
||||
last_fund = fund_name_split[-2].strip()
|
||||
fund_name = f"{last_fund} {fund_feature}"
|
||||
return fund_name
|
||||
|
||||
def check_fund_name_as_share(self, fund_name: str) -> bool:
|
||||
"""
|
||||
|
|
|
|||
6
main.py
6
main.py
|
|
@ -1197,12 +1197,12 @@ if __name__ == "__main__":
|
|||
"534535767"
|
||||
]
|
||||
special_doc_id_list = check_db_mapping_doc_id_list
|
||||
# special_doc_id_list = ["337293427"]
|
||||
special_doc_id_list = ["451878128"]
|
||||
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
|
||||
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
|
||||
re_run_extract_data = False
|
||||
re_run_extract_data = True
|
||||
re_run_mapping_data = True
|
||||
force_save_total_data = True
|
||||
force_save_total_data = False
|
||||
calculate_metrics = False
|
||||
|
||||
extract_ways = ["text"]
|
||||
|
|
|
|||
|
|
@ -613,6 +613,42 @@
|
|||
"pathlib.Path(r\"./data/emea_ar/output/markdown/501380553.md\").write_bytes(md_text.encode())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def get_fund_name(fund_name: str, fund_feature: str):\n",
|
||||
" fund_name_split = fund_name.split(fund_feature)\n",
|
||||
" if len(fund_name_split) > 1:\n",
|
||||
" last_fund = fund_name_split[-1].strip()\n",
|
||||
" if len(last_fund) == 0:\n",
|
||||
" last_fund = fund_name_split[-2].strip()\n",
|
||||
" fund_name = f\"{last_fund} {fund_feature}\"\n",
|
||||
" return fund_name"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'C Fund'"
|
||||
]
|
||||
},
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"get_fund_name(\"A Fund B Fund C Fund\", \"Fund\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
|
|
|
|||
|
|
@ -249,27 +249,27 @@ def statistics_document(
|
|||
|
||||
doc_id_list = [str(docid) for docid in doc_mapping_data["DocumentId"].unique().tolist()]
|
||||
# statistics document page number
|
||||
# pdf_files = glob(os.path.join(pdf_folder, "*.pdf"))
|
||||
# logger.info(f"Total {len(pdf_files)} pdf files found in {pdf_folder}")
|
||||
# logger.info("statistics document page number")
|
||||
# doc_page_num_list = []
|
||||
# for pdf_file in tqdm(pdf_files):
|
||||
# pdf_base_name = os.path.basename(pdf_file).replace(".pdf", "")
|
||||
# if pdf_base_name not in doc_id_list:
|
||||
# continue
|
||||
# docid = os.path.basename(pdf_file).split(".")[0]
|
||||
# doc = fitz.open(pdf_file)
|
||||
# page_num = doc.page_count
|
||||
# doc_page_num_list.append({"docid": docid, "page_num": page_num})
|
||||
# doc.close()
|
||||
# doc_page_num_df = pd.DataFrame(doc_page_num_list)
|
||||
# # order by page_num in descending order
|
||||
# doc_page_num_df = doc_page_num_df.sort_values(by="page_num", ascending=False)
|
||||
# # statistics page_num by describe and transform to DataFrame
|
||||
# doc_page_num_stat_df = get_describe_stat(
|
||||
# doc_page_num_df, "page_num", "doc_page_num"
|
||||
# )
|
||||
# describe_stat_df_list.append(doc_page_num_stat_df)
|
||||
pdf_files = glob(os.path.join(pdf_folder, "*.pdf"))
|
||||
logger.info(f"Total {len(pdf_files)} pdf files found in {pdf_folder}")
|
||||
logger.info("statistics document page number")
|
||||
doc_page_num_list = []
|
||||
for pdf_file in tqdm(pdf_files):
|
||||
pdf_base_name = os.path.basename(pdf_file).replace(".pdf", "")
|
||||
if pdf_base_name not in doc_id_list:
|
||||
continue
|
||||
docid = os.path.basename(pdf_file).split(".")[0]
|
||||
doc = fitz.open(pdf_file)
|
||||
page_num = doc.page_count
|
||||
doc_page_num_list.append({"docid": docid, "page_num": page_num})
|
||||
doc.close()
|
||||
doc_page_num_df = pd.DataFrame(doc_page_num_list)
|
||||
# order by page_num in descending order
|
||||
doc_page_num_df = doc_page_num_df.sort_values(by="page_num", ascending=False)
|
||||
# statistics page_num by describe and transform to DataFrame
|
||||
doc_page_num_stat_df = get_describe_stat(
|
||||
doc_page_num_df, "page_num", "doc_page_num"
|
||||
)
|
||||
describe_stat_df_list.append(doc_page_num_stat_df)
|
||||
|
||||
describe_stat_df = pd.concat(describe_stat_df_list)
|
||||
describe_stat_df.reset_index(drop=True, inplace=True)
|
||||
|
|
@ -295,7 +295,7 @@ def statistics_document(
|
|||
|
||||
# save statistics data to excel
|
||||
with pd.ExcelWriter(stat_file) as writer:
|
||||
# doc_page_num_df.to_excel(writer, sheet_name="doc_page_num", index=False)
|
||||
doc_page_num_df.to_excel(writer, sheet_name="doc_page_num", index=False)
|
||||
doc_dp_data_df.to_excel(writer, sheet_name="doc_dp_data", index=False)
|
||||
doc_fund_count.to_excel(writer, sheet_name="doc_fund_count", index=False)
|
||||
doc_share_class_count.to_excel(
|
||||
|
|
@ -1392,13 +1392,13 @@ if __name__ == "__main__":
|
|||
# pdf_folder)
|
||||
|
||||
|
||||
doc_mapping_file_path = r"/data/emea_ar/basic_information/English/sample_doc/emea_doc_with_all_4_dp/doc_ar_data_with_all_4_dp.xlsx"
|
||||
output_data_folder = r"/data/emea_ar/basic_information/English/sample_doc/emea_doc_with_all_4_dp/"
|
||||
doc_mapping_file_path = r"/data/emea_ar/basic_information/English/sample_doc/emea_sample_documents/doc_ar_data_for_emea_sample_documents.xlsx"
|
||||
output_data_folder = r"/data/emea_ar/basic_information/English/sample_doc/emea_sample_documents/"
|
||||
statistics_document(pdf_folder=pdf_folder,
|
||||
doc_mapping_file_path=doc_mapping_file_path,
|
||||
sheet_name="doc_ar_data_in_db",
|
||||
output_folder=output_data_folder,
|
||||
output_file="doc_ar_data_with_all_4_dp_statistics.xlsx")
|
||||
output_file="doc_ar_data_sample_documents_statistics.xlsx")
|
||||
# get_document_extracted_share_diff_by_db()
|
||||
# statistics_provider_mapping(
|
||||
# provider_mapping_data_file=provider_mapping_data_file,
|
||||
|
|
|
|||
|
|
@ -968,4 +968,57 @@ def clean_folder(folder_path: str, expired_days: int = 5):
|
|||
try:
|
||||
os.remove(file_path)
|
||||
except:
|
||||
pass
|
||||
pass
|
||||
|
||||
|
||||
def remove_abundant_data(data_list: list):
|
||||
exist_data_list = []
|
||||
|
||||
# remove abundant data, only keep the first one with value
|
||||
for data in data_list:
|
||||
extract_data = data.get("extract_data", {})
|
||||
data_detail_list = extract_data.get("data", [])
|
||||
data_detail_list = remove_abundant_data_detail(data_detail_list,
|
||||
exist_data_list)
|
||||
data["extract_data"]["data"] = data_detail_list
|
||||
return data_list
|
||||
|
||||
|
||||
def remove_abundant_data_detail(data_detail_list: list,
|
||||
exist_data_list: list):
|
||||
regular_attributes = ["fund_name", "share_name"]
|
||||
remove_list = []
|
||||
for data_detail in data_detail_list:
|
||||
fund_name = data_detail.get("fund_name", "")
|
||||
share_name = data_detail.get("share_name", "")
|
||||
is_exist_data = False
|
||||
for exist_data in exist_data_list:
|
||||
if fund_name == exist_data["fund_name"] and share_name == exist_data["share_name"]:
|
||||
is_exist_data = True
|
||||
break
|
||||
if not is_exist_data:
|
||||
exist_data_list.append({"fund_name": fund_name, "share_name": share_name})
|
||||
pop_keys = []
|
||||
for data_key, data_value in data_detail.items():
|
||||
if data_key in regular_attributes:
|
||||
continue
|
||||
for exist_data in exist_data_list:
|
||||
if fund_name == exist_data["fund_name"] and share_name == exist_data["share_name"]:
|
||||
if data_key in exist_data.keys():
|
||||
# fund_data remove fund_key
|
||||
exist_data_value = exist_data.get(data_key, None)
|
||||
if exist_data_value is not None:
|
||||
pop_keys.append(data_key)
|
||||
else:
|
||||
exist_data[data_key] = data_value
|
||||
if len(pop_keys) > 0:
|
||||
for pop_key in pop_keys:
|
||||
data_detail.pop(pop_key)
|
||||
value_keys = [value_key for value_key in list(data_detail.keys())
|
||||
if value_key not in regular_attributes]
|
||||
if len(value_keys) == 0:
|
||||
remove_list.append(data_detail)
|
||||
for remove_data in remove_list:
|
||||
if remove_data in data_detail_list:
|
||||
data_detail_list.remove(remove_data)
|
||||
return data_detail_list
|
||||
Loading…
Reference in New Issue