remove_abundant_data
This commit is contained in:
parent
c146497052
commit
bc32860f87
|
|
@ -8,7 +8,7 @@ from utils.gpt_utils import chat
|
||||||
from utils.pdf_util import PDFUtil
|
from utils.pdf_util import PDFUtil
|
||||||
from utils.sql_query_util import query_document_fund_mapping, query_investment_by_provider
|
from utils.sql_query_util import query_document_fund_mapping, query_investment_by_provider
|
||||||
from utils.logger import logger
|
from utils.logger import logger
|
||||||
from utils.biz_utils import add_slash_to_text_as_regex, clean_text, get_most_similar_name
|
from utils.biz_utils import add_slash_to_text_as_regex, clean_text, get_most_similar_name, remove_abundant_data
|
||||||
|
|
||||||
|
|
||||||
class DataExtraction:
|
class DataExtraction:
|
||||||
|
|
@ -149,11 +149,14 @@ class DataExtraction:
|
||||||
def extract_data(self) -> dict:
|
def extract_data(self) -> dict:
|
||||||
logger.info(f"Extracting data from document {self.doc_id}, extract way: {self.extract_way}")
|
logger.info(f"Extracting data from document {self.doc_id}, extract way: {self.extract_way}")
|
||||||
if self.extract_way == "text":
|
if self.extract_way == "text":
|
||||||
return self.extract_data_by_text()
|
data_list = self.extract_data_by_text()
|
||||||
elif self.extract_way == "image":
|
elif self.extract_way == "image":
|
||||||
return self.extract_data_by_image()
|
data_list = self.extract_data_by_image()
|
||||||
else:
|
else:
|
||||||
return self.extract_data_by_text()
|
data_list = self.extract_data_by_text()
|
||||||
|
data_list = remove_abundant_data(data_list)
|
||||||
|
self.output_data_to_file(data_list)
|
||||||
|
return data_list
|
||||||
|
|
||||||
def extract_data_by_text(self) -> dict:
|
def extract_data_by_text(self) -> dict:
|
||||||
"""
|
"""
|
||||||
|
|
@ -267,8 +270,7 @@ class DataExtraction:
|
||||||
logger.error(f"Error in extracting data from next page: {e}")
|
logger.error(f"Error in extracting data from next page: {e}")
|
||||||
break
|
break
|
||||||
|
|
||||||
self.output_data_to_file(data_list)
|
# self.output_data_to_file(data_list)
|
||||||
|
|
||||||
return data_list
|
return data_list
|
||||||
|
|
||||||
def extract_data_by_image(self) -> dict:
|
def extract_data_by_image(self) -> dict:
|
||||||
|
|
@ -346,7 +348,7 @@ class DataExtraction:
|
||||||
logger.error(f"Error in extracting data from next page: {e}")
|
logger.error(f"Error in extracting data from next page: {e}")
|
||||||
break
|
break
|
||||||
|
|
||||||
self.output_data_to_file(data_list)
|
# self.output_data_to_file(data_list)
|
||||||
|
|
||||||
return data_list
|
return data_list
|
||||||
|
|
||||||
|
|
@ -512,6 +514,8 @@ class DataExtraction:
|
||||||
fund_name = data.get("fund name", "")
|
fund_name = data.get("fund name", "")
|
||||||
if fund_name == "":
|
if fund_name == "":
|
||||||
remove_list.append(data)
|
remove_list.append(data)
|
||||||
|
fund_name = self.get_fund_name(fund_name, "Fund")
|
||||||
|
data["fund name"] = fund_name
|
||||||
keys = list(data.keys())
|
keys = list(data.keys())
|
||||||
for key in keys:
|
for key in keys:
|
||||||
if self.datapoint_level_config.get(key, "") == "share_level":
|
if self.datapoint_level_config.get(key, "") == "share_level":
|
||||||
|
|
@ -572,6 +576,17 @@ class DataExtraction:
|
||||||
|
|
||||||
extract_data_info["data"] = new_data_list
|
extract_data_info["data"] = new_data_list
|
||||||
return extract_data_info
|
return extract_data_info
|
||||||
|
|
||||||
|
def get_fund_name(self, fund_name: str, fund_feature: str):
|
||||||
|
if not fund_name.endswith(fund_feature):
|
||||||
|
return fund_name
|
||||||
|
fund_name_split = fund_name.split(fund_feature)
|
||||||
|
if len(fund_name_split) > 1:
|
||||||
|
last_fund = fund_name_split[-1].strip()
|
||||||
|
if len(last_fund) == 0:
|
||||||
|
last_fund = fund_name_split[-2].strip()
|
||||||
|
fund_name = f"{last_fund} {fund_feature}"
|
||||||
|
return fund_name
|
||||||
|
|
||||||
def check_fund_name_as_share(self, fund_name: str) -> bool:
|
def check_fund_name_as_share(self, fund_name: str) -> bool:
|
||||||
"""
|
"""
|
||||||
|
|
|
||||||
6
main.py
6
main.py
|
|
@ -1197,12 +1197,12 @@ if __name__ == "__main__":
|
||||||
"534535767"
|
"534535767"
|
||||||
]
|
]
|
||||||
special_doc_id_list = check_db_mapping_doc_id_list
|
special_doc_id_list = check_db_mapping_doc_id_list
|
||||||
# special_doc_id_list = ["337293427"]
|
special_doc_id_list = ["451878128"]
|
||||||
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
|
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
|
||||||
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
|
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
|
||||||
re_run_extract_data = False
|
re_run_extract_data = True
|
||||||
re_run_mapping_data = True
|
re_run_mapping_data = True
|
||||||
force_save_total_data = True
|
force_save_total_data = False
|
||||||
calculate_metrics = False
|
calculate_metrics = False
|
||||||
|
|
||||||
extract_ways = ["text"]
|
extract_ways = ["text"]
|
||||||
|
|
|
||||||
|
|
@ -613,6 +613,42 @@
|
||||||
"pathlib.Path(r\"./data/emea_ar/output/markdown/501380553.md\").write_bytes(md_text.encode())"
|
"pathlib.Path(r\"./data/emea_ar/output/markdown/501380553.md\").write_bytes(md_text.encode())"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def get_fund_name(fund_name: str, fund_feature: str):\n",
|
||||||
|
" fund_name_split = fund_name.split(fund_feature)\n",
|
||||||
|
" if len(fund_name_split) > 1:\n",
|
||||||
|
" last_fund = fund_name_split[-1].strip()\n",
|
||||||
|
" if len(last_fund) == 0:\n",
|
||||||
|
" last_fund = fund_name_split[-2].strip()\n",
|
||||||
|
" fund_name = f\"{last_fund} {fund_feature}\"\n",
|
||||||
|
" return fund_name"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"'C Fund'"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"get_fund_name(\"A Fund B Fund C Fund\", \"Fund\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
|
|
|
||||||
|
|
@ -249,27 +249,27 @@ def statistics_document(
|
||||||
|
|
||||||
doc_id_list = [str(docid) for docid in doc_mapping_data["DocumentId"].unique().tolist()]
|
doc_id_list = [str(docid) for docid in doc_mapping_data["DocumentId"].unique().tolist()]
|
||||||
# statistics document page number
|
# statistics document page number
|
||||||
# pdf_files = glob(os.path.join(pdf_folder, "*.pdf"))
|
pdf_files = glob(os.path.join(pdf_folder, "*.pdf"))
|
||||||
# logger.info(f"Total {len(pdf_files)} pdf files found in {pdf_folder}")
|
logger.info(f"Total {len(pdf_files)} pdf files found in {pdf_folder}")
|
||||||
# logger.info("statistics document page number")
|
logger.info("statistics document page number")
|
||||||
# doc_page_num_list = []
|
doc_page_num_list = []
|
||||||
# for pdf_file in tqdm(pdf_files):
|
for pdf_file in tqdm(pdf_files):
|
||||||
# pdf_base_name = os.path.basename(pdf_file).replace(".pdf", "")
|
pdf_base_name = os.path.basename(pdf_file).replace(".pdf", "")
|
||||||
# if pdf_base_name not in doc_id_list:
|
if pdf_base_name not in doc_id_list:
|
||||||
# continue
|
continue
|
||||||
# docid = os.path.basename(pdf_file).split(".")[0]
|
docid = os.path.basename(pdf_file).split(".")[0]
|
||||||
# doc = fitz.open(pdf_file)
|
doc = fitz.open(pdf_file)
|
||||||
# page_num = doc.page_count
|
page_num = doc.page_count
|
||||||
# doc_page_num_list.append({"docid": docid, "page_num": page_num})
|
doc_page_num_list.append({"docid": docid, "page_num": page_num})
|
||||||
# doc.close()
|
doc.close()
|
||||||
# doc_page_num_df = pd.DataFrame(doc_page_num_list)
|
doc_page_num_df = pd.DataFrame(doc_page_num_list)
|
||||||
# # order by page_num in descending order
|
# order by page_num in descending order
|
||||||
# doc_page_num_df = doc_page_num_df.sort_values(by="page_num", ascending=False)
|
doc_page_num_df = doc_page_num_df.sort_values(by="page_num", ascending=False)
|
||||||
# # statistics page_num by describe and transform to DataFrame
|
# statistics page_num by describe and transform to DataFrame
|
||||||
# doc_page_num_stat_df = get_describe_stat(
|
doc_page_num_stat_df = get_describe_stat(
|
||||||
# doc_page_num_df, "page_num", "doc_page_num"
|
doc_page_num_df, "page_num", "doc_page_num"
|
||||||
# )
|
)
|
||||||
# describe_stat_df_list.append(doc_page_num_stat_df)
|
describe_stat_df_list.append(doc_page_num_stat_df)
|
||||||
|
|
||||||
describe_stat_df = pd.concat(describe_stat_df_list)
|
describe_stat_df = pd.concat(describe_stat_df_list)
|
||||||
describe_stat_df.reset_index(drop=True, inplace=True)
|
describe_stat_df.reset_index(drop=True, inplace=True)
|
||||||
|
|
@ -295,7 +295,7 @@ def statistics_document(
|
||||||
|
|
||||||
# save statistics data to excel
|
# save statistics data to excel
|
||||||
with pd.ExcelWriter(stat_file) as writer:
|
with pd.ExcelWriter(stat_file) as writer:
|
||||||
# doc_page_num_df.to_excel(writer, sheet_name="doc_page_num", index=False)
|
doc_page_num_df.to_excel(writer, sheet_name="doc_page_num", index=False)
|
||||||
doc_dp_data_df.to_excel(writer, sheet_name="doc_dp_data", index=False)
|
doc_dp_data_df.to_excel(writer, sheet_name="doc_dp_data", index=False)
|
||||||
doc_fund_count.to_excel(writer, sheet_name="doc_fund_count", index=False)
|
doc_fund_count.to_excel(writer, sheet_name="doc_fund_count", index=False)
|
||||||
doc_share_class_count.to_excel(
|
doc_share_class_count.to_excel(
|
||||||
|
|
@ -1392,13 +1392,13 @@ if __name__ == "__main__":
|
||||||
# pdf_folder)
|
# pdf_folder)
|
||||||
|
|
||||||
|
|
||||||
doc_mapping_file_path = r"/data/emea_ar/basic_information/English/sample_doc/emea_doc_with_all_4_dp/doc_ar_data_with_all_4_dp.xlsx"
|
doc_mapping_file_path = r"/data/emea_ar/basic_information/English/sample_doc/emea_sample_documents/doc_ar_data_for_emea_sample_documents.xlsx"
|
||||||
output_data_folder = r"/data/emea_ar/basic_information/English/sample_doc/emea_doc_with_all_4_dp/"
|
output_data_folder = r"/data/emea_ar/basic_information/English/sample_doc/emea_sample_documents/"
|
||||||
statistics_document(pdf_folder=pdf_folder,
|
statistics_document(pdf_folder=pdf_folder,
|
||||||
doc_mapping_file_path=doc_mapping_file_path,
|
doc_mapping_file_path=doc_mapping_file_path,
|
||||||
sheet_name="doc_ar_data_in_db",
|
sheet_name="doc_ar_data_in_db",
|
||||||
output_folder=output_data_folder,
|
output_folder=output_data_folder,
|
||||||
output_file="doc_ar_data_with_all_4_dp_statistics.xlsx")
|
output_file="doc_ar_data_sample_documents_statistics.xlsx")
|
||||||
# get_document_extracted_share_diff_by_db()
|
# get_document_extracted_share_diff_by_db()
|
||||||
# statistics_provider_mapping(
|
# statistics_provider_mapping(
|
||||||
# provider_mapping_data_file=provider_mapping_data_file,
|
# provider_mapping_data_file=provider_mapping_data_file,
|
||||||
|
|
|
||||||
|
|
@ -968,4 +968,57 @@ def clean_folder(folder_path: str, expired_days: int = 5):
|
||||||
try:
|
try:
|
||||||
os.remove(file_path)
|
os.remove(file_path)
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def remove_abundant_data(data_list: list):
|
||||||
|
exist_data_list = []
|
||||||
|
|
||||||
|
# remove abundant data, only keep the first one with value
|
||||||
|
for data in data_list:
|
||||||
|
extract_data = data.get("extract_data", {})
|
||||||
|
data_detail_list = extract_data.get("data", [])
|
||||||
|
data_detail_list = remove_abundant_data_detail(data_detail_list,
|
||||||
|
exist_data_list)
|
||||||
|
data["extract_data"]["data"] = data_detail_list
|
||||||
|
return data_list
|
||||||
|
|
||||||
|
|
||||||
|
def remove_abundant_data_detail(data_detail_list: list,
|
||||||
|
exist_data_list: list):
|
||||||
|
regular_attributes = ["fund_name", "share_name"]
|
||||||
|
remove_list = []
|
||||||
|
for data_detail in data_detail_list:
|
||||||
|
fund_name = data_detail.get("fund_name", "")
|
||||||
|
share_name = data_detail.get("share_name", "")
|
||||||
|
is_exist_data = False
|
||||||
|
for exist_data in exist_data_list:
|
||||||
|
if fund_name == exist_data["fund_name"] and share_name == exist_data["share_name"]:
|
||||||
|
is_exist_data = True
|
||||||
|
break
|
||||||
|
if not is_exist_data:
|
||||||
|
exist_data_list.append({"fund_name": fund_name, "share_name": share_name})
|
||||||
|
pop_keys = []
|
||||||
|
for data_key, data_value in data_detail.items():
|
||||||
|
if data_key in regular_attributes:
|
||||||
|
continue
|
||||||
|
for exist_data in exist_data_list:
|
||||||
|
if fund_name == exist_data["fund_name"] and share_name == exist_data["share_name"]:
|
||||||
|
if data_key in exist_data.keys():
|
||||||
|
# fund_data remove fund_key
|
||||||
|
exist_data_value = exist_data.get(data_key, None)
|
||||||
|
if exist_data_value is not None:
|
||||||
|
pop_keys.append(data_key)
|
||||||
|
else:
|
||||||
|
exist_data[data_key] = data_value
|
||||||
|
if len(pop_keys) > 0:
|
||||||
|
for pop_key in pop_keys:
|
||||||
|
data_detail.pop(pop_key)
|
||||||
|
value_keys = [value_key for value_key in list(data_detail.keys())
|
||||||
|
if value_key not in regular_attributes]
|
||||||
|
if len(value_keys) == 0:
|
||||||
|
remove_list.append(data_detail)
|
||||||
|
for remove_data in remove_list:
|
||||||
|
if remove_data in data_detail_list:
|
||||||
|
data_detail_list.remove(remove_data)
|
||||||
|
return data_detail_list
|
||||||
Loading…
Reference in New Issue