try to improve page filter precision

This commit is contained in:
Blade He 2024-09-04 17:01:12 -05:00
parent 7198450e53
commit 7c83f9152a
4 changed files with 92 additions and 38 deletions

View File

@ -12,6 +12,6 @@
"english": ["operating expenses paid"] "english": ["operating expenses paid"]
}, },
"performance_fee": { "performance_fee": {
"english": [] "english": ["Performance fees payable"]
} }
} }

View File

@ -200,8 +200,6 @@
"tor": { "tor": {
"english": [ "english": [
"TOR", "TOR",
"Turnover* \\n",
"Turnover \\n",
"Turnover Ratio", "Turnover Ratio",
"Turnover Rate", "Turnover Rate",
"Portfolio Turnover", "Portfolio Turnover",

View File

@ -113,61 +113,65 @@ class Metrics:
for data_point in data_point_list: for data_point in data_point_list:
if data_point == "tor": if data_point == "tor":
precision, recall, f1 = self.get_specific_metrics(tor_true, tor_pred) precision, recall, f1 = self.get_specific_metrics(tor_true, tor_pred)
tor_support = self.get_support_number(tor_true)
metrics_list.append( metrics_list.append(
{ {
"Data_Point": data_point, "Data_Point": data_point,
"Precision": precision, "Precision": precision,
"Recall": recall, "Recall": recall,
"F1": f1, "F1": f1,
"Support": len(tor_true), "Support": tor_support,
} }
) )
logger.info( logger.info(
f"TOR Precision: {precision}, Recall: {recall}, F1: {f1}, Support: {len(tor_true)}" f"TOR Precision: {precision}, Recall: {recall}, F1: {f1}, Support: {tor_support}"
) )
elif data_point == "ter": elif data_point == "ter":
precision, recall, f1 = self.get_specific_metrics(ter_true, ter_pred) precision, recall, f1 = self.get_specific_metrics(ter_true, ter_pred)
ter_support = self.get_support_number(ter_true)
metrics_list.append( metrics_list.append(
{ {
"Data_Point": data_point, "Data_Point": data_point,
"Precision": precision, "Precision": precision,
"Recall": recall, "Recall": recall,
"F1": f1, "F1": f1,
"Support": len(ter_true), "Support": ter_support,
} }
) )
logger.info( logger.info(
f"TER Precision: {precision}, Recall: {recall}, F1: {f1}, Support: {len(ter_true)}" f"TER Precision: {precision}, Recall: {recall}, F1: {f1}, Support: {ter_support}"
) )
elif data_point == "ogc": elif data_point == "ogc":
precision, recall, f1 = self.get_specific_metrics(ogc_true, ogc_pred) precision, recall, f1 = self.get_specific_metrics(ogc_true, ogc_pred)
ogc_support = self.get_support_number(ogc_true)
metrics_list.append( metrics_list.append(
{ {
"Data_Point": data_point, "Data_Point": data_point,
"Precision": precision, "Precision": precision,
"Recall": recall, "Recall": recall,
"F1": f1, "F1": f1,
"Support": len(ogc_true), "Support": ogc_support,
} }
) )
logger.info( logger.info(
f"OGC Precision: {precision}, Recall: {recall}, F1: {f1}, Support: {len(ogc_true)}" f"OGC Precision: {precision}, Recall: {recall}, F1: {f1}, Support: {ogc_support}"
) )
elif data_point == "performance_fee": elif data_point == "performance_fee":
precision, recall, f1 = self.get_specific_metrics( precision, recall, f1 = self.get_specific_metrics(
performance_fee_true, performance_fee_pred performance_fee_true, performance_fee_pred
) )
performance_fee_support = self.get_support_number(performance_fee_true)
metrics_list.append( metrics_list.append(
{ {
"Data_Point": data_point, "Data_Point": data_point,
"Precision": precision, "Precision": precision,
"Recall": recall, "Recall": recall,
"F1": f1, "F1": f1,
"Support": len(performance_fee_true), "Support": performance_fee_support,
} }
) )
logger.info( logger.info(
f"Performance Fee Precision: {precision}, Recall: {recall}, F1: {f1}, Support: {len(performance_fee_true)}" f"Performance Fee Precision: {precision}, Recall: {recall}, F1: {f1}, Support: {performance_fee_support}"
) )
# get average metrics # get average metrics
@ -185,6 +189,11 @@ class Metrics:
) )
return missing_error_list, metrics_list return missing_error_list, metrics_list
def get_support_number(self, true_data: list):
# get the count which true_data is 1
return sum(true_data)
def get_true_pred_data( def get_true_pred_data(
self, doc_id, ground_truth_data: pd.Series, prediction_data: pd.Series, data_point: str self, doc_id, ground_truth_data: pd.Series, prediction_data: pd.Series, data_point: str
): ):

View File

@ -22,7 +22,10 @@ class FilterPages:
self.document_mapping_info_df = document_mapping_info_df self.document_mapping_info_df = document_mapping_info_df
self.get_configuration_from_file() self.get_configuration_from_file()
self.doc_info = self.get_doc_info() self.doc_info = self.get_doc_info()
self.datapoint_config, self.datapoint_exclude_config = self.get_datapoint_config() self.datapoint_config, self.datapoint_exclude_config = (
self.get_datapoint_config()
)
self.percentage_regex = r"\b\d{1,3}\.\d+\b|\b\d{1,3}(\,\d+)\b%"
def get_pdf_page_text_dict(self) -> dict: def get_pdf_page_text_dict(self) -> dict:
pdf_util = PDFUtil(self.pdf_file) pdf_util = PDFUtil(self.pdf_file)
@ -30,17 +33,24 @@ class FilterPages:
return page_text_dict return page_text_dict
def get_configuration_from_file(self) -> dict: def get_configuration_from_file(self) -> dict:
"""
Remark: remove the
"""
language_config_file = r"./configuration/language.json" language_config_file = r"./configuration/language.json"
domicile_datapoint_config_file = r"./configuration/domicile_datapoints.json" domicile_datapoint_config_file = r"./configuration/domicile_datapoints.json"
datapoint_keywords_config_file = r"./configuration/datapoint_keyword.json" datapoint_keywords_config_file = r"./configuration/datapoint_keyword.json"
datapoint_exclude_keywords_config_file = r"./configuration/datapoint_exclude_keyword.json" datapoint_exclude_keywords_config_file = (
r"./configuration/datapoint_exclude_keyword.json"
)
with open(language_config_file, "r", encoding="utf-8") as file: with open(language_config_file, "r", encoding="utf-8") as file:
self.language_config = json.load(file) self.language_config = json.load(file)
with open(domicile_datapoint_config_file, "r", encoding="utf-8") as file: with open(domicile_datapoint_config_file, "r", encoding="utf-8") as file:
self.domicile_datapoint_config = json.load(file) self.domicile_datapoint_config = json.load(file)
with open(datapoint_keywords_config_file, "r", encoding="utf-8") as file: with open(datapoint_keywords_config_file, "r", encoding="utf-8") as file:
self.datapoint_keywords_config = json.load(file) self.datapoint_keywords_config = json.load(file)
with open(datapoint_exclude_keywords_config_file, "r", encoding="utf-8") as file: with open(
datapoint_exclude_keywords_config_file, "r", encoding="utf-8"
) as file:
self.datapoint_exclude_keywords_config = json.load(file) self.datapoint_exclude_keywords_config = json.load(file)
def get_doc_info(self) -> dict: def get_doc_info(self) -> dict:
@ -79,12 +89,16 @@ class FilterPages:
domicile = "default" domicile = "default"
if self.domicile_datapoint_config[domicile].get(document_type, None) is None: if self.domicile_datapoint_config[domicile].get(document_type, None) is None:
document_type = "ar" document_type = "ar"
datapoint_list = self.domicile_datapoint_config[domicile][document_type] datapoint_list = self.domicile_datapoint_config[domicile][document_type]
datapoint_keywords = self.get_keywords("include", datapoint_list, language) datapoint_keywords = self.get_keywords("include", datapoint_list, language)
datapoint_exclude_keywords = self.get_keywords("exclude", datapoint_list, language) datapoint_exclude_keywords = self.get_keywords(
"exclude", datapoint_list, language
)
return datapoint_keywords, datapoint_exclude_keywords return datapoint_keywords, datapoint_exclude_keywords
def get_keywords(self, keywords_type: str, datapoint_list: list, language: str) -> dict: def get_keywords(
self, keywords_type: str, datapoint_list: list, language: str
) -> dict:
if keywords_type == "include": if keywords_type == "include":
config = self.datapoint_keywords_config config = self.datapoint_keywords_config
elif keywords_type == "exclude": elif keywords_type == "exclude":
@ -129,16 +143,31 @@ class FilterPages:
for datapoint in self.datapoint_config.keys(): for datapoint in self.datapoint_config.keys():
result[datapoint] = [] result[datapoint] = []
for page_num, page_text in self.page_text_dict.items(): for page_num, page_text in self.page_text_dict.items():
text = "\n" + clean_text(page_text) + "\n" if page_num < 2:
continue
page_text = clean_text(page_text)
text_split = [
sentence.strip()
for sentence in page_text.split("\n")
if len(sentence.strip()) > 0
]
text = "\n" + "\n".join(text_split) + "\n"
language = self.doc_info.get("language", None)
if language is None:
language = "english"
if language == "english" and re.search(self.percentage_regex, text) is None:
continue
for datapoint, keywords in self.datapoint_config.items(): for datapoint, keywords in self.datapoint_config.items():
# idx = idx & np.array([re.findall(r'\b' + word + r'\d*\b', page) != [] for page in self.pages_clean])
find_datapoint = False find_datapoint = False
for keyword in keywords: for keyword in keywords:
search_iter = self.search_keyword(text, keyword) search_iter = self.search_keyword(text, keyword)
for search in search_iter: for search in search_iter:
search_text = search.group().strip() search_text = search.group().strip()
exclude_search_list = self.search_exclude_keywords(text, datapoint) exclude_search_list = self.search_exclude_keywords(
if exclude_search_list is not None: text, datapoint
)
if exclude_search_list is not None and len(exclude_search_list) > 0:
need_exclude = False need_exclude = False
for exclude_search_text in exclude_search_list: for exclude_search_text in exclude_search_list:
if search_text in exclude_search_text: if search_text in exclude_search_text:
@ -156,7 +185,7 @@ class FilterPages:
"datapoint": datapoint, "datapoint": datapoint,
"page_num": page_num, "page_num": page_num,
"keyword": keyword, "keyword": keyword,
"text": search_text "text": search_text,
} }
result_details.append(detail) result_details.append(detail)
find_datapoint = True find_datapoint = True
@ -165,14 +194,13 @@ class FilterPages:
break break
return result, result_details return result, result_details
def search_in_sentence_is_valid(self, def search_in_sentence_is_valid(self, search_text: str, text: str):
search_text: str,
text: str):
search_text_regex = add_slash_to_text_as_regex(search_text) search_text_regex = add_slash_to_text_as_regex(search_text)
search_regex = r"\n.*{0}.*\n".format(search_text_regex) search_regex = r"\n.*{0}.*\n".format(search_text_regex)
search_iter = re.finditer(search_regex, text, re.IGNORECASE) search_iter = re.finditer(search_regex, text, re.IGNORECASE)
is_valid = False is_valid = False
lower_word_count_threshold = 7 lower_word_count_threshold = 7
big_number_regex = r"\b\d{1,3}(\,\d{3})+\b"
for search in search_iter: for search in search_iter:
lower_word_count = 0 lower_word_count = 0
if search is not None: if search is not None:
@ -183,8 +211,27 @@ class FilterPages:
if split[0].islower(): if split[0].islower():
lower_word_count += 1 lower_word_count += 1
if lower_word_count < lower_word_count_threshold: if lower_word_count < lower_word_count_threshold:
is_valid = True if re.search(self.percentage_regex, search_text) is not None:
break is_valid = True
break
new_search_text_regex = add_slash_to_text_as_regex(search_text)
new_search_regex = r"\n.*{0}.*\n(?P<next_line>.*)\n(?P<next_2_line>.*)\n".format(
new_search_text_regex
)
new_search = re.search(new_search_regex, text, re.IGNORECASE)
if new_search is not None:
next_line = new_search.group("next_line").strip()
next_2_line = new_search.group("next_2_line").strip()
if re.search(big_number_regex, next_line) is not None or \
re.search(big_number_regex, next_2_line) is not None:
is_valid = False
else:
is_valid = True
break
else:
is_valid = True
break
return is_valid return is_valid
def search_keyword(self, text: str, keyword: str): def search_keyword(self, text: str, keyword: str):