try to improve page filter precision

This commit is contained in:
Blade He 2024-09-04 17:01:12 -05:00
parent 7198450e53
commit 7c83f9152a
4 changed files with 92 additions and 38 deletions

View File

@ -12,6 +12,6 @@
"english": ["operating expenses paid"]
},
"performance_fee": {
"english": []
"english": ["Performance fees payable"]
}
}

View File

@ -200,8 +200,6 @@
"tor": {
"english": [
"TOR",
"Turnover* \\n",
"Turnover \\n",
"Turnover Ratio",
"Turnover Rate",
"Portfolio Turnover",

View File

@ -113,61 +113,65 @@ class Metrics:
for data_point in data_point_list:
if data_point == "tor":
precision, recall, f1 = self.get_specific_metrics(tor_true, tor_pred)
tor_support = self.get_support_number(tor_true)
metrics_list.append(
{
"Data_Point": data_point,
"Precision": precision,
"Recall": recall,
"F1": f1,
"Support": len(tor_true),
"Support": tor_support,
}
)
logger.info(
f"TOR Precision: {precision}, Recall: {recall}, F1: {f1}, Support: {len(tor_true)}"
f"TOR Precision: {precision}, Recall: {recall}, F1: {f1}, Support: {tor_support}"
)
elif data_point == "ter":
precision, recall, f1 = self.get_specific_metrics(ter_true, ter_pred)
ter_support = self.get_support_number(ter_true)
metrics_list.append(
{
"Data_Point": data_point,
"Precision": precision,
"Recall": recall,
"F1": f1,
"Support": len(ter_true),
"Support": ter_support,
}
)
logger.info(
f"TER Precision: {precision}, Recall: {recall}, F1: {f1}, Support: {len(ter_true)}"
f"TER Precision: {precision}, Recall: {recall}, F1: {f1}, Support: {ter_support}"
)
elif data_point == "ogc":
precision, recall, f1 = self.get_specific_metrics(ogc_true, ogc_pred)
ogc_support = self.get_support_number(ogc_true)
metrics_list.append(
{
"Data_Point": data_point,
"Precision": precision,
"Recall": recall,
"F1": f1,
"Support": len(ogc_true),
"Support": ogc_support,
}
)
logger.info(
f"OGC Precision: {precision}, Recall: {recall}, F1: {f1}, Support: {len(ogc_true)}"
f"OGC Precision: {precision}, Recall: {recall}, F1: {f1}, Support: {ogc_support}"
)
elif data_point == "performance_fee":
precision, recall, f1 = self.get_specific_metrics(
performance_fee_true, performance_fee_pred
)
performance_fee_support = self.get_support_number(performance_fee_true)
metrics_list.append(
{
"Data_Point": data_point,
"Precision": precision,
"Recall": recall,
"F1": f1,
"Support": len(performance_fee_true),
"Support": performance_fee_support,
}
)
logger.info(
f"Performance Fee Precision: {precision}, Recall: {recall}, F1: {f1}, Support: {len(performance_fee_true)}"
f"Performance Fee Precision: {precision}, Recall: {recall}, F1: {f1}, Support: {performance_fee_support}"
)
# get average metrics
@ -184,6 +188,11 @@ class Metrics:
}
)
return missing_error_list, metrics_list
def get_support_number(self, true_data: list):
# get the count which true_data is 1
return sum(true_data)
def get_true_pred_data(
self, doc_id, ground_truth_data: pd.Series, prediction_data: pd.Series, data_point: str

View File

@ -22,7 +22,10 @@ class FilterPages:
self.document_mapping_info_df = document_mapping_info_df
self.get_configuration_from_file()
self.doc_info = self.get_doc_info()
self.datapoint_config, self.datapoint_exclude_config = self.get_datapoint_config()
self.datapoint_config, self.datapoint_exclude_config = (
self.get_datapoint_config()
)
self.percentage_regex = r"\b\d{1,3}\.\d+\b|\b\d{1,3}(\,\d+)\b%"
def get_pdf_page_text_dict(self) -> dict:
pdf_util = PDFUtil(self.pdf_file)
@ -30,17 +33,24 @@ class FilterPages:
return page_text_dict
def get_configuration_from_file(self) -> dict:
"""
Remark: remove the
"""
language_config_file = r"./configuration/language.json"
domicile_datapoint_config_file = r"./configuration/domicile_datapoints.json"
datapoint_keywords_config_file = r"./configuration/datapoint_keyword.json"
datapoint_exclude_keywords_config_file = r"./configuration/datapoint_exclude_keyword.json"
datapoint_exclude_keywords_config_file = (
r"./configuration/datapoint_exclude_keyword.json"
)
with open(language_config_file, "r", encoding="utf-8") as file:
self.language_config = json.load(file)
with open(domicile_datapoint_config_file, "r", encoding="utf-8") as file:
self.domicile_datapoint_config = json.load(file)
with open(datapoint_keywords_config_file, "r", encoding="utf-8") as file:
self.datapoint_keywords_config = json.load(file)
with open(datapoint_exclude_keywords_config_file, "r", encoding="utf-8") as file:
with open(
datapoint_exclude_keywords_config_file, "r", encoding="utf-8"
) as file:
self.datapoint_exclude_keywords_config = json.load(file)
def get_doc_info(self) -> dict:
@ -68,7 +78,7 @@ class FilterPages:
"language": language,
"domicile": domicile,
}
def get_datapoint_config(self) -> dict:
domicile = self.doc_info.get("domicile", None)
document_type = self.doc_info.get("document_type", None)
@ -79,12 +89,16 @@ class FilterPages:
domicile = "default"
if self.domicile_datapoint_config[domicile].get(document_type, None) is None:
document_type = "ar"
datapoint_list = self.domicile_datapoint_config[domicile][document_type]
datapoint_list = self.domicile_datapoint_config[domicile][document_type]
datapoint_keywords = self.get_keywords("include", datapoint_list, language)
datapoint_exclude_keywords = self.get_keywords("exclude", datapoint_list, language)
datapoint_exclude_keywords = self.get_keywords(
"exclude", datapoint_list, language
)
return datapoint_keywords, datapoint_exclude_keywords
def get_keywords(self, keywords_type: str, datapoint_list: list, language: str) -> dict:
def get_keywords(
self, keywords_type: str, datapoint_list: list, language: str
) -> dict:
if keywords_type == "include":
config = self.datapoint_keywords_config
elif keywords_type == "exclude":
@ -92,7 +106,7 @@ class FilterPages:
else:
config = self.datapoint_keywords_config
datapoint_keywords = {}
for datapoint in datapoint_list:
keywords = config.get(datapoint, {}).get(language, [])
if len(keywords) > 0:
@ -103,15 +117,15 @@ class FilterPages:
if len(english_keywords) > 0:
english_keywords = self.optimize_keywords_regex(english_keywords)
datapoint_keywords[datapoint] += english_keywords
return datapoint_keywords
return datapoint_keywords
def optimize_keywords_regex(self, keywords: list) -> list:
new_keywords = []
for keyword in keywords:
new_keyword = add_slash_to_text_as_regex(keyword)
new_keywords.append(new_keyword)
return new_keywords
def start_job(self) -> dict:
logger.info(f"Start extracting datapoints from {self.pdf_file}")
"""
@ -129,16 +143,31 @@ class FilterPages:
for datapoint in self.datapoint_config.keys():
result[datapoint] = []
for page_num, page_text in self.page_text_dict.items():
text = "\n" + clean_text(page_text) + "\n"
if page_num < 2:
continue
page_text = clean_text(page_text)
text_split = [
sentence.strip()
for sentence in page_text.split("\n")
if len(sentence.strip()) > 0
]
text = "\n" + "\n".join(text_split) + "\n"
language = self.doc_info.get("language", None)
if language is None:
language = "english"
if language == "english" and re.search(self.percentage_regex, text) is None:
continue
for datapoint, keywords in self.datapoint_config.items():
# idx = idx & np.array([re.findall(r'\b' + word + r'\d*\b', page) != [] for page in self.pages_clean])
find_datapoint = False
for keyword in keywords:
search_iter = self.search_keyword(text, keyword)
for search in search_iter:
search_text = search.group().strip()
exclude_search_list = self.search_exclude_keywords(text, datapoint)
if exclude_search_list is not None:
exclude_search_list = self.search_exclude_keywords(
text, datapoint
)
if exclude_search_list is not None and len(exclude_search_list) > 0:
need_exclude = False
for exclude_search_text in exclude_search_list:
if search_text in exclude_search_text:
@ -146,7 +175,7 @@ class FilterPages:
break
if need_exclude:
continue
is_valid = self.search_in_sentence_is_valid(search_text, text)
if not is_valid:
continue
@ -156,7 +185,7 @@ class FilterPages:
"datapoint": datapoint,
"page_num": page_num,
"keyword": keyword,
"text": search_text
"text": search_text,
}
result_details.append(detail)
find_datapoint = True
@ -164,15 +193,14 @@ class FilterPages:
if find_datapoint:
break
return result, result_details
def search_in_sentence_is_valid(self,
search_text: str,
text: str):
def search_in_sentence_is_valid(self, search_text: str, text: str):
search_text_regex = add_slash_to_text_as_regex(search_text)
search_regex = r"\n.*{0}.*\n".format(search_text_regex)
search_iter = re.finditer(search_regex, text, re.IGNORECASE)
is_valid = False
lower_word_count_threshold = 7
big_number_regex = r"\b\d{1,3}(\,\d{3})+\b"
for search in search_iter:
lower_word_count = 0
if search is not None:
@ -183,20 +211,39 @@ class FilterPages:
if split[0].islower():
lower_word_count += 1
if lower_word_count < lower_word_count_threshold:
is_valid = True
break
if re.search(self.percentage_regex, search_text) is not None:
is_valid = True
break
new_search_text_regex = add_slash_to_text_as_regex(search_text)
new_search_regex = r"\n.*{0}.*\n(?P<next_line>.*)\n(?P<next_2_line>.*)\n".format(
new_search_text_regex
)
new_search = re.search(new_search_regex, text, re.IGNORECASE)
if new_search is not None:
next_line = new_search.group("next_line").strip()
next_2_line = new_search.group("next_2_line").strip()
if re.search(big_number_regex, next_line) is not None or \
re.search(big_number_regex, next_2_line) is not None:
is_valid = False
else:
is_valid = True
break
else:
is_valid = True
break
return is_valid
def search_keyword(self, text: str, keyword: str):
search_regex = r"\b{0}\d*\W*\s*\b".format(keyword)
return re.finditer(search_regex, text, re.IGNORECASE)
def search_exclude_keywords(self, text: str, datapoint: str):
exclude_keywords = self.datapoint_exclude_config.get(datapoint, [])
search_list = []
for keyword in exclude_keywords:
search_iter = self.search_keyword(text, keyword)
for search in search_iter:
search_list.append(search.group())
return search_list
return search_list