try to improve page filter precision
This commit is contained in:
parent
7198450e53
commit
7c83f9152a
|
|
@ -12,6 +12,6 @@
|
|||
"english": ["operating expenses paid"]
|
||||
},
|
||||
"performance_fee": {
|
||||
"english": []
|
||||
"english": ["Performance fees payable"]
|
||||
}
|
||||
}
|
||||
|
|
@ -200,8 +200,6 @@
|
|||
"tor": {
|
||||
"english": [
|
||||
"TOR",
|
||||
"Turnover* \\n",
|
||||
"Turnover \\n",
|
||||
"Turnover Ratio",
|
||||
"Turnover Rate",
|
||||
"Portfolio Turnover",
|
||||
|
|
|
|||
|
|
@ -113,61 +113,65 @@ class Metrics:
|
|||
for data_point in data_point_list:
|
||||
if data_point == "tor":
|
||||
precision, recall, f1 = self.get_specific_metrics(tor_true, tor_pred)
|
||||
tor_support = self.get_support_number(tor_true)
|
||||
metrics_list.append(
|
||||
{
|
||||
"Data_Point": data_point,
|
||||
"Precision": precision,
|
||||
"Recall": recall,
|
||||
"F1": f1,
|
||||
"Support": len(tor_true),
|
||||
"Support": tor_support,
|
||||
}
|
||||
)
|
||||
logger.info(
|
||||
f"TOR Precision: {precision}, Recall: {recall}, F1: {f1}, Support: {len(tor_true)}"
|
||||
f"TOR Precision: {precision}, Recall: {recall}, F1: {f1}, Support: {tor_support}"
|
||||
)
|
||||
elif data_point == "ter":
|
||||
precision, recall, f1 = self.get_specific_metrics(ter_true, ter_pred)
|
||||
ter_support = self.get_support_number(ter_true)
|
||||
metrics_list.append(
|
||||
{
|
||||
"Data_Point": data_point,
|
||||
"Precision": precision,
|
||||
"Recall": recall,
|
||||
"F1": f1,
|
||||
"Support": len(ter_true),
|
||||
"Support": ter_support,
|
||||
}
|
||||
)
|
||||
logger.info(
|
||||
f"TER Precision: {precision}, Recall: {recall}, F1: {f1}, Support: {len(ter_true)}"
|
||||
f"TER Precision: {precision}, Recall: {recall}, F1: {f1}, Support: {ter_support}"
|
||||
)
|
||||
elif data_point == "ogc":
|
||||
precision, recall, f1 = self.get_specific_metrics(ogc_true, ogc_pred)
|
||||
ogc_support = self.get_support_number(ogc_true)
|
||||
metrics_list.append(
|
||||
{
|
||||
"Data_Point": data_point,
|
||||
"Precision": precision,
|
||||
"Recall": recall,
|
||||
"F1": f1,
|
||||
"Support": len(ogc_true),
|
||||
"Support": ogc_support,
|
||||
}
|
||||
)
|
||||
logger.info(
|
||||
f"OGC Precision: {precision}, Recall: {recall}, F1: {f1}, Support: {len(ogc_true)}"
|
||||
f"OGC Precision: {precision}, Recall: {recall}, F1: {f1}, Support: {ogc_support}"
|
||||
)
|
||||
elif data_point == "performance_fee":
|
||||
precision, recall, f1 = self.get_specific_metrics(
|
||||
performance_fee_true, performance_fee_pred
|
||||
)
|
||||
performance_fee_support = self.get_support_number(performance_fee_true)
|
||||
metrics_list.append(
|
||||
{
|
||||
"Data_Point": data_point,
|
||||
"Precision": precision,
|
||||
"Recall": recall,
|
||||
"F1": f1,
|
||||
"Support": len(performance_fee_true),
|
||||
"Support": performance_fee_support,
|
||||
}
|
||||
)
|
||||
logger.info(
|
||||
f"Performance Fee Precision: {precision}, Recall: {recall}, F1: {f1}, Support: {len(performance_fee_true)}"
|
||||
f"Performance Fee Precision: {precision}, Recall: {recall}, F1: {f1}, Support: {performance_fee_support}"
|
||||
)
|
||||
|
||||
# get average metrics
|
||||
|
|
@ -185,6 +189,11 @@ class Metrics:
|
|||
)
|
||||
return missing_error_list, metrics_list
|
||||
|
||||
def get_support_number(self, true_data: list):
|
||||
# get the count which true_data is 1
|
||||
return sum(true_data)
|
||||
|
||||
|
||||
def get_true_pred_data(
|
||||
self, doc_id, ground_truth_data: pd.Series, prediction_data: pd.Series, data_point: str
|
||||
):
|
||||
|
|
|
|||
|
|
@ -22,7 +22,10 @@ class FilterPages:
|
|||
self.document_mapping_info_df = document_mapping_info_df
|
||||
self.get_configuration_from_file()
|
||||
self.doc_info = self.get_doc_info()
|
||||
self.datapoint_config, self.datapoint_exclude_config = self.get_datapoint_config()
|
||||
self.datapoint_config, self.datapoint_exclude_config = (
|
||||
self.get_datapoint_config()
|
||||
)
|
||||
self.percentage_regex = r"\b\d{1,3}\.\d+\b|\b\d{1,3}(\,\d+)\b%"
|
||||
|
||||
def get_pdf_page_text_dict(self) -> dict:
|
||||
pdf_util = PDFUtil(self.pdf_file)
|
||||
|
|
@ -30,17 +33,24 @@ class FilterPages:
|
|||
return page_text_dict
|
||||
|
||||
def get_configuration_from_file(self) -> dict:
|
||||
"""
|
||||
Remark: remove the
|
||||
"""
|
||||
language_config_file = r"./configuration/language.json"
|
||||
domicile_datapoint_config_file = r"./configuration/domicile_datapoints.json"
|
||||
datapoint_keywords_config_file = r"./configuration/datapoint_keyword.json"
|
||||
datapoint_exclude_keywords_config_file = r"./configuration/datapoint_exclude_keyword.json"
|
||||
datapoint_exclude_keywords_config_file = (
|
||||
r"./configuration/datapoint_exclude_keyword.json"
|
||||
)
|
||||
with open(language_config_file, "r", encoding="utf-8") as file:
|
||||
self.language_config = json.load(file)
|
||||
with open(domicile_datapoint_config_file, "r", encoding="utf-8") as file:
|
||||
self.domicile_datapoint_config = json.load(file)
|
||||
with open(datapoint_keywords_config_file, "r", encoding="utf-8") as file:
|
||||
self.datapoint_keywords_config = json.load(file)
|
||||
with open(datapoint_exclude_keywords_config_file, "r", encoding="utf-8") as file:
|
||||
with open(
|
||||
datapoint_exclude_keywords_config_file, "r", encoding="utf-8"
|
||||
) as file:
|
||||
self.datapoint_exclude_keywords_config = json.load(file)
|
||||
|
||||
def get_doc_info(self) -> dict:
|
||||
|
|
@ -81,10 +91,14 @@ class FilterPages:
|
|||
document_type = "ar"
|
||||
datapoint_list = self.domicile_datapoint_config[domicile][document_type]
|
||||
datapoint_keywords = self.get_keywords("include", datapoint_list, language)
|
||||
datapoint_exclude_keywords = self.get_keywords("exclude", datapoint_list, language)
|
||||
datapoint_exclude_keywords = self.get_keywords(
|
||||
"exclude", datapoint_list, language
|
||||
)
|
||||
return datapoint_keywords, datapoint_exclude_keywords
|
||||
|
||||
def get_keywords(self, keywords_type: str, datapoint_list: list, language: str) -> dict:
|
||||
def get_keywords(
|
||||
self, keywords_type: str, datapoint_list: list, language: str
|
||||
) -> dict:
|
||||
if keywords_type == "include":
|
||||
config = self.datapoint_keywords_config
|
||||
elif keywords_type == "exclude":
|
||||
|
|
@ -129,16 +143,31 @@ class FilterPages:
|
|||
for datapoint in self.datapoint_config.keys():
|
||||
result[datapoint] = []
|
||||
for page_num, page_text in self.page_text_dict.items():
|
||||
text = "\n" + clean_text(page_text) + "\n"
|
||||
if page_num < 2:
|
||||
continue
|
||||
page_text = clean_text(page_text)
|
||||
text_split = [
|
||||
sentence.strip()
|
||||
for sentence in page_text.split("\n")
|
||||
if len(sentence.strip()) > 0
|
||||
]
|
||||
text = "\n" + "\n".join(text_split) + "\n"
|
||||
language = self.doc_info.get("language", None)
|
||||
if language is None:
|
||||
language = "english"
|
||||
if language == "english" and re.search(self.percentage_regex, text) is None:
|
||||
continue
|
||||
for datapoint, keywords in self.datapoint_config.items():
|
||||
# idx = idx & np.array([re.findall(r'\b' + word + r'\d*\b', page) != [] for page in self.pages_clean])
|
||||
find_datapoint = False
|
||||
|
||||
for keyword in keywords:
|
||||
search_iter = self.search_keyword(text, keyword)
|
||||
for search in search_iter:
|
||||
search_text = search.group().strip()
|
||||
exclude_search_list = self.search_exclude_keywords(text, datapoint)
|
||||
if exclude_search_list is not None:
|
||||
exclude_search_list = self.search_exclude_keywords(
|
||||
text, datapoint
|
||||
)
|
||||
if exclude_search_list is not None and len(exclude_search_list) > 0:
|
||||
need_exclude = False
|
||||
for exclude_search_text in exclude_search_list:
|
||||
if search_text in exclude_search_text:
|
||||
|
|
@ -156,7 +185,7 @@ class FilterPages:
|
|||
"datapoint": datapoint,
|
||||
"page_num": page_num,
|
||||
"keyword": keyword,
|
||||
"text": search_text
|
||||
"text": search_text,
|
||||
}
|
||||
result_details.append(detail)
|
||||
find_datapoint = True
|
||||
|
|
@ -165,14 +194,13 @@ class FilterPages:
|
|||
break
|
||||
return result, result_details
|
||||
|
||||
def search_in_sentence_is_valid(self,
|
||||
search_text: str,
|
||||
text: str):
|
||||
def search_in_sentence_is_valid(self, search_text: str, text: str):
|
||||
search_text_regex = add_slash_to_text_as_regex(search_text)
|
||||
search_regex = r"\n.*{0}.*\n".format(search_text_regex)
|
||||
search_iter = re.finditer(search_regex, text, re.IGNORECASE)
|
||||
is_valid = False
|
||||
lower_word_count_threshold = 7
|
||||
big_number_regex = r"\b\d{1,3}(\,\d{3})+\b"
|
||||
for search in search_iter:
|
||||
lower_word_count = 0
|
||||
if search is not None:
|
||||
|
|
@ -183,6 +211,25 @@ class FilterPages:
|
|||
if split[0].islower():
|
||||
lower_word_count += 1
|
||||
if lower_word_count < lower_word_count_threshold:
|
||||
if re.search(self.percentage_regex, search_text) is not None:
|
||||
is_valid = True
|
||||
break
|
||||
new_search_text_regex = add_slash_to_text_as_regex(search_text)
|
||||
new_search_regex = r"\n.*{0}.*\n(?P<next_line>.*)\n(?P<next_2_line>.*)\n".format(
|
||||
new_search_text_regex
|
||||
)
|
||||
new_search = re.search(new_search_regex, text, re.IGNORECASE)
|
||||
if new_search is not None:
|
||||
next_line = new_search.group("next_line").strip()
|
||||
next_2_line = new_search.group("next_2_line").strip()
|
||||
|
||||
if re.search(big_number_regex, next_line) is not None or \
|
||||
re.search(big_number_regex, next_2_line) is not None:
|
||||
is_valid = False
|
||||
else:
|
||||
is_valid = True
|
||||
break
|
||||
else:
|
||||
is_valid = True
|
||||
break
|
||||
return is_valid
|
||||
|
|
|
|||
Loading…
Reference in New Issue