try to improve page filter precision
This commit is contained in:
parent
7198450e53
commit
7c83f9152a
|
|
@ -12,6 +12,6 @@
|
||||||
"english": ["operating expenses paid"]
|
"english": ["operating expenses paid"]
|
||||||
},
|
},
|
||||||
"performance_fee": {
|
"performance_fee": {
|
||||||
"english": []
|
"english": ["Performance fees payable"]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -200,8 +200,6 @@
|
||||||
"tor": {
|
"tor": {
|
||||||
"english": [
|
"english": [
|
||||||
"TOR",
|
"TOR",
|
||||||
"Turnover* \\n",
|
|
||||||
"Turnover \\n",
|
|
||||||
"Turnover Ratio",
|
"Turnover Ratio",
|
||||||
"Turnover Rate",
|
"Turnover Rate",
|
||||||
"Portfolio Turnover",
|
"Portfolio Turnover",
|
||||||
|
|
|
||||||
|
|
@ -113,61 +113,65 @@ class Metrics:
|
||||||
for data_point in data_point_list:
|
for data_point in data_point_list:
|
||||||
if data_point == "tor":
|
if data_point == "tor":
|
||||||
precision, recall, f1 = self.get_specific_metrics(tor_true, tor_pred)
|
precision, recall, f1 = self.get_specific_metrics(tor_true, tor_pred)
|
||||||
|
tor_support = self.get_support_number(tor_true)
|
||||||
metrics_list.append(
|
metrics_list.append(
|
||||||
{
|
{
|
||||||
"Data_Point": data_point,
|
"Data_Point": data_point,
|
||||||
"Precision": precision,
|
"Precision": precision,
|
||||||
"Recall": recall,
|
"Recall": recall,
|
||||||
"F1": f1,
|
"F1": f1,
|
||||||
"Support": len(tor_true),
|
"Support": tor_support,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
logger.info(
|
logger.info(
|
||||||
f"TOR Precision: {precision}, Recall: {recall}, F1: {f1}, Support: {len(tor_true)}"
|
f"TOR Precision: {precision}, Recall: {recall}, F1: {f1}, Support: {tor_support}"
|
||||||
)
|
)
|
||||||
elif data_point == "ter":
|
elif data_point == "ter":
|
||||||
precision, recall, f1 = self.get_specific_metrics(ter_true, ter_pred)
|
precision, recall, f1 = self.get_specific_metrics(ter_true, ter_pred)
|
||||||
|
ter_support = self.get_support_number(ter_true)
|
||||||
metrics_list.append(
|
metrics_list.append(
|
||||||
{
|
{
|
||||||
"Data_Point": data_point,
|
"Data_Point": data_point,
|
||||||
"Precision": precision,
|
"Precision": precision,
|
||||||
"Recall": recall,
|
"Recall": recall,
|
||||||
"F1": f1,
|
"F1": f1,
|
||||||
"Support": len(ter_true),
|
"Support": ter_support,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
logger.info(
|
logger.info(
|
||||||
f"TER Precision: {precision}, Recall: {recall}, F1: {f1}, Support: {len(ter_true)}"
|
f"TER Precision: {precision}, Recall: {recall}, F1: {f1}, Support: {ter_support}"
|
||||||
)
|
)
|
||||||
elif data_point == "ogc":
|
elif data_point == "ogc":
|
||||||
precision, recall, f1 = self.get_specific_metrics(ogc_true, ogc_pred)
|
precision, recall, f1 = self.get_specific_metrics(ogc_true, ogc_pred)
|
||||||
|
ogc_support = self.get_support_number(ogc_true)
|
||||||
metrics_list.append(
|
metrics_list.append(
|
||||||
{
|
{
|
||||||
"Data_Point": data_point,
|
"Data_Point": data_point,
|
||||||
"Precision": precision,
|
"Precision": precision,
|
||||||
"Recall": recall,
|
"Recall": recall,
|
||||||
"F1": f1,
|
"F1": f1,
|
||||||
"Support": len(ogc_true),
|
"Support": ogc_support,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
logger.info(
|
logger.info(
|
||||||
f"OGC Precision: {precision}, Recall: {recall}, F1: {f1}, Support: {len(ogc_true)}"
|
f"OGC Precision: {precision}, Recall: {recall}, F1: {f1}, Support: {ogc_support}"
|
||||||
)
|
)
|
||||||
elif data_point == "performance_fee":
|
elif data_point == "performance_fee":
|
||||||
precision, recall, f1 = self.get_specific_metrics(
|
precision, recall, f1 = self.get_specific_metrics(
|
||||||
performance_fee_true, performance_fee_pred
|
performance_fee_true, performance_fee_pred
|
||||||
)
|
)
|
||||||
|
performance_fee_support = self.get_support_number(performance_fee_true)
|
||||||
metrics_list.append(
|
metrics_list.append(
|
||||||
{
|
{
|
||||||
"Data_Point": data_point,
|
"Data_Point": data_point,
|
||||||
"Precision": precision,
|
"Precision": precision,
|
||||||
"Recall": recall,
|
"Recall": recall,
|
||||||
"F1": f1,
|
"F1": f1,
|
||||||
"Support": len(performance_fee_true),
|
"Support": performance_fee_support,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Performance Fee Precision: {precision}, Recall: {recall}, F1: {f1}, Support: {len(performance_fee_true)}"
|
f"Performance Fee Precision: {precision}, Recall: {recall}, F1: {f1}, Support: {performance_fee_support}"
|
||||||
)
|
)
|
||||||
|
|
||||||
# get average metrics
|
# get average metrics
|
||||||
|
|
@ -185,6 +189,11 @@ class Metrics:
|
||||||
)
|
)
|
||||||
return missing_error_list, metrics_list
|
return missing_error_list, metrics_list
|
||||||
|
|
||||||
|
def get_support_number(self, true_data: list):
|
||||||
|
# get the count which true_data is 1
|
||||||
|
return sum(true_data)
|
||||||
|
|
||||||
|
|
||||||
def get_true_pred_data(
|
def get_true_pred_data(
|
||||||
self, doc_id, ground_truth_data: pd.Series, prediction_data: pd.Series, data_point: str
|
self, doc_id, ground_truth_data: pd.Series, prediction_data: pd.Series, data_point: str
|
||||||
):
|
):
|
||||||
|
|
|
||||||
|
|
@ -22,7 +22,10 @@ class FilterPages:
|
||||||
self.document_mapping_info_df = document_mapping_info_df
|
self.document_mapping_info_df = document_mapping_info_df
|
||||||
self.get_configuration_from_file()
|
self.get_configuration_from_file()
|
||||||
self.doc_info = self.get_doc_info()
|
self.doc_info = self.get_doc_info()
|
||||||
self.datapoint_config, self.datapoint_exclude_config = self.get_datapoint_config()
|
self.datapoint_config, self.datapoint_exclude_config = (
|
||||||
|
self.get_datapoint_config()
|
||||||
|
)
|
||||||
|
self.percentage_regex = r"\b\d{1,3}\.\d+\b|\b\d{1,3}(\,\d+)\b%"
|
||||||
|
|
||||||
def get_pdf_page_text_dict(self) -> dict:
|
def get_pdf_page_text_dict(self) -> dict:
|
||||||
pdf_util = PDFUtil(self.pdf_file)
|
pdf_util = PDFUtil(self.pdf_file)
|
||||||
|
|
@ -30,17 +33,24 @@ class FilterPages:
|
||||||
return page_text_dict
|
return page_text_dict
|
||||||
|
|
||||||
def get_configuration_from_file(self) -> dict:
|
def get_configuration_from_file(self) -> dict:
|
||||||
|
"""
|
||||||
|
Remark: remove the
|
||||||
|
"""
|
||||||
language_config_file = r"./configuration/language.json"
|
language_config_file = r"./configuration/language.json"
|
||||||
domicile_datapoint_config_file = r"./configuration/domicile_datapoints.json"
|
domicile_datapoint_config_file = r"./configuration/domicile_datapoints.json"
|
||||||
datapoint_keywords_config_file = r"./configuration/datapoint_keyword.json"
|
datapoint_keywords_config_file = r"./configuration/datapoint_keyword.json"
|
||||||
datapoint_exclude_keywords_config_file = r"./configuration/datapoint_exclude_keyword.json"
|
datapoint_exclude_keywords_config_file = (
|
||||||
|
r"./configuration/datapoint_exclude_keyword.json"
|
||||||
|
)
|
||||||
with open(language_config_file, "r", encoding="utf-8") as file:
|
with open(language_config_file, "r", encoding="utf-8") as file:
|
||||||
self.language_config = json.load(file)
|
self.language_config = json.load(file)
|
||||||
with open(domicile_datapoint_config_file, "r", encoding="utf-8") as file:
|
with open(domicile_datapoint_config_file, "r", encoding="utf-8") as file:
|
||||||
self.domicile_datapoint_config = json.load(file)
|
self.domicile_datapoint_config = json.load(file)
|
||||||
with open(datapoint_keywords_config_file, "r", encoding="utf-8") as file:
|
with open(datapoint_keywords_config_file, "r", encoding="utf-8") as file:
|
||||||
self.datapoint_keywords_config = json.load(file)
|
self.datapoint_keywords_config = json.load(file)
|
||||||
with open(datapoint_exclude_keywords_config_file, "r", encoding="utf-8") as file:
|
with open(
|
||||||
|
datapoint_exclude_keywords_config_file, "r", encoding="utf-8"
|
||||||
|
) as file:
|
||||||
self.datapoint_exclude_keywords_config = json.load(file)
|
self.datapoint_exclude_keywords_config = json.load(file)
|
||||||
|
|
||||||
def get_doc_info(self) -> dict:
|
def get_doc_info(self) -> dict:
|
||||||
|
|
@ -79,12 +89,16 @@ class FilterPages:
|
||||||
domicile = "default"
|
domicile = "default"
|
||||||
if self.domicile_datapoint_config[domicile].get(document_type, None) is None:
|
if self.domicile_datapoint_config[domicile].get(document_type, None) is None:
|
||||||
document_type = "ar"
|
document_type = "ar"
|
||||||
datapoint_list = self.domicile_datapoint_config[domicile][document_type]
|
datapoint_list = self.domicile_datapoint_config[domicile][document_type]
|
||||||
datapoint_keywords = self.get_keywords("include", datapoint_list, language)
|
datapoint_keywords = self.get_keywords("include", datapoint_list, language)
|
||||||
datapoint_exclude_keywords = self.get_keywords("exclude", datapoint_list, language)
|
datapoint_exclude_keywords = self.get_keywords(
|
||||||
|
"exclude", datapoint_list, language
|
||||||
|
)
|
||||||
return datapoint_keywords, datapoint_exclude_keywords
|
return datapoint_keywords, datapoint_exclude_keywords
|
||||||
|
|
||||||
def get_keywords(self, keywords_type: str, datapoint_list: list, language: str) -> dict:
|
def get_keywords(
|
||||||
|
self, keywords_type: str, datapoint_list: list, language: str
|
||||||
|
) -> dict:
|
||||||
if keywords_type == "include":
|
if keywords_type == "include":
|
||||||
config = self.datapoint_keywords_config
|
config = self.datapoint_keywords_config
|
||||||
elif keywords_type == "exclude":
|
elif keywords_type == "exclude":
|
||||||
|
|
@ -129,16 +143,31 @@ class FilterPages:
|
||||||
for datapoint in self.datapoint_config.keys():
|
for datapoint in self.datapoint_config.keys():
|
||||||
result[datapoint] = []
|
result[datapoint] = []
|
||||||
for page_num, page_text in self.page_text_dict.items():
|
for page_num, page_text in self.page_text_dict.items():
|
||||||
text = "\n" + clean_text(page_text) + "\n"
|
if page_num < 2:
|
||||||
|
continue
|
||||||
|
page_text = clean_text(page_text)
|
||||||
|
text_split = [
|
||||||
|
sentence.strip()
|
||||||
|
for sentence in page_text.split("\n")
|
||||||
|
if len(sentence.strip()) > 0
|
||||||
|
]
|
||||||
|
text = "\n" + "\n".join(text_split) + "\n"
|
||||||
|
language = self.doc_info.get("language", None)
|
||||||
|
if language is None:
|
||||||
|
language = "english"
|
||||||
|
if language == "english" and re.search(self.percentage_regex, text) is None:
|
||||||
|
continue
|
||||||
for datapoint, keywords in self.datapoint_config.items():
|
for datapoint, keywords in self.datapoint_config.items():
|
||||||
# idx = idx & np.array([re.findall(r'\b' + word + r'\d*\b', page) != [] for page in self.pages_clean])
|
|
||||||
find_datapoint = False
|
find_datapoint = False
|
||||||
|
|
||||||
for keyword in keywords:
|
for keyword in keywords:
|
||||||
search_iter = self.search_keyword(text, keyword)
|
search_iter = self.search_keyword(text, keyword)
|
||||||
for search in search_iter:
|
for search in search_iter:
|
||||||
search_text = search.group().strip()
|
search_text = search.group().strip()
|
||||||
exclude_search_list = self.search_exclude_keywords(text, datapoint)
|
exclude_search_list = self.search_exclude_keywords(
|
||||||
if exclude_search_list is not None:
|
text, datapoint
|
||||||
|
)
|
||||||
|
if exclude_search_list is not None and len(exclude_search_list) > 0:
|
||||||
need_exclude = False
|
need_exclude = False
|
||||||
for exclude_search_text in exclude_search_list:
|
for exclude_search_text in exclude_search_list:
|
||||||
if search_text in exclude_search_text:
|
if search_text in exclude_search_text:
|
||||||
|
|
@ -156,7 +185,7 @@ class FilterPages:
|
||||||
"datapoint": datapoint,
|
"datapoint": datapoint,
|
||||||
"page_num": page_num,
|
"page_num": page_num,
|
||||||
"keyword": keyword,
|
"keyword": keyword,
|
||||||
"text": search_text
|
"text": search_text,
|
||||||
}
|
}
|
||||||
result_details.append(detail)
|
result_details.append(detail)
|
||||||
find_datapoint = True
|
find_datapoint = True
|
||||||
|
|
@ -165,14 +194,13 @@ class FilterPages:
|
||||||
break
|
break
|
||||||
return result, result_details
|
return result, result_details
|
||||||
|
|
||||||
def search_in_sentence_is_valid(self,
|
def search_in_sentence_is_valid(self, search_text: str, text: str):
|
||||||
search_text: str,
|
|
||||||
text: str):
|
|
||||||
search_text_regex = add_slash_to_text_as_regex(search_text)
|
search_text_regex = add_slash_to_text_as_regex(search_text)
|
||||||
search_regex = r"\n.*{0}.*\n".format(search_text_regex)
|
search_regex = r"\n.*{0}.*\n".format(search_text_regex)
|
||||||
search_iter = re.finditer(search_regex, text, re.IGNORECASE)
|
search_iter = re.finditer(search_regex, text, re.IGNORECASE)
|
||||||
is_valid = False
|
is_valid = False
|
||||||
lower_word_count_threshold = 7
|
lower_word_count_threshold = 7
|
||||||
|
big_number_regex = r"\b\d{1,3}(\,\d{3})+\b"
|
||||||
for search in search_iter:
|
for search in search_iter:
|
||||||
lower_word_count = 0
|
lower_word_count = 0
|
||||||
if search is not None:
|
if search is not None:
|
||||||
|
|
@ -183,8 +211,27 @@ class FilterPages:
|
||||||
if split[0].islower():
|
if split[0].islower():
|
||||||
lower_word_count += 1
|
lower_word_count += 1
|
||||||
if lower_word_count < lower_word_count_threshold:
|
if lower_word_count < lower_word_count_threshold:
|
||||||
is_valid = True
|
if re.search(self.percentage_regex, search_text) is not None:
|
||||||
break
|
is_valid = True
|
||||||
|
break
|
||||||
|
new_search_text_regex = add_slash_to_text_as_regex(search_text)
|
||||||
|
new_search_regex = r"\n.*{0}.*\n(?P<next_line>.*)\n(?P<next_2_line>.*)\n".format(
|
||||||
|
new_search_text_regex
|
||||||
|
)
|
||||||
|
new_search = re.search(new_search_regex, text, re.IGNORECASE)
|
||||||
|
if new_search is not None:
|
||||||
|
next_line = new_search.group("next_line").strip()
|
||||||
|
next_2_line = new_search.group("next_2_line").strip()
|
||||||
|
|
||||||
|
if re.search(big_number_regex, next_line) is not None or \
|
||||||
|
re.search(big_number_regex, next_2_line) is not None:
|
||||||
|
is_valid = False
|
||||||
|
else:
|
||||||
|
is_valid = True
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
is_valid = True
|
||||||
|
break
|
||||||
return is_valid
|
return is_valid
|
||||||
|
|
||||||
def search_keyword(self, text: str, keyword: str):
|
def search_keyword(self, text: str, keyword: str):
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue