From 7c83f9152a6d1a199f31bd27b48eaca2fab29dc9 Mon Sep 17 00:00:00 2001 From: Blade He Date: Wed, 4 Sep 2024 17:01:12 -0500 Subject: [PATCH] try to improve page filter precision --- configuration/datapoint_exclude_keyword.json | 2 +- configuration/datapoint_keyword.json | 2 - core/metrics.py | 25 +++-- core/page_filter.py | 101 ++++++++++++++----- 4 files changed, 92 insertions(+), 38 deletions(-) diff --git a/configuration/datapoint_exclude_keyword.json b/configuration/datapoint_exclude_keyword.json index 3da5413..a2ff244 100644 --- a/configuration/datapoint_exclude_keyword.json +++ b/configuration/datapoint_exclude_keyword.json @@ -12,6 +12,6 @@ "english": ["operating expenses paid"] }, "performance_fee": { - "english": [] + "english": ["Performance fees payable"] } } \ No newline at end of file diff --git a/configuration/datapoint_keyword.json b/configuration/datapoint_keyword.json index d003e0b..c54d312 100644 --- a/configuration/datapoint_keyword.json +++ b/configuration/datapoint_keyword.json @@ -200,8 +200,6 @@ "tor": { "english": [ "TOR", - "Turnover* \\n", - "Turnover \\n", "Turnover Ratio", "Turnover Rate", "Portfolio Turnover", diff --git a/core/metrics.py b/core/metrics.py index f97a7c5..815333c 100644 --- a/core/metrics.py +++ b/core/metrics.py @@ -113,61 +113,65 @@ class Metrics: for data_point in data_point_list: if data_point == "tor": precision, recall, f1 = self.get_specific_metrics(tor_true, tor_pred) + tor_support = self.get_support_number(tor_true) metrics_list.append( { "Data_Point": data_point, "Precision": precision, "Recall": recall, "F1": f1, - "Support": len(tor_true), + "Support": tor_support, } ) logger.info( - f"TOR Precision: {precision}, Recall: {recall}, F1: {f1}, Support: {len(tor_true)}" + f"TOR Precision: {precision}, Recall: {recall}, F1: {f1}, Support: {tor_support}" ) elif data_point == "ter": precision, recall, f1 = self.get_specific_metrics(ter_true, ter_pred) + ter_support = self.get_support_number(ter_true) metrics_list.append( { "Data_Point": data_point, "Precision": precision, "Recall": recall, "F1": f1, - "Support": len(ter_true), + "Support": ter_support, } ) logger.info( - f"TER Precision: {precision}, Recall: {recall}, F1: {f1}, Support: {len(ter_true)}" + f"TER Precision: {precision}, Recall: {recall}, F1: {f1}, Support: {ter_support}" ) elif data_point == "ogc": precision, recall, f1 = self.get_specific_metrics(ogc_true, ogc_pred) + ogc_support = self.get_support_number(ogc_true) metrics_list.append( { "Data_Point": data_point, "Precision": precision, "Recall": recall, "F1": f1, - "Support": len(ogc_true), + "Support": ogc_support, } ) logger.info( - f"OGC Precision: {precision}, Recall: {recall}, F1: {f1}, Support: {len(ogc_true)}" + f"OGC Precision: {precision}, Recall: {recall}, F1: {f1}, Support: {ogc_support}" ) elif data_point == "performance_fee": precision, recall, f1 = self.get_specific_metrics( performance_fee_true, performance_fee_pred ) + performance_fee_support = self.get_support_number(performance_fee_true) metrics_list.append( { "Data_Point": data_point, "Precision": precision, "Recall": recall, "F1": f1, - "Support": len(performance_fee_true), + "Support": performance_fee_support, } ) logger.info( - f"Performance Fee Precision: {precision}, Recall: {recall}, F1: {f1}, Support: {len(performance_fee_true)}" + f"Performance Fee Precision: {precision}, Recall: {recall}, F1: {f1}, Support: {performance_fee_support}" ) # get average metrics @@ -184,6 +188,11 @@ class Metrics: } ) return missing_error_list, metrics_list + + def get_support_number(self, true_data: list): + # get the count which true_data is 1 + return sum(true_data) + def get_true_pred_data( self, doc_id, ground_truth_data: pd.Series, prediction_data: pd.Series, data_point: str diff --git a/core/page_filter.py b/core/page_filter.py index 1a6b653..07328c3 100644 --- a/core/page_filter.py +++ b/core/page_filter.py @@ -22,7 +22,10 @@ class FilterPages: self.document_mapping_info_df = document_mapping_info_df self.get_configuration_from_file() self.doc_info = self.get_doc_info() - self.datapoint_config, self.datapoint_exclude_config = self.get_datapoint_config() + self.datapoint_config, self.datapoint_exclude_config = ( + self.get_datapoint_config() + ) + self.percentage_regex = r"\b\d{1,3}\.\d+\b|\b\d{1,3}(\,\d+)\b%" def get_pdf_page_text_dict(self) -> dict: pdf_util = PDFUtil(self.pdf_file) @@ -30,17 +33,24 @@ class FilterPages: return page_text_dict def get_configuration_from_file(self) -> dict: + """ + Remark: remove the + """ language_config_file = r"./configuration/language.json" domicile_datapoint_config_file = r"./configuration/domicile_datapoints.json" datapoint_keywords_config_file = r"./configuration/datapoint_keyword.json" - datapoint_exclude_keywords_config_file = r"./configuration/datapoint_exclude_keyword.json" + datapoint_exclude_keywords_config_file = ( + r"./configuration/datapoint_exclude_keyword.json" + ) with open(language_config_file, "r", encoding="utf-8") as file: self.language_config = json.load(file) with open(domicile_datapoint_config_file, "r", encoding="utf-8") as file: self.domicile_datapoint_config = json.load(file) with open(datapoint_keywords_config_file, "r", encoding="utf-8") as file: self.datapoint_keywords_config = json.load(file) - with open(datapoint_exclude_keywords_config_file, "r", encoding="utf-8") as file: + with open( + datapoint_exclude_keywords_config_file, "r", encoding="utf-8" + ) as file: self.datapoint_exclude_keywords_config = json.load(file) def get_doc_info(self) -> dict: @@ -68,7 +78,7 @@ class FilterPages: "language": language, "domicile": domicile, } - + def get_datapoint_config(self) -> dict: domicile = self.doc_info.get("domicile", None) document_type = self.doc_info.get("document_type", None) @@ -79,12 +89,16 @@ class FilterPages: domicile = "default" if self.domicile_datapoint_config[domicile].get(document_type, None) is None: document_type = "ar" - datapoint_list = self.domicile_datapoint_config[domicile][document_type] + datapoint_list = self.domicile_datapoint_config[domicile][document_type] datapoint_keywords = self.get_keywords("include", datapoint_list, language) - datapoint_exclude_keywords = self.get_keywords("exclude", datapoint_list, language) + datapoint_exclude_keywords = self.get_keywords( + "exclude", datapoint_list, language + ) return datapoint_keywords, datapoint_exclude_keywords - - def get_keywords(self, keywords_type: str, datapoint_list: list, language: str) -> dict: + + def get_keywords( + self, keywords_type: str, datapoint_list: list, language: str + ) -> dict: if keywords_type == "include": config = self.datapoint_keywords_config elif keywords_type == "exclude": @@ -92,7 +106,7 @@ class FilterPages: else: config = self.datapoint_keywords_config datapoint_keywords = {} - + for datapoint in datapoint_list: keywords = config.get(datapoint, {}).get(language, []) if len(keywords) > 0: @@ -103,15 +117,15 @@ class FilterPages: if len(english_keywords) > 0: english_keywords = self.optimize_keywords_regex(english_keywords) datapoint_keywords[datapoint] += english_keywords - return datapoint_keywords - + return datapoint_keywords + def optimize_keywords_regex(self, keywords: list) -> list: new_keywords = [] for keyword in keywords: new_keyword = add_slash_to_text_as_regex(keyword) new_keywords.append(new_keyword) return new_keywords - + def start_job(self) -> dict: logger.info(f"Start extracting datapoints from {self.pdf_file}") """ @@ -129,16 +143,31 @@ class FilterPages: for datapoint in self.datapoint_config.keys(): result[datapoint] = [] for page_num, page_text in self.page_text_dict.items(): - text = "\n" + clean_text(page_text) + "\n" + if page_num < 2: + continue + page_text = clean_text(page_text) + text_split = [ + sentence.strip() + for sentence in page_text.split("\n") + if len(sentence.strip()) > 0 + ] + text = "\n" + "\n".join(text_split) + "\n" + language = self.doc_info.get("language", None) + if language is None: + language = "english" + if language == "english" and re.search(self.percentage_regex, text) is None: + continue for datapoint, keywords in self.datapoint_config.items(): - # idx = idx & np.array([re.findall(r'\b' + word + r'\d*\b', page) != [] for page in self.pages_clean]) find_datapoint = False + for keyword in keywords: search_iter = self.search_keyword(text, keyword) for search in search_iter: search_text = search.group().strip() - exclude_search_list = self.search_exclude_keywords(text, datapoint) - if exclude_search_list is not None: + exclude_search_list = self.search_exclude_keywords( + text, datapoint + ) + if exclude_search_list is not None and len(exclude_search_list) > 0: need_exclude = False for exclude_search_text in exclude_search_list: if search_text in exclude_search_text: @@ -146,7 +175,7 @@ class FilterPages: break if need_exclude: continue - + is_valid = self.search_in_sentence_is_valid(search_text, text) if not is_valid: continue @@ -156,7 +185,7 @@ class FilterPages: "datapoint": datapoint, "page_num": page_num, "keyword": keyword, - "text": search_text + "text": search_text, } result_details.append(detail) find_datapoint = True @@ -164,15 +193,14 @@ class FilterPages: if find_datapoint: break return result, result_details - - def search_in_sentence_is_valid(self, - search_text: str, - text: str): + + def search_in_sentence_is_valid(self, search_text: str, text: str): search_text_regex = add_slash_to_text_as_regex(search_text) search_regex = r"\n.*{0}.*\n".format(search_text_regex) search_iter = re.finditer(search_regex, text, re.IGNORECASE) is_valid = False lower_word_count_threshold = 7 + big_number_regex = r"\b\d{1,3}(\,\d{3})+\b" for search in search_iter: lower_word_count = 0 if search is not None: @@ -183,20 +211,39 @@ class FilterPages: if split[0].islower(): lower_word_count += 1 if lower_word_count < lower_word_count_threshold: - is_valid = True - break + if re.search(self.percentage_regex, search_text) is not None: + is_valid = True + break + new_search_text_regex = add_slash_to_text_as_regex(search_text) + new_search_regex = r"\n.*{0}.*\n(?P.*)\n(?P.*)\n".format( + new_search_text_regex + ) + new_search = re.search(new_search_regex, text, re.IGNORECASE) + if new_search is not None: + next_line = new_search.group("next_line").strip() + next_2_line = new_search.group("next_2_line").strip() + + if re.search(big_number_regex, next_line) is not None or \ + re.search(big_number_regex, next_2_line) is not None: + is_valid = False + else: + is_valid = True + break + else: + is_valid = True + break return is_valid def search_keyword(self, text: str, keyword: str): search_regex = r"\b{0}\d*\W*\s*\b".format(keyword) return re.finditer(search_regex, text, re.IGNORECASE) - + def search_exclude_keywords(self, text: str, datapoint: str): exclude_keywords = self.datapoint_exclude_config.get(datapoint, []) search_list = [] for keyword in exclude_keywords: search_iter = self.search_keyword(text, keyword) - + for search in search_iter: search_list.append(search.group()) - return search_list \ No newline at end of file + return search_list