diff --git a/core/media/__pycache__/truth_social_retriever.cpython-312.pyc b/core/media/__pycache__/truth_social_retriever.cpython-312.pyc index 3636ac0..19652a3 100644 Binary files a/core/media/__pycache__/truth_social_retriever.cpython-312.pyc and b/core/media/__pycache__/truth_social_retriever.cpython-312.pyc differ diff --git a/core/media/truth_social_retriever.py b/core/media/truth_social_retriever.py index 477ce8a..8fddf63 100644 --- a/core/media/truth_social_retriever.py +++ b/core/media/truth_social_retriever.py @@ -31,6 +31,7 @@ class TruthSocialRetriever: self.db_url = f"mysql+pymysql://{mysql_user}:{mysql_password}@{mysql_host}:{mysql_port}/{mysql_database}" self.db_truth_social_content = DBTruthSocialContent(self.db_url) + self.article_ids_txt_file = r"./output/media/truth_social/article_ids.txt" trump_key = WECHAT_CONFIG.get("trump_key", "") if trump_key: @@ -174,6 +175,8 @@ class TruthSocialRetriever: result_df["analysis_result"] = "" result_df["analysis_token"] = 0 result_df = self.send_wechat_message(result_df, user_full_name) + article_ids = result_df["article_id"].tolist() + self.append_article_ids_to_txt(article_ids) result_df = result_df[ [ "article_id", @@ -207,12 +210,34 @@ class TruthSocialRetriever: self.send_wechat_message(result_df) else: logger.info(f"没有数据需要发送企业微信消息") + + def get_article_ids_from_txt(self): + if not os.path.exists(self.article_ids_txt_file): + return [] + with open(self.article_ids_txt_file, "r", encoding="utf-8") as f: + article_ids = f.readlines() + article_ids = [article_id.strip() for article_id in article_ids if article_id.strip()] + return article_ids + + def append_article_ids_to_txt(self, article_ids: list): + if article_ids is not None and len(article_ids) > 0: + if not os.path.exists(self.article_ids_txt_file): + with open(self.article_ids_txt_file, "w", encoding="utf-8") as f: + pass + with open(self.article_ids_txt_file, "a", encoding="utf-8") as f: + for article_id in article_ids: + f.write(article_id + "\n") def remove_duplicate_posts(self, result_df: pd.DataFrame): try: + article_ids = self.get_article_ids_from_txt() duplicate_index_list = [] for index, row in result_df.iterrows(): article_id = row["article_id"] + + if article_id in article_ids: + duplicate_index_list.append(index) + continue exist_data = self.db_truth_social_content.query_data_by_article_id( article_id )