Support remove duplicate articles by local records
This commit is contained in:
parent
d10094f048
commit
13cf259bb0
Binary file not shown.
|
|
@ -31,6 +31,7 @@ class TruthSocialRetriever:
|
||||||
|
|
||||||
self.db_url = f"mysql+pymysql://{mysql_user}:{mysql_password}@{mysql_host}:{mysql_port}/{mysql_database}"
|
self.db_url = f"mysql+pymysql://{mysql_user}:{mysql_password}@{mysql_host}:{mysql_port}/{mysql_database}"
|
||||||
self.db_truth_social_content = DBTruthSocialContent(self.db_url)
|
self.db_truth_social_content = DBTruthSocialContent(self.db_url)
|
||||||
|
self.article_ids_txt_file = r"./output/media/truth_social/article_ids.txt"
|
||||||
|
|
||||||
trump_key = WECHAT_CONFIG.get("trump_key", "")
|
trump_key = WECHAT_CONFIG.get("trump_key", "")
|
||||||
if trump_key:
|
if trump_key:
|
||||||
|
|
@ -174,6 +175,8 @@ class TruthSocialRetriever:
|
||||||
result_df["analysis_result"] = ""
|
result_df["analysis_result"] = ""
|
||||||
result_df["analysis_token"] = 0
|
result_df["analysis_token"] = 0
|
||||||
result_df = self.send_wechat_message(result_df, user_full_name)
|
result_df = self.send_wechat_message(result_df, user_full_name)
|
||||||
|
article_ids = result_df["article_id"].tolist()
|
||||||
|
self.append_article_ids_to_txt(article_ids)
|
||||||
result_df = result_df[
|
result_df = result_df[
|
||||||
[
|
[
|
||||||
"article_id",
|
"article_id",
|
||||||
|
|
@ -207,12 +210,34 @@ class TruthSocialRetriever:
|
||||||
self.send_wechat_message(result_df)
|
self.send_wechat_message(result_df)
|
||||||
else:
|
else:
|
||||||
logger.info(f"没有数据需要发送企业微信消息")
|
logger.info(f"没有数据需要发送企业微信消息")
|
||||||
|
|
||||||
|
def get_article_ids_from_txt(self):
|
||||||
|
if not os.path.exists(self.article_ids_txt_file):
|
||||||
|
return []
|
||||||
|
with open(self.article_ids_txt_file, "r", encoding="utf-8") as f:
|
||||||
|
article_ids = f.readlines()
|
||||||
|
article_ids = [article_id.strip() for article_id in article_ids if article_id.strip()]
|
||||||
|
return article_ids
|
||||||
|
|
||||||
|
def append_article_ids_to_txt(self, article_ids: list):
|
||||||
|
if article_ids is not None and len(article_ids) > 0:
|
||||||
|
if not os.path.exists(self.article_ids_txt_file):
|
||||||
|
with open(self.article_ids_txt_file, "w", encoding="utf-8") as f:
|
||||||
|
pass
|
||||||
|
with open(self.article_ids_txt_file, "a", encoding="utf-8") as f:
|
||||||
|
for article_id in article_ids:
|
||||||
|
f.write(article_id + "\n")
|
||||||
|
|
||||||
def remove_duplicate_posts(self, result_df: pd.DataFrame):
|
def remove_duplicate_posts(self, result_df: pd.DataFrame):
|
||||||
try:
|
try:
|
||||||
|
article_ids = self.get_article_ids_from_txt()
|
||||||
duplicate_index_list = []
|
duplicate_index_list = []
|
||||||
for index, row in result_df.iterrows():
|
for index, row in result_df.iterrows():
|
||||||
article_id = row["article_id"]
|
article_id = row["article_id"]
|
||||||
|
|
||||||
|
if article_id in article_ids:
|
||||||
|
duplicate_index_list.append(index)
|
||||||
|
continue
|
||||||
exist_data = self.db_truth_social_content.query_data_by_article_id(
|
exist_data = self.db_truth_social_content.query_data_by_article_id(
|
||||||
article_id
|
article_id
|
||||||
)
|
)
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue