From 13cf259bb07871cf6c119f254f02804204ec15ef Mon Sep 17 00:00:00 2001 From: blade <8019068@qq.com> Date: Wed, 4 Feb 2026 16:01:06 +0800 Subject: [PATCH] Support remove duplicate articles by local records --- .../truth_social_retriever.cpython-312.pyc | Bin 21526 -> 23379 bytes core/media/truth_social_retriever.py | 25 ++++++++++++++++++ 2 files changed, 25 insertions(+) diff --git a/core/media/__pycache__/truth_social_retriever.cpython-312.pyc b/core/media/__pycache__/truth_social_retriever.cpython-312.pyc index 3636ac0eb7df7b63fd0e6adaaeb53af1604c42ca..19652a3b16a6070becd119d5b35e64e84132284f 100644 GIT binary patch delta 4482 zcma)9dr(x@8NcVg_Qh+T>@K^@8+Jinz7_#N@BxSc6q81bkd?dOA_Crf5jDG_9j9vJ z19Lio)I4k^P3^?CICK)5=_F!IVy9_j1BRWAO*3uM{M81vQ`1bR?RV~ikl1!|ckb_= z^PTT}_uTLNzVCed^aV8aDoXfme7u%|tN&#=5ZG}kVLy843Uo&(Kh=r^s+AU!1^Ou4 z$_Q40If`0Y!6vYNu9FtHqg*R5@O~AvRiL33)JLgSjiB*sL81kTPSE*rUyWG@{W$0+ z%=Om^@i2=O5nlym|id|dKPFK4j_*mm(d0EQ!GB* zm@)n&{SrzN3_+@0L%xMvvKg1KPNPwvU!h}?5Fg}4y_}4Fta({nOh!??DuIG|_xpkg zdhzq4vU9%G?ZWBFMnvKM8q*fRe9K2Z*I^!VEYf>bE)m&cE&YzW!#IrTP6)U;@@9R}Mw z13munzo}=5#%J1_>PXg>Q!;(hD(FHku}0%TdyxBfwcc~iB))6S}DI3)T9=? zH!U4?VVw3D+KFe=wxdVJH>Gc;TOwKRnXIaCR@D{FRMy5(&8I1@(F5lyFI0b!QV=O$ zJz77R;|)7XBW2~JxiakVev-FrY*RR|dM2+foL4uMw{f%~;z*lul!YB-m&>OewRkGC zNfV-XqWBfJbzL|=tB_tpQlYSoRhgysa*WP%0SlUWJPrK-tNu6gzNzq6i4Sp~#7g#?NSga{lW zFhpP{mhsdF3Ad-yxKXIvd~!3TpB z#luZf=iLgBj!-fM+1-H)*P0*;b@LE$#*{gAiSD9WDEaJB9hrsauKqGlM4v&m1~&XjGVSdBZO6 z&~##H#ArTVe=A-;oHb*}4;%6?v|RR1$FH5Gkj8ywZ^WMRLjE)P=L(KD++#rUNk#34 zmbMGs*UM%awuc+G-)z`1d~hZ?H=LY%Gubm~X*;e1h$Px4)%JTTn4sAC=N~O?k`-1l z!Rl+`nTd3Bjg6h~8WP}fHJ+=ns1!%wq-=|Om%@@DIx;HXXHgK%XJP^EhQuhIkZHUr z+kua`%?#D8CL~IpCsBs1gEuJ`5J-*njIo3kn2YQp2GJUYG>d8T;4CNi%~?UTj@n^S zC`A)WtqC#$e-K_ZzdYcKm8Bjuet;ss2l!Qw!+h$Hs!_?=Q8c(ft0?kCkrv2BB2r

lQTh~ltLyYKp$k3cm!r1Ec`Fz84?TsXWsr8CP^0o4pR}GWm1>% zl60YbD%X2C{e#@{TSoJU_pEoM@@(b#teK4Ra7OvGvEr85HkvhKUlz768*91Zoi;ZB z1@yM-dm|<*P)Kk4RBs*@W-LWvOVOC|^5FE4zIK+PaxtbD{lD9f&_@=oFJF>NLOF%FJ8wZR*qzKT4Ab&tsK-~=1yPeJ^ih2xGLTrc& zazd;OfTA==@1sNm9@1M3L3R-d(v{RTVKrhm+*b;;>X0g^g5*>6pzdatQu3Ku$IM#@ zA^e3q8Gr0Hp+v%8%zKisKF7uIP#b$a9vg5lc4bjwDU_q6EKx2}_#7CHs)1&r*f%JN zPe2#%%S%H)!O!NoitnEiF{bW^(M5;K0qA-(F?N#kLVOB;mY0)9=o!^2xw2h3y}u6v ziR68f0hLQhgW?cgo?qPdIT=lb_&!X{(q>J0#O|80d&73`lzr9lhDc`iWL#Rrl{@1q z4!eq{Tubjz+Ia6-?JzeiMVx8p9bsqTu=W$9^L$F!xFlk7&zOqBrlK)!+O#raa^Q3M z1-uz%%v$mL`J>q+wSTudZLNWt$gD1a$6E%?IwSK|O2N7W=2`*=ZSmOni6vV!x`oQA z2mg6RBUD-|R;DE>Wz#XRGFMNH`&T}S>~11ZyhWTMULc?Z_Y-)=+g2hzO9aGy9?K$s zDQ12~hJH?9@e6mAx==GdR+_E5Oon2$8-A^HV~*lYLXkONe&Z2y?bA4SRRK+*@kdrU zIkpb8abdg#pQ>D+LzV?cS#L+5;P=q6)er@VM9tg@B~IWkD?P;|TTA3s&G zclsp%%H>PMI^0&<%7=;aCHzM1!{}9Ps>|U&Ac85prtTqLabO9?bxVhqfHbO;WS=}B zwUd>LGLf)KWy%VH4mrxe0j&5r$|?#o#85HO;ye4qJw91nN+ipOL|g0c^@~0^(ATT% z%EH!p3=1Av_%v0lT0Vvw>a)>Jytm!~n{&F}o1tum0Ze{l7oZ#4h&S=)^()ctao5HT zEIK{zZ@h%~SRIQU%^rS$EU6PWH0PL=B!8N$PCX!Zw9A1#e$kJ=*X(k35v5ZAk{V$C zO)>ii@kH|ybPfNt`4{M6{QTx@WW?7tXAi9*bG8ytUV%yLGXf(7NYsmeB0vs~MIo)! ziBA*R1c5gRkR$M?RBW?J`68141a}{^DA`@r2y<{J%;8vVpdzadW zxD~VgjGC)Omzld1dEPPcT*_>ciYpr1dY6L7tRb1RjnS{gU5dL)L34H~KDuQWe9T|A zSkYe0xA?Xz|4fv!R;k9AozilV)sjH zkM2SeS)ytXadOmzD1x9gQlO>{bfq@xLuyDv9Hk;4ia-;oN`LkF!^CY=^-p@wENfF# zuC(8td(JoK-h0lu=id1_dHoV8_$)u)#vJeS=fD0U~n)yzG|w zIrD%?_Q<9=GGLaAWV0d+@v<-{42ZI*NKBR(!y;Q0Yut^_{&z6DmD%lCJABBycG=ih zX4%fPd2*iNA)m?lD9Z)xw}$;3?1wy?T*zz%veV$Xn7n3&go|}ITy(hMIA6@R2j};% z&79}oCA{W>7x_|`Q|1@=>^9P3N~DFF2VOLLTI{*4B_x?kanY5z>|(RLoK^GZq<8=- z!X8p$Y}jJ2)`Ech#MM2 zZ-p;yeS~j)G9Qi$fuQXXuI-83P|*Ge&;CSS9@L9o_=z}3N}*7i&)k%}gdO!SmV=oC z)+Is;q1%2pQbyqEqPhON(VfceS**LFjIg z7B>pr+cM|!-{SjP*r9cCB$klJ6x}pQ$LMy{Sg4{-BonHlo7Lpxgf6LrbaX;f=`+kq zcOyj@k7z-Nvx#C8i7DfG1#!S%vQAfjECm%$Y0B zJ7)4G3-t#0eX-vglg%lg9Nx_Q$xbG1KqC8Cr-kw%+=3lZs?KtU}B2(P7=^!^nDmiVXsN(gHxna|I zBk0~rxa)^=_I27=b=U_6hJ!phboQTobxT*^iNtFd| zC-X$H-02ax8`wh*L#VWmbir)tHWGnfmTE0AG#*Acj-Vh6BXl7o5RwR^2rQT!Jpr+@ z3&i(Yu*$v*=bZ?3&=?$o>(w5(9o&7?Pcg%EKLT!x;w_*D5c&}gBH(&@-lVF~sH|wQ z(J|H7hhAW)h9<|xqKVizvm&_%$!0mG#pp4ZDc@bM>1_^o#_ zawl}?+_&KFx*qEce~jd}KCmlLS5?oa2pp;E|8}I^{IPv|bHr}Gv8795vzvBnq)oc% zE{QZtH)}=IGz+L{GojpG+Ut`(FD~e*kv^{}V)89g(Cd+Id4yhHhOECpPSxTe>JYFD zXbK_CKrf=oc=EeSRGyp|8y$>kN;I3)No1Tvm_YargRqc|_&h4w5x$RbmVqiTFs=&S z$|T*ZDhW9{uBd8kScx8=0e`5Sd#{026Hf84RO4}(7MLd-ggdjz z%rRe9^ZzBZe8;W4D!2`{$E?8p1!pYpcaM zTD$~?8iV$g&*FY;&ux~J^cHuXUtRhjCv$1(q?96iA4;n?Gi;4e<5no3H&&B3fN$Do zFVHQcnnG!EN`(_m<>D+`k)DGyO=UGJNuWkzrqHsIL^{o;VM{hUXF+dTUyr_Zn=(DB zYEh%0Pcwzex>hzzELBFL=xao0IpFWj=PFB?hnL!)spWpN zy^i7aTI-HF)AiQ69U;?4A%V$sE_116kJY}~1WtgkCBmA+KW^ftmHOR}kM8B8WLC84Q$ z!FY_Gh|0;SL{w9zHQEF3ZmY6i#%~e`N%+II-Q+X`JA!sBa^rT)hK$%9?Qrpb<}zwW zsld+g>aL?TxHDZ;5`#%u3G>w$7kqA5+px2*%p+ zOrrB`NnAk1d*F&ZOBNs+@v~*V7a0&gLiIcFMPxsD51M+`i@!s~$8ecVB(k9e2qsG>y zBorFcMw1C6-m7T|J6DgodI%}o)aPKNw~BlMXL|$etP8zu)u)*^)yaSr0W$hiA zy6w0!8XrvzN3~f!Z$(D4(c*u=IREE{%)lQH)Uue{``;(~;8*=s!~uWmuR662m)VP8 zto;f~_YeSK^={$~%QjNXHpK(c83va*YGm3dC48kRf{XpsA2FEYmicO7^YR|z7TgbN zH6pZL+_B8z|I(PrA_Tg~2c~7tB6N`@w^Q(5<$sZPEpM4);!+4cJUHSryra)>iEbZE zQl)+@IXukT%LMEgh-IHu<3$L=`vcy#zcQxIUiO3>Q?uK@%{0cQ=+`K%?DjvF&ErXV Ya!lDuzv1AUfn95cL{jt>hiI_=3qi`$fB*mh diff --git a/core/media/truth_social_retriever.py b/core/media/truth_social_retriever.py index 477ce8a..8fddf63 100644 --- a/core/media/truth_social_retriever.py +++ b/core/media/truth_social_retriever.py @@ -31,6 +31,7 @@ class TruthSocialRetriever: self.db_url = f"mysql+pymysql://{mysql_user}:{mysql_password}@{mysql_host}:{mysql_port}/{mysql_database}" self.db_truth_social_content = DBTruthSocialContent(self.db_url) + self.article_ids_txt_file = r"./output/media/truth_social/article_ids.txt" trump_key = WECHAT_CONFIG.get("trump_key", "") if trump_key: @@ -174,6 +175,8 @@ class TruthSocialRetriever: result_df["analysis_result"] = "" result_df["analysis_token"] = 0 result_df = self.send_wechat_message(result_df, user_full_name) + article_ids = result_df["article_id"].tolist() + self.append_article_ids_to_txt(article_ids) result_df = result_df[ [ "article_id", @@ -207,12 +210,34 @@ class TruthSocialRetriever: self.send_wechat_message(result_df) else: logger.info(f"没有数据需要发送企业微信消息") + + def get_article_ids_from_txt(self): + if not os.path.exists(self.article_ids_txt_file): + return [] + with open(self.article_ids_txt_file, "r", encoding="utf-8") as f: + article_ids = f.readlines() + article_ids = [article_id.strip() for article_id in article_ids if article_id.strip()] + return article_ids + + def append_article_ids_to_txt(self, article_ids: list): + if article_ids is not None and len(article_ids) > 0: + if not os.path.exists(self.article_ids_txt_file): + with open(self.article_ids_txt_file, "w", encoding="utf-8") as f: + pass + with open(self.article_ids_txt_file, "a", encoding="utf-8") as f: + for article_id in article_ids: + f.write(article_id + "\n") def remove_duplicate_posts(self, result_df: pd.DataFrame): try: + article_ids = self.get_article_ids_from_txt() duplicate_index_list = [] for index, row in result_df.iterrows(): article_id = row["article_id"] + + if article_id in article_ids: + duplicate_index_list.append(index) + continue exist_data = self.db_truth_social_content.query_data_by_article_id( article_id )