From 79d1775197230d42760904cf67d490803ce5c8ed Mon Sep 17 00:00:00 2001 From: blade <8019068@qq.com> Date: Mon, 27 Oct 2025 17:29:30 +0800 Subject: [PATCH] support fetch twitter articles --- config.py | 36 ++++-- .../twitter_retriever.cpython-312.pyc | Bin 6170 -> 5324 bytes core/media/twitter_retriever.py | 107 +++++++++--------- 3 files changed, 81 insertions(+), 62 deletions(-) diff --git a/config.py b/config.py index 53eb277..a541f04 100644 --- a/config.py +++ b/config.py @@ -215,21 +215,35 @@ WECHAT_CONFIG = { ITICK_API_KEY = "dfd4bc0caed148d6bc03b960224754ffb5356349e389431f828702b3a27e8a2b" +# TWITTER_CONFIG = { +# "keys": { +# "api_key": "c3l344o8pgVwy7Aw4yxj7CprT", +# "api_secret": "xjh3RVyyhVr9aDVSq5fFq210R1fmwYt36myBZR7ifuv0wYRWcT", +# "bearer_token": "AAAAAAAAAAAAAAAAAAAAAPoL4wEAAAAAXEMHlBpeR66dtTYWBkFSz1Fp3oI%3DHMoLlCMKNRGr1h6c0lBpZnJulx88fQ0JzZE1zm4jI4qNfSxiRZ", +# "access_token": "1658642847975784449-MR79EAOk8MTKx3zIbCEySaQjxDPK3R", +# "access_token_secret": "2H9RwHzBrWhAbt7RUmGHSg6mcfJEf0Aesx74QFFMeMYMn" +# }, +# "user_search_url": "https://api.twitter.com/2/users/by/username/{0}", +# "contents_search_url": "https://api.twitter.com/2/users/{0}/tweets?max_results=100&tweet.fields=text,created_at&exclude=replies,retweets", +# "monitor_accounts": [ +# {"name": "FoxNews", "id": ""}, +# {"name": "WhiteHouse", "id": "1879644163769335808"}, +# {"name": "sama", "id": ""}, +# {"name": "PressSec", "id": ""}, +# ], +# } + TWITTER_CONFIG = { "keys": { - "api_key": "c3l344o8pgVwy7Aw4yxj7CprT", - "api_secret": "xjh3RVyyhVr9aDVSq5fFq210R1fmwYt36myBZR7ifuv0wYRWcT", - "bearer_token": "AAAAAAAAAAAAAAAAAAAAAPoL4wEAAAAAXEMHlBpeR66dtTYWBkFSz1Fp3oI%3DHMoLlCMKNRGr1h6c0lBpZnJulx88fQ0JzZE1zm4jI4qNfSxiRZ", - "access_token": "1658642847975784449-MR79EAOk8MTKx3zIbCEySaQjxDPK3R", - "access_token_secret": "2H9RwHzBrWhAbt7RUmGHSg6mcfJEf0Aesx74QFFMeMYMn" + "api_key": "FRfhlDHnmYc1PCCrVHZdWtqDENr2", }, - "user_search_url": "https://api.twitter.com/2/users/by/username/{0}", - "contents_search_url": "https://api.twitter.com/2/users/{0}/tweets?max_results=100&tweet.fields=text,created_at&exclude=replies,retweets", + "base_url": "https://api.scrapecreators.com/v1/twitter/user-tweets?handle={0}&trim=false", "monitor_accounts": [ - {"name": "FoxNews", "id": ""}, - {"name": "WhiteHouse", "id": "1879644163769335808"}, - {"name": "sama", "id": ""}, - {"name": "PressSec", "id": ""}, + {"name": "realDonaldTrump"}, + {"name": "FoxNews"}, + {"name": "WhiteHouse"}, + {"name": "sama"}, + {"name": "PressSec"}, ], } diff --git a/core/media/__pycache__/twitter_retriever.cpython-312.pyc b/core/media/__pycache__/twitter_retriever.cpython-312.pyc index e7fc113d3e2084be1986cb6d6d4e7493312de67f..01fd632c6347c8196abf43dc200cf3bd045dd9e0 100644 GIT binary patch delta 2701 zcmZuyYitwQ6~1>~9@}F(;Ab8XL&C$5$3hYa0g@IdghG+FWLrp=wOrpxaEKqwj2noY z2Cb0RXd@sUX#=~9aMZ0Zf_8(p>hhCnD|Myzhh4IZHS!OBEOhx7!d5HopZ46b4Mgg- z=J?!m&YXK@&Ue1~@Qr_lod0k*YzS7e@4u&8y!V_oyq}?aSTDLk%pn;KV}gbmf`?hc zTtvehVP$@VA>2i7SRg!DEJQFZ7O;rt#gO$g83%B1HDsDYnwg9Wg6pYdsLY@+)-mh0 zveg|!cbmEDj-~N*5gsDBijI(GuWI_d&Fd!uvD~RvJB3&Hwh@t7U;3B~quMoBEeo;T z!4>`8m3R4xok%b5VpnAJ69;j=^l>YEgTzJZUix^lDuX{lWOarC>CUjG&Vlq~_>a&H z8$6L%t3`9yv8J9^c(y2|vG)-1Ihi+p-l+VRfEu1_Paw@bh3N1^K^!NFC!QjVZ@tFR5 zepkS%Mt&oQQxWa7!6a2fj7QEuDn-@*&7Z&n`j3PSyqQ}c7Y2I2gA^A~*}ow-QuMdy zV?}>&mYb2vfsMJzVqiPyj&kevLf215Ez81H?fUL+6)2gf_B5^rd!!)469-N-6i=-vYBCXGCave%0S@f|he>IFGNH){t$pZxN)BPea$1fMSy7*T30cz=e&1gH z{K=IMo@5t-hM-NzvZk8m;L{uPPcP@6e(+w}Nuxgf{OP@SpL}xf+du!IFF45t+k+v= z5T?hIqocB7aIwTlvI-Q4HU!wkBbp&9^4YYkYO2yq&309bXlXSZO%mDQXtf4AE+-6L zR+J=U(d2VlNKiIHtkOxvHhLFJ(F~c?R5Agr?gE=i+ZPN5dAO7|{jq(=H1q_ie*pnF z6P*+(1z*>~;6h)))4wPVED5M#?aij^O{J!e=RE7SWi3l0a=EX>FU21@*B71ZbG;9p zZCSSLZk*dQyC=uyf4S)H$%-Yr`;oo5Xm8GS{mZ_wK&K@jFtFIinbmZEW zPmbq(3#Y_lXxGCHeTCND_xlQggNxFkB@gm6WPkPz#2vkK^jd#@^P?^O#V!4X(7=85 zLGR!9{&nwz&O-gq7p3DRw_oXi+M`u!02LuCTqcB{V*QU&Q{Abr5t_moTw&QMMt|;X z_5t$&G?-V=7)4WTMi1M9tGEnz0nhBvUQ&_%rs!eD9F$S?9P15am>QW(@ftnNT$H37 z?hsGdyIjo=u;mC(VSNCdIss(}|2OMSJ|hr;h5?W*P*sh)Ew9aog);YD+nwqcpa)Hf z`mD40n2-@EJKuHOseTPGBId7_1*4f^4$YJ}V7?Z#4XJp7!RREy@S7Pu$DTwJIOGII zq-3D~*5#WXQW~Ka6ETJb0HlRuWRmYNfrlq1`Ls6D-V0M9$_apEEHQfU89h717?PY& z(~2C9sL@!=5Ml`eLllyfH8danY$`oe3{cSu!r)9M7 zNyk$mtBGzys*-Y;#G;yEr4$yXWgB)VGd-?_$AN zj7L=3N*IiwLUbS(0AE@eoKwLnjrr@Kdo?=taGICl6arS5_(3<;2G0>Jh zTt#-Fp(B4jf2I)VTa`gddpOD=!G)m*9zd=p%M%QA1d zCCy9up-_4nmC3yBPP{aOBmM=E(``@q| z4ieF1Ef$xd-llsY6-khYYA`8c k9;n2rixzBF@`Jc1o+RmUxnD`^MSq{`G{(MVM5P(@KWF>5X8-^I literal 6170 zcmbVQYitu&7M`)k?Df>J}Np$J+_X=v>TL1|ABULB=_UOlCU9u;MP9>bK;V-oGzr5&a@CP2SN zGn4-Wu|_vMx8HjE-lxC6dpdsS;?=vqK6UTYk8Yp)>|cL4bLZN*rR$&Exfs8F?vs0O zetGZgXSdJ2yS}aM&ZQ6UoK4($_ZPq;3Algp)`eL@Au-hMee)q$Z zNJ&oWwWTk9o0ZzpmM8bQ_O`aWe>{8ptv6didq%~T*}I>A{G-!h_7eh-xD^uoc;4Y6 zgHyx%$OuFhen#JZYM&G^9&<^2W5; zG5MbJSE0*}X_R>unHsmsRIaAZeD3L@Y%y5D4VDS=s_>THmT2zOR zy&knsOtHsh182bwJr)L89D`gy)Q!@SC+WQm6J|s$brfxzLFO$ML4(WSADKM{(Z~jAIzl3B^Jqna zrGq1)QCb%%9f37N%R83Gq#b7Y&_2a1AK;|4QC15I5p=oTk9SgC!;vYttPmMt!sE1` zV~73BRD=tY$0vOuZrC4Y=;7?bRc-LcUH*s&>SQ z1-)yrc0J$TE7a~sE%#zwJHKJCP`3}Ys+UUErb-%xlE%cQ`Ho~sXWY10(U_`e5h_{| z(d+HWitTah-)yTFi(LHbEyYH8Mvy-|$Pc~350X6jD(?&PuSWRN|Wh6<07r9aE1Z(_ZmVs$=WG!tF4spkt2ZN|qeNI0aqD6;#aeK_mb?x}=iVi(`Np z#K2{oaRJl)ub_yQ?ehhKH09>P;J64F+{n_q+#ZAEWTJK?%uM(=QP0rFCux>rnKdv? z5+Isb&c{u%;64;B660(*Bx-}<(NO>k4UIg`Lol_#E$WfXkZ%H99>EUCVk6-YOS9+_ zkXtYw#AcGA3Am%Kr))cK6tS~Mpg4)8twkwojbN=wSv`W)lUSFuZiuTF9nK5g=ey_B ziJ@!5mxmV|U2%Qd4r5M4be z8CM`Q>Xu`U8!RhpY)muAVOpSw);(p91^7KiW&<#bL?8r0kPk4esAxllVxP=<)NDt^ zMkqWQ$rz>kWI9prCRA*Of<@aJHwydeis-x~7Ge(&MX^`@ueH) zd*?g(;%y81?UIqM6|8Gh)@H%loES}7H~rs6+H*s9eTuK>TQL7bHqs}uMT9sKvi(`g z77HWW9&83batOnw@fe=TBBxbAsZb^dMy4R;xPovnkbMAGR+#cL_z74;0epnaNOMV@ zL#I(}nuw`?rirO3tz?Y#Fi$aeS^E?2powWt;%7!UB}a;BC@CAw>_nD5johLDk^(}j z701xVv>@wDOq-D~t&8dQU`KYqdFW&Cg(97d0p!?nE0L*~LD6K487YIrYXZtBQD&HL zk|+x}s48ZPnPV2pd{HxP1!pXP+%c;x568~zA}~&IwHP6fP?lqRzyq;X0 zVTbnWqZq}B6vorXaEqQWQ=QPO_WnXDiFy}CV zBGxL%a;4y*3Q*DFgPH6UAxSnt?Q%yggD7!BeF#*ddWdD1o&L&JwF zMM5+)qZALK*F=PzU`6{;Ixr5CrNobf^id=@`gINfqZ3UxNZ2o}z@W?%4d^Ran!&&| zU<~q@sG$#{8Usq)(MmTHKEb*JA&9KmbWYAQqiY7jZnP#C&9b9qE8Fqd7<57s#xYwF zRfCExP>5<+9@CAeB98HeSeQ6LqNSjfiCSMILWd|(9fWjW6`@4K9#FH7fzZoTqe;aX zjcA4sz6wYwp%rx}d`t)qz+=zEYjbufBo**;%wD8iItdd5?#UpR$+7K7o?Zq((Y|7n ztfY_$iq)9^Oyh6$t803o4>ZibY#Ww+q z&Wrt}lntj{?v$%TaCJPwY7GbRcz?RQZtg{){JD5vy0m6)qfojo-n$6-z=6{T_{x^V zRN@$4-nC%fn(doyP7v2@mu-B*mg{=HeCL9B*R4ud{6N}P8h>N1A#wOO&FL~{s;ogM zYe*Q9W$iNrfb>jvy4bZ?xq6nG+w@*Au~AslmaJSqqmx*hgtDeY-~5SWSx?zT^boUHntFV`x1P@Hw%Ay(VvK3q9O4(}! zJA5Mi)!w-51TEjYFuP-BYtmLVSN*lEcG0!wgZ}sW6V9ZoW5$rS7pLqEg1sSS?-uOc z*I!<+_b%E?FPP4oW&>B%iM}gVzF{-J`FY-T_#6A+5)4_-Tlwmq8;0vY<*N>SWAA^^ zi=&gzBv&uM%@j_{yD~Wyl$oc{FAWsBWcV%=s5m9)6imZ&S(Pd;fT#)u3OWCw)KLhC zGB`q@RI;^#Ih6v|5ZF`&fj-9pp{!s=rBSp&fP%k%+DeSlMd47O1?@@<(bHN8ELuu` zQH8?RAm%X)L`&*lZmyqbuBV3T_jlDF?5ZDZsUPve4?HawMTt}39%Kno%T6}nYaJZ( zg+|AGfvDrDN%hhD|M}sE9~czBJSs+US4hIp@rrstq=Gc9=?>zowHj~&St^coOHI%*9p!|$+FFH^HNF0g(K&W%=IKoHpGqTvZ_>B zqfpkE=t`Dtiks8UhLp2SaDt2OOFDabefceCQ(|}GFz;-e-vBcB&c3f|zN+Or2L$H; z8g{KqyW1XU)J4X)c3F?vOXC*Fbp|{}^d|s?C6W}4BsmeLCWDAJljv7MFgv0nNh<6o zNv07tD8&$FKcWyOO8Cj3bGnRW{LzDY(NRhT%B)$GAfLh>s0fYzq0ymfc-W}ZlrI-QGO;b9r{vLxWBNK`9tP)M=&qhn;`!JKf>rey2j#w!T zQAb;z{wI_9S)+weG7=cg{0c%Dsi+FGnbaQT{1C6ff+TZfUO{bb#22D`EYlB-%yy0; zstFZni;wl)nd92du+T&x!=muItj2NtTWs~W*yg`u8*gHUo0#b)W`hzsCE9VzU_ZC! T^qP~}e-nNjFMWuiQj+{%ilzmE diff --git a/core/media/twitter_retriever.py b/core/media/twitter_retriever.py index 102c8f7..ec1eecd 100644 --- a/core/media/twitter_retriever.py +++ b/core/media/twitter_retriever.py @@ -18,13 +18,10 @@ class TwitterRetriever: 高级版每个月可以获取1000000条推文,5000美元/月 """ def __init__(self): - self.keys = TWITTER_CONFIG["keys"] - self.headers = { - "Authorization": f"Bearer {self.keys['bearer_token']}" - } - self.user_search_url = TWITTER_CONFIG["user_search_url"] - self.contents_search_url = TWITTER_CONFIG["contents_search_url"] + self.api_key = TWITTER_CONFIG["keys"]["api_key"] + self.base_url = TWITTER_CONFIG["base_url"] self.monitor_account_list = TWITTER_CONFIG["monitor_accounts"] + self.limit = 20 mysql_user = COIN_MYSQL_CONFIG.get("user", "xch") mysql_password = COIN_MYSQL_CONFIG.get("password", "") @@ -36,64 +33,72 @@ class TwitterRetriever: self.db_url = f"mysql+pymysql://{mysql_user}:{mysql_password}@{mysql_host}:{mysql_port}/{mysql_database}" self.db_twitter_content = DBTwitterContent(self.db_url) - self.sleep_time = 15 * 60 + 10 - def search_user(self, username): - url = self.user_search_url.format(username) - response = requests.get(url, headers=self.headers) - if response.status_code == 200: - return response.json() - else: - logger.error(f"Failed to search user: {username}") - return None + self.save_path = r"./output/media/twitter/" + os.makedirs(self.save_path, exist_ok=True) - def search_contents(self, username: str, user_id: str): - logger.info(f"Searching contents for user: {user_id}") - url = self.contents_search_url.format(user_id) - response = requests.get(url, headers=self.headers) + def search_contents(self, username: str): + logger.info(f"Searching contents for user: {username}") + + headers = { + "x-api-key": self.api_key, + "Content-Type": "application/json" + } + url = self.base_url.format(username) + response = requests.get(url, headers=headers) if response.status_code == 200: - return response.json() + tweets = response.json() # 假设响应是推文数组 + logger.info(f"获取到 {len(tweets["tweets"])} 条推文") + # for tweet in tweets: + # created_at = self.transform_datetime(tweet.get('created_at', '')) + # logger.info(f"- ID: {tweet.get('id')}") + # logger.info(f" 文本: {tweet.get('text', '')[:100]}...") # 截取前100字符 + # logger.info(f" 时间: {created_at}") + # logger.info(f" 点赞: {tweet.get('likes', 0)}, 转发: {tweet.get('retweets', 0)}") + # if tweet.get('media'): + # logger.info(f" 媒体: {tweet['media'][0].get('url', 'N/A')}") + # logger.info("---") + return tweets else: - logger.error(f"Failed to search contents for user: {user_id}") + logger.error(f"请求失败: {response.status_code} - {response.text}") return None def monitor_accounts(self): for account_dict in self.monitor_account_list: user_name = account_dict["name"] - user_id = account_dict["id"] logger.info(f"Monitoring account: {user_name}") - logger.info(f"Sleeping for {self.sleep_time} seconds") - # time.sleep(self.sleep_time) + result_list = [] - if user_id is None or user_id == "": - user = self.search_user(user_name) - if user is None: - continue - user_id = str(user["data"]["id"]) - contents = self.search_contents(user_name, user_id) + contents = self.search_contents(user_name) if contents is None: continue - twitter_contents = contents["data"] - for content in twitter_contents: - datetime_text = content["created_at"] - datetime_dict = self.transform_datetime(datetime_text) - timestamp_ms = datetime_dict["timestamp_ms"] - beijing_time_str = datetime_dict["beijing_time_str"] - text = content["text"] - result = { - "user_id": user_id, - "user_name": user_name, - "timestamp": timestamp_ms, - "date_time": beijing_time_str, - "text": text - } - result_list.append(result) - if len(result_list) > 0: - result_df = pd.DataFrame(result_list) - self.db_twitter_content.insert_data_to_mysql(result_df) - logger.info(f"Inserted {len(result_df)} rows into twitter_content") - else: - logger.warning(f"No data inserted for account: {user_name}") + + user_twitter_path = os.path.join(self.save_path, user_name) + os.makedirs(user_twitter_path, exist_ok=True) + for content in contents["tweets"]: + content_path = os.path.join(user_twitter_path, f"{content["rest_id"]}.json") + with open(content_path, "w", encoding="utf-8") as f: + json.dump(content, f, ensure_ascii=False, indent=4) + # for content in contents: + # datetime_text = content["created_at"] + # datetime_dict = self.transform_datetime(datetime_text) + # timestamp_ms = datetime_dict["timestamp_ms"] + # beijing_time_str = datetime_dict["beijing_time_str"] + # text = content["text"] + # result = { + # "user_id": content["id"], + # "user_name": user_name, + # "timestamp": timestamp_ms, + # "date_time": beijing_time_str, + # "text": text + # } + # result_list.append(result) + # if len(result_list) > 0: + # result_df = pd.DataFrame(result_list) + # self.db_twitter_content.insert_data_to_mysql(result_df) + # logger.info(f"Inserted {len(result_df)} rows into twitter_content") + # else: + # logger.warning(f"No data inserted for account: {user_name}") def transform_datetime(self, datetime_text: str): utc_time = datetime.strptime(datetime_text, "%Y-%m-%dT%H:%M:%S.%fZ").replace(tzinfo=pytz.UTC)