diff --git a/config.py b/config.py index 53eb277..a541f04 100644 --- a/config.py +++ b/config.py @@ -215,21 +215,35 @@ WECHAT_CONFIG = { ITICK_API_KEY = "dfd4bc0caed148d6bc03b960224754ffb5356349e389431f828702b3a27e8a2b" +# TWITTER_CONFIG = { +# "keys": { +# "api_key": "c3l344o8pgVwy7Aw4yxj7CprT", +# "api_secret": "xjh3RVyyhVr9aDVSq5fFq210R1fmwYt36myBZR7ifuv0wYRWcT", +# "bearer_token": "AAAAAAAAAAAAAAAAAAAAAPoL4wEAAAAAXEMHlBpeR66dtTYWBkFSz1Fp3oI%3DHMoLlCMKNRGr1h6c0lBpZnJulx88fQ0JzZE1zm4jI4qNfSxiRZ", +# "access_token": "1658642847975784449-MR79EAOk8MTKx3zIbCEySaQjxDPK3R", +# "access_token_secret": "2H9RwHzBrWhAbt7RUmGHSg6mcfJEf0Aesx74QFFMeMYMn" +# }, +# "user_search_url": "https://api.twitter.com/2/users/by/username/{0}", +# "contents_search_url": "https://api.twitter.com/2/users/{0}/tweets?max_results=100&tweet.fields=text,created_at&exclude=replies,retweets", +# "monitor_accounts": [ +# {"name": "FoxNews", "id": ""}, +# {"name": "WhiteHouse", "id": "1879644163769335808"}, +# {"name": "sama", "id": ""}, +# {"name": "PressSec", "id": ""}, +# ], +# } + TWITTER_CONFIG = { "keys": { - "api_key": "c3l344o8pgVwy7Aw4yxj7CprT", - "api_secret": "xjh3RVyyhVr9aDVSq5fFq210R1fmwYt36myBZR7ifuv0wYRWcT", - "bearer_token": "AAAAAAAAAAAAAAAAAAAAAPoL4wEAAAAAXEMHlBpeR66dtTYWBkFSz1Fp3oI%3DHMoLlCMKNRGr1h6c0lBpZnJulx88fQ0JzZE1zm4jI4qNfSxiRZ", - "access_token": "1658642847975784449-MR79EAOk8MTKx3zIbCEySaQjxDPK3R", - "access_token_secret": "2H9RwHzBrWhAbt7RUmGHSg6mcfJEf0Aesx74QFFMeMYMn" + "api_key": "FRfhlDHnmYc1PCCrVHZdWtqDENr2", }, - "user_search_url": "https://api.twitter.com/2/users/by/username/{0}", - "contents_search_url": "https://api.twitter.com/2/users/{0}/tweets?max_results=100&tweet.fields=text,created_at&exclude=replies,retweets", + "base_url": "https://api.scrapecreators.com/v1/twitter/user-tweets?handle={0}&trim=false", "monitor_accounts": [ - {"name": "FoxNews", "id": ""}, - {"name": "WhiteHouse", "id": "1879644163769335808"}, - {"name": "sama", "id": ""}, - {"name": "PressSec", "id": ""}, + {"name": "realDonaldTrump"}, + {"name": "FoxNews"}, + {"name": "WhiteHouse"}, + {"name": "sama"}, + {"name": "PressSec"}, ], } diff --git a/core/media/__pycache__/twitter_retriever.cpython-312.pyc b/core/media/__pycache__/twitter_retriever.cpython-312.pyc index e7fc113..01fd632 100644 Binary files a/core/media/__pycache__/twitter_retriever.cpython-312.pyc and b/core/media/__pycache__/twitter_retriever.cpython-312.pyc differ diff --git a/core/media/twitter_retriever.py b/core/media/twitter_retriever.py index 102c8f7..ec1eecd 100644 --- a/core/media/twitter_retriever.py +++ b/core/media/twitter_retriever.py @@ -18,13 +18,10 @@ class TwitterRetriever: 高级版每个月可以获取1000000条推文,5000美元/月 """ def __init__(self): - self.keys = TWITTER_CONFIG["keys"] - self.headers = { - "Authorization": f"Bearer {self.keys['bearer_token']}" - } - self.user_search_url = TWITTER_CONFIG["user_search_url"] - self.contents_search_url = TWITTER_CONFIG["contents_search_url"] + self.api_key = TWITTER_CONFIG["keys"]["api_key"] + self.base_url = TWITTER_CONFIG["base_url"] self.monitor_account_list = TWITTER_CONFIG["monitor_accounts"] + self.limit = 20 mysql_user = COIN_MYSQL_CONFIG.get("user", "xch") mysql_password = COIN_MYSQL_CONFIG.get("password", "") @@ -36,64 +33,72 @@ class TwitterRetriever: self.db_url = f"mysql+pymysql://{mysql_user}:{mysql_password}@{mysql_host}:{mysql_port}/{mysql_database}" self.db_twitter_content = DBTwitterContent(self.db_url) - self.sleep_time = 15 * 60 + 10 - def search_user(self, username): - url = self.user_search_url.format(username) - response = requests.get(url, headers=self.headers) - if response.status_code == 200: - return response.json() - else: - logger.error(f"Failed to search user: {username}") - return None + self.save_path = r"./output/media/twitter/" + os.makedirs(self.save_path, exist_ok=True) - def search_contents(self, username: str, user_id: str): - logger.info(f"Searching contents for user: {user_id}") - url = self.contents_search_url.format(user_id) - response = requests.get(url, headers=self.headers) + def search_contents(self, username: str): + logger.info(f"Searching contents for user: {username}") + + headers = { + "x-api-key": self.api_key, + "Content-Type": "application/json" + } + url = self.base_url.format(username) + response = requests.get(url, headers=headers) if response.status_code == 200: - return response.json() + tweets = response.json() # 假设响应是推文数组 + logger.info(f"获取到 {len(tweets["tweets"])} 条推文") + # for tweet in tweets: + # created_at = self.transform_datetime(tweet.get('created_at', '')) + # logger.info(f"- ID: {tweet.get('id')}") + # logger.info(f" 文本: {tweet.get('text', '')[:100]}...") # 截取前100字符 + # logger.info(f" 时间: {created_at}") + # logger.info(f" 点赞: {tweet.get('likes', 0)}, 转发: {tweet.get('retweets', 0)}") + # if tweet.get('media'): + # logger.info(f" 媒体: {tweet['media'][0].get('url', 'N/A')}") + # logger.info("---") + return tweets else: - logger.error(f"Failed to search contents for user: {user_id}") + logger.error(f"请求失败: {response.status_code} - {response.text}") return None def monitor_accounts(self): for account_dict in self.monitor_account_list: user_name = account_dict["name"] - user_id = account_dict["id"] logger.info(f"Monitoring account: {user_name}") - logger.info(f"Sleeping for {self.sleep_time} seconds") - # time.sleep(self.sleep_time) + result_list = [] - if user_id is None or user_id == "": - user = self.search_user(user_name) - if user is None: - continue - user_id = str(user["data"]["id"]) - contents = self.search_contents(user_name, user_id) + contents = self.search_contents(user_name) if contents is None: continue - twitter_contents = contents["data"] - for content in twitter_contents: - datetime_text = content["created_at"] - datetime_dict = self.transform_datetime(datetime_text) - timestamp_ms = datetime_dict["timestamp_ms"] - beijing_time_str = datetime_dict["beijing_time_str"] - text = content["text"] - result = { - "user_id": user_id, - "user_name": user_name, - "timestamp": timestamp_ms, - "date_time": beijing_time_str, - "text": text - } - result_list.append(result) - if len(result_list) > 0: - result_df = pd.DataFrame(result_list) - self.db_twitter_content.insert_data_to_mysql(result_df) - logger.info(f"Inserted {len(result_df)} rows into twitter_content") - else: - logger.warning(f"No data inserted for account: {user_name}") + + user_twitter_path = os.path.join(self.save_path, user_name) + os.makedirs(user_twitter_path, exist_ok=True) + for content in contents["tweets"]: + content_path = os.path.join(user_twitter_path, f"{content["rest_id"]}.json") + with open(content_path, "w", encoding="utf-8") as f: + json.dump(content, f, ensure_ascii=False, indent=4) + # for content in contents: + # datetime_text = content["created_at"] + # datetime_dict = self.transform_datetime(datetime_text) + # timestamp_ms = datetime_dict["timestamp_ms"] + # beijing_time_str = datetime_dict["beijing_time_str"] + # text = content["text"] + # result = { + # "user_id": content["id"], + # "user_name": user_name, + # "timestamp": timestamp_ms, + # "date_time": beijing_time_str, + # "text": text + # } + # result_list.append(result) + # if len(result_list) > 0: + # result_df = pd.DataFrame(result_list) + # self.db_twitter_content.insert_data_to_mysql(result_df) + # logger.info(f"Inserted {len(result_df)} rows into twitter_content") + # else: + # logger.warning(f"No data inserted for account: {user_name}") def transform_datetime(self, datetime_text: str): utc_time = datetime.strptime(datetime_text, "%Y-%m-%dT%H:%M:%S.%fZ").replace(tzinfo=pytz.UTC)