support fetch twitter articles
This commit is contained in:
parent
8c5bb34e5a
commit
79d1775197
36
config.py
36
config.py
|
|
@ -215,21 +215,35 @@ WECHAT_CONFIG = {
|
|||
|
||||
ITICK_API_KEY = "dfd4bc0caed148d6bc03b960224754ffb5356349e389431f828702b3a27e8a2b"
|
||||
|
||||
# TWITTER_CONFIG = {
|
||||
# "keys": {
|
||||
# "api_key": "c3l344o8pgVwy7Aw4yxj7CprT",
|
||||
# "api_secret": "xjh3RVyyhVr9aDVSq5fFq210R1fmwYt36myBZR7ifuv0wYRWcT",
|
||||
# "bearer_token": "AAAAAAAAAAAAAAAAAAAAAPoL4wEAAAAAXEMHlBpeR66dtTYWBkFSz1Fp3oI%3DHMoLlCMKNRGr1h6c0lBpZnJulx88fQ0JzZE1zm4jI4qNfSxiRZ",
|
||||
# "access_token": "1658642847975784449-MR79EAOk8MTKx3zIbCEySaQjxDPK3R",
|
||||
# "access_token_secret": "2H9RwHzBrWhAbt7RUmGHSg6mcfJEf0Aesx74QFFMeMYMn"
|
||||
# },
|
||||
# "user_search_url": "https://api.twitter.com/2/users/by/username/{0}",
|
||||
# "contents_search_url": "https://api.twitter.com/2/users/{0}/tweets?max_results=100&tweet.fields=text,created_at&exclude=replies,retweets",
|
||||
# "monitor_accounts": [
|
||||
# {"name": "FoxNews", "id": ""},
|
||||
# {"name": "WhiteHouse", "id": "1879644163769335808"},
|
||||
# {"name": "sama", "id": ""},
|
||||
# {"name": "PressSec", "id": ""},
|
||||
# ],
|
||||
# }
|
||||
|
||||
TWITTER_CONFIG = {
|
||||
"keys": {
|
||||
"api_key": "c3l344o8pgVwy7Aw4yxj7CprT",
|
||||
"api_secret": "xjh3RVyyhVr9aDVSq5fFq210R1fmwYt36myBZR7ifuv0wYRWcT",
|
||||
"bearer_token": "AAAAAAAAAAAAAAAAAAAAAPoL4wEAAAAAXEMHlBpeR66dtTYWBkFSz1Fp3oI%3DHMoLlCMKNRGr1h6c0lBpZnJulx88fQ0JzZE1zm4jI4qNfSxiRZ",
|
||||
"access_token": "1658642847975784449-MR79EAOk8MTKx3zIbCEySaQjxDPK3R",
|
||||
"access_token_secret": "2H9RwHzBrWhAbt7RUmGHSg6mcfJEf0Aesx74QFFMeMYMn"
|
||||
"api_key": "FRfhlDHnmYc1PCCrVHZdWtqDENr2",
|
||||
},
|
||||
"user_search_url": "https://api.twitter.com/2/users/by/username/{0}",
|
||||
"contents_search_url": "https://api.twitter.com/2/users/{0}/tweets?max_results=100&tweet.fields=text,created_at&exclude=replies,retweets",
|
||||
"base_url": "https://api.scrapecreators.com/v1/twitter/user-tweets?handle={0}&trim=false",
|
||||
"monitor_accounts": [
|
||||
{"name": "FoxNews", "id": ""},
|
||||
{"name": "WhiteHouse", "id": "1879644163769335808"},
|
||||
{"name": "sama", "id": ""},
|
||||
{"name": "PressSec", "id": ""},
|
||||
{"name": "realDonaldTrump"},
|
||||
{"name": "FoxNews"},
|
||||
{"name": "WhiteHouse"},
|
||||
{"name": "sama"},
|
||||
{"name": "PressSec"},
|
||||
],
|
||||
}
|
||||
|
||||
|
|
|
|||
Binary file not shown.
|
|
@ -18,13 +18,10 @@ class TwitterRetriever:
|
|||
高级版每个月可以获取1000000条推文,5000美元/月
|
||||
"""
|
||||
def __init__(self):
|
||||
self.keys = TWITTER_CONFIG["keys"]
|
||||
self.headers = {
|
||||
"Authorization": f"Bearer {self.keys['bearer_token']}"
|
||||
}
|
||||
self.user_search_url = TWITTER_CONFIG["user_search_url"]
|
||||
self.contents_search_url = TWITTER_CONFIG["contents_search_url"]
|
||||
self.api_key = TWITTER_CONFIG["keys"]["api_key"]
|
||||
self.base_url = TWITTER_CONFIG["base_url"]
|
||||
self.monitor_account_list = TWITTER_CONFIG["monitor_accounts"]
|
||||
self.limit = 20
|
||||
|
||||
mysql_user = COIN_MYSQL_CONFIG.get("user", "xch")
|
||||
mysql_password = COIN_MYSQL_CONFIG.get("password", "")
|
||||
|
|
@ -36,64 +33,72 @@ class TwitterRetriever:
|
|||
|
||||
self.db_url = f"mysql+pymysql://{mysql_user}:{mysql_password}@{mysql_host}:{mysql_port}/{mysql_database}"
|
||||
self.db_twitter_content = DBTwitterContent(self.db_url)
|
||||
self.sleep_time = 15 * 60 + 10
|
||||
|
||||
def search_user(self, username):
|
||||
url = self.user_search_url.format(username)
|
||||
response = requests.get(url, headers=self.headers)
|
||||
if response.status_code == 200:
|
||||
return response.json()
|
||||
else:
|
||||
logger.error(f"Failed to search user: {username}")
|
||||
return None
|
||||
self.save_path = r"./output/media/twitter/"
|
||||
os.makedirs(self.save_path, exist_ok=True)
|
||||
|
||||
def search_contents(self, username: str, user_id: str):
|
||||
logger.info(f"Searching contents for user: {user_id}")
|
||||
url = self.contents_search_url.format(user_id)
|
||||
response = requests.get(url, headers=self.headers)
|
||||
def search_contents(self, username: str):
|
||||
logger.info(f"Searching contents for user: {username}")
|
||||
|
||||
headers = {
|
||||
"x-api-key": self.api_key,
|
||||
"Content-Type": "application/json"
|
||||
}
|
||||
url = self.base_url.format(username)
|
||||
response = requests.get(url, headers=headers)
|
||||
if response.status_code == 200:
|
||||
return response.json()
|
||||
tweets = response.json() # 假设响应是推文数组
|
||||
logger.info(f"获取到 {len(tweets["tweets"])} 条推文")
|
||||
# for tweet in tweets:
|
||||
# created_at = self.transform_datetime(tweet.get('created_at', ''))
|
||||
# logger.info(f"- ID: {tweet.get('id')}")
|
||||
# logger.info(f" 文本: {tweet.get('text', '')[:100]}...") # 截取前100字符
|
||||
# logger.info(f" 时间: {created_at}")
|
||||
# logger.info(f" 点赞: {tweet.get('likes', 0)}, 转发: {tweet.get('retweets', 0)}")
|
||||
# if tweet.get('media'):
|
||||
# logger.info(f" 媒体: {tweet['media'][0].get('url', 'N/A')}")
|
||||
# logger.info("---")
|
||||
return tweets
|
||||
else:
|
||||
logger.error(f"Failed to search contents for user: {user_id}")
|
||||
logger.error(f"请求失败: {response.status_code} - {response.text}")
|
||||
return None
|
||||
|
||||
def monitor_accounts(self):
|
||||
for account_dict in self.monitor_account_list:
|
||||
user_name = account_dict["name"]
|
||||
user_id = account_dict["id"]
|
||||
logger.info(f"Monitoring account: {user_name}")
|
||||
logger.info(f"Sleeping for {self.sleep_time} seconds")
|
||||
# time.sleep(self.sleep_time)
|
||||
|
||||
result_list = []
|
||||
if user_id is None or user_id == "":
|
||||
user = self.search_user(user_name)
|
||||
if user is None:
|
||||
continue
|
||||
user_id = str(user["data"]["id"])
|
||||
contents = self.search_contents(user_name, user_id)
|
||||
contents = self.search_contents(user_name)
|
||||
if contents is None:
|
||||
continue
|
||||
twitter_contents = contents["data"]
|
||||
for content in twitter_contents:
|
||||
datetime_text = content["created_at"]
|
||||
datetime_dict = self.transform_datetime(datetime_text)
|
||||
timestamp_ms = datetime_dict["timestamp_ms"]
|
||||
beijing_time_str = datetime_dict["beijing_time_str"]
|
||||
text = content["text"]
|
||||
result = {
|
||||
"user_id": user_id,
|
||||
"user_name": user_name,
|
||||
"timestamp": timestamp_ms,
|
||||
"date_time": beijing_time_str,
|
||||
"text": text
|
||||
}
|
||||
result_list.append(result)
|
||||
if len(result_list) > 0:
|
||||
result_df = pd.DataFrame(result_list)
|
||||
self.db_twitter_content.insert_data_to_mysql(result_df)
|
||||
logger.info(f"Inserted {len(result_df)} rows into twitter_content")
|
||||
else:
|
||||
logger.warning(f"No data inserted for account: {user_name}")
|
||||
|
||||
user_twitter_path = os.path.join(self.save_path, user_name)
|
||||
os.makedirs(user_twitter_path, exist_ok=True)
|
||||
for content in contents["tweets"]:
|
||||
content_path = os.path.join(user_twitter_path, f"{content["rest_id"]}.json")
|
||||
with open(content_path, "w", encoding="utf-8") as f:
|
||||
json.dump(content, f, ensure_ascii=False, indent=4)
|
||||
# for content in contents:
|
||||
# datetime_text = content["created_at"]
|
||||
# datetime_dict = self.transform_datetime(datetime_text)
|
||||
# timestamp_ms = datetime_dict["timestamp_ms"]
|
||||
# beijing_time_str = datetime_dict["beijing_time_str"]
|
||||
# text = content["text"]
|
||||
# result = {
|
||||
# "user_id": content["id"],
|
||||
# "user_name": user_name,
|
||||
# "timestamp": timestamp_ms,
|
||||
# "date_time": beijing_time_str,
|
||||
# "text": text
|
||||
# }
|
||||
# result_list.append(result)
|
||||
# if len(result_list) > 0:
|
||||
# result_df = pd.DataFrame(result_list)
|
||||
# self.db_twitter_content.insert_data_to_mysql(result_df)
|
||||
# logger.info(f"Inserted {len(result_df)} rows into twitter_content")
|
||||
# else:
|
||||
# logger.warning(f"No data inserted for account: {user_name}")
|
||||
|
||||
def transform_datetime(self, datetime_text: str):
|
||||
utc_time = datetime.strptime(datetime_text, "%Y-%m-%dT%H:%M:%S.%fZ").replace(tzinfo=pytz.UTC)
|
||||
|
|
|
|||
Loading…
Reference in New Issue