support fetch data from truth social

This commit is contained in:
blade 2025-10-20 18:37:41 +08:00
parent 1e6caf51e0
commit 2dc2d51171
15 changed files with 687 additions and 2 deletions

50
.vscode/README.md vendored Normal file
View File

@ -0,0 +1,50 @@
# Python环境配置说明
## 自动配置Python解释器
本项目已移除硬编码的Python路径支持在不同电脑上自动检测Python环境。
### 首次使用步骤:
1. **打开Cursor**
2. **选择Python解释器**
- 按 `Ctrl+Shift+P`
- 搜索 "Python: Select Interpreter"
- 选择您系统中的Python解释器
3. **验证配置**
- 打开 `test_autocomplete.py` 文件
- 尝试输入代码,检查是否有自动补全功能
### 支持的Python环境
- **Anaconda/Miniconda**自动检测conda环境
- **Python官方安装包**检测系统PATH中的python
- **虚拟环境**支持venv、virtualenv等
- **Docker容器**支持远程Python解释器
### 代码提示功能:
**已启用的功能**
- 智能代码补全
- 函数参数提示
- 类型提示
- 自动导入建议
- 悬停信息显示
- 语法错误检测
- 代码格式化
### 故障排除:
如果代码提示不工作:
1. **重启语言服务器**`Ctrl+Shift+P` → "Python: Restart Language Server"
2. **重新加载窗口**`Ctrl+Shift+P` → "Developer: Reload Window"
3. **检查Python扩展**确保Python扩展已安装并启用
4. **验证解释器**确保选择的Python解释器路径正确
### 配置文件说明:
- `.vscode/settings.json`:项目级设置,包含代码提示配置
- `python_settings.json`通用Python配置模板
- 全局设置:已移除硬编码路径,保持代码提示功能

20
.vscode/python_settings.json vendored Normal file
View File

@ -0,0 +1,20 @@
{
"python.analysis.autoImportCompletions": true,
"python.analysis.typeCheckingMode": "basic",
"python.analysis.autoSearchPaths": true,
"python.analysis.diagnosticMode": "workspace",
"python.analysis.indexing": true,
"python.analysis.completeFunctionParens": true,
"python.analysis.inlayHints.functionReturnTypes": true,
"python.analysis.inlayHints.variableTypes": true,
"python.analysis.inlayHints.pytestParameters": true,
"python.linting.enabled": true,
"python.linting.pylintEnabled": false,
"python.linting.flake8Enabled": true,
"python.formatting.provider": "black",
"python.terminal.activateEnvironment": true,
"python.terminal.activateEnvInCurrentTerminal": true,
"files.associations": {
"*.py": "python"
}
}

53
.vscode/settings.json vendored Normal file
View File

@ -0,0 +1,53 @@
{
"python.analysis.autoImportCompletions": true,
"python.analysis.typeCheckingMode": "basic",
"python.analysis.autoSearchPaths": true,
"python.analysis.diagnosticMode": "workspace",
"python.analysis.stubPath": "./typings",
"python.analysis.extraPaths": [
"./core",
"./utils"
],
"editor.quickSuggestions": {
"other": true,
"comments": false,
"strings": true
},
"editor.suggestOnTriggerCharacters": true,
"editor.acceptSuggestionOnEnter": "on",
"editor.tabCompletion": "on",
"editor.wordBasedSuggestions": "matchingDocuments",
"editor.parameterHints.enabled": true,
"editor.hover.enabled": true,
"editor.codeActionsOnSave": {
"source.organizeImports": "explicit"
},
"python.linting.enabled": true,
"python.linting.pylintEnabled": false,
"python.linting.flake8Enabled": true,
"python.formatting.provider": "black",
"python.analysis.completeFunctionParens": true,
"python.analysis.inlayHints.functionReturnTypes": true,
"python.analysis.inlayHints.variableTypes": true,
"python.analysis.inlayHints.pytestParameters": true,
"files.associations": {
"*.py": "python"
},
"python.analysis.indexing": true,
"python.terminal.activateEnvironment": true,
"python.terminal.activateEnvInCurrentTerminal": true,
"python.analysis.packageIndexDepths": [
{
"name": "pandas",
"depth": 2
},
{
"name": "numpy",
"depth": 2
},
{
"name": "requests",
"depth": 2
}
]
}

View File

@ -0,0 +1,36 @@
import schedule
import time
from core.utils import get_current_date_time
import core.logger as logging
import subprocess
import os
import sys
logger = logging.logger
# 定义要执行的任务
def run_script():
start_time = time.time()
logger.info(f"Executing script at: {get_current_date_time()}")
output_file = r'./output/auto_fetch_truth_social.txt'
with open(output_file, 'a') as f:
f.write(f"Task ran at {get_current_date_time()}\n")
current_dir = os.getcwd()
python_path = sys.executable
if current_dir.endswith('crypto_quant'):
script_path = r'./truth_social_retriever_main.py'
elif current_dir.endswith(r'python_projects'):
script_path = f'{current_dir}/crypto_quant/truth_social_retriever_main.py'
else:
script_path = f'{current_dir}/truth_social_retriever_main.py'
subprocess.run([python_path, script_path])
end_time = time.time()
logger.info(f"Script execution time: {end_time - start_time} seconds")
# 设置每天凌晨00:00 运行一次
schedule.every().day.at("00:00:00").do(run_script)
# schedule.every(60).seconds.do(run_script)
# 保持程序运行并检查调度
logger.info("Scheduler started. Press Ctrl+C to stop.")
while True:
schedule.run_pending()
time.sleep(1)

View File

@ -11,7 +11,7 @@ logger = logging.logger
def run_script(): def run_script():
start_time = time.time() start_time = time.time()
logger.info(f"Executing script at: {get_current_date_time()}") logger.info(f"Executing script at: {get_current_date_time()}")
output_file = r'./output/auto_schedule.txt' output_file = r'./output/auto_update_market_data.txt'
with open(output_file, 'a') as f: with open(output_file, 'a') as f:
f.write(f"Task ran at {get_current_date_time()}\n") f.write(f"Task ran at {get_current_date_time()}\n")
python_path = sys.executable python_path = sys.executable

View File

@ -231,3 +231,6 @@ TWITTER_CONFIG = {
{"name": "PressSec", "id": ""}, {"name": "PressSec", "id": ""},
], ],
} }
TRUTH_SOCIAL_API = {"api_key": "FRfhlDHnmYc1PCCrVHZdWtqDENr2",
"user_id": {"realDonaldTrump": "107780257626128497"}}

View File

@ -0,0 +1,294 @@
import pandas as pd
import core.logger as logging
from core.db.db_manager import DBData
from core.utils import get_current_date_time
logger = logging.logger
class DBTruthSocialContent:
def __init__(self, db_url: str):
self.db_url = db_url
self.table_name = "truth_social_content"
self.columns = [
"article_id",
"user_id",
"user_name",
"timestamp",
"date_time",
"text",
"media_url",
"media_type",
"media_thumbnail"
]
self.db_manager = DBData(db_url, self.table_name, self.columns)
def insert_data_to_mysql(self, df: pd.DataFrame):
"""
将内容数据保存到MySQL的truth_social_content表
速度 最快
内存 中等
适用场景中小数据量<10万条
:param df: Truth Social内容数据DataFrame
"""
if df is None or df.empty:
logger.warning("DataFrame为空无需写入数据库。")
return
self.db_manager.insert_data_to_mysql(df)
def insert_data_to_mysql_fast(self, df: pd.DataFrame):
"""
快速插入Truth Social内容数据方案2使用executemany批量插入
速度 很快
内存
适用场景中等数据量
:param df: Truth Social内容数据DataFrame
"""
if df is None or df.empty:
logger.warning("DataFrame为空无需写入数据库。")
return
self.db_manager.insert_data_to_mysql_fast(df)
def insert_data_to_mysql_chunk(self, df: pd.DataFrame, chunk_size: int = 1000):
"""
分块插入Truth Social内容数据方案3适合大数据量
速度 中等
内存 最低
适用场景大数据量>10万条
:param df: Twitter内容数据DataFrame
:param chunk_size: 分块大小
"""
if df is None or df.empty:
logger.warning("DataFrame为空无需写入数据库。")
return
self.db_manager.insert_data_to_mysql_chunk(df, chunk_size)
def insert_data_to_mysql_simple(self, df: pd.DataFrame):
"""
简单插入Truth Social内容数据方案4直接使用to_sql忽略重复
速度 最快
内存 中等
注意会抛出重复键错误需要额外处理
"""
if df is None or df.empty:
logger.warning("DataFrame为空无需写入数据库。")
return
self.db_manager.insert_data_to_mysql_simple(df)
def query_latest_data(self, user_id: str = None):
"""
查询最新数据
:param user_id: 用户ID如果为None则查询所有用户的最新数据
"""
if user_id:
sql = """
SELECT * FROM truth_social_content
WHERE user_id = :user_id
ORDER BY timestamp DESC
LIMIT 1
"""
condition_dict = {"user_id": user_id}
else:
sql = """
SELECT * FROM truth_social_content
ORDER BY timestamp DESC
LIMIT 1
"""
condition_dict = {}
return self.db_manager.query_data(sql, condition_dict, return_multi=False)
def query_data_by_user_id(self, user_id: str, limit: int = 100):
"""
根据用户ID查询数据
:param user_id: 用户ID
:param limit: 查询数量
"""
sql = """
SELECT * FROM truth_social_content
WHERE user_id = :user_id
ORDER BY timestamp DESC
LIMIT :limit
"""
condition_dict = {"user_id": user_id, "limit": limit}
return self.db_manager.query_data(sql, condition_dict, return_multi=True)
def query_data_by_timestamp_range(
self,
start_timestamp: int = None,
end_timestamp: int = None,
user_id: str = None,
limit: int = 1000
):
"""
根据时间戳范围查询数据
:param start_timestamp: 开始时间戳
:param end_timestamp: 结束时间戳
:param user_id: 用户ID可选
:param limit: 查询数量
"""
conditions = []
condition_dict = {"limit": limit}
if start_timestamp:
conditions.append("timestamp >= :start_timestamp")
condition_dict["start_timestamp"] = start_timestamp
if end_timestamp:
conditions.append("timestamp <= :end_timestamp")
condition_dict["end_timestamp"] = end_timestamp
if user_id:
conditions.append("user_id = :user_id")
condition_dict["user_id"] = user_id
where_clause = " AND ".join(conditions) if conditions else "1=1"
sql = f"""
SELECT * FROM truth_social_content
WHERE {where_clause}
ORDER BY timestamp DESC
LIMIT :limit
"""
return self.db_manager.query_data(sql, condition_dict, return_multi=True)
def query_data_by_text_search(
self,
search_text: str,
user_id: str = None,
limit: int = 100
):
"""
根据文本内容搜索数据
:param search_text: 搜索文本
:param user_id: 用户ID可选
:param limit: 查询数量
"""
conditions = ["text LIKE :search_text"]
condition_dict = {
"search_text": f"%{search_text}%",
"limit": limit
}
if user_id:
conditions.append("user_id = :user_id")
condition_dict["user_id"] = user_id
where_clause = " AND ".join(conditions)
sql = f"""
SELECT * FROM truth_social_content
WHERE {where_clause}
ORDER BY timestamp DESC
LIMIT :limit
"""
return self.db_manager.query_data(sql, condition_dict, return_multi=True)
def query_data_by_date_range(
self,
start_date: str = None,
end_date: str = None,
user_id: str = None,
limit: int = 1000
):
"""
根据日期范围查询数据
:param start_date: 开始日期 (YYYY-MM-DD)
:param end_date: 结束日期 (YYYY-MM-DD)
:param user_id: 用户ID可选
:param limit: 查询数量
"""
conditions = []
condition_dict = {"limit": limit}
if start_date:
conditions.append("date_time >= :start_date")
condition_dict["start_date"] = start_date
if end_date:
conditions.append("date_time <= :end_date")
condition_dict["end_date"] = end_date
if user_id:
conditions.append("user_id = :user_id")
condition_dict["user_id"] = user_id
where_clause = " AND ".join(conditions) if conditions else "1=1"
sql = f"""
SELECT * FROM truth_social_content
WHERE {where_clause}
ORDER BY timestamp DESC
LIMIT :limit
"""
return self.db_manager.query_data(sql, condition_dict, return_multi=True)
def get_user_list(self, limit: int = 100):
"""
获取用户列表
:param limit: 查询数量
"""
sql = """
SELECT DISTINCT user_id, user_name,
COUNT(*) as article_count,
MAX(timestamp) as last_time
FROM truth_social_content
GROUP BY user_id, user_name
ORDER BY last_time DESC
LIMIT :limit
"""
condition_dict = {"limit": limit}
return self.db_manager.query_data(sql, condition_dict, return_multi=True)
def get_statistics(self):
"""
获取统计信息
"""
sql = """
SELECT
COUNT(*) as total_articles,
COUNT(DISTINCT user_id) as total_users,
MIN(timestamp) as earliest_article,
MAX(timestamp) as latest_article,
AVG(LENGTH(text)) as avg_text_length
FROM truth_social_content
"""
return self.db_manager.query_data(sql, {}, return_multi=False)
def delete_old_data(self, days: int = 30):
"""
删除指定天数前的旧数据
:param days: 保留天数
"""
current_time = get_current_date_time()
cutoff_timestamp = int(pd.Timestamp(current_time).timestamp()) - (days * 24 * 60 * 60)
sql = """
DELETE FROM truth_social_content
WHERE timestamp < :cutoff_timestamp
"""
condition_dict = {"cutoff_timestamp": cutoff_timestamp}
return self.db_manager.execute_sql(sql, condition_dict)
def check_duplicate(self, user_id: str, timestamp: int):
"""
检查是否存在重复数据
:param user_id: 用户ID
:param timestamp: 时间戳
"""
sql = """
SELECT COUNT(*) as count
FROM truth_social_content
WHERE user_id = :user_id AND timestamp = :timestamp
"""
condition_dict = {"user_id": user_id, "timestamp": timestamp}
result = self.db_manager.query_data(sql, condition_dict, return_multi=False)
return result['count'] > 0 if result else False

View File

@ -0,0 +1,156 @@
import core.logger as logging
from core.db.db_truth_social_content import DBTruthSocialContent
from config import TRUTH_SOCIAL_API, COIN_MYSQL_CONFIG
import requests
import json
import os
from bs4 import BeautifulSoup
import time
from datetime import datetime
import pytz
import pandas as pd
logger = logging.logger
class TruthSocialRetriever:
def __init__(self) -> None:
self.api_key = TRUTH_SOCIAL_API.get("api_key", "")
self.user_info = TRUTH_SOCIAL_API.get("user_id", {})
mysql_user = COIN_MYSQL_CONFIG.get("user", "xch")
mysql_password = COIN_MYSQL_CONFIG.get("password", "")
if not mysql_password:
raise ValueError("MySQL password is not set")
mysql_host = COIN_MYSQL_CONFIG.get("host", "localhost")
mysql_port = COIN_MYSQL_CONFIG.get("port", 3306)
mysql_database = COIN_MYSQL_CONFIG.get("database", "okx")
self.db_url = f"mysql+pymysql://{mysql_user}:{mysql_password}@{mysql_host}:{mysql_port}/{mysql_database}"
self.db_truth_social_content = DBTruthSocialContent(self.db_url)
self.save_path = r"./output/media/truth_social/"
os.makedirs(self.save_path, exist_ok=True)
def get_user_id_from_page(self, handle='realDonaldTrump'):
url = f'https://truthsocial.com/@{handle}'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'} # 模拟浏览器
response = requests.get(url, headers=headers)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
# 查找嵌入的 JSONTruth Social 使用 data 属性或 script 标签)
scripts = soup.find_all('script')
for script in scripts:
if script.string and 'id' in script.string and handle in script.string:
# 简单提取(实际可能需正则匹配 JSON
import re
match = re.search(r'"id"\s*:\s*"(\d+)"', script.string)
if match:
return match.group(1)
return None
def get_user_posts(self, limit: int = None):
"""
获取用户在 Truth Social 的最新帖子
免费版100
付费版
47美元25,000如果5分钟跑一次则可以跑86.8
497美元500,000如果5分钟跑一次则可以跑1736天
参数:
- limit: 最大帖子数API 默认返回 20 可通过分页获取更多
返回:
- 帖子列表JSON 格式
"""
headers = {
'x-api-key': self.api_key,
'Content-Type': 'application/json'
}
for user_name, user_id in self.user_info.items():
params = {
'handle': user_name, # 用户名
'user_id': user_id, # 可选,用户 ID
'next_max_id': None, # 分页时设置为上一次响应的 max_id
'trim': 'false' # 保留完整内容
}
url = 'https://api.scrapecreators.com/v1/truthsocial/user/posts'
logger.info(f"Searching contents for user: {user_name}")
try:
response = requests.get(url, headers=headers, params=params)
response.raise_for_status() # 检查 HTTP 错误
data = response.json()
# 提取帖子列表(假设响应中 'posts' 是键,根据实际文档调整)
if limit is not None and isinstance(limit, int):
posts = data.get('posts', [])[:limit]
else:
posts = data.get('posts', [])
results = []
if posts:
logger.info(f"获取{user_name}帖子: {len(posts)}")
for post in posts:
result = {}
result["article_id"] = post.get('id')
result["user_id"] = user_id
result["user_name"] = user_name
datetime_text = post.get('created_at')
datetime_dict = self.transform_datetime(datetime_text)
timestamp_ms = datetime_dict["timestamp_ms"]
result["timestamp"] = timestamp_ms
beijing_time_str = datetime_dict["beijing_time_str"]
result["date_time"] = beijing_time_str
result["text"] = post.get('text', '无内容')
media_attachments = post.get('media_attachments', [])
result["media_url"] = ""
result["media_type"] = ""
result["media_thumbnail"] = ""
if media_attachments:
for media_attachment in media_attachments:
result["media_url"] = media_attachment.get('url')
result["media_type"] = media_attachment.get('type')
result["media_thumbnail"] = media_attachment.get('preview_url')
break
results.append(result)
else:
print("获取帖子失败,请检查 API 密钥或网络。")
if len(results) > 0:
user_path = os.path.join(self.save_path, user_name)
os.makedirs(user_path, exist_ok=True)
now_date_time = datetime.now().strftime("%Y%m%d%H%M%S")
json_file_name = os.path.join(user_path, f"{user_name}_{now_date_time}.json")
# 将results内容写入json_file_name文件中
with open(json_file_name, 'w', encoding='utf-8') as f:
json.dump(results, f, ensure_ascii=False, indent=2)
logger.info(f"已将{len(results)}条数据保存到: {json_file_name}")
result_df = pd.DataFrame(results)
self.db_truth_social_content.insert_data_to_mysql(result_df)
except requests.exceptions.RequestException as e:
print(f"请求错误: {e}")
except json.JSONDecodeError as e:
print(f"JSON 解析错误: {e}")
def transform_datetime(self, datetime_text: str):
utc_time = datetime.strptime(datetime_text, "%Y-%m-%dT%H:%M:%S.%fZ").replace(tzinfo=pytz.UTC)
# 1. 转换为时间戳(毫秒)
timestamp_ms = int(utc_time.timestamp() * 1000)
# 2. 转换为北京时间ISO 8601 格式,带 +08:00
beijing_tz = pytz.timezone("Asia/Shanghai")
beijing_time = utc_time.astimezone(beijing_tz)
beijing_time_str = beijing_time.strftime("%Y-%m-%dT%H:%M:%S%z")
# 插入冒号到时区偏移(如 +0800 -> +08:00
beijing_time_str = beijing_time_str[:-2] + ":" + beijing_time_str[-2:]
result = {
"timestamp_ms": timestamp_ms,
"beijing_time_str": beijing_time_str
}
return result

View File

@ -0,0 +1,22 @@
CREATE TABLE `truth_social_content` (
`article_id` VARCHAR(50) NOT NULL PRIMARY KEY,
`user_id` VARCHAR(50) NOT NULL,
`user_name` VARCHAR(100) NOT NULL,
`timestamp` BIGINT NOT NULL,
`date_time` VARCHAR(50) NOT NULL,
`text` TEXT NOT NULL,
`media_url` TEXT NULL,
`media_type` VARCHAR(50) NULL,
`media_thumbnail` TEXT NULL,
`created_at` TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
`updated_at` TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
-- 对于 MySQL 8.0.29 之前的版本不支持 "ADD COLUMN IF NOT EXISTS"
-- 如需在已有表上添加列,请分别执行以下语句(每条仅需执行一次)
ALTER TABLE `truth_social_content`
ADD COLUMN `media_url` TEXT NULL DEFAULT NULL AFTER `text`;
ALTER TABLE `truth_social_content`
ADD COLUMN `media_type` VARCHAR(50) NULL DEFAULT NULL AFTER `media_url`;
ALTER TABLE `truth_social_content`
ADD COLUMN `media_thumbnail` TEXT NULL DEFAULT NULL AFTER `media_type`;

41
test_autocomplete.py Normal file
View File

@ -0,0 +1,41 @@
# 测试代码提示和自动补全功能
import pandas as pd
import numpy as np
import requests
import json
import os
from datetime import datetime
def test_autocomplete():
"""测试自动补全功能"""
# 测试pandas自动补全
df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]})
# 在这里输入 df. 应该会显示DataFrame的方法
df.head()
df.describe()
# 测试numpy自动补全
arr = np.array([1, 2, 3, 4, 5])
# 在这里输入 arr. 应该会显示numpy数组的方法
arr.mean()
arr.std()
# 测试requests自动补全
response = requests.get("https://api.github.com")
# 在这里输入 response. 应该会显示Response对象的方法
response.status_code
response.json()
# 测试内置函数自动补全
# 在这里输入 len( 应该会显示参数提示
length = len([1, 2, 3])
# 测试类型提示
current_time = datetime.now()
# 在这里输入 current_time. 应该会显示datetime对象的方法
current_time.strftime("%Y-%m-%d")
return df, arr, response, length, current_time
if __name__ == "__main__":
test_autocomplete()

View File

@ -0,0 +1,10 @@
from core.media.truth_social_retriever import TruthSocialRetriever
def main():
truth_social_retriever = TruthSocialRetriever()
truth_social_retriever.get_user_posts()
if __name__ == "__main__":
main()

View File

@ -1,4 +1,4 @@
from core.twitter.twitter_retriever import TwitterRetriever from core.media.twitter_retriever import TwitterRetriever
import core.logger as logging import core.logger as logging
import os import os