support fetch data from truth social
This commit is contained in:
parent
1e6caf51e0
commit
2dc2d51171
|
|
@ -0,0 +1,50 @@
|
||||||
|
# Python环境配置说明
|
||||||
|
|
||||||
|
## 自动配置Python解释器
|
||||||
|
|
||||||
|
本项目已移除硬编码的Python路径,支持在不同电脑上自动检测Python环境。
|
||||||
|
|
||||||
|
### 首次使用步骤:
|
||||||
|
|
||||||
|
1. **打开Cursor**
|
||||||
|
2. **选择Python解释器**:
|
||||||
|
- 按 `Ctrl+Shift+P`
|
||||||
|
- 搜索 "Python: Select Interpreter"
|
||||||
|
- 选择您系统中的Python解释器
|
||||||
|
|
||||||
|
3. **验证配置**:
|
||||||
|
- 打开 `test_autocomplete.py` 文件
|
||||||
|
- 尝试输入代码,检查是否有自动补全功能
|
||||||
|
|
||||||
|
### 支持的Python环境:
|
||||||
|
|
||||||
|
- **Anaconda/Miniconda**:自动检测conda环境
|
||||||
|
- **Python官方安装包**:检测系统PATH中的python
|
||||||
|
- **虚拟环境**:支持venv、virtualenv等
|
||||||
|
- **Docker容器**:支持远程Python解释器
|
||||||
|
|
||||||
|
### 代码提示功能:
|
||||||
|
|
||||||
|
✅ **已启用的功能**:
|
||||||
|
- 智能代码补全
|
||||||
|
- 函数参数提示
|
||||||
|
- 类型提示
|
||||||
|
- 自动导入建议
|
||||||
|
- 悬停信息显示
|
||||||
|
- 语法错误检测
|
||||||
|
- 代码格式化
|
||||||
|
|
||||||
|
### 故障排除:
|
||||||
|
|
||||||
|
如果代码提示不工作:
|
||||||
|
|
||||||
|
1. **重启语言服务器**:`Ctrl+Shift+P` → "Python: Restart Language Server"
|
||||||
|
2. **重新加载窗口**:`Ctrl+Shift+P` → "Developer: Reload Window"
|
||||||
|
3. **检查Python扩展**:确保Python扩展已安装并启用
|
||||||
|
4. **验证解释器**:确保选择的Python解释器路径正确
|
||||||
|
|
||||||
|
### 配置文件说明:
|
||||||
|
|
||||||
|
- `.vscode/settings.json`:项目级设置,包含代码提示配置
|
||||||
|
- `python_settings.json`:通用Python配置模板
|
||||||
|
- 全局设置:已移除硬编码路径,保持代码提示功能
|
||||||
|
|
@ -0,0 +1,20 @@
|
||||||
|
{
|
||||||
|
"python.analysis.autoImportCompletions": true,
|
||||||
|
"python.analysis.typeCheckingMode": "basic",
|
||||||
|
"python.analysis.autoSearchPaths": true,
|
||||||
|
"python.analysis.diagnosticMode": "workspace",
|
||||||
|
"python.analysis.indexing": true,
|
||||||
|
"python.analysis.completeFunctionParens": true,
|
||||||
|
"python.analysis.inlayHints.functionReturnTypes": true,
|
||||||
|
"python.analysis.inlayHints.variableTypes": true,
|
||||||
|
"python.analysis.inlayHints.pytestParameters": true,
|
||||||
|
"python.linting.enabled": true,
|
||||||
|
"python.linting.pylintEnabled": false,
|
||||||
|
"python.linting.flake8Enabled": true,
|
||||||
|
"python.formatting.provider": "black",
|
||||||
|
"python.terminal.activateEnvironment": true,
|
||||||
|
"python.terminal.activateEnvInCurrentTerminal": true,
|
||||||
|
"files.associations": {
|
||||||
|
"*.py": "python"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,53 @@
|
||||||
|
{
|
||||||
|
"python.analysis.autoImportCompletions": true,
|
||||||
|
"python.analysis.typeCheckingMode": "basic",
|
||||||
|
"python.analysis.autoSearchPaths": true,
|
||||||
|
"python.analysis.diagnosticMode": "workspace",
|
||||||
|
"python.analysis.stubPath": "./typings",
|
||||||
|
"python.analysis.extraPaths": [
|
||||||
|
"./core",
|
||||||
|
"./utils"
|
||||||
|
],
|
||||||
|
"editor.quickSuggestions": {
|
||||||
|
"other": true,
|
||||||
|
"comments": false,
|
||||||
|
"strings": true
|
||||||
|
},
|
||||||
|
"editor.suggestOnTriggerCharacters": true,
|
||||||
|
"editor.acceptSuggestionOnEnter": "on",
|
||||||
|
"editor.tabCompletion": "on",
|
||||||
|
"editor.wordBasedSuggestions": "matchingDocuments",
|
||||||
|
"editor.parameterHints.enabled": true,
|
||||||
|
"editor.hover.enabled": true,
|
||||||
|
"editor.codeActionsOnSave": {
|
||||||
|
"source.organizeImports": "explicit"
|
||||||
|
},
|
||||||
|
"python.linting.enabled": true,
|
||||||
|
"python.linting.pylintEnabled": false,
|
||||||
|
"python.linting.flake8Enabled": true,
|
||||||
|
"python.formatting.provider": "black",
|
||||||
|
"python.analysis.completeFunctionParens": true,
|
||||||
|
"python.analysis.inlayHints.functionReturnTypes": true,
|
||||||
|
"python.analysis.inlayHints.variableTypes": true,
|
||||||
|
"python.analysis.inlayHints.pytestParameters": true,
|
||||||
|
"files.associations": {
|
||||||
|
"*.py": "python"
|
||||||
|
},
|
||||||
|
"python.analysis.indexing": true,
|
||||||
|
"python.terminal.activateEnvironment": true,
|
||||||
|
"python.terminal.activateEnvInCurrentTerminal": true,
|
||||||
|
"python.analysis.packageIndexDepths": [
|
||||||
|
{
|
||||||
|
"name": "pandas",
|
||||||
|
"depth": 2
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "numpy",
|
||||||
|
"depth": 2
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "requests",
|
||||||
|
"depth": 2
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,36 @@
|
||||||
|
import schedule
|
||||||
|
import time
|
||||||
|
from core.utils import get_current_date_time
|
||||||
|
import core.logger as logging
|
||||||
|
import subprocess
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
|
logger = logging.logger
|
||||||
|
# 定义要执行的任务
|
||||||
|
def run_script():
|
||||||
|
start_time = time.time()
|
||||||
|
logger.info(f"Executing script at: {get_current_date_time()}")
|
||||||
|
output_file = r'./output/auto_fetch_truth_social.txt'
|
||||||
|
with open(output_file, 'a') as f:
|
||||||
|
f.write(f"Task ran at {get_current_date_time()}\n")
|
||||||
|
current_dir = os.getcwd()
|
||||||
|
python_path = sys.executable
|
||||||
|
if current_dir.endswith('crypto_quant'):
|
||||||
|
script_path = r'./truth_social_retriever_main.py'
|
||||||
|
elif current_dir.endswith(r'python_projects'):
|
||||||
|
script_path = f'{current_dir}/crypto_quant/truth_social_retriever_main.py'
|
||||||
|
else:
|
||||||
|
script_path = f'{current_dir}/truth_social_retriever_main.py'
|
||||||
|
subprocess.run([python_path, script_path])
|
||||||
|
end_time = time.time()
|
||||||
|
logger.info(f"Script execution time: {end_time - start_time} seconds")
|
||||||
|
# 设置每天凌晨00:00 运行一次
|
||||||
|
schedule.every().day.at("00:00:00").do(run_script)
|
||||||
|
# schedule.every(60).seconds.do(run_script)
|
||||||
|
|
||||||
|
# 保持程序运行并检查调度
|
||||||
|
logger.info("Scheduler started. Press Ctrl+C to stop.")
|
||||||
|
while True:
|
||||||
|
schedule.run_pending()
|
||||||
|
time.sleep(1)
|
||||||
|
|
@ -11,7 +11,7 @@ logger = logging.logger
|
||||||
def run_script():
|
def run_script():
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
logger.info(f"Executing script at: {get_current_date_time()}")
|
logger.info(f"Executing script at: {get_current_date_time()}")
|
||||||
output_file = r'./output/auto_schedule.txt'
|
output_file = r'./output/auto_update_market_data.txt'
|
||||||
with open(output_file, 'a') as f:
|
with open(output_file, 'a') as f:
|
||||||
f.write(f"Task ran at {get_current_date_time()}\n")
|
f.write(f"Task ran at {get_current_date_time()}\n")
|
||||||
python_path = sys.executable
|
python_path = sys.executable
|
||||||
|
|
|
||||||
|
|
@ -231,3 +231,6 @@ TWITTER_CONFIG = {
|
||||||
{"name": "PressSec", "id": ""},
|
{"name": "PressSec", "id": ""},
|
||||||
],
|
],
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TRUTH_SOCIAL_API = {"api_key": "FRfhlDHnmYc1PCCrVHZdWtqDENr2",
|
||||||
|
"user_id": {"realDonaldTrump": "107780257626128497"}}
|
||||||
|
|
@ -0,0 +1,294 @@
|
||||||
|
import pandas as pd
|
||||||
|
import core.logger as logging
|
||||||
|
from core.db.db_manager import DBData
|
||||||
|
from core.utils import get_current_date_time
|
||||||
|
|
||||||
|
logger = logging.logger
|
||||||
|
|
||||||
|
|
||||||
|
class DBTruthSocialContent:
|
||||||
|
def __init__(self, db_url: str):
|
||||||
|
self.db_url = db_url
|
||||||
|
self.table_name = "truth_social_content"
|
||||||
|
self.columns = [
|
||||||
|
"article_id",
|
||||||
|
"user_id",
|
||||||
|
"user_name",
|
||||||
|
"timestamp",
|
||||||
|
"date_time",
|
||||||
|
"text",
|
||||||
|
"media_url",
|
||||||
|
"media_type",
|
||||||
|
"media_thumbnail"
|
||||||
|
]
|
||||||
|
self.db_manager = DBData(db_url, self.table_name, self.columns)
|
||||||
|
|
||||||
|
def insert_data_to_mysql(self, df: pd.DataFrame):
|
||||||
|
"""
|
||||||
|
将内容数据保存到MySQL的truth_social_content表
|
||||||
|
速度:⭐⭐⭐⭐⭐ 最快
|
||||||
|
内存:⭐⭐⭐⭐ 中等
|
||||||
|
适用场景:中小数据量(<10万条)
|
||||||
|
:param df: Truth Social内容数据DataFrame
|
||||||
|
"""
|
||||||
|
if df is None or df.empty:
|
||||||
|
logger.warning("DataFrame为空,无需写入数据库。")
|
||||||
|
return
|
||||||
|
|
||||||
|
self.db_manager.insert_data_to_mysql(df)
|
||||||
|
|
||||||
|
def insert_data_to_mysql_fast(self, df: pd.DataFrame):
|
||||||
|
"""
|
||||||
|
快速插入Truth Social内容数据(方案2:使用executemany批量插入)
|
||||||
|
速度:⭐⭐⭐⭐ 很快
|
||||||
|
内存:⭐⭐⭐⭐⭐ 低
|
||||||
|
适用场景:中等数据量
|
||||||
|
:param df: Truth Social内容数据DataFrame
|
||||||
|
"""
|
||||||
|
if df is None or df.empty:
|
||||||
|
logger.warning("DataFrame为空,无需写入数据库。")
|
||||||
|
return
|
||||||
|
|
||||||
|
self.db_manager.insert_data_to_mysql_fast(df)
|
||||||
|
|
||||||
|
def insert_data_to_mysql_chunk(self, df: pd.DataFrame, chunk_size: int = 1000):
|
||||||
|
"""
|
||||||
|
分块插入Truth Social内容数据(方案3:适合大数据量)
|
||||||
|
速度:⭐⭐⭐ 中等
|
||||||
|
内存:⭐⭐⭐⭐⭐ 最低
|
||||||
|
适用场景:大数据量(>10万条)
|
||||||
|
:param df: Twitter内容数据DataFrame
|
||||||
|
:param chunk_size: 分块大小
|
||||||
|
"""
|
||||||
|
if df is None or df.empty:
|
||||||
|
logger.warning("DataFrame为空,无需写入数据库。")
|
||||||
|
return
|
||||||
|
|
||||||
|
self.db_manager.insert_data_to_mysql_chunk(df, chunk_size)
|
||||||
|
|
||||||
|
def insert_data_to_mysql_simple(self, df: pd.DataFrame):
|
||||||
|
"""
|
||||||
|
简单插入Truth Social内容数据(方案4:直接使用to_sql,忽略重复)
|
||||||
|
速度:⭐⭐⭐⭐⭐ 最快
|
||||||
|
内存:⭐⭐⭐⭐ 中等
|
||||||
|
注意:会抛出重复键错误,需要额外处理
|
||||||
|
"""
|
||||||
|
if df is None or df.empty:
|
||||||
|
logger.warning("DataFrame为空,无需写入数据库。")
|
||||||
|
return
|
||||||
|
|
||||||
|
self.db_manager.insert_data_to_mysql_simple(df)
|
||||||
|
|
||||||
|
def query_latest_data(self, user_id: str = None):
|
||||||
|
"""
|
||||||
|
查询最新数据
|
||||||
|
:param user_id: 用户ID,如果为None则查询所有用户的最新数据
|
||||||
|
"""
|
||||||
|
if user_id:
|
||||||
|
sql = """
|
||||||
|
SELECT * FROM truth_social_content
|
||||||
|
WHERE user_id = :user_id
|
||||||
|
ORDER BY timestamp DESC
|
||||||
|
LIMIT 1
|
||||||
|
"""
|
||||||
|
condition_dict = {"user_id": user_id}
|
||||||
|
else:
|
||||||
|
sql = """
|
||||||
|
SELECT * FROM truth_social_content
|
||||||
|
ORDER BY timestamp DESC
|
||||||
|
LIMIT 1
|
||||||
|
"""
|
||||||
|
condition_dict = {}
|
||||||
|
|
||||||
|
return self.db_manager.query_data(sql, condition_dict, return_multi=False)
|
||||||
|
|
||||||
|
def query_data_by_user_id(self, user_id: str, limit: int = 100):
|
||||||
|
"""
|
||||||
|
根据用户ID查询数据
|
||||||
|
:param user_id: 用户ID
|
||||||
|
:param limit: 查询数量
|
||||||
|
"""
|
||||||
|
sql = """
|
||||||
|
SELECT * FROM truth_social_content
|
||||||
|
WHERE user_id = :user_id
|
||||||
|
ORDER BY timestamp DESC
|
||||||
|
LIMIT :limit
|
||||||
|
"""
|
||||||
|
condition_dict = {"user_id": user_id, "limit": limit}
|
||||||
|
return self.db_manager.query_data(sql, condition_dict, return_multi=True)
|
||||||
|
|
||||||
|
def query_data_by_timestamp_range(
|
||||||
|
self,
|
||||||
|
start_timestamp: int = None,
|
||||||
|
end_timestamp: int = None,
|
||||||
|
user_id: str = None,
|
||||||
|
limit: int = 1000
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
根据时间戳范围查询数据
|
||||||
|
:param start_timestamp: 开始时间戳
|
||||||
|
:param end_timestamp: 结束时间戳
|
||||||
|
:param user_id: 用户ID,可选
|
||||||
|
:param limit: 查询数量
|
||||||
|
"""
|
||||||
|
conditions = []
|
||||||
|
condition_dict = {"limit": limit}
|
||||||
|
|
||||||
|
if start_timestamp:
|
||||||
|
conditions.append("timestamp >= :start_timestamp")
|
||||||
|
condition_dict["start_timestamp"] = start_timestamp
|
||||||
|
|
||||||
|
if end_timestamp:
|
||||||
|
conditions.append("timestamp <= :end_timestamp")
|
||||||
|
condition_dict["end_timestamp"] = end_timestamp
|
||||||
|
|
||||||
|
if user_id:
|
||||||
|
conditions.append("user_id = :user_id")
|
||||||
|
condition_dict["user_id"] = user_id
|
||||||
|
|
||||||
|
where_clause = " AND ".join(conditions) if conditions else "1=1"
|
||||||
|
|
||||||
|
sql = f"""
|
||||||
|
SELECT * FROM truth_social_content
|
||||||
|
WHERE {where_clause}
|
||||||
|
ORDER BY timestamp DESC
|
||||||
|
LIMIT :limit
|
||||||
|
"""
|
||||||
|
|
||||||
|
return self.db_manager.query_data(sql, condition_dict, return_multi=True)
|
||||||
|
|
||||||
|
def query_data_by_text_search(
|
||||||
|
self,
|
||||||
|
search_text: str,
|
||||||
|
user_id: str = None,
|
||||||
|
limit: int = 100
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
根据文本内容搜索数据
|
||||||
|
:param search_text: 搜索文本
|
||||||
|
:param user_id: 用户ID,可选
|
||||||
|
:param limit: 查询数量
|
||||||
|
"""
|
||||||
|
conditions = ["text LIKE :search_text"]
|
||||||
|
condition_dict = {
|
||||||
|
"search_text": f"%{search_text}%",
|
||||||
|
"limit": limit
|
||||||
|
}
|
||||||
|
|
||||||
|
if user_id:
|
||||||
|
conditions.append("user_id = :user_id")
|
||||||
|
condition_dict["user_id"] = user_id
|
||||||
|
|
||||||
|
where_clause = " AND ".join(conditions)
|
||||||
|
|
||||||
|
sql = f"""
|
||||||
|
SELECT * FROM truth_social_content
|
||||||
|
WHERE {where_clause}
|
||||||
|
ORDER BY timestamp DESC
|
||||||
|
LIMIT :limit
|
||||||
|
"""
|
||||||
|
|
||||||
|
return self.db_manager.query_data(sql, condition_dict, return_multi=True)
|
||||||
|
|
||||||
|
def query_data_by_date_range(
|
||||||
|
self,
|
||||||
|
start_date: str = None,
|
||||||
|
end_date: str = None,
|
||||||
|
user_id: str = None,
|
||||||
|
limit: int = 1000
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
根据日期范围查询数据
|
||||||
|
:param start_date: 开始日期 (YYYY-MM-DD)
|
||||||
|
:param end_date: 结束日期 (YYYY-MM-DD)
|
||||||
|
:param user_id: 用户ID,可选
|
||||||
|
:param limit: 查询数量
|
||||||
|
"""
|
||||||
|
conditions = []
|
||||||
|
condition_dict = {"limit": limit}
|
||||||
|
|
||||||
|
if start_date:
|
||||||
|
conditions.append("date_time >= :start_date")
|
||||||
|
condition_dict["start_date"] = start_date
|
||||||
|
|
||||||
|
if end_date:
|
||||||
|
conditions.append("date_time <= :end_date")
|
||||||
|
condition_dict["end_date"] = end_date
|
||||||
|
|
||||||
|
if user_id:
|
||||||
|
conditions.append("user_id = :user_id")
|
||||||
|
condition_dict["user_id"] = user_id
|
||||||
|
|
||||||
|
where_clause = " AND ".join(conditions) if conditions else "1=1"
|
||||||
|
|
||||||
|
sql = f"""
|
||||||
|
SELECT * FROM truth_social_content
|
||||||
|
WHERE {where_clause}
|
||||||
|
ORDER BY timestamp DESC
|
||||||
|
LIMIT :limit
|
||||||
|
"""
|
||||||
|
|
||||||
|
return self.db_manager.query_data(sql, condition_dict, return_multi=True)
|
||||||
|
|
||||||
|
def get_user_list(self, limit: int = 100):
|
||||||
|
"""
|
||||||
|
获取用户列表
|
||||||
|
:param limit: 查询数量
|
||||||
|
"""
|
||||||
|
sql = """
|
||||||
|
SELECT DISTINCT user_id, user_name,
|
||||||
|
COUNT(*) as article_count,
|
||||||
|
MAX(timestamp) as last_time
|
||||||
|
FROM truth_social_content
|
||||||
|
GROUP BY user_id, user_name
|
||||||
|
ORDER BY last_time DESC
|
||||||
|
LIMIT :limit
|
||||||
|
"""
|
||||||
|
condition_dict = {"limit": limit}
|
||||||
|
return self.db_manager.query_data(sql, condition_dict, return_multi=True)
|
||||||
|
|
||||||
|
def get_statistics(self):
|
||||||
|
"""
|
||||||
|
获取统计信息
|
||||||
|
"""
|
||||||
|
sql = """
|
||||||
|
SELECT
|
||||||
|
COUNT(*) as total_articles,
|
||||||
|
COUNT(DISTINCT user_id) as total_users,
|
||||||
|
MIN(timestamp) as earliest_article,
|
||||||
|
MAX(timestamp) as latest_article,
|
||||||
|
AVG(LENGTH(text)) as avg_text_length
|
||||||
|
FROM truth_social_content
|
||||||
|
"""
|
||||||
|
return self.db_manager.query_data(sql, {}, return_multi=False)
|
||||||
|
|
||||||
|
def delete_old_data(self, days: int = 30):
|
||||||
|
"""
|
||||||
|
删除指定天数前的旧数据
|
||||||
|
:param days: 保留天数
|
||||||
|
"""
|
||||||
|
current_time = get_current_date_time()
|
||||||
|
cutoff_timestamp = int(pd.Timestamp(current_time).timestamp()) - (days * 24 * 60 * 60)
|
||||||
|
|
||||||
|
sql = """
|
||||||
|
DELETE FROM truth_social_content
|
||||||
|
WHERE timestamp < :cutoff_timestamp
|
||||||
|
"""
|
||||||
|
condition_dict = {"cutoff_timestamp": cutoff_timestamp}
|
||||||
|
|
||||||
|
return self.db_manager.execute_sql(sql, condition_dict)
|
||||||
|
|
||||||
|
def check_duplicate(self, user_id: str, timestamp: int):
|
||||||
|
"""
|
||||||
|
检查是否存在重复数据
|
||||||
|
:param user_id: 用户ID
|
||||||
|
:param timestamp: 时间戳
|
||||||
|
"""
|
||||||
|
sql = """
|
||||||
|
SELECT COUNT(*) as count
|
||||||
|
FROM truth_social_content
|
||||||
|
WHERE user_id = :user_id AND timestamp = :timestamp
|
||||||
|
"""
|
||||||
|
condition_dict = {"user_id": user_id, "timestamp": timestamp}
|
||||||
|
result = self.db_manager.query_data(sql, condition_dict, return_multi=False)
|
||||||
|
return result['count'] > 0 if result else False
|
||||||
Binary file not shown.
Binary file not shown.
|
|
@ -0,0 +1,156 @@
|
||||||
|
import core.logger as logging
|
||||||
|
from core.db.db_truth_social_content import DBTruthSocialContent
|
||||||
|
from config import TRUTH_SOCIAL_API, COIN_MYSQL_CONFIG
|
||||||
|
|
||||||
|
import requests
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import time
|
||||||
|
from datetime import datetime
|
||||||
|
import pytz
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
logger = logging.logger
|
||||||
|
|
||||||
|
class TruthSocialRetriever:
|
||||||
|
def __init__(self) -> None:
|
||||||
|
self.api_key = TRUTH_SOCIAL_API.get("api_key", "")
|
||||||
|
self.user_info = TRUTH_SOCIAL_API.get("user_id", {})
|
||||||
|
mysql_user = COIN_MYSQL_CONFIG.get("user", "xch")
|
||||||
|
mysql_password = COIN_MYSQL_CONFIG.get("password", "")
|
||||||
|
if not mysql_password:
|
||||||
|
raise ValueError("MySQL password is not set")
|
||||||
|
mysql_host = COIN_MYSQL_CONFIG.get("host", "localhost")
|
||||||
|
mysql_port = COIN_MYSQL_CONFIG.get("port", 3306)
|
||||||
|
mysql_database = COIN_MYSQL_CONFIG.get("database", "okx")
|
||||||
|
|
||||||
|
self.db_url = f"mysql+pymysql://{mysql_user}:{mysql_password}@{mysql_host}:{mysql_port}/{mysql_database}"
|
||||||
|
self.db_truth_social_content = DBTruthSocialContent(self.db_url)
|
||||||
|
|
||||||
|
self.save_path = r"./output/media/truth_social/"
|
||||||
|
os.makedirs(self.save_path, exist_ok=True)
|
||||||
|
|
||||||
|
def get_user_id_from_page(self, handle='realDonaldTrump'):
|
||||||
|
url = f'https://truthsocial.com/@{handle}'
|
||||||
|
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'} # 模拟浏览器
|
||||||
|
|
||||||
|
response = requests.get(url, headers=headers)
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
soup = BeautifulSoup(response.text, 'html.parser')
|
||||||
|
# 查找嵌入的 JSON(Truth Social 使用 data 属性或 script 标签)
|
||||||
|
scripts = soup.find_all('script')
|
||||||
|
for script in scripts:
|
||||||
|
if script.string and 'id' in script.string and handle in script.string:
|
||||||
|
# 简单提取(实际可能需正则匹配 JSON)
|
||||||
|
import re
|
||||||
|
match = re.search(r'"id"\s*:\s*"(\d+)"', script.string)
|
||||||
|
if match:
|
||||||
|
return match.group(1)
|
||||||
|
return None
|
||||||
|
|
||||||
|
def get_user_posts(self, limit: int = None):
|
||||||
|
"""
|
||||||
|
获取用户在 Truth Social 的最新帖子。
|
||||||
|
免费版:100次
|
||||||
|
付费版:
|
||||||
|
47美元:25,000次,如果5分钟跑一次,则可以跑86.8天
|
||||||
|
497美元:500,000次,如果5分钟跑一次,则可以跑1736天
|
||||||
|
参数:
|
||||||
|
- limit: 最大帖子数(API 默认返回 20 条,可通过分页获取更多)。
|
||||||
|
|
||||||
|
返回:
|
||||||
|
- 帖子列表(JSON 格式)。
|
||||||
|
"""
|
||||||
|
headers = {
|
||||||
|
'x-api-key': self.api_key,
|
||||||
|
'Content-Type': 'application/json'
|
||||||
|
}
|
||||||
|
|
||||||
|
for user_name, user_id in self.user_info.items():
|
||||||
|
params = {
|
||||||
|
'handle': user_name, # 用户名
|
||||||
|
'user_id': user_id, # 可选,用户 ID
|
||||||
|
'next_max_id': None, # 分页时设置为上一次响应的 max_id
|
||||||
|
'trim': 'false' # 保留完整内容
|
||||||
|
}
|
||||||
|
|
||||||
|
url = 'https://api.scrapecreators.com/v1/truthsocial/user/posts'
|
||||||
|
logger.info(f"Searching contents for user: {user_name}")
|
||||||
|
try:
|
||||||
|
response = requests.get(url, headers=headers, params=params)
|
||||||
|
response.raise_for_status() # 检查 HTTP 错误
|
||||||
|
data = response.json()
|
||||||
|
|
||||||
|
# 提取帖子列表(假设响应中 'posts' 是键,根据实际文档调整)
|
||||||
|
if limit is not None and isinstance(limit, int):
|
||||||
|
posts = data.get('posts', [])[:limit]
|
||||||
|
else:
|
||||||
|
posts = data.get('posts', [])
|
||||||
|
|
||||||
|
results = []
|
||||||
|
if posts:
|
||||||
|
logger.info(f"获取{user_name}帖子: {len(posts)}条")
|
||||||
|
for post in posts:
|
||||||
|
result = {}
|
||||||
|
result["article_id"] = post.get('id')
|
||||||
|
result["user_id"] = user_id
|
||||||
|
result["user_name"] = user_name
|
||||||
|
datetime_text = post.get('created_at')
|
||||||
|
datetime_dict = self.transform_datetime(datetime_text)
|
||||||
|
timestamp_ms = datetime_dict["timestamp_ms"]
|
||||||
|
result["timestamp"] = timestamp_ms
|
||||||
|
beijing_time_str = datetime_dict["beijing_time_str"]
|
||||||
|
result["date_time"] = beijing_time_str
|
||||||
|
result["text"] = post.get('text', '无内容')
|
||||||
|
media_attachments = post.get('media_attachments', [])
|
||||||
|
result["media_url"] = ""
|
||||||
|
result["media_type"] = ""
|
||||||
|
result["media_thumbnail"] = ""
|
||||||
|
if media_attachments:
|
||||||
|
for media_attachment in media_attachments:
|
||||||
|
result["media_url"] = media_attachment.get('url')
|
||||||
|
result["media_type"] = media_attachment.get('type')
|
||||||
|
result["media_thumbnail"] = media_attachment.get('preview_url')
|
||||||
|
break
|
||||||
|
results.append(result)
|
||||||
|
else:
|
||||||
|
print("获取帖子失败,请检查 API 密钥或网络。")
|
||||||
|
|
||||||
|
if len(results) > 0:
|
||||||
|
user_path = os.path.join(self.save_path, user_name)
|
||||||
|
os.makedirs(user_path, exist_ok=True)
|
||||||
|
now_date_time = datetime.now().strftime("%Y%m%d%H%M%S")
|
||||||
|
json_file_name = os.path.join(user_path, f"{user_name}_{now_date_time}.json")
|
||||||
|
# 将results内容写入json_file_name文件中
|
||||||
|
with open(json_file_name, 'w', encoding='utf-8') as f:
|
||||||
|
json.dump(results, f, ensure_ascii=False, indent=2)
|
||||||
|
logger.info(f"已将{len(results)}条数据保存到: {json_file_name}")
|
||||||
|
|
||||||
|
result_df = pd.DataFrame(results)
|
||||||
|
|
||||||
|
self.db_truth_social_content.insert_data_to_mysql(result_df)
|
||||||
|
|
||||||
|
except requests.exceptions.RequestException as e:
|
||||||
|
print(f"请求错误: {e}")
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
print(f"JSON 解析错误: {e}")
|
||||||
|
|
||||||
|
def transform_datetime(self, datetime_text: str):
|
||||||
|
utc_time = datetime.strptime(datetime_text, "%Y-%m-%dT%H:%M:%S.%fZ").replace(tzinfo=pytz.UTC)
|
||||||
|
|
||||||
|
# 1. 转换为时间戳(毫秒)
|
||||||
|
timestamp_ms = int(utc_time.timestamp() * 1000)
|
||||||
|
# 2. 转换为北京时间(ISO 8601 格式,带 +08:00)
|
||||||
|
beijing_tz = pytz.timezone("Asia/Shanghai")
|
||||||
|
beijing_time = utc_time.astimezone(beijing_tz)
|
||||||
|
beijing_time_str = beijing_time.strftime("%Y-%m-%dT%H:%M:%S%z")
|
||||||
|
# 插入冒号到时区偏移(如 +0800 -> +08:00)
|
||||||
|
beijing_time_str = beijing_time_str[:-2] + ":" + beijing_time_str[-2:]
|
||||||
|
result = {
|
||||||
|
"timestamp_ms": timestamp_ms,
|
||||||
|
"beijing_time_str": beijing_time_str
|
||||||
|
}
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
@ -0,0 +1,22 @@
|
||||||
|
CREATE TABLE `truth_social_content` (
|
||||||
|
`article_id` VARCHAR(50) NOT NULL PRIMARY KEY,
|
||||||
|
`user_id` VARCHAR(50) NOT NULL,
|
||||||
|
`user_name` VARCHAR(100) NOT NULL,
|
||||||
|
`timestamp` BIGINT NOT NULL,
|
||||||
|
`date_time` VARCHAR(50) NOT NULL,
|
||||||
|
`text` TEXT NOT NULL,
|
||||||
|
`media_url` TEXT NULL,
|
||||||
|
`media_type` VARCHAR(50) NULL,
|
||||||
|
`media_thumbnail` TEXT NULL,
|
||||||
|
`created_at` TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||||
|
`updated_at` TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP
|
||||||
|
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
|
||||||
|
|
||||||
|
-- 对于 MySQL 8.0.29 之前的版本不支持 "ADD COLUMN IF NOT EXISTS"
|
||||||
|
-- 如需在已有表上添加列,请分别执行以下语句(每条仅需执行一次)
|
||||||
|
ALTER TABLE `truth_social_content`
|
||||||
|
ADD COLUMN `media_url` TEXT NULL DEFAULT NULL AFTER `text`;
|
||||||
|
ALTER TABLE `truth_social_content`
|
||||||
|
ADD COLUMN `media_type` VARCHAR(50) NULL DEFAULT NULL AFTER `media_url`;
|
||||||
|
ALTER TABLE `truth_social_content`
|
||||||
|
ADD COLUMN `media_thumbnail` TEXT NULL DEFAULT NULL AFTER `media_type`;
|
||||||
|
|
@ -0,0 +1,41 @@
|
||||||
|
# 测试代码提示和自动补全功能
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
import requests
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
def test_autocomplete():
|
||||||
|
"""测试自动补全功能"""
|
||||||
|
# 测试pandas自动补全
|
||||||
|
df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]})
|
||||||
|
# 在这里输入 df. 应该会显示DataFrame的方法
|
||||||
|
df.head()
|
||||||
|
df.describe()
|
||||||
|
|
||||||
|
# 测试numpy自动补全
|
||||||
|
arr = np.array([1, 2, 3, 4, 5])
|
||||||
|
# 在这里输入 arr. 应该会显示numpy数组的方法
|
||||||
|
arr.mean()
|
||||||
|
arr.std()
|
||||||
|
|
||||||
|
# 测试requests自动补全
|
||||||
|
response = requests.get("https://api.github.com")
|
||||||
|
# 在这里输入 response. 应该会显示Response对象的方法
|
||||||
|
response.status_code
|
||||||
|
response.json()
|
||||||
|
|
||||||
|
# 测试内置函数自动补全
|
||||||
|
# 在这里输入 len( 应该会显示参数提示
|
||||||
|
length = len([1, 2, 3])
|
||||||
|
|
||||||
|
# 测试类型提示
|
||||||
|
current_time = datetime.now()
|
||||||
|
# 在这里输入 current_time. 应该会显示datetime对象的方法
|
||||||
|
current_time.strftime("%Y-%m-%d")
|
||||||
|
|
||||||
|
return df, arr, response, length, current_time
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
test_autocomplete()
|
||||||
|
|
@ -0,0 +1,10 @@
|
||||||
|
from core.media.truth_social_retriever import TruthSocialRetriever
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
truth_social_retriever = TruthSocialRetriever()
|
||||||
|
truth_social_retriever.get_user_posts()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
|
|
@ -1,4 +1,4 @@
|
||||||
from core.twitter.twitter_retriever import TwitterRetriever
|
from core.media.twitter_retriever import TwitterRetriever
|
||||||
import core.logger as logging
|
import core.logger as logging
|
||||||
import os
|
import os
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue