support fetch data from truth social
This commit is contained in:
parent
1e6caf51e0
commit
2dc2d51171
|
|
@ -0,0 +1,50 @@
|
|||
# Python环境配置说明
|
||||
|
||||
## 自动配置Python解释器
|
||||
|
||||
本项目已移除硬编码的Python路径,支持在不同电脑上自动检测Python环境。
|
||||
|
||||
### 首次使用步骤:
|
||||
|
||||
1. **打开Cursor**
|
||||
2. **选择Python解释器**:
|
||||
- 按 `Ctrl+Shift+P`
|
||||
- 搜索 "Python: Select Interpreter"
|
||||
- 选择您系统中的Python解释器
|
||||
|
||||
3. **验证配置**:
|
||||
- 打开 `test_autocomplete.py` 文件
|
||||
- 尝试输入代码,检查是否有自动补全功能
|
||||
|
||||
### 支持的Python环境:
|
||||
|
||||
- **Anaconda/Miniconda**:自动检测conda环境
|
||||
- **Python官方安装包**:检测系统PATH中的python
|
||||
- **虚拟环境**:支持venv、virtualenv等
|
||||
- **Docker容器**:支持远程Python解释器
|
||||
|
||||
### 代码提示功能:
|
||||
|
||||
✅ **已启用的功能**:
|
||||
- 智能代码补全
|
||||
- 函数参数提示
|
||||
- 类型提示
|
||||
- 自动导入建议
|
||||
- 悬停信息显示
|
||||
- 语法错误检测
|
||||
- 代码格式化
|
||||
|
||||
### 故障排除:
|
||||
|
||||
如果代码提示不工作:
|
||||
|
||||
1. **重启语言服务器**:`Ctrl+Shift+P` → "Python: Restart Language Server"
|
||||
2. **重新加载窗口**:`Ctrl+Shift+P` → "Developer: Reload Window"
|
||||
3. **检查Python扩展**:确保Python扩展已安装并启用
|
||||
4. **验证解释器**:确保选择的Python解释器路径正确
|
||||
|
||||
### 配置文件说明:
|
||||
|
||||
- `.vscode/settings.json`:项目级设置,包含代码提示配置
|
||||
- `python_settings.json`:通用Python配置模板
|
||||
- 全局设置:已移除硬编码路径,保持代码提示功能
|
||||
|
|
@ -0,0 +1,20 @@
|
|||
{
|
||||
"python.analysis.autoImportCompletions": true,
|
||||
"python.analysis.typeCheckingMode": "basic",
|
||||
"python.analysis.autoSearchPaths": true,
|
||||
"python.analysis.diagnosticMode": "workspace",
|
||||
"python.analysis.indexing": true,
|
||||
"python.analysis.completeFunctionParens": true,
|
||||
"python.analysis.inlayHints.functionReturnTypes": true,
|
||||
"python.analysis.inlayHints.variableTypes": true,
|
||||
"python.analysis.inlayHints.pytestParameters": true,
|
||||
"python.linting.enabled": true,
|
||||
"python.linting.pylintEnabled": false,
|
||||
"python.linting.flake8Enabled": true,
|
||||
"python.formatting.provider": "black",
|
||||
"python.terminal.activateEnvironment": true,
|
||||
"python.terminal.activateEnvInCurrentTerminal": true,
|
||||
"files.associations": {
|
||||
"*.py": "python"
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,53 @@
|
|||
{
|
||||
"python.analysis.autoImportCompletions": true,
|
||||
"python.analysis.typeCheckingMode": "basic",
|
||||
"python.analysis.autoSearchPaths": true,
|
||||
"python.analysis.diagnosticMode": "workspace",
|
||||
"python.analysis.stubPath": "./typings",
|
||||
"python.analysis.extraPaths": [
|
||||
"./core",
|
||||
"./utils"
|
||||
],
|
||||
"editor.quickSuggestions": {
|
||||
"other": true,
|
||||
"comments": false,
|
||||
"strings": true
|
||||
},
|
||||
"editor.suggestOnTriggerCharacters": true,
|
||||
"editor.acceptSuggestionOnEnter": "on",
|
||||
"editor.tabCompletion": "on",
|
||||
"editor.wordBasedSuggestions": "matchingDocuments",
|
||||
"editor.parameterHints.enabled": true,
|
||||
"editor.hover.enabled": true,
|
||||
"editor.codeActionsOnSave": {
|
||||
"source.organizeImports": "explicit"
|
||||
},
|
||||
"python.linting.enabled": true,
|
||||
"python.linting.pylintEnabled": false,
|
||||
"python.linting.flake8Enabled": true,
|
||||
"python.formatting.provider": "black",
|
||||
"python.analysis.completeFunctionParens": true,
|
||||
"python.analysis.inlayHints.functionReturnTypes": true,
|
||||
"python.analysis.inlayHints.variableTypes": true,
|
||||
"python.analysis.inlayHints.pytestParameters": true,
|
||||
"files.associations": {
|
||||
"*.py": "python"
|
||||
},
|
||||
"python.analysis.indexing": true,
|
||||
"python.terminal.activateEnvironment": true,
|
||||
"python.terminal.activateEnvInCurrentTerminal": true,
|
||||
"python.analysis.packageIndexDepths": [
|
||||
{
|
||||
"name": "pandas",
|
||||
"depth": 2
|
||||
},
|
||||
{
|
||||
"name": "numpy",
|
||||
"depth": 2
|
||||
},
|
||||
{
|
||||
"name": "requests",
|
||||
"depth": 2
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
@ -0,0 +1,36 @@
|
|||
import schedule
|
||||
import time
|
||||
from core.utils import get_current_date_time
|
||||
import core.logger as logging
|
||||
import subprocess
|
||||
import os
|
||||
import sys
|
||||
|
||||
logger = logging.logger
|
||||
# 定义要执行的任务
|
||||
def run_script():
|
||||
start_time = time.time()
|
||||
logger.info(f"Executing script at: {get_current_date_time()}")
|
||||
output_file = r'./output/auto_fetch_truth_social.txt'
|
||||
with open(output_file, 'a') as f:
|
||||
f.write(f"Task ran at {get_current_date_time()}\n")
|
||||
current_dir = os.getcwd()
|
||||
python_path = sys.executable
|
||||
if current_dir.endswith('crypto_quant'):
|
||||
script_path = r'./truth_social_retriever_main.py'
|
||||
elif current_dir.endswith(r'python_projects'):
|
||||
script_path = f'{current_dir}/crypto_quant/truth_social_retriever_main.py'
|
||||
else:
|
||||
script_path = f'{current_dir}/truth_social_retriever_main.py'
|
||||
subprocess.run([python_path, script_path])
|
||||
end_time = time.time()
|
||||
logger.info(f"Script execution time: {end_time - start_time} seconds")
|
||||
# 设置每天凌晨00:00 运行一次
|
||||
schedule.every().day.at("00:00:00").do(run_script)
|
||||
# schedule.every(60).seconds.do(run_script)
|
||||
|
||||
# 保持程序运行并检查调度
|
||||
logger.info("Scheduler started. Press Ctrl+C to stop.")
|
||||
while True:
|
||||
schedule.run_pending()
|
||||
time.sleep(1)
|
||||
|
|
@ -11,7 +11,7 @@ logger = logging.logger
|
|||
def run_script():
|
||||
start_time = time.time()
|
||||
logger.info(f"Executing script at: {get_current_date_time()}")
|
||||
output_file = r'./output/auto_schedule.txt'
|
||||
output_file = r'./output/auto_update_market_data.txt'
|
||||
with open(output_file, 'a') as f:
|
||||
f.write(f"Task ran at {get_current_date_time()}\n")
|
||||
python_path = sys.executable
|
||||
|
|
|
|||
|
|
@ -231,3 +231,6 @@ TWITTER_CONFIG = {
|
|||
{"name": "PressSec", "id": ""},
|
||||
],
|
||||
}
|
||||
|
||||
TRUTH_SOCIAL_API = {"api_key": "FRfhlDHnmYc1PCCrVHZdWtqDENr2",
|
||||
"user_id": {"realDonaldTrump": "107780257626128497"}}
|
||||
|
|
@ -0,0 +1,294 @@
|
|||
import pandas as pd
|
||||
import core.logger as logging
|
||||
from core.db.db_manager import DBData
|
||||
from core.utils import get_current_date_time
|
||||
|
||||
logger = logging.logger
|
||||
|
||||
|
||||
class DBTruthSocialContent:
|
||||
def __init__(self, db_url: str):
|
||||
self.db_url = db_url
|
||||
self.table_name = "truth_social_content"
|
||||
self.columns = [
|
||||
"article_id",
|
||||
"user_id",
|
||||
"user_name",
|
||||
"timestamp",
|
||||
"date_time",
|
||||
"text",
|
||||
"media_url",
|
||||
"media_type",
|
||||
"media_thumbnail"
|
||||
]
|
||||
self.db_manager = DBData(db_url, self.table_name, self.columns)
|
||||
|
||||
def insert_data_to_mysql(self, df: pd.DataFrame):
|
||||
"""
|
||||
将内容数据保存到MySQL的truth_social_content表
|
||||
速度:⭐⭐⭐⭐⭐ 最快
|
||||
内存:⭐⭐⭐⭐ 中等
|
||||
适用场景:中小数据量(<10万条)
|
||||
:param df: Truth Social内容数据DataFrame
|
||||
"""
|
||||
if df is None or df.empty:
|
||||
logger.warning("DataFrame为空,无需写入数据库。")
|
||||
return
|
||||
|
||||
self.db_manager.insert_data_to_mysql(df)
|
||||
|
||||
def insert_data_to_mysql_fast(self, df: pd.DataFrame):
|
||||
"""
|
||||
快速插入Truth Social内容数据(方案2:使用executemany批量插入)
|
||||
速度:⭐⭐⭐⭐ 很快
|
||||
内存:⭐⭐⭐⭐⭐ 低
|
||||
适用场景:中等数据量
|
||||
:param df: Truth Social内容数据DataFrame
|
||||
"""
|
||||
if df is None or df.empty:
|
||||
logger.warning("DataFrame为空,无需写入数据库。")
|
||||
return
|
||||
|
||||
self.db_manager.insert_data_to_mysql_fast(df)
|
||||
|
||||
def insert_data_to_mysql_chunk(self, df: pd.DataFrame, chunk_size: int = 1000):
|
||||
"""
|
||||
分块插入Truth Social内容数据(方案3:适合大数据量)
|
||||
速度:⭐⭐⭐ 中等
|
||||
内存:⭐⭐⭐⭐⭐ 最低
|
||||
适用场景:大数据量(>10万条)
|
||||
:param df: Twitter内容数据DataFrame
|
||||
:param chunk_size: 分块大小
|
||||
"""
|
||||
if df is None or df.empty:
|
||||
logger.warning("DataFrame为空,无需写入数据库。")
|
||||
return
|
||||
|
||||
self.db_manager.insert_data_to_mysql_chunk(df, chunk_size)
|
||||
|
||||
def insert_data_to_mysql_simple(self, df: pd.DataFrame):
|
||||
"""
|
||||
简单插入Truth Social内容数据(方案4:直接使用to_sql,忽略重复)
|
||||
速度:⭐⭐⭐⭐⭐ 最快
|
||||
内存:⭐⭐⭐⭐ 中等
|
||||
注意:会抛出重复键错误,需要额外处理
|
||||
"""
|
||||
if df is None or df.empty:
|
||||
logger.warning("DataFrame为空,无需写入数据库。")
|
||||
return
|
||||
|
||||
self.db_manager.insert_data_to_mysql_simple(df)
|
||||
|
||||
def query_latest_data(self, user_id: str = None):
|
||||
"""
|
||||
查询最新数据
|
||||
:param user_id: 用户ID,如果为None则查询所有用户的最新数据
|
||||
"""
|
||||
if user_id:
|
||||
sql = """
|
||||
SELECT * FROM truth_social_content
|
||||
WHERE user_id = :user_id
|
||||
ORDER BY timestamp DESC
|
||||
LIMIT 1
|
||||
"""
|
||||
condition_dict = {"user_id": user_id}
|
||||
else:
|
||||
sql = """
|
||||
SELECT * FROM truth_social_content
|
||||
ORDER BY timestamp DESC
|
||||
LIMIT 1
|
||||
"""
|
||||
condition_dict = {}
|
||||
|
||||
return self.db_manager.query_data(sql, condition_dict, return_multi=False)
|
||||
|
||||
def query_data_by_user_id(self, user_id: str, limit: int = 100):
|
||||
"""
|
||||
根据用户ID查询数据
|
||||
:param user_id: 用户ID
|
||||
:param limit: 查询数量
|
||||
"""
|
||||
sql = """
|
||||
SELECT * FROM truth_social_content
|
||||
WHERE user_id = :user_id
|
||||
ORDER BY timestamp DESC
|
||||
LIMIT :limit
|
||||
"""
|
||||
condition_dict = {"user_id": user_id, "limit": limit}
|
||||
return self.db_manager.query_data(sql, condition_dict, return_multi=True)
|
||||
|
||||
def query_data_by_timestamp_range(
|
||||
self,
|
||||
start_timestamp: int = None,
|
||||
end_timestamp: int = None,
|
||||
user_id: str = None,
|
||||
limit: int = 1000
|
||||
):
|
||||
"""
|
||||
根据时间戳范围查询数据
|
||||
:param start_timestamp: 开始时间戳
|
||||
:param end_timestamp: 结束时间戳
|
||||
:param user_id: 用户ID,可选
|
||||
:param limit: 查询数量
|
||||
"""
|
||||
conditions = []
|
||||
condition_dict = {"limit": limit}
|
||||
|
||||
if start_timestamp:
|
||||
conditions.append("timestamp >= :start_timestamp")
|
||||
condition_dict["start_timestamp"] = start_timestamp
|
||||
|
||||
if end_timestamp:
|
||||
conditions.append("timestamp <= :end_timestamp")
|
||||
condition_dict["end_timestamp"] = end_timestamp
|
||||
|
||||
if user_id:
|
||||
conditions.append("user_id = :user_id")
|
||||
condition_dict["user_id"] = user_id
|
||||
|
||||
where_clause = " AND ".join(conditions) if conditions else "1=1"
|
||||
|
||||
sql = f"""
|
||||
SELECT * FROM truth_social_content
|
||||
WHERE {where_clause}
|
||||
ORDER BY timestamp DESC
|
||||
LIMIT :limit
|
||||
"""
|
||||
|
||||
return self.db_manager.query_data(sql, condition_dict, return_multi=True)
|
||||
|
||||
def query_data_by_text_search(
|
||||
self,
|
||||
search_text: str,
|
||||
user_id: str = None,
|
||||
limit: int = 100
|
||||
):
|
||||
"""
|
||||
根据文本内容搜索数据
|
||||
:param search_text: 搜索文本
|
||||
:param user_id: 用户ID,可选
|
||||
:param limit: 查询数量
|
||||
"""
|
||||
conditions = ["text LIKE :search_text"]
|
||||
condition_dict = {
|
||||
"search_text": f"%{search_text}%",
|
||||
"limit": limit
|
||||
}
|
||||
|
||||
if user_id:
|
||||
conditions.append("user_id = :user_id")
|
||||
condition_dict["user_id"] = user_id
|
||||
|
||||
where_clause = " AND ".join(conditions)
|
||||
|
||||
sql = f"""
|
||||
SELECT * FROM truth_social_content
|
||||
WHERE {where_clause}
|
||||
ORDER BY timestamp DESC
|
||||
LIMIT :limit
|
||||
"""
|
||||
|
||||
return self.db_manager.query_data(sql, condition_dict, return_multi=True)
|
||||
|
||||
def query_data_by_date_range(
|
||||
self,
|
||||
start_date: str = None,
|
||||
end_date: str = None,
|
||||
user_id: str = None,
|
||||
limit: int = 1000
|
||||
):
|
||||
"""
|
||||
根据日期范围查询数据
|
||||
:param start_date: 开始日期 (YYYY-MM-DD)
|
||||
:param end_date: 结束日期 (YYYY-MM-DD)
|
||||
:param user_id: 用户ID,可选
|
||||
:param limit: 查询数量
|
||||
"""
|
||||
conditions = []
|
||||
condition_dict = {"limit": limit}
|
||||
|
||||
if start_date:
|
||||
conditions.append("date_time >= :start_date")
|
||||
condition_dict["start_date"] = start_date
|
||||
|
||||
if end_date:
|
||||
conditions.append("date_time <= :end_date")
|
||||
condition_dict["end_date"] = end_date
|
||||
|
||||
if user_id:
|
||||
conditions.append("user_id = :user_id")
|
||||
condition_dict["user_id"] = user_id
|
||||
|
||||
where_clause = " AND ".join(conditions) if conditions else "1=1"
|
||||
|
||||
sql = f"""
|
||||
SELECT * FROM truth_social_content
|
||||
WHERE {where_clause}
|
||||
ORDER BY timestamp DESC
|
||||
LIMIT :limit
|
||||
"""
|
||||
|
||||
return self.db_manager.query_data(sql, condition_dict, return_multi=True)
|
||||
|
||||
def get_user_list(self, limit: int = 100):
|
||||
"""
|
||||
获取用户列表
|
||||
:param limit: 查询数量
|
||||
"""
|
||||
sql = """
|
||||
SELECT DISTINCT user_id, user_name,
|
||||
COUNT(*) as article_count,
|
||||
MAX(timestamp) as last_time
|
||||
FROM truth_social_content
|
||||
GROUP BY user_id, user_name
|
||||
ORDER BY last_time DESC
|
||||
LIMIT :limit
|
||||
"""
|
||||
condition_dict = {"limit": limit}
|
||||
return self.db_manager.query_data(sql, condition_dict, return_multi=True)
|
||||
|
||||
def get_statistics(self):
|
||||
"""
|
||||
获取统计信息
|
||||
"""
|
||||
sql = """
|
||||
SELECT
|
||||
COUNT(*) as total_articles,
|
||||
COUNT(DISTINCT user_id) as total_users,
|
||||
MIN(timestamp) as earliest_article,
|
||||
MAX(timestamp) as latest_article,
|
||||
AVG(LENGTH(text)) as avg_text_length
|
||||
FROM truth_social_content
|
||||
"""
|
||||
return self.db_manager.query_data(sql, {}, return_multi=False)
|
||||
|
||||
def delete_old_data(self, days: int = 30):
|
||||
"""
|
||||
删除指定天数前的旧数据
|
||||
:param days: 保留天数
|
||||
"""
|
||||
current_time = get_current_date_time()
|
||||
cutoff_timestamp = int(pd.Timestamp(current_time).timestamp()) - (days * 24 * 60 * 60)
|
||||
|
||||
sql = """
|
||||
DELETE FROM truth_social_content
|
||||
WHERE timestamp < :cutoff_timestamp
|
||||
"""
|
||||
condition_dict = {"cutoff_timestamp": cutoff_timestamp}
|
||||
|
||||
return self.db_manager.execute_sql(sql, condition_dict)
|
||||
|
||||
def check_duplicate(self, user_id: str, timestamp: int):
|
||||
"""
|
||||
检查是否存在重复数据
|
||||
:param user_id: 用户ID
|
||||
:param timestamp: 时间戳
|
||||
"""
|
||||
sql = """
|
||||
SELECT COUNT(*) as count
|
||||
FROM truth_social_content
|
||||
WHERE user_id = :user_id AND timestamp = :timestamp
|
||||
"""
|
||||
condition_dict = {"user_id": user_id, "timestamp": timestamp}
|
||||
result = self.db_manager.query_data(sql, condition_dict, return_multi=False)
|
||||
return result['count'] > 0 if result else False
|
||||
Binary file not shown.
Binary file not shown.
|
|
@ -0,0 +1,156 @@
|
|||
import core.logger as logging
|
||||
from core.db.db_truth_social_content import DBTruthSocialContent
|
||||
from config import TRUTH_SOCIAL_API, COIN_MYSQL_CONFIG
|
||||
|
||||
import requests
|
||||
import json
|
||||
import os
|
||||
from bs4 import BeautifulSoup
|
||||
import time
|
||||
from datetime import datetime
|
||||
import pytz
|
||||
import pandas as pd
|
||||
|
||||
logger = logging.logger
|
||||
|
||||
class TruthSocialRetriever:
|
||||
def __init__(self) -> None:
|
||||
self.api_key = TRUTH_SOCIAL_API.get("api_key", "")
|
||||
self.user_info = TRUTH_SOCIAL_API.get("user_id", {})
|
||||
mysql_user = COIN_MYSQL_CONFIG.get("user", "xch")
|
||||
mysql_password = COIN_MYSQL_CONFIG.get("password", "")
|
||||
if not mysql_password:
|
||||
raise ValueError("MySQL password is not set")
|
||||
mysql_host = COIN_MYSQL_CONFIG.get("host", "localhost")
|
||||
mysql_port = COIN_MYSQL_CONFIG.get("port", 3306)
|
||||
mysql_database = COIN_MYSQL_CONFIG.get("database", "okx")
|
||||
|
||||
self.db_url = f"mysql+pymysql://{mysql_user}:{mysql_password}@{mysql_host}:{mysql_port}/{mysql_database}"
|
||||
self.db_truth_social_content = DBTruthSocialContent(self.db_url)
|
||||
|
||||
self.save_path = r"./output/media/truth_social/"
|
||||
os.makedirs(self.save_path, exist_ok=True)
|
||||
|
||||
def get_user_id_from_page(self, handle='realDonaldTrump'):
|
||||
url = f'https://truthsocial.com/@{handle}'
|
||||
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'} # 模拟浏览器
|
||||
|
||||
response = requests.get(url, headers=headers)
|
||||
response.raise_for_status()
|
||||
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
# 查找嵌入的 JSON(Truth Social 使用 data 属性或 script 标签)
|
||||
scripts = soup.find_all('script')
|
||||
for script in scripts:
|
||||
if script.string and 'id' in script.string and handle in script.string:
|
||||
# 简单提取(实际可能需正则匹配 JSON)
|
||||
import re
|
||||
match = re.search(r'"id"\s*:\s*"(\d+)"', script.string)
|
||||
if match:
|
||||
return match.group(1)
|
||||
return None
|
||||
|
||||
def get_user_posts(self, limit: int = None):
|
||||
"""
|
||||
获取用户在 Truth Social 的最新帖子。
|
||||
免费版:100次
|
||||
付费版:
|
||||
47美元:25,000次,如果5分钟跑一次,则可以跑86.8天
|
||||
497美元:500,000次,如果5分钟跑一次,则可以跑1736天
|
||||
参数:
|
||||
- limit: 最大帖子数(API 默认返回 20 条,可通过分页获取更多)。
|
||||
|
||||
返回:
|
||||
- 帖子列表(JSON 格式)。
|
||||
"""
|
||||
headers = {
|
||||
'x-api-key': self.api_key,
|
||||
'Content-Type': 'application/json'
|
||||
}
|
||||
|
||||
for user_name, user_id in self.user_info.items():
|
||||
params = {
|
||||
'handle': user_name, # 用户名
|
||||
'user_id': user_id, # 可选,用户 ID
|
||||
'next_max_id': None, # 分页时设置为上一次响应的 max_id
|
||||
'trim': 'false' # 保留完整内容
|
||||
}
|
||||
|
||||
url = 'https://api.scrapecreators.com/v1/truthsocial/user/posts'
|
||||
logger.info(f"Searching contents for user: {user_name}")
|
||||
try:
|
||||
response = requests.get(url, headers=headers, params=params)
|
||||
response.raise_for_status() # 检查 HTTP 错误
|
||||
data = response.json()
|
||||
|
||||
# 提取帖子列表(假设响应中 'posts' 是键,根据实际文档调整)
|
||||
if limit is not None and isinstance(limit, int):
|
||||
posts = data.get('posts', [])[:limit]
|
||||
else:
|
||||
posts = data.get('posts', [])
|
||||
|
||||
results = []
|
||||
if posts:
|
||||
logger.info(f"获取{user_name}帖子: {len(posts)}条")
|
||||
for post in posts:
|
||||
result = {}
|
||||
result["article_id"] = post.get('id')
|
||||
result["user_id"] = user_id
|
||||
result["user_name"] = user_name
|
||||
datetime_text = post.get('created_at')
|
||||
datetime_dict = self.transform_datetime(datetime_text)
|
||||
timestamp_ms = datetime_dict["timestamp_ms"]
|
||||
result["timestamp"] = timestamp_ms
|
||||
beijing_time_str = datetime_dict["beijing_time_str"]
|
||||
result["date_time"] = beijing_time_str
|
||||
result["text"] = post.get('text', '无内容')
|
||||
media_attachments = post.get('media_attachments', [])
|
||||
result["media_url"] = ""
|
||||
result["media_type"] = ""
|
||||
result["media_thumbnail"] = ""
|
||||
if media_attachments:
|
||||
for media_attachment in media_attachments:
|
||||
result["media_url"] = media_attachment.get('url')
|
||||
result["media_type"] = media_attachment.get('type')
|
||||
result["media_thumbnail"] = media_attachment.get('preview_url')
|
||||
break
|
||||
results.append(result)
|
||||
else:
|
||||
print("获取帖子失败,请检查 API 密钥或网络。")
|
||||
|
||||
if len(results) > 0:
|
||||
user_path = os.path.join(self.save_path, user_name)
|
||||
os.makedirs(user_path, exist_ok=True)
|
||||
now_date_time = datetime.now().strftime("%Y%m%d%H%M%S")
|
||||
json_file_name = os.path.join(user_path, f"{user_name}_{now_date_time}.json")
|
||||
# 将results内容写入json_file_name文件中
|
||||
with open(json_file_name, 'w', encoding='utf-8') as f:
|
||||
json.dump(results, f, ensure_ascii=False, indent=2)
|
||||
logger.info(f"已将{len(results)}条数据保存到: {json_file_name}")
|
||||
|
||||
result_df = pd.DataFrame(results)
|
||||
|
||||
self.db_truth_social_content.insert_data_to_mysql(result_df)
|
||||
|
||||
except requests.exceptions.RequestException as e:
|
||||
print(f"请求错误: {e}")
|
||||
except json.JSONDecodeError as e:
|
||||
print(f"JSON 解析错误: {e}")
|
||||
|
||||
def transform_datetime(self, datetime_text: str):
|
||||
utc_time = datetime.strptime(datetime_text, "%Y-%m-%dT%H:%M:%S.%fZ").replace(tzinfo=pytz.UTC)
|
||||
|
||||
# 1. 转换为时间戳(毫秒)
|
||||
timestamp_ms = int(utc_time.timestamp() * 1000)
|
||||
# 2. 转换为北京时间(ISO 8601 格式,带 +08:00)
|
||||
beijing_tz = pytz.timezone("Asia/Shanghai")
|
||||
beijing_time = utc_time.astimezone(beijing_tz)
|
||||
beijing_time_str = beijing_time.strftime("%Y-%m-%dT%H:%M:%S%z")
|
||||
# 插入冒号到时区偏移(如 +0800 -> +08:00)
|
||||
beijing_time_str = beijing_time_str[:-2] + ":" + beijing_time_str[-2:]
|
||||
result = {
|
||||
"timestamp_ms": timestamp_ms,
|
||||
"beijing_time_str": beijing_time_str
|
||||
}
|
||||
return result
|
||||
|
||||
|
|
@ -0,0 +1,22 @@
|
|||
CREATE TABLE `truth_social_content` (
|
||||
`article_id` VARCHAR(50) NOT NULL PRIMARY KEY,
|
||||
`user_id` VARCHAR(50) NOT NULL,
|
||||
`user_name` VARCHAR(100) NOT NULL,
|
||||
`timestamp` BIGINT NOT NULL,
|
||||
`date_time` VARCHAR(50) NOT NULL,
|
||||
`text` TEXT NOT NULL,
|
||||
`media_url` TEXT NULL,
|
||||
`media_type` VARCHAR(50) NULL,
|
||||
`media_thumbnail` TEXT NULL,
|
||||
`created_at` TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
`updated_at` TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
|
||||
|
||||
-- 对于 MySQL 8.0.29 之前的版本不支持 "ADD COLUMN IF NOT EXISTS"
|
||||
-- 如需在已有表上添加列,请分别执行以下语句(每条仅需执行一次)
|
||||
ALTER TABLE `truth_social_content`
|
||||
ADD COLUMN `media_url` TEXT NULL DEFAULT NULL AFTER `text`;
|
||||
ALTER TABLE `truth_social_content`
|
||||
ADD COLUMN `media_type` VARCHAR(50) NULL DEFAULT NULL AFTER `media_url`;
|
||||
ALTER TABLE `truth_social_content`
|
||||
ADD COLUMN `media_thumbnail` TEXT NULL DEFAULT NULL AFTER `media_type`;
|
||||
|
|
@ -0,0 +1,41 @@
|
|||
# 测试代码提示和自动补全功能
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import requests
|
||||
import json
|
||||
import os
|
||||
from datetime import datetime
|
||||
|
||||
def test_autocomplete():
|
||||
"""测试自动补全功能"""
|
||||
# 测试pandas自动补全
|
||||
df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]})
|
||||
# 在这里输入 df. 应该会显示DataFrame的方法
|
||||
df.head()
|
||||
df.describe()
|
||||
|
||||
# 测试numpy自动补全
|
||||
arr = np.array([1, 2, 3, 4, 5])
|
||||
# 在这里输入 arr. 应该会显示numpy数组的方法
|
||||
arr.mean()
|
||||
arr.std()
|
||||
|
||||
# 测试requests自动补全
|
||||
response = requests.get("https://api.github.com")
|
||||
# 在这里输入 response. 应该会显示Response对象的方法
|
||||
response.status_code
|
||||
response.json()
|
||||
|
||||
# 测试内置函数自动补全
|
||||
# 在这里输入 len( 应该会显示参数提示
|
||||
length = len([1, 2, 3])
|
||||
|
||||
# 测试类型提示
|
||||
current_time = datetime.now()
|
||||
# 在这里输入 current_time. 应该会显示datetime对象的方法
|
||||
current_time.strftime("%Y-%m-%d")
|
||||
|
||||
return df, arr, response, length, current_time
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_autocomplete()
|
||||
|
|
@ -0,0 +1,10 @@
|
|||
from core.media.truth_social_retriever import TruthSocialRetriever
|
||||
|
||||
|
||||
def main():
|
||||
truth_social_retriever = TruthSocialRetriever()
|
||||
truth_social_retriever.get_user_posts()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -1,4 +1,4 @@
|
|||
from core.twitter.twitter_retriever import TwitterRetriever
|
||||
from core.media.twitter_retriever import TwitterRetriever
|
||||
import core.logger as logging
|
||||
import os
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue