From 2dc2d51171609f1735610e7aaf5dc5c865ff4dcf Mon Sep 17 00:00:00 2001 From: blade <8019068@qq.com> Date: Mon, 20 Oct 2025 18:37:41 +0800 Subject: [PATCH] support fetch data from truth social --- .vscode/README.md | 50 +++ .vscode/python_settings.json | 20 ++ .vscode/settings.json | 53 ++++ auto_fetch_truth_social.py | 36 +++ auto_update_market_data.py | 2 +- config.py | 3 + core/db/db_truth_social_content.py | 294 ++++++++++++++++++ .../truth_social_retriever.cpython-312.pyc | Bin 0 -> 8173 bytes .../twitter_retriever.cpython-312.pyc | Bin 5912 -> 6170 bytes core/media/truth_social_retriever.py | 156 ++++++++++ core/{twitter => media}/twitter_retriever.py | 0 sql/table/truth_social_content.sql | 22 ++ test_autocomplete.py | 41 +++ truth_social_retriever_main.py | 10 + twitter_retriever_main.py | 2 +- 15 files changed, 687 insertions(+), 2 deletions(-) create mode 100644 .vscode/README.md create mode 100644 .vscode/python_settings.json create mode 100644 .vscode/settings.json create mode 100644 auto_fetch_truth_social.py create mode 100644 core/db/db_truth_social_content.py create mode 100644 core/media/__pycache__/truth_social_retriever.cpython-312.pyc rename core/{twitter => media}/__pycache__/twitter_retriever.cpython-312.pyc (78%) create mode 100644 core/media/truth_social_retriever.py rename core/{twitter => media}/twitter_retriever.py (100%) create mode 100644 sql/table/truth_social_content.sql create mode 100644 test_autocomplete.py create mode 100644 truth_social_retriever_main.py diff --git a/.vscode/README.md b/.vscode/README.md new file mode 100644 index 0000000..d1d0994 --- /dev/null +++ b/.vscode/README.md @@ -0,0 +1,50 @@ +# Python环境配置说明 + +## 自动配置Python解释器 + +本项目已移除硬编码的Python路径,支持在不同电脑上自动检测Python环境。 + +### 首次使用步骤: + +1. **打开Cursor** +2. **选择Python解释器**: + - 按 `Ctrl+Shift+P` + - 搜索 "Python: Select Interpreter" + - 选择您系统中的Python解释器 + +3. **验证配置**: + - 打开 `test_autocomplete.py` 文件 + - 尝试输入代码,检查是否有自动补全功能 + +### 支持的Python环境: + +- **Anaconda/Miniconda**:自动检测conda环境 +- **Python官方安装包**:检测系统PATH中的python +- **虚拟环境**:支持venv、virtualenv等 +- **Docker容器**:支持远程Python解释器 + +### 代码提示功能: + +✅ **已启用的功能**: +- 智能代码补全 +- 函数参数提示 +- 类型提示 +- 自动导入建议 +- 悬停信息显示 +- 语法错误检测 +- 代码格式化 + +### 故障排除: + +如果代码提示不工作: + +1. **重启语言服务器**:`Ctrl+Shift+P` → "Python: Restart Language Server" +2. **重新加载窗口**:`Ctrl+Shift+P` → "Developer: Reload Window" +3. **检查Python扩展**:确保Python扩展已安装并启用 +4. **验证解释器**:确保选择的Python解释器路径正确 + +### 配置文件说明: + +- `.vscode/settings.json`:项目级设置,包含代码提示配置 +- `python_settings.json`:通用Python配置模板 +- 全局设置:已移除硬编码路径,保持代码提示功能 diff --git a/.vscode/python_settings.json b/.vscode/python_settings.json new file mode 100644 index 0000000..749538d --- /dev/null +++ b/.vscode/python_settings.json @@ -0,0 +1,20 @@ +{ + "python.analysis.autoImportCompletions": true, + "python.analysis.typeCheckingMode": "basic", + "python.analysis.autoSearchPaths": true, + "python.analysis.diagnosticMode": "workspace", + "python.analysis.indexing": true, + "python.analysis.completeFunctionParens": true, + "python.analysis.inlayHints.functionReturnTypes": true, + "python.analysis.inlayHints.variableTypes": true, + "python.analysis.inlayHints.pytestParameters": true, + "python.linting.enabled": true, + "python.linting.pylintEnabled": false, + "python.linting.flake8Enabled": true, + "python.formatting.provider": "black", + "python.terminal.activateEnvironment": true, + "python.terminal.activateEnvInCurrentTerminal": true, + "files.associations": { + "*.py": "python" + } +} diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..23e4a0f --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,53 @@ +{ + "python.analysis.autoImportCompletions": true, + "python.analysis.typeCheckingMode": "basic", + "python.analysis.autoSearchPaths": true, + "python.analysis.diagnosticMode": "workspace", + "python.analysis.stubPath": "./typings", + "python.analysis.extraPaths": [ + "./core", + "./utils" + ], + "editor.quickSuggestions": { + "other": true, + "comments": false, + "strings": true + }, + "editor.suggestOnTriggerCharacters": true, + "editor.acceptSuggestionOnEnter": "on", + "editor.tabCompletion": "on", + "editor.wordBasedSuggestions": "matchingDocuments", + "editor.parameterHints.enabled": true, + "editor.hover.enabled": true, + "editor.codeActionsOnSave": { + "source.organizeImports": "explicit" + }, + "python.linting.enabled": true, + "python.linting.pylintEnabled": false, + "python.linting.flake8Enabled": true, + "python.formatting.provider": "black", + "python.analysis.completeFunctionParens": true, + "python.analysis.inlayHints.functionReturnTypes": true, + "python.analysis.inlayHints.variableTypes": true, + "python.analysis.inlayHints.pytestParameters": true, + "files.associations": { + "*.py": "python" + }, + "python.analysis.indexing": true, + "python.terminal.activateEnvironment": true, + "python.terminal.activateEnvInCurrentTerminal": true, + "python.analysis.packageIndexDepths": [ + { + "name": "pandas", + "depth": 2 + }, + { + "name": "numpy", + "depth": 2 + }, + { + "name": "requests", + "depth": 2 + } + ] +} diff --git a/auto_fetch_truth_social.py b/auto_fetch_truth_social.py new file mode 100644 index 0000000..b4be396 --- /dev/null +++ b/auto_fetch_truth_social.py @@ -0,0 +1,36 @@ +import schedule +import time +from core.utils import get_current_date_time +import core.logger as logging +import subprocess +import os +import sys + +logger = logging.logger +# 定义要执行的任务 +def run_script(): + start_time = time.time() + logger.info(f"Executing script at: {get_current_date_time()}") + output_file = r'./output/auto_fetch_truth_social.txt' + with open(output_file, 'a') as f: + f.write(f"Task ran at {get_current_date_time()}\n") + current_dir = os.getcwd() + python_path = sys.executable + if current_dir.endswith('crypto_quant'): + script_path = r'./truth_social_retriever_main.py' + elif current_dir.endswith(r'python_projects'): + script_path = f'{current_dir}/crypto_quant/truth_social_retriever_main.py' + else: + script_path = f'{current_dir}/truth_social_retriever_main.py' + subprocess.run([python_path, script_path]) + end_time = time.time() + logger.info(f"Script execution time: {end_time - start_time} seconds") +# 设置每天凌晨00:00 运行一次 +schedule.every().day.at("00:00:00").do(run_script) +# schedule.every(60).seconds.do(run_script) + +# 保持程序运行并检查调度 +logger.info("Scheduler started. Press Ctrl+C to stop.") +while True: + schedule.run_pending() + time.sleep(1) \ No newline at end of file diff --git a/auto_update_market_data.py b/auto_update_market_data.py index e853bf5..cd95361 100644 --- a/auto_update_market_data.py +++ b/auto_update_market_data.py @@ -11,7 +11,7 @@ logger = logging.logger def run_script(): start_time = time.time() logger.info(f"Executing script at: {get_current_date_time()}") - output_file = r'./output/auto_schedule.txt' + output_file = r'./output/auto_update_market_data.txt' with open(output_file, 'a') as f: f.write(f"Task ran at {get_current_date_time()}\n") python_path = sys.executable diff --git a/config.py b/config.py index ec74865..ef10b04 100644 --- a/config.py +++ b/config.py @@ -231,3 +231,6 @@ TWITTER_CONFIG = { {"name": "PressSec", "id": ""}, ], } + +TRUTH_SOCIAL_API = {"api_key": "FRfhlDHnmYc1PCCrVHZdWtqDENr2", +"user_id": {"realDonaldTrump": "107780257626128497"}} \ No newline at end of file diff --git a/core/db/db_truth_social_content.py b/core/db/db_truth_social_content.py new file mode 100644 index 0000000..6e3b9eb --- /dev/null +++ b/core/db/db_truth_social_content.py @@ -0,0 +1,294 @@ +import pandas as pd +import core.logger as logging +from core.db.db_manager import DBData +from core.utils import get_current_date_time + +logger = logging.logger + + +class DBTruthSocialContent: + def __init__(self, db_url: str): + self.db_url = db_url + self.table_name = "truth_social_content" + self.columns = [ + "article_id", + "user_id", + "user_name", + "timestamp", + "date_time", + "text", + "media_url", + "media_type", + "media_thumbnail" + ] + self.db_manager = DBData(db_url, self.table_name, self.columns) + + def insert_data_to_mysql(self, df: pd.DataFrame): + """ + 将内容数据保存到MySQL的truth_social_content表 + 速度:⭐⭐⭐⭐⭐ 最快 + 内存:⭐⭐⭐⭐ 中等 + 适用场景:中小数据量(<10万条) + :param df: Truth Social内容数据DataFrame + """ + if df is None or df.empty: + logger.warning("DataFrame为空,无需写入数据库。") + return + + self.db_manager.insert_data_to_mysql(df) + + def insert_data_to_mysql_fast(self, df: pd.DataFrame): + """ + 快速插入Truth Social内容数据(方案2:使用executemany批量插入) + 速度:⭐⭐⭐⭐ 很快 + 内存:⭐⭐⭐⭐⭐ 低 + 适用场景:中等数据量 + :param df: Truth Social内容数据DataFrame + """ + if df is None or df.empty: + logger.warning("DataFrame为空,无需写入数据库。") + return + + self.db_manager.insert_data_to_mysql_fast(df) + + def insert_data_to_mysql_chunk(self, df: pd.DataFrame, chunk_size: int = 1000): + """ + 分块插入Truth Social内容数据(方案3:适合大数据量) + 速度:⭐⭐⭐ 中等 + 内存:⭐⭐⭐⭐⭐ 最低 + 适用场景:大数据量(>10万条) + :param df: Twitter内容数据DataFrame + :param chunk_size: 分块大小 + """ + if df is None or df.empty: + logger.warning("DataFrame为空,无需写入数据库。") + return + + self.db_manager.insert_data_to_mysql_chunk(df, chunk_size) + + def insert_data_to_mysql_simple(self, df: pd.DataFrame): + """ + 简单插入Truth Social内容数据(方案4:直接使用to_sql,忽略重复) + 速度:⭐⭐⭐⭐⭐ 最快 + 内存:⭐⭐⭐⭐ 中等 + 注意:会抛出重复键错误,需要额外处理 + """ + if df is None or df.empty: + logger.warning("DataFrame为空,无需写入数据库。") + return + + self.db_manager.insert_data_to_mysql_simple(df) + + def query_latest_data(self, user_id: str = None): + """ + 查询最新数据 + :param user_id: 用户ID,如果为None则查询所有用户的最新数据 + """ + if user_id: + sql = """ + SELECT * FROM truth_social_content + WHERE user_id = :user_id + ORDER BY timestamp DESC + LIMIT 1 + """ + condition_dict = {"user_id": user_id} + else: + sql = """ + SELECT * FROM truth_social_content + ORDER BY timestamp DESC + LIMIT 1 + """ + condition_dict = {} + + return self.db_manager.query_data(sql, condition_dict, return_multi=False) + + def query_data_by_user_id(self, user_id: str, limit: int = 100): + """ + 根据用户ID查询数据 + :param user_id: 用户ID + :param limit: 查询数量 + """ + sql = """ + SELECT * FROM truth_social_content + WHERE user_id = :user_id + ORDER BY timestamp DESC + LIMIT :limit + """ + condition_dict = {"user_id": user_id, "limit": limit} + return self.db_manager.query_data(sql, condition_dict, return_multi=True) + + def query_data_by_timestamp_range( + self, + start_timestamp: int = None, + end_timestamp: int = None, + user_id: str = None, + limit: int = 1000 + ): + """ + 根据时间戳范围查询数据 + :param start_timestamp: 开始时间戳 + :param end_timestamp: 结束时间戳 + :param user_id: 用户ID,可选 + :param limit: 查询数量 + """ + conditions = [] + condition_dict = {"limit": limit} + + if start_timestamp: + conditions.append("timestamp >= :start_timestamp") + condition_dict["start_timestamp"] = start_timestamp + + if end_timestamp: + conditions.append("timestamp <= :end_timestamp") + condition_dict["end_timestamp"] = end_timestamp + + if user_id: + conditions.append("user_id = :user_id") + condition_dict["user_id"] = user_id + + where_clause = " AND ".join(conditions) if conditions else "1=1" + + sql = f""" + SELECT * FROM truth_social_content + WHERE {where_clause} + ORDER BY timestamp DESC + LIMIT :limit + """ + + return self.db_manager.query_data(sql, condition_dict, return_multi=True) + + def query_data_by_text_search( + self, + search_text: str, + user_id: str = None, + limit: int = 100 + ): + """ + 根据文本内容搜索数据 + :param search_text: 搜索文本 + :param user_id: 用户ID,可选 + :param limit: 查询数量 + """ + conditions = ["text LIKE :search_text"] + condition_dict = { + "search_text": f"%{search_text}%", + "limit": limit + } + + if user_id: + conditions.append("user_id = :user_id") + condition_dict["user_id"] = user_id + + where_clause = " AND ".join(conditions) + + sql = f""" + SELECT * FROM truth_social_content + WHERE {where_clause} + ORDER BY timestamp DESC + LIMIT :limit + """ + + return self.db_manager.query_data(sql, condition_dict, return_multi=True) + + def query_data_by_date_range( + self, + start_date: str = None, + end_date: str = None, + user_id: str = None, + limit: int = 1000 + ): + """ + 根据日期范围查询数据 + :param start_date: 开始日期 (YYYY-MM-DD) + :param end_date: 结束日期 (YYYY-MM-DD) + :param user_id: 用户ID,可选 + :param limit: 查询数量 + """ + conditions = [] + condition_dict = {"limit": limit} + + if start_date: + conditions.append("date_time >= :start_date") + condition_dict["start_date"] = start_date + + if end_date: + conditions.append("date_time <= :end_date") + condition_dict["end_date"] = end_date + + if user_id: + conditions.append("user_id = :user_id") + condition_dict["user_id"] = user_id + + where_clause = " AND ".join(conditions) if conditions else "1=1" + + sql = f""" + SELECT * FROM truth_social_content + WHERE {where_clause} + ORDER BY timestamp DESC + LIMIT :limit + """ + + return self.db_manager.query_data(sql, condition_dict, return_multi=True) + + def get_user_list(self, limit: int = 100): + """ + 获取用户列表 + :param limit: 查询数量 + """ + sql = """ + SELECT DISTINCT user_id, user_name, + COUNT(*) as article_count, + MAX(timestamp) as last_time + FROM truth_social_content + GROUP BY user_id, user_name + ORDER BY last_time DESC + LIMIT :limit + """ + condition_dict = {"limit": limit} + return self.db_manager.query_data(sql, condition_dict, return_multi=True) + + def get_statistics(self): + """ + 获取统计信息 + """ + sql = """ + SELECT + COUNT(*) as total_articles, + COUNT(DISTINCT user_id) as total_users, + MIN(timestamp) as earliest_article, + MAX(timestamp) as latest_article, + AVG(LENGTH(text)) as avg_text_length + FROM truth_social_content + """ + return self.db_manager.query_data(sql, {}, return_multi=False) + + def delete_old_data(self, days: int = 30): + """ + 删除指定天数前的旧数据 + :param days: 保留天数 + """ + current_time = get_current_date_time() + cutoff_timestamp = int(pd.Timestamp(current_time).timestamp()) - (days * 24 * 60 * 60) + + sql = """ + DELETE FROM truth_social_content + WHERE timestamp < :cutoff_timestamp + """ + condition_dict = {"cutoff_timestamp": cutoff_timestamp} + + return self.db_manager.execute_sql(sql, condition_dict) + + def check_duplicate(self, user_id: str, timestamp: int): + """ + 检查是否存在重复数据 + :param user_id: 用户ID + :param timestamp: 时间戳 + """ + sql = """ + SELECT COUNT(*) as count + FROM truth_social_content + WHERE user_id = :user_id AND timestamp = :timestamp + """ + condition_dict = {"user_id": user_id, "timestamp": timestamp} + result = self.db_manager.query_data(sql, condition_dict, return_multi=False) + return result['count'] > 0 if result else False diff --git a/core/media/__pycache__/truth_social_retriever.cpython-312.pyc b/core/media/__pycache__/truth_social_retriever.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6953118245c88398b8ebadb76d4c876f59a5862c GIT binary patch literal 8173 zcmbU`ZEzDumNWXYB+Ig}WLfeD4E{tmmTkZW8y|$V0fRphuoDii$z;%su@rqVGvY%T z?Q%ITr_^zWxy{8U35%&Dg(yhyUhP$!3Rhu&9krKKNs)6>Cf+)#f*r`8kdV66C3Uy= zdgPIe<3QDo%iXWLU%!6+y5C1X{d+-y9)s^Y!+(ucm1Ec!s3CuQG3 z+H3R~hmDJUg~NpszG-+BqwpwXR&w<)uE2IFZcyRg?RU9$U7Ja1))PrQ#`}FeT*x|28JmY8xPt5n7|lp7^kse1+AqOr&YsB zT1P8SD&$^jo3_IY$m#l;i2rc%& z3frB)`0Ut9;-PU`A&W)8$gBV#GfU~lmduj3hVeR?j}_Na(Q0{C1b+;!CdXGqAt<LDj&0=p3cwWsvd)DpbN6NYT78^uVZwfpn%dOC?!}0<`E=W2OpFh?u*(nZk9N|rB{`#n|}%GE%%DA$r9+I-C-^NV9A zF-P%$m`4TOuya5;Gl|WT6!1C^|U}KUNP=_gw5tV`GPlQdrliX2nL-3?jcQiGLc!!8T zA>vKpa%WQ@#0NuslaHa@R1=Ssk>n(8Y#O=(3dNjob37S%-eF*okr9nWB%V<}$5#xQ4hr0k6edt=N!QpD3=6 zwN1Asi`%36IeUG|?o8O7vGB}>q#wcbvV0`G=_#hc4pNmt0_;bOy^(b(a zKUcSPru)O*_j{9dJyF%Hw)(bxi^Q!~e>VT+{PFIIfpY_~q5sfscmTV_TvFWhWL_sTPr5leZA8qXfg`m8qlJzH|5>y^UaoKluv5jMVo;(DYSuK|P zFmPd6T53V<$=bCg7#i6QL^Oh?8#{Cu!yI4Ub zdGjJ#LF+-6W!XpTR-T*PflkoD8v6vD$GmJRG-YBp4f#<0(GG zu}HB+70;aDMeQgsNK#%etN@PLe?(NU48RPrCpoQuAS9Z-6<@e{L71n&L4{PCkw%{+%B?CG@fgm(k6=>(Y&;^;#*9ktB>wV z7Z#t5{9R>g)rfQLRJqP(_y&)XC*ZkG+>6**R)sBuHci*K{lAk^r13mhCp#wP-uA^6dwPp zT0X0^0?u1xDzb93Rxk<$a&NAtbw{${AFUMdQY@nL7d;*~0`nqzK{t-GrpL%fFNIP0 ze2KiOpp#_fe`idX1mX$7%r4N|aM_}4= zXAoIM+X1^2VM|b17Alu0tYW)`RUoD9z*mt_w4#J6tRr)*~GL)-5NRgI( z)fj7!*p}SXCc*ZwRX+&wir9W-8&{sOXQ1OaZ<5J)w#iVg{t3N%PG%19XDkG*4dO&3M z-a&+r^9Px-NU23V1#fMamjNy_d}kiRe#*y)dWky*;!$2Spj;HhtG*x!E9kN<(R7$` zdw?+#bwNbSiaM0KlA0{i#}Eo#7AB_VU;3N*ci$IRNv~2M$y2T|A4KRJi&7Y(F4GUO zEIf$O6Q1Njv|`~RJQngD_ET;zi_{8&Wi*x_WSL`b<~VwwhgzXOv)jc(e(QI4fB!aI zoV(ZGTX^%-!o{}FSH+=gZUO&TrI8vp$-4UyoJQU3Ok_rYhSKm2Jt&Ef26RT-TB?td8y(KYqKk zDqU8YUQ?B}t@%c$wi=_m;Av#;$D>_yhN82*FZaevH_zy%Ux?efW)0o<73~Erb7fT* zJI{BaY$=l?VRFox8qy_eCL-q|zxnmJ>b?fE*yHBvYo*gWZ)}`( zY)_XH7kke4OdW_FPL{V!*G{*_%iGWGy!}x3_|Ds<<>PtqZkw}|PV}DZovMx5WBJot zZgkyLf3SC^^ClN3dS@+tX`<#^%v#(usYaQj>lS0C_KD~Y; zKqnJ6N7}g=pcfLhhV;7i*R@x))8)x^+a?R3m0Gtwv2Od#g5AP40CxUbkJbP1nveBpth;^VoS~dVTZto+~}^)}C9H$@Tql zv~IJE-_!z-#MP#@NlkoBbHdj0S=svZrk&qns*2h;(VTAEm1^6YXxn?MHQBarN;CCB zg4meeygRkIH?g_*)~@8{LFg0`#HMs-U#jz|MCViSea|L4$tg{|ZcBn_|BR@)uLD|e z=62$u@w{>B*rztaoW&|hUN_zSsil3cdi~|TOMPz+q^h?js<+MzB&&PJ`{!!vE+4pb z;PS|&k!i5sW)cF)?o?_+pzAASp;vz4Pa z`cz%4Wv2Dks!wgb-z(9`9r>)UnPUg8mdA{7ki)k4-r+d$%UR3F9bgnOc_2x&j_dFsEviQwpBe;>pVz07()erLig*q|xlg6o|Q?9tRK4oh7#MJQPoVg@ru1lEf5JC$e#BB|+9kGpZYg5Y9{E4ag$NNec z_<=)F*B{J1?R`axKN?E=H0nRr?kI=GM;disq3)ynJOme2^lwmq)MV^esy|j*0sh!- z>~Br@h$rx$tkY}DNOi{ zPE!iMz=Z(|TF@@V5De)}CA77Wg@F|2Am-2wgo|qqHrDtWYv`exo{pOSj+#Md&FC=ATTe3E%tQKmYm9Ec%fIqO7c2fE1kM1(5>; z{{;>Z+)U6*ftOW5UOp@;pC0OBNtngH{Cn;Bv2Vc#sIGqB7_ zMF0|&aH5h3uVB(z8JVIGmPG}6c5!u3oW>p~3e_7QRIXB6zNy`ht6QeJ9$=`xKL!KV zY2|kqR2dp*ro$}eqhDp5nV^mRCG?4TsDZM)YosCiu=8hrVaojeq=Ddn)O|$y{Yv_& zDiib_0#ccPO?jBxB&U?Aai z+Hok90&|Dsf5IyMggx<}Slbs^;pbTX=a>$90LYY|W9HjB)7jN8uRf*uUe%+(t>0p( Hl!*Ty;m;DD literal 0 HcmV?d00001 diff --git a/core/twitter/__pycache__/twitter_retriever.cpython-312.pyc b/core/media/__pycache__/twitter_retriever.cpython-312.pyc similarity index 78% rename from core/twitter/__pycache__/twitter_retriever.cpython-312.pyc rename to core/media/__pycache__/twitter_retriever.cpython-312.pyc index 14774207fae2bbc89b830a5a5396aec52dda7385..e7fc113d3e2084be1986cb6d6d4e7493312de67f 100644 GIT binary patch delta 492 zcmbQCH_L$UG%qg~0}$Lcc$MKVypd0svEB~Ioz9TT5XG3n5XF?j7{#2z)WQ(OlEMrm zSyNb2*-}_j+0s~2*jiYk*i+bnq8uq4KoO=CPM`>9C6^}mErF2o%#xDSqM+20qRiB? z)S}YYKx4TSfZ%Cs?~5%vpLcdVo3rNmtd>54pam-;pLo$7t0!-?A`x-%8DnuXJ9q$>HJ;K7dC)w zffx%ib^VjwOQEKk8X%dgV`O0Pd|%(w)@J=@b38gAcC4Q9eAjZE`hnU}OaPe&Hv(e9 z=3>TQOzgiHt$r~&{F+?Ca)U8tvM*~YqsZpXtoN80H8)S}=H+bnm>5+yw{cEjV$|I%&NG*l(R}k(K{*!2 z*vVH!w=r@~?iN#-yhF^A!wBfA7KTqulmChBl*$5finKw5CXo2WVUwGmQks)$S5!3l SqPP`*93!LHCk7A+Rs;a`c0UOK diff --git a/core/media/truth_social_retriever.py b/core/media/truth_social_retriever.py new file mode 100644 index 0000000..a84661e --- /dev/null +++ b/core/media/truth_social_retriever.py @@ -0,0 +1,156 @@ +import core.logger as logging +from core.db.db_truth_social_content import DBTruthSocialContent +from config import TRUTH_SOCIAL_API, COIN_MYSQL_CONFIG + +import requests +import json +import os +from bs4 import BeautifulSoup +import time +from datetime import datetime +import pytz +import pandas as pd + +logger = logging.logger + +class TruthSocialRetriever: + def __init__(self) -> None: + self.api_key = TRUTH_SOCIAL_API.get("api_key", "") + self.user_info = TRUTH_SOCIAL_API.get("user_id", {}) + mysql_user = COIN_MYSQL_CONFIG.get("user", "xch") + mysql_password = COIN_MYSQL_CONFIG.get("password", "") + if not mysql_password: + raise ValueError("MySQL password is not set") + mysql_host = COIN_MYSQL_CONFIG.get("host", "localhost") + mysql_port = COIN_MYSQL_CONFIG.get("port", 3306) + mysql_database = COIN_MYSQL_CONFIG.get("database", "okx") + + self.db_url = f"mysql+pymysql://{mysql_user}:{mysql_password}@{mysql_host}:{mysql_port}/{mysql_database}" + self.db_truth_social_content = DBTruthSocialContent(self.db_url) + + self.save_path = r"./output/media/truth_social/" + os.makedirs(self.save_path, exist_ok=True) + + def get_user_id_from_page(self, handle='realDonaldTrump'): + url = f'https://truthsocial.com/@{handle}' + headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'} # 模拟浏览器 + + response = requests.get(url, headers=headers) + response.raise_for_status() + + soup = BeautifulSoup(response.text, 'html.parser') + # 查找嵌入的 JSON(Truth Social 使用 data 属性或 script 标签) + scripts = soup.find_all('script') + for script in scripts: + if script.string and 'id' in script.string and handle in script.string: + # 简单提取(实际可能需正则匹配 JSON) + import re + match = re.search(r'"id"\s*:\s*"(\d+)"', script.string) + if match: + return match.group(1) + return None + + def get_user_posts(self, limit: int = None): + """ + 获取用户在 Truth Social 的最新帖子。 + 免费版:100次 + 付费版: + 47美元:25,000次,如果5分钟跑一次,则可以跑86.8天 + 497美元:500,000次,如果5分钟跑一次,则可以跑1736天 + 参数: + - limit: 最大帖子数(API 默认返回 20 条,可通过分页获取更多)。 + + 返回: + - 帖子列表(JSON 格式)。 + """ + headers = { + 'x-api-key': self.api_key, + 'Content-Type': 'application/json' + } + + for user_name, user_id in self.user_info.items(): + params = { + 'handle': user_name, # 用户名 + 'user_id': user_id, # 可选,用户 ID + 'next_max_id': None, # 分页时设置为上一次响应的 max_id + 'trim': 'false' # 保留完整内容 + } + + url = 'https://api.scrapecreators.com/v1/truthsocial/user/posts' + logger.info(f"Searching contents for user: {user_name}") + try: + response = requests.get(url, headers=headers, params=params) + response.raise_for_status() # 检查 HTTP 错误 + data = response.json() + + # 提取帖子列表(假设响应中 'posts' 是键,根据实际文档调整) + if limit is not None and isinstance(limit, int): + posts = data.get('posts', [])[:limit] + else: + posts = data.get('posts', []) + + results = [] + if posts: + logger.info(f"获取{user_name}帖子: {len(posts)}条") + for post in posts: + result = {} + result["article_id"] = post.get('id') + result["user_id"] = user_id + result["user_name"] = user_name + datetime_text = post.get('created_at') + datetime_dict = self.transform_datetime(datetime_text) + timestamp_ms = datetime_dict["timestamp_ms"] + result["timestamp"] = timestamp_ms + beijing_time_str = datetime_dict["beijing_time_str"] + result["date_time"] = beijing_time_str + result["text"] = post.get('text', '无内容') + media_attachments = post.get('media_attachments', []) + result["media_url"] = "" + result["media_type"] = "" + result["media_thumbnail"] = "" + if media_attachments: + for media_attachment in media_attachments: + result["media_url"] = media_attachment.get('url') + result["media_type"] = media_attachment.get('type') + result["media_thumbnail"] = media_attachment.get('preview_url') + break + results.append(result) + else: + print("获取帖子失败,请检查 API 密钥或网络。") + + if len(results) > 0: + user_path = os.path.join(self.save_path, user_name) + os.makedirs(user_path, exist_ok=True) + now_date_time = datetime.now().strftime("%Y%m%d%H%M%S") + json_file_name = os.path.join(user_path, f"{user_name}_{now_date_time}.json") + # 将results内容写入json_file_name文件中 + with open(json_file_name, 'w', encoding='utf-8') as f: + json.dump(results, f, ensure_ascii=False, indent=2) + logger.info(f"已将{len(results)}条数据保存到: {json_file_name}") + + result_df = pd.DataFrame(results) + + self.db_truth_social_content.insert_data_to_mysql(result_df) + + except requests.exceptions.RequestException as e: + print(f"请求错误: {e}") + except json.JSONDecodeError as e: + print(f"JSON 解析错误: {e}") + + def transform_datetime(self, datetime_text: str): + utc_time = datetime.strptime(datetime_text, "%Y-%m-%dT%H:%M:%S.%fZ").replace(tzinfo=pytz.UTC) + + # 1. 转换为时间戳(毫秒) + timestamp_ms = int(utc_time.timestamp() * 1000) + # 2. 转换为北京时间(ISO 8601 格式,带 +08:00) + beijing_tz = pytz.timezone("Asia/Shanghai") + beijing_time = utc_time.astimezone(beijing_tz) + beijing_time_str = beijing_time.strftime("%Y-%m-%dT%H:%M:%S%z") + # 插入冒号到时区偏移(如 +0800 -> +08:00) + beijing_time_str = beijing_time_str[:-2] + ":" + beijing_time_str[-2:] + result = { + "timestamp_ms": timestamp_ms, + "beijing_time_str": beijing_time_str + } + return result + diff --git a/core/twitter/twitter_retriever.py b/core/media/twitter_retriever.py similarity index 100% rename from core/twitter/twitter_retriever.py rename to core/media/twitter_retriever.py diff --git a/sql/table/truth_social_content.sql b/sql/table/truth_social_content.sql new file mode 100644 index 0000000..aaaf4ba --- /dev/null +++ b/sql/table/truth_social_content.sql @@ -0,0 +1,22 @@ +CREATE TABLE `truth_social_content` ( + `article_id` VARCHAR(50) NOT NULL PRIMARY KEY, + `user_id` VARCHAR(50) NOT NULL, + `user_name` VARCHAR(100) NOT NULL, + `timestamp` BIGINT NOT NULL, + `date_time` VARCHAR(50) NOT NULL, + `text` TEXT NOT NULL, + `media_url` TEXT NULL, + `media_type` VARCHAR(50) NULL, + `media_thumbnail` TEXT NULL, + `created_at` TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + `updated_at` TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; + +-- 对于 MySQL 8.0.29 之前的版本不支持 "ADD COLUMN IF NOT EXISTS" +-- 如需在已有表上添加列,请分别执行以下语句(每条仅需执行一次) +ALTER TABLE `truth_social_content` + ADD COLUMN `media_url` TEXT NULL DEFAULT NULL AFTER `text`; +ALTER TABLE `truth_social_content` + ADD COLUMN `media_type` VARCHAR(50) NULL DEFAULT NULL AFTER `media_url`; +ALTER TABLE `truth_social_content` + ADD COLUMN `media_thumbnail` TEXT NULL DEFAULT NULL AFTER `media_type`; diff --git a/test_autocomplete.py b/test_autocomplete.py new file mode 100644 index 0000000..31bcd9d --- /dev/null +++ b/test_autocomplete.py @@ -0,0 +1,41 @@ +# 测试代码提示和自动补全功能 +import pandas as pd +import numpy as np +import requests +import json +import os +from datetime import datetime + +def test_autocomplete(): + """测试自动补全功能""" + # 测试pandas自动补全 + df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}) + # 在这里输入 df. 应该会显示DataFrame的方法 + df.head() + df.describe() + + # 测试numpy自动补全 + arr = np.array([1, 2, 3, 4, 5]) + # 在这里输入 arr. 应该会显示numpy数组的方法 + arr.mean() + arr.std() + + # 测试requests自动补全 + response = requests.get("https://api.github.com") + # 在这里输入 response. 应该会显示Response对象的方法 + response.status_code + response.json() + + # 测试内置函数自动补全 + # 在这里输入 len( 应该会显示参数提示 + length = len([1, 2, 3]) + + # 测试类型提示 + current_time = datetime.now() + # 在这里输入 current_time. 应该会显示datetime对象的方法 + current_time.strftime("%Y-%m-%d") + + return df, arr, response, length, current_time + +if __name__ == "__main__": + test_autocomplete() diff --git a/truth_social_retriever_main.py b/truth_social_retriever_main.py new file mode 100644 index 0000000..568d5e2 --- /dev/null +++ b/truth_social_retriever_main.py @@ -0,0 +1,10 @@ +from core.media.truth_social_retriever import TruthSocialRetriever + + +def main(): + truth_social_retriever = TruthSocialRetriever() + truth_social_retriever.get_user_posts() + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/twitter_retriever_main.py b/twitter_retriever_main.py index 0196c37..96f63a3 100644 --- a/twitter_retriever_main.py +++ b/twitter_retriever_main.py @@ -1,4 +1,4 @@ -from core.twitter.twitter_retriever import TwitterRetriever +from core.media.twitter_retriever import TwitterRetriever import core.logger as logging import os