from core.db_manager import query_market_data_by_symbol_bar from pandas import DataFrame import logging import os import re import pandas as pd logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" ) class Statistics: def __init__(self, output_folder: str = "./output"): self.output_folder = output_folder os.makedirs(self.output_folder, exist_ok=True) def detect_volume_spike( self, data: DataFrame, threshold: float = 2.0, window: int = 50, check_price: bool = False, only_output_huge_volume: bool = False, output_excel: bool = False, ): """ detect_volume_spike的函数逻辑: 1. 根据window滑动行情数据 2. 每一个window的最新的volume是否高于该window的volume的均值+2倍标准差,如果满足条件,则增加一列:huge_volume,值为1 3. 如果check_price为True,则检查: a. 每一个window的close是否处于该window的80%分位数及以上 b. 每一个window的close是否处于该window的20%分位数及以下 Args: data: 包含成交量数据的DataFrame threshold: 标准差倍数,默认为2.0(即成交量超过均值+2倍标准差) window: 计算移动窗口的大小,默认50个周期 check_price: 是否检查价格处于windows内的80%分位数以上,或20%分位数以下,默认False Returns: DataFrame: 包含异常检测结果的DataFrame """ if data is None or len(data) == 0: logging.warning("数据为空,无法进行成交量异常检测") return None if "volume" not in data.columns: logging.error("数据中缺少volume列") return None # 按时间戳排序 data = data.sort_values(by="timestamp", ascending=True).copy() # 计算移动窗口的成交量均值和标准差 data["volume_ma"] = data["volume"].rolling(window=window, min_periods=1).mean() data["volume_std"] = data["volume"].rolling(window=window, min_periods=1).std() # 计算成交量阈值(均值 + threshold倍标准差) data["volume_threshold"] = data["volume_ma"] + threshold * data["volume_std"] # 判断当前成交量是否超过阈值 data["huge_volume"] = (data["volume"] > data["volume_threshold"]).astype(int) # 计算成交量比率 data["volume_ratio"] = data["volume"] / data["volume_ma"] # 计算异常强度 data["spike_intensity"] = data["volume_ratio"] - 1 # 如果check_price为True,检查价格分位数 if check_price: if "close" not in data.columns: logging.error("数据中缺少close列,无法进行价格检查") return data # 计算移动窗口的收盘价分位数 data["close_80_percentile"] = ( data["close"].rolling(window=window, min_periods=1).quantile(0.8) ) data["close_20_percentile"] = ( data["close"].rolling(window=window, min_periods=1).quantile(0.2) ) # 检查收盘价是否在80%分位数及以上或20%分位数及以下 data["price_high"] = (data["close"] >= data["close_80_percentile"]).astype( int ) data["price_low"] = (data["close"] <= data["close_20_percentile"]).astype( int ) # 综合判断:成交量异常且价格处于极端位置 data["volume_price_spike"] = ( (data["huge_volume"] == 1) & ((data["price_high"] == 1) | (data["price_low"] == 1)) ).astype(int) if only_output_huge_volume: data = data[data["huge_volume"] == 1] if output_excel: # 检查数据是否为空 if len(data) == 0: logging.warning("数据为空,无法导出Excel文件") return data start_date = data["date_time"].iloc[0] end_date = data["date_time"].iloc[-1] # remove punctuation from start_date and end_date start_date = re.sub(r"[\:\-\s]", "", str(start_date)) end_date = re.sub(r"[\:\-\s]", "", str(end_date)) symbol = data["symbol"].iloc[0] bar = data["bar"].iloc[0] file_name = f"volume_spike_{symbol}_{bar}_{start_date}_{end_date}.xlsx" with pd.ExcelWriter(os.path.join(self.output_folder, file_name)) as writer: data.to_excel(writer, sheet_name="volume_spike", index=False) return data