from pandas import DataFrame import logging import os import re import pandas as pd from datetime import datetime from typing import Optional, List, Dict, Any, Tuple logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" ) class HugeVolume: def __init__(self, output_folder: str = "./output"): self.output_folder = output_folder os.makedirs(self.output_folder, exist_ok=True) def _calculate_percentile_indicators( self, data: pd.DataFrame, window_size: int, percentiles: List[Tuple[float, str]] = [(0.8, "80"), (0.2, "20"), (0.9, "90"), (0.1, "10")] ) -> pd.DataFrame: """ 计算分位数指标 :param data: 数据DataFrame :param window_size: 窗口大小 :param percentiles: 分位数配置列表,格式为[(分位数, 名称后缀)] :return: 包含分位数指标的DataFrame """ for percentile, suffix in percentiles: # 计算分位数 data[f"close_{suffix}_percentile"] = ( data["close"].rolling(window=window_size, min_periods=1).quantile(percentile) ) # 判断价格是否达到分位数 if suffix in ["80", "90"]: # 高点分位数 data[f"price_{suffix}_high"] = ( data["close"] >= data[f"close_{suffix}_percentile"] ).astype(int) else: # 低点分位数 data[f"price_{suffix}_low"] = ( data["close"] <= data[f"close_{suffix}_percentile"] ).astype(int) return data def _calculate_volume_price_spikes(self, data: pd.DataFrame) -> pd.DataFrame: """ 计算量价尖峰指标 :param data: 数据DataFrame :return: 包含量价尖峰指标的DataFrame """ # 80/20量价尖峰 data["volume_80_20_price_spike"] = ( (data["huge_volume"] == 1) & ((data["price_80_high"] == 1) | (data["price_20_low"] == 1)) ).astype(int) # 90/10量价尖峰 data["volume_90_10_price_spike"] = ( (data["huge_volume"] == 1) & ((data["price_90_high"] == 1) | (data["price_10_low"] == 1)) ).astype(int) return data def detect_huge_volume( self, data: DataFrame, window_size: int = 50, threshold: float = 2.0, check_price: bool = False, only_output_huge_volume: bool = False, output_excel: bool = False, ) -> Optional[DataFrame]: """ detect_volume_spike的函数逻辑: 1. 根据window滑动行情数据 2. 每一个window的最新的volume是否高于该window的volume的均值+2倍标准差,如果满足条件,则增加一列:huge_volume,值为1 3. 如果check_price为True,则检查: a. 每一个window的close是否处于该window的80%分位数及以上 b. 每一个window的close是否处于该window的20%分位数及以下 c. 每一个window的close是否处于该window的90%分位数及以上 d. 每一个window的close是否处于该window的10%分位数及以下 Args: data: 包含成交量数据的DataFrame threshold: 标准差倍数,默认为2.0(即成交量超过均值+2倍标准差) window_size: 计算移动窗口的大小,默认50个周期 check_price: 是否检查价格处于windows内的分位数位置,默认False only_output_huge_volume: 是否只输出巨量交易记录,默认False output_excel: 是否输出到Excel文件,默认False Returns: DataFrame: 包含异常检测结果的DataFrame """ if data is None or len(data) == 0: logging.warning("数据为空,无法进行成交量异常检测") return None if "volume" not in data.columns: logging.error("数据中缺少volume列") return None # 按时间戳排序 data = data.sort_values(by="timestamp", ascending=True).copy() data["window_size"] = window_size # 计算移动窗口的成交量均值和标准差 data["volume_ma"] = ( data["volume"].rolling(window=window_size, min_periods=1).mean() ) data["volume_std"] = ( data["volume"].rolling(window=window_size, min_periods=1).std() ) # 计算成交量阈值(均值 + threshold倍标准差) data["volume_threshold"] = data["volume_ma"] + threshold * data["volume_std"] # 判断当前成交量是否超过阈值 data["huge_volume"] = (data["volume"] > data["volume_threshold"]).astype(int) # 计算成交量比率 data["volume_ratio"] = data["volume"] / data["volume_ma"] # 计算异常强度 data["spike_intensity"] = data["volume_ratio"] - 1 # 如果check_price为True,检查价格分位数 if check_price: if "close" not in data.columns: logging.error("数据中缺少close列,无法进行价格检查") return data # 计算分位数指标(80/20和90/10) data = self._calculate_percentile_indicators(data, window_size) # 计算量价尖峰指标 data = self._calculate_volume_price_spikes(data) if only_output_huge_volume: data = data[(data["huge_volume"] == 1)] data["create_time"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S") if output_excel: # 检查数据是否为空 if len(data) == 0: logging.warning("数据为空,无法导出Excel文件") return data start_date = data["date_time"].iloc[0] end_date = data["date_time"].iloc[-1] # remove punctuation from start_date and end_date start_date = re.sub(r"[\:\-\s]", "", str(start_date)) end_date = re.sub(r"[\:\-\s]", "", str(end_date)) symbol = data["symbol"].iloc[0] bar = data["bar"].iloc[0] file_name = f"volume_spike_{symbol}_{bar}_{window_size}_{start_date}_{end_date}.xlsx" try: with pd.ExcelWriter( os.path.join(self.output_folder, file_name) ) as writer: data.to_excel(writer, sheet_name="volume_spike", index=False) except Exception as e: logging.error(f"导出Excel文件失败: {e}") return data def next_periods_rise_or_fall( self, data: pd.DataFrame, window_size: int = 50, periods: List[int] = [3, 5], output_excel: bool = False ) -> Tuple[pd.DataFrame, pd.DataFrame]: """ 1. 根据period_count,计算每个timestamp的下一个periods的rise_or_fall 示例,period_count=3,则计算每个timestamp的下一个3个periods的rise_or_fall 示例,data如下: timestamp close huge_volume 1000000000 100 1 1000000001 101 0 1000000002 102 1 1000000003 103 0 1000000004 100 1 1000000005 98 0 则对于timestamp,1000000000,计算结果为: timestamp close huge_volume next_3_close next_3_result next_5_close next_5_result 1000000000 100 1 103 rise 98 fall 因为之后第3个periods的close是103,所以next_3_result为rise 因为之后第5个periods的close是98,所以next_3_result为fall 2. 如果output_excel为True,则输出到excel 3. 新建一个列表: result,计算huge_volume为1时,之后3或5个周期,close上涨或下跌的比例 a. 计算huge_volume为1时,且price_80_high为1时的数量,如100, 并且计算next_3_result为fall的次数,如50, 然后计算fall_ratio, 如50/100=0.5 b. 计算huge_volume为1时,且price_80_high为1时的数量,如100, 并且计算next_5_result为fall的次数,如30, 然后计算fall_ratio, 如30/100=0.3 c. 计算huge_volume为1时,且price_20_low为1时的数量,如100, 并且计算next_3_result为rise的次数,如50, 然后计算rise_ratio, 如50/100=0.5 d. 计算huge_volume为1时,且price_20_low为1时的数量,如100, 并且计算next_5_result为rise的次数,如30, 然后计算rise_ratio, 如30/100=0.3 e. 同样计算90/10分位数的统计 Args: data: 包含巨量交易数据的DataFrame periods: 计算周期列表,默认[3, 5] output_excel: 是否输出到Excel文件,默认False Returns: Tuple[pd.DataFrame, pd.DataFrame]: (处理后的数据, 统计结果) """ data = data.sort_values(by="timestamp", ascending=True) data = data.reset_index(drop=True) # 计算未来价格变化 for period in periods: data[f"next_{period}_close"] = data["close"].shift(-period) data[f"next_{period}_result"] = ( data[f"next_{period}_close"] / data["close"] - 1 ) data[f"next_{period}_result"] = data[f"next_{period}_result"].apply( lambda x: ( "rise" if pd.notna(x) and x > 0 else ( "fall" if pd.notna(x) and x < 0 else "draw" if pd.notna(x) and x == 0 else x ) ) ) # 过滤data, 只获取huge_volume为1,且价格处于分位数位置的行 price_conditions = [] if "price_80_high" in data.columns: price_conditions.append(data["price_80_high"] == 1) if "price_20_low" in data.columns: price_conditions.append(data["price_20_low"] == 1) if "price_90_high" in data.columns: price_conditions.append(data["price_90_high"] == 1) if "price_10_low" in data.columns: price_conditions.append(data["price_10_low"] == 1) if price_conditions: combined_condition = data["huge_volume"] == 1 for condition in price_conditions: combined_condition = combined_condition | condition data = data[combined_condition] data = data.reset_index(drop=True) # 统计各种分位数情况的数量 price_stats = {} for price_type in ["price_80_high", "price_20_low", "price_90_high", "price_10_low"]: if price_type in data.columns: price_stats[price_type] = len(data[(data["huge_volume"] == 1) & (data[price_type] == 1)]) results = [] for period in periods: for price_type, count in price_stats.items(): if count > 0: # 计算下跌次数 fall_count = len( data[ (data["huge_volume"] == 1) & (data[price_type] == 1) & (data[f"next_{period}_result"] == "fall") ] ) # 计算上涨次数 rise_count = len( data[ (data["huge_volume"] == 1) & (data[price_type] == 1) & (data[f"next_{period}_result"] == "rise") ] ) results.append( { "symbol": data["symbol"].iloc[0] if len(data) > 0 else "", "bar": data["bar"].iloc[0] if len(data) > 0 else "", "window_size": window_size, "huge_volume": 1, "price_type": price_type, "next_period": period, "fall_count": fall_count, "rise_count": rise_count, "fall_ratio": fall_count / count, "rise_ratio": rise_count / count, "total_count": count, } ) result_data = pd.DataFrame(results) return data, result_data