2025-07-26 06:41:50 +00:00
|
|
|
|
from pandas import DataFrame
|
|
|
|
|
|
import logging
|
|
|
|
|
|
import os
|
|
|
|
|
|
import re
|
|
|
|
|
|
import pandas as pd
|
2025-07-28 08:14:40 +00:00
|
|
|
|
from datetime import datetime
|
2025-07-30 08:11:34 +00:00
|
|
|
|
from typing import Optional, List, Dict, Any, Tuple
|
2025-07-26 06:41:50 +00:00
|
|
|
|
|
|
|
|
|
|
logging.basicConfig(
|
|
|
|
|
|
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
2025-07-28 08:14:40 +00:00
|
|
|
|
class HugeVolume:
|
2025-07-26 06:41:50 +00:00
|
|
|
|
def __init__(self, output_folder: str = "./output"):
|
|
|
|
|
|
self.output_folder = output_folder
|
|
|
|
|
|
os.makedirs(self.output_folder, exist_ok=True)
|
|
|
|
|
|
|
2025-07-30 08:11:34 +00:00
|
|
|
|
def _calculate_percentile_indicators(
|
|
|
|
|
|
self,
|
|
|
|
|
|
data: pd.DataFrame,
|
|
|
|
|
|
window_size: int,
|
|
|
|
|
|
percentiles: List[Tuple[float, str]] = [(0.8, "80"), (0.2, "20"), (0.9, "90"), (0.1, "10")]
|
|
|
|
|
|
) -> pd.DataFrame:
|
|
|
|
|
|
"""
|
|
|
|
|
|
计算分位数指标
|
|
|
|
|
|
:param data: 数据DataFrame
|
|
|
|
|
|
:param window_size: 窗口大小
|
|
|
|
|
|
:param percentiles: 分位数配置列表,格式为[(分位数, 名称后缀)]
|
|
|
|
|
|
:return: 包含分位数指标的DataFrame
|
|
|
|
|
|
"""
|
|
|
|
|
|
for percentile, suffix in percentiles:
|
|
|
|
|
|
# 计算分位数
|
|
|
|
|
|
data[f"close_{suffix}_percentile"] = (
|
|
|
|
|
|
data["close"].rolling(window=window_size, min_periods=1).quantile(percentile)
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
# 判断价格是否达到分位数
|
|
|
|
|
|
if suffix in ["80", "90"]:
|
|
|
|
|
|
# 高点分位数
|
|
|
|
|
|
data[f"price_{suffix}_high"] = (
|
|
|
|
|
|
data["close"] >= data[f"close_{suffix}_percentile"]
|
|
|
|
|
|
).astype(int)
|
|
|
|
|
|
else:
|
|
|
|
|
|
# 低点分位数
|
|
|
|
|
|
data[f"price_{suffix}_low"] = (
|
|
|
|
|
|
data["close"] <= data[f"close_{suffix}_percentile"]
|
|
|
|
|
|
).astype(int)
|
|
|
|
|
|
|
|
|
|
|
|
return data
|
|
|
|
|
|
|
|
|
|
|
|
def _calculate_volume_price_spikes(self, data: pd.DataFrame) -> pd.DataFrame:
|
|
|
|
|
|
"""
|
|
|
|
|
|
计算量价尖峰指标
|
|
|
|
|
|
:param data: 数据DataFrame
|
|
|
|
|
|
:return: 包含量价尖峰指标的DataFrame
|
|
|
|
|
|
"""
|
|
|
|
|
|
# 80/20量价尖峰
|
|
|
|
|
|
data["volume_80_20_price_spike"] = (
|
|
|
|
|
|
(data["huge_volume"] == 1)
|
|
|
|
|
|
& ((data["price_80_high"] == 1) | (data["price_20_low"] == 1))
|
|
|
|
|
|
).astype(int)
|
|
|
|
|
|
|
|
|
|
|
|
# 90/10量价尖峰
|
|
|
|
|
|
data["volume_90_10_price_spike"] = (
|
|
|
|
|
|
(data["huge_volume"] == 1)
|
|
|
|
|
|
& ((data["price_90_high"] == 1) | (data["price_10_low"] == 1))
|
|
|
|
|
|
).astype(int)
|
|
|
|
|
|
|
|
|
|
|
|
return data
|
|
|
|
|
|
|
2025-07-28 08:14:40 +00:00
|
|
|
|
def detect_huge_volume(
|
2025-07-26 06:41:50 +00:00
|
|
|
|
self,
|
|
|
|
|
|
data: DataFrame,
|
2025-07-28 08:14:40 +00:00
|
|
|
|
window_size: int = 50,
|
2025-07-26 06:41:50 +00:00
|
|
|
|
threshold: float = 2.0,
|
|
|
|
|
|
check_price: bool = False,
|
|
|
|
|
|
only_output_huge_volume: bool = False,
|
|
|
|
|
|
output_excel: bool = False,
|
2025-07-30 08:11:34 +00:00
|
|
|
|
) -> Optional[DataFrame]:
|
2025-07-26 06:41:50 +00:00
|
|
|
|
"""
|
|
|
|
|
|
detect_volume_spike的函数逻辑:
|
|
|
|
|
|
1. 根据window滑动行情数据
|
|
|
|
|
|
2. 每一个window的最新的volume是否高于该window的volume的均值+2倍标准差,如果满足条件,则增加一列:huge_volume,值为1
|
|
|
|
|
|
3. 如果check_price为True,则检查:
|
|
|
|
|
|
a. 每一个window的close是否处于该window的80%分位数及以上
|
2025-07-28 04:29:31 +00:00
|
|
|
|
b. 每一个window的close是否处于该window的20%分位数及以下
|
2025-07-30 08:11:34 +00:00
|
|
|
|
c. 每一个window的close是否处于该window的90%分位数及以上
|
|
|
|
|
|
d. 每一个window的close是否处于该window的10%分位数及以下
|
2025-07-26 06:41:50 +00:00
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
|
data: 包含成交量数据的DataFrame
|
|
|
|
|
|
threshold: 标准差倍数,默认为2.0(即成交量超过均值+2倍标准差)
|
2025-07-30 08:11:34 +00:00
|
|
|
|
window_size: 计算移动窗口的大小,默认50个周期
|
|
|
|
|
|
check_price: 是否检查价格处于windows内的分位数位置,默认False
|
|
|
|
|
|
only_output_huge_volume: 是否只输出巨量交易记录,默认False
|
|
|
|
|
|
output_excel: 是否输出到Excel文件,默认False
|
2025-07-26 06:41:50 +00:00
|
|
|
|
Returns:
|
|
|
|
|
|
DataFrame: 包含异常检测结果的DataFrame
|
|
|
|
|
|
"""
|
|
|
|
|
|
if data is None or len(data) == 0:
|
|
|
|
|
|
logging.warning("数据为空,无法进行成交量异常检测")
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
if "volume" not in data.columns:
|
|
|
|
|
|
logging.error("数据中缺少volume列")
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
# 按时间戳排序
|
|
|
|
|
|
data = data.sort_values(by="timestamp", ascending=True).copy()
|
|
|
|
|
|
|
|
|
|
|
|
# 计算移动窗口的成交量均值和标准差
|
2025-07-30 08:11:34 +00:00
|
|
|
|
data["volume_ma"] = (
|
|
|
|
|
|
data["volume"].rolling(window=window_size, min_periods=1).mean()
|
|
|
|
|
|
)
|
|
|
|
|
|
data["volume_std"] = (
|
|
|
|
|
|
data["volume"].rolling(window=window_size, min_periods=1).std()
|
|
|
|
|
|
)
|
2025-07-26 06:41:50 +00:00
|
|
|
|
|
|
|
|
|
|
# 计算成交量阈值(均值 + threshold倍标准差)
|
|
|
|
|
|
data["volume_threshold"] = data["volume_ma"] + threshold * data["volume_std"]
|
|
|
|
|
|
|
|
|
|
|
|
# 判断当前成交量是否超过阈值
|
|
|
|
|
|
data["huge_volume"] = (data["volume"] > data["volume_threshold"]).astype(int)
|
|
|
|
|
|
|
|
|
|
|
|
# 计算成交量比率
|
|
|
|
|
|
data["volume_ratio"] = data["volume"] / data["volume_ma"]
|
|
|
|
|
|
|
|
|
|
|
|
# 计算异常强度
|
|
|
|
|
|
data["spike_intensity"] = data["volume_ratio"] - 1
|
|
|
|
|
|
|
|
|
|
|
|
# 如果check_price为True,检查价格分位数
|
|
|
|
|
|
if check_price:
|
|
|
|
|
|
if "close" not in data.columns:
|
|
|
|
|
|
logging.error("数据中缺少close列,无法进行价格检查")
|
|
|
|
|
|
return data
|
|
|
|
|
|
|
2025-07-30 08:11:34 +00:00
|
|
|
|
# 计算分位数指标(80/20和90/10)
|
|
|
|
|
|
data = self._calculate_percentile_indicators(data, window_size)
|
|
|
|
|
|
|
|
|
|
|
|
# 计算量价尖峰指标
|
|
|
|
|
|
data = self._calculate_volume_price_spikes(data)
|
2025-07-26 06:41:50 +00:00
|
|
|
|
|
|
|
|
|
|
if only_output_huge_volume:
|
2025-07-30 08:11:34 +00:00
|
|
|
|
data = data[(data["huge_volume"] == 1)]
|
|
|
|
|
|
|
2025-07-28 08:14:40 +00:00
|
|
|
|
data["create_time"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
2025-07-26 06:41:50 +00:00
|
|
|
|
|
|
|
|
|
|
if output_excel:
|
|
|
|
|
|
# 检查数据是否为空
|
|
|
|
|
|
if len(data) == 0:
|
|
|
|
|
|
logging.warning("数据为空,无法导出Excel文件")
|
|
|
|
|
|
return data
|
2025-07-30 08:11:34 +00:00
|
|
|
|
|
2025-07-26 06:41:50 +00:00
|
|
|
|
start_date = data["date_time"].iloc[0]
|
|
|
|
|
|
end_date = data["date_time"].iloc[-1]
|
|
|
|
|
|
# remove punctuation from start_date and end_date
|
|
|
|
|
|
start_date = re.sub(r"[\:\-\s]", "", str(start_date))
|
|
|
|
|
|
end_date = re.sub(r"[\:\-\s]", "", str(end_date))
|
|
|
|
|
|
symbol = data["symbol"].iloc[0]
|
|
|
|
|
|
bar = data["bar"].iloc[0]
|
|
|
|
|
|
file_name = f"volume_spike_{symbol}_{bar}_{start_date}_{end_date}.xlsx"
|
2025-07-28 08:14:40 +00:00
|
|
|
|
try:
|
2025-07-30 08:11:34 +00:00
|
|
|
|
with pd.ExcelWriter(
|
|
|
|
|
|
os.path.join(self.output_folder, file_name)
|
|
|
|
|
|
) as writer:
|
2025-07-28 08:14:40 +00:00
|
|
|
|
data.to_excel(writer, sheet_name="volume_spike", index=False)
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
logging.error(f"导出Excel文件失败: {e}")
|
2025-07-26 06:41:50 +00:00
|
|
|
|
|
|
|
|
|
|
return data
|
2025-07-30 08:11:34 +00:00
|
|
|
|
|
|
|
|
|
|
def next_periods_rise_or_fall(
|
|
|
|
|
|
self,
|
|
|
|
|
|
data: pd.DataFrame,
|
|
|
|
|
|
periods: List[int] = [3, 5],
|
|
|
|
|
|
output_excel: bool = False
|
|
|
|
|
|
) -> Tuple[pd.DataFrame, pd.DataFrame]:
|
|
|
|
|
|
"""
|
|
|
|
|
|
1. 根据period_count,计算每个timestamp的下一个periods的rise_or_fall
|
|
|
|
|
|
示例,period_count=3,则计算每个timestamp的下一个3个periods的rise_or_fall
|
|
|
|
|
|
示例,data如下:
|
|
|
|
|
|
timestamp close huge_volume
|
|
|
|
|
|
1000000000 100 1
|
|
|
|
|
|
1000000001 101 0
|
|
|
|
|
|
1000000002 102 1
|
|
|
|
|
|
1000000003 103 0
|
|
|
|
|
|
1000000004 100 1
|
|
|
|
|
|
1000000005 98 0
|
|
|
|
|
|
|
|
|
|
|
|
则对于timestamp,1000000000,计算结果为:
|
|
|
|
|
|
timestamp close huge_volume next_3_close next_3_result next_5_close next_5_result
|
|
|
|
|
|
1000000000 100 1 103 rise 98 fall
|
|
|
|
|
|
因为之后第3个periods的close是103,所以next_3_result为rise
|
|
|
|
|
|
因为之后第5个periods的close是98,所以next_3_result为fall
|
|
|
|
|
|
|
|
|
|
|
|
2. 如果output_excel为True,则输出到excel
|
|
|
|
|
|
3. 新建一个列表: result,计算huge_volume为1时,之后3或5个周期,close上涨或下跌的比例
|
|
|
|
|
|
a. 计算huge_volume为1时,且price_80_high为1时的数量,如100, 并且计算next_3_result为fall的次数,如50, 然后计算fall_ratio, 如50/100=0.5
|
|
|
|
|
|
b. 计算huge_volume为1时,且price_80_high为1时的数量,如100, 并且计算next_5_result为fall的次数,如30, 然后计算fall_ratio, 如30/100=0.3
|
|
|
|
|
|
c. 计算huge_volume为1时,且price_20_low为1时的数量,如100, 并且计算next_3_result为rise的次数,如50, 然后计算rise_ratio, 如50/100=0.5
|
|
|
|
|
|
d. 计算huge_volume为1时,且price_20_low为1时的数量,如100, 并且计算next_5_result为rise的次数,如30, 然后计算rise_ratio, 如30/100=0.3
|
|
|
|
|
|
e. 同样计算90/10分位数的统计
|
|
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
|
data: 包含巨量交易数据的DataFrame
|
|
|
|
|
|
periods: 计算周期列表,默认[3, 5]
|
|
|
|
|
|
output_excel: 是否输出到Excel文件,默认False
|
|
|
|
|
|
Returns:
|
|
|
|
|
|
Tuple[pd.DataFrame, pd.DataFrame]: (处理后的数据, 统计结果)
|
|
|
|
|
|
"""
|
|
|
|
|
|
data = data.sort_values(by="timestamp", ascending=True)
|
|
|
|
|
|
data = data.reset_index(drop=True)
|
|
|
|
|
|
|
|
|
|
|
|
# 计算未来价格变化
|
|
|
|
|
|
for period in periods:
|
|
|
|
|
|
data[f"next_{period}_close"] = data["close"].shift(-period)
|
|
|
|
|
|
data[f"next_{period}_result"] = (
|
|
|
|
|
|
data[f"next_{period}_close"] / data["close"] - 1
|
|
|
|
|
|
)
|
|
|
|
|
|
data[f"next_{period}_result"] = data[f"next_{period}_result"].apply(
|
|
|
|
|
|
lambda x: (
|
|
|
|
|
|
"rise"
|
|
|
|
|
|
if pd.notna(x) and x > 0
|
|
|
|
|
|
else (
|
|
|
|
|
|
"fall"
|
|
|
|
|
|
if pd.notna(x) and x < 0
|
|
|
|
|
|
else "draw" if pd.notna(x) and x == 0 else x
|
|
|
|
|
|
)
|
|
|
|
|
|
)
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
# 过滤data, 只获取huge_volume为1,且价格处于分位数位置的行
|
|
|
|
|
|
price_conditions = []
|
|
|
|
|
|
if "price_80_high" in data.columns:
|
|
|
|
|
|
price_conditions.append(data["price_80_high"] == 1)
|
|
|
|
|
|
if "price_20_low" in data.columns:
|
|
|
|
|
|
price_conditions.append(data["price_20_low"] == 1)
|
|
|
|
|
|
if "price_90_high" in data.columns:
|
|
|
|
|
|
price_conditions.append(data["price_90_high"] == 1)
|
|
|
|
|
|
if "price_10_low" in data.columns:
|
|
|
|
|
|
price_conditions.append(data["price_10_low"] == 1)
|
|
|
|
|
|
|
|
|
|
|
|
if price_conditions:
|
|
|
|
|
|
combined_condition = data["huge_volume"] == 1
|
|
|
|
|
|
for condition in price_conditions:
|
|
|
|
|
|
combined_condition = combined_condition | condition
|
|
|
|
|
|
data = data[combined_condition]
|
|
|
|
|
|
|
|
|
|
|
|
data = data.reset_index(drop=True)
|
|
|
|
|
|
|
|
|
|
|
|
# 统计各种分位数情况的数量
|
|
|
|
|
|
price_stats = {}
|
|
|
|
|
|
for price_type in ["price_80_high", "price_20_low", "price_90_high", "price_10_low"]:
|
|
|
|
|
|
if price_type in data.columns:
|
|
|
|
|
|
price_stats[price_type] = len(data[(data["huge_volume"] == 1) & (data[price_type] == 1)])
|
|
|
|
|
|
|
|
|
|
|
|
results = []
|
|
|
|
|
|
for period in periods:
|
|
|
|
|
|
for price_type, count in price_stats.items():
|
|
|
|
|
|
if count > 0:
|
|
|
|
|
|
# 计算下跌次数
|
|
|
|
|
|
fall_count = len(
|
|
|
|
|
|
data[
|
|
|
|
|
|
(data["huge_volume"] == 1) &
|
|
|
|
|
|
(data[price_type] == 1) &
|
|
|
|
|
|
(data[f"next_{period}_result"] == "fall")
|
|
|
|
|
|
]
|
|
|
|
|
|
)
|
|
|
|
|
|
# 计算上涨次数
|
|
|
|
|
|
rise_count = len(
|
|
|
|
|
|
data[
|
|
|
|
|
|
(data["huge_volume"] == 1) &
|
|
|
|
|
|
(data[price_type] == 1) &
|
|
|
|
|
|
(data[f"next_{period}_result"] == "rise")
|
|
|
|
|
|
]
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
results.append(
|
|
|
|
|
|
{
|
|
|
|
|
|
"symbol": data["symbol"].iloc[0] if len(data) > 0 else "",
|
|
|
|
|
|
"bar": data["bar"].iloc[0] if len(data) > 0 else "",
|
|
|
|
|
|
"huge_volume": 1,
|
|
|
|
|
|
"price_type": price_type,
|
|
|
|
|
|
"next_period": period,
|
|
|
|
|
|
"fall_count": fall_count,
|
|
|
|
|
|
"rise_count": rise_count,
|
|
|
|
|
|
"fall_ratio": fall_count / count,
|
|
|
|
|
|
"rise_ratio": rise_count / count,
|
|
|
|
|
|
"total_count": count,
|
|
|
|
|
|
}
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
result_data = pd.DataFrame(results)
|
|
|
|
|
|
return data, result_data
|