crypto_quant/core/huge_volume.py

296 lines
12 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from pandas import DataFrame
import logging
import os
import re
import pandas as pd
from datetime import datetime
from typing import Optional, List, Dict, Any, Tuple
logging.basicConfig(
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)
class HugeVolume:
def __init__(self, output_folder: str = "./output"):
self.output_folder = output_folder
os.makedirs(self.output_folder, exist_ok=True)
def _calculate_percentile_indicators(
self,
data: pd.DataFrame,
window_size: int,
percentiles: List[Tuple[float, str]] = [(0.8, "80"), (0.2, "20"), (0.9, "90"), (0.1, "10")]
) -> pd.DataFrame:
"""
计算分位数指标
:param data: 数据DataFrame
:param window_size: 窗口大小
:param percentiles: 分位数配置列表,格式为[(分位数, 名称后缀)]
:return: 包含分位数指标的DataFrame
"""
for percentile, suffix in percentiles:
# 计算分位数
data[f"close_{suffix}_percentile"] = (
data["close"].rolling(window=window_size, min_periods=1).quantile(percentile)
)
# 判断价格是否达到分位数
if suffix in ["80", "90"]:
# 高点分位数
data[f"price_{suffix}_high"] = (
data["close"] >= data[f"close_{suffix}_percentile"]
).astype(int)
else:
# 低点分位数
data[f"price_{suffix}_low"] = (
data["close"] <= data[f"close_{suffix}_percentile"]
).astype(int)
return data
def _calculate_volume_price_spikes(self, data: pd.DataFrame) -> pd.DataFrame:
"""
计算量价尖峰指标
:param data: 数据DataFrame
:return: 包含量价尖峰指标的DataFrame
"""
# 80/20量价尖峰
data["volume_80_20_price_spike"] = (
(data["huge_volume"] == 1)
& ((data["price_80_high"] == 1) | (data["price_20_low"] == 1))
).astype(int)
# 90/10量价尖峰
data["volume_90_10_price_spike"] = (
(data["huge_volume"] == 1)
& ((data["price_90_high"] == 1) | (data["price_10_low"] == 1))
).astype(int)
return data
def detect_huge_volume(
self,
data: DataFrame,
window_size: int = 50,
threshold: float = 2.0,
check_price: bool = False,
only_output_huge_volume: bool = False,
output_excel: bool = False,
) -> Optional[DataFrame]:
"""
detect_volume_spike的函数逻辑
1. 根据window滑动行情数据
2. 每一个window的最新的volume是否高于该window的volume的均值+2倍标准差如果满足条件则增加一列huge_volume值为1
3. 如果check_price为True则检查:
a. 每一个window的close是否处于该window的80%分位数及以上
b. 每一个window的close是否处于该window的20%分位数及以下
c. 每一个window的close是否处于该window的90%分位数及以上
d. 每一个window的close是否处于该window的10%分位数及以下
Args:
data: 包含成交量数据的DataFrame
threshold: 标准差倍数默认为2.0(即成交量超过均值+2倍标准差
window_size: 计算移动窗口的大小默认50个周期
check_price: 是否检查价格处于windows内的分位数位置默认False
only_output_huge_volume: 是否只输出巨量交易记录默认False
output_excel: 是否输出到Excel文件默认False
Returns:
DataFrame: 包含异常检测结果的DataFrame
"""
if data is None or len(data) == 0:
logging.warning("数据为空,无法进行成交量异常检测")
return None
if "volume" not in data.columns:
logging.error("数据中缺少volume列")
return None
# 按时间戳排序
data = data.sort_values(by="timestamp", ascending=True).copy()
# 计算移动窗口的成交量均值和标准差
data["volume_ma"] = (
data["volume"].rolling(window=window_size, min_periods=1).mean()
)
data["volume_std"] = (
data["volume"].rolling(window=window_size, min_periods=1).std()
)
# 计算成交量阈值(均值 + threshold倍标准差
data["volume_threshold"] = data["volume_ma"] + threshold * data["volume_std"]
# 判断当前成交量是否超过阈值
data["huge_volume"] = (data["volume"] > data["volume_threshold"]).astype(int)
# 计算成交量比率
data["volume_ratio"] = data["volume"] / data["volume_ma"]
# 计算异常强度
data["spike_intensity"] = data["volume_ratio"] - 1
# 如果check_price为True检查价格分位数
if check_price:
if "close" not in data.columns:
logging.error("数据中缺少close列无法进行价格检查")
return data
# 计算分位数指标80/20和90/10
data = self._calculate_percentile_indicators(data, window_size)
# 计算量价尖峰指标
data = self._calculate_volume_price_spikes(data)
if only_output_huge_volume:
data = data[(data["huge_volume"] == 1)]
data["create_time"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
if output_excel:
# 检查数据是否为空
if len(data) == 0:
logging.warning("数据为空无法导出Excel文件")
return data
start_date = data["date_time"].iloc[0]
end_date = data["date_time"].iloc[-1]
# remove punctuation from start_date and end_date
start_date = re.sub(r"[\:\-\s]", "", str(start_date))
end_date = re.sub(r"[\:\-\s]", "", str(end_date))
symbol = data["symbol"].iloc[0]
bar = data["bar"].iloc[0]
file_name = f"volume_spike_{symbol}_{bar}_{start_date}_{end_date}.xlsx"
try:
with pd.ExcelWriter(
os.path.join(self.output_folder, file_name)
) as writer:
data.to_excel(writer, sheet_name="volume_spike", index=False)
except Exception as e:
logging.error(f"导出Excel文件失败: {e}")
return data
def next_periods_rise_or_fall(
self,
data: pd.DataFrame,
periods: List[int] = [3, 5],
output_excel: bool = False
) -> Tuple[pd.DataFrame, pd.DataFrame]:
"""
1. 根据period_count计算每个timestamp的下一个periods的rise_or_fall
示例period_count=3则计算每个timestamp的下一个3个periods的rise_or_fall
示例data如下
timestamp close huge_volume
1000000000 100 1
1000000001 101 0
1000000002 102 1
1000000003 103 0
1000000004 100 1
1000000005 98 0
则对于timestamp1000000000计算结果为
timestamp close huge_volume next_3_close next_3_result next_5_close next_5_result
1000000000 100 1 103 rise 98 fall
因为之后第3个periods的close是103所以next_3_result为rise
因为之后第5个periods的close是98所以next_3_result为fall
2. 如果output_excel为True则输出到excel
3. 新建一个列表: result计算huge_volume为1时之后3或5个周期close上涨或下跌的比例
a. 计算huge_volume为1时且price_80_high为1时的数量如100 并且计算next_3_result为fall的次数如50 然后计算fall_ratio 如50/100=0.5
b. 计算huge_volume为1时且price_80_high为1时的数量如100 并且计算next_5_result为fall的次数如30 然后计算fall_ratio 如30/100=0.3
c. 计算huge_volume为1时且price_20_low为1时的数量如100 并且计算next_3_result为rise的次数如50 然后计算rise_ratio 如50/100=0.5
d. 计算huge_volume为1时且price_20_low为1时的数量如100 并且计算next_5_result为rise的次数如30 然后计算rise_ratio 如30/100=0.3
e. 同样计算90/10分位数的统计
Args:
data: 包含巨量交易数据的DataFrame
periods: 计算周期列表,默认[3, 5]
output_excel: 是否输出到Excel文件默认False
Returns:
Tuple[pd.DataFrame, pd.DataFrame]: (处理后的数据, 统计结果)
"""
data = data.sort_values(by="timestamp", ascending=True)
data = data.reset_index(drop=True)
# 计算未来价格变化
for period in periods:
data[f"next_{period}_close"] = data["close"].shift(-period)
data[f"next_{period}_result"] = (
data[f"next_{period}_close"] / data["close"] - 1
)
data[f"next_{period}_result"] = data[f"next_{period}_result"].apply(
lambda x: (
"rise"
if pd.notna(x) and x > 0
else (
"fall"
if pd.notna(x) and x < 0
else "draw" if pd.notna(x) and x == 0 else x
)
)
)
# 过滤data, 只获取huge_volume为1且价格处于分位数位置的行
price_conditions = []
if "price_80_high" in data.columns:
price_conditions.append(data["price_80_high"] == 1)
if "price_20_low" in data.columns:
price_conditions.append(data["price_20_low"] == 1)
if "price_90_high" in data.columns:
price_conditions.append(data["price_90_high"] == 1)
if "price_10_low" in data.columns:
price_conditions.append(data["price_10_low"] == 1)
if price_conditions:
combined_condition = data["huge_volume"] == 1
for condition in price_conditions:
combined_condition = combined_condition | condition
data = data[combined_condition]
data = data.reset_index(drop=True)
# 统计各种分位数情况的数量
price_stats = {}
for price_type in ["price_80_high", "price_20_low", "price_90_high", "price_10_low"]:
if price_type in data.columns:
price_stats[price_type] = len(data[(data["huge_volume"] == 1) & (data[price_type] == 1)])
results = []
for period in periods:
for price_type, count in price_stats.items():
if count > 0:
# 计算下跌次数
fall_count = len(
data[
(data["huge_volume"] == 1) &
(data[price_type] == 1) &
(data[f"next_{period}_result"] == "fall")
]
)
# 计算上涨次数
rise_count = len(
data[
(data["huge_volume"] == 1) &
(data[price_type] == 1) &
(data[f"next_{period}_result"] == "rise")
]
)
results.append(
{
"symbol": data["symbol"].iloc[0] if len(data) > 0 else "",
"bar": data["bar"].iloc[0] if len(data) > 0 else "",
"huge_volume": 1,
"price_type": price_type,
"next_period": period,
"fall_count": fall_count,
"rise_count": rise_count,
"fall_ratio": fall_count / count,
"rise_ratio": rise_count / count,
"total_count": count,
}
)
result_data = pd.DataFrame(results)
return data, result_data