crypto_quant/core/statistics.py

119 lines
4.7 KiB
Python
Raw Normal View History

from core.db_manager import query_data_by_symbol_bar
from pandas import DataFrame
import logging
import os
import re
import pandas as pd
logging.basicConfig(
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)
class Statistics:
def __init__(self, output_folder: str = "./output"):
self.output_folder = output_folder
os.makedirs(self.output_folder, exist_ok=True)
def detect_volume_spike(
self,
data: DataFrame,
threshold: float = 2.0,
window: int = 50,
check_price: bool = False,
only_output_huge_volume: bool = False,
output_excel: bool = False,
):
"""
detect_volume_spike的函数逻辑
1. 根据window滑动行情数据
2. 每一个window的最新的volume是否高于该window的volume的均值+2倍标准差如果满足条件则增加一列huge_volume值为1
3. 如果check_price为True则检查:
a. 每一个window的close是否处于该window的80%分位数及以上
b. 每一个window的close是否处于该window的20%分位数及以上
Args:
data: 包含成交量数据的DataFrame
threshold: 标准差倍数默认为2.0即成交量超过均值+2倍标准差
window: 计算移动窗口的大小默认50个周期
check_price: 是否检查价格处于windows内的80%分位数以上或20%分位数以下默认False
Returns:
DataFrame: 包含异常检测结果的DataFrame
"""
if data is None or len(data) == 0:
logging.warning("数据为空,无法进行成交量异常检测")
return None
if "volume" not in data.columns:
logging.error("数据中缺少volume列")
return None
# 按时间戳排序
data = data.sort_values(by="timestamp", ascending=True).copy()
# 计算移动窗口的成交量均值和标准差
data["volume_ma"] = data["volume"].rolling(window=window, min_periods=1).mean()
data["volume_std"] = data["volume"].rolling(window=window, min_periods=1).std()
# 计算成交量阈值(均值 + threshold倍标准差
data["volume_threshold"] = data["volume_ma"] + threshold * data["volume_std"]
# 判断当前成交量是否超过阈值
data["huge_volume"] = (data["volume"] > data["volume_threshold"]).astype(int)
# 计算成交量比率
data["volume_ratio"] = data["volume"] / data["volume_ma"]
# 计算异常强度
data["spike_intensity"] = data["volume_ratio"] - 1
# 如果check_price为True检查价格分位数
if check_price:
if "close" not in data.columns:
logging.error("数据中缺少close列无法进行价格检查")
return data
# 计算移动窗口的收盘价分位数
data["close_80_percentile"] = (
data["close"].rolling(window=window, min_periods=1).quantile(0.8)
)
data["close_20_percentile"] = (
data["close"].rolling(window=window, min_periods=1).quantile(0.2)
)
# 检查收盘价是否在80%分位数及以上或20%分位数及以下
data["price_high"] = (data["close"] >= data["close_80_percentile"]).astype(
int
)
data["price_low"] = (data["close"] <= data["close_20_percentile"]).astype(
int
)
# 综合判断:成交量异常且价格处于极端位置
data["volume_price_spike"] = (
(data["huge_volume"] == 1)
& ((data["price_high"] == 1) | (data["price_low"] == 1))
).astype(int)
if only_output_huge_volume:
data = data[data["huge_volume"] == 1]
if output_excel:
# 检查数据是否为空
if len(data) == 0:
logging.warning("数据为空无法导出Excel文件")
return data
start_date = data["date_time"].iloc[0]
end_date = data["date_time"].iloc[-1]
# remove punctuation from start_date and end_date
start_date = re.sub(r"[\:\-\s]", "", str(start_date))
end_date = re.sub(r"[\:\-\s]", "", str(end_date))
symbol = data["symbol"].iloc[0]
bar = data["bar"].iloc[0]
file_name = f"volume_spike_{symbol}_{bar}_{start_date}_{end_date}.xlsx"
with pd.ExcelWriter(os.path.join(self.output_folder, file_name)) as writer:
data.to_excel(writer, sheet_name="volume_spike", index=False)
return data