crypto_quant/core/statistics.py

119 lines
4.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from core.db_manager import query_market_data_by_symbol_bar
from pandas import DataFrame
import logging
import os
import re
import pandas as pd
logging.basicConfig(
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)
class Statistics:
def __init__(self, output_folder: str = "./output"):
self.output_folder = output_folder
os.makedirs(self.output_folder, exist_ok=True)
def detect_volume_spike(
self,
data: DataFrame,
threshold: float = 2.0,
window: int = 50,
check_price: bool = False,
only_output_huge_volume: bool = False,
output_excel: bool = False,
):
"""
detect_volume_spike的函数逻辑
1. 根据window滑动行情数据
2. 每一个window的最新的volume是否高于该window的volume的均值+2倍标准差如果满足条件则增加一列huge_volume值为1
3. 如果check_price为True则检查:
a. 每一个window的close是否处于该window的80%分位数及以上
b. 每一个window的close是否处于该window的20%分位数及以下
Args:
data: 包含成交量数据的DataFrame
threshold: 标准差倍数默认为2.0(即成交量超过均值+2倍标准差
window: 计算移动窗口的大小默认50个周期
check_price: 是否检查价格处于windows内的80%分位数以上或20%分位数以下默认False
Returns:
DataFrame: 包含异常检测结果的DataFrame
"""
if data is None or len(data) == 0:
logging.warning("数据为空,无法进行成交量异常检测")
return None
if "volume" not in data.columns:
logging.error("数据中缺少volume列")
return None
# 按时间戳排序
data = data.sort_values(by="timestamp", ascending=True).copy()
# 计算移动窗口的成交量均值和标准差
data["volume_ma"] = data["volume"].rolling(window=window, min_periods=1).mean()
data["volume_std"] = data["volume"].rolling(window=window, min_periods=1).std()
# 计算成交量阈值(均值 + threshold倍标准差
data["volume_threshold"] = data["volume_ma"] + threshold * data["volume_std"]
# 判断当前成交量是否超过阈值
data["huge_volume"] = (data["volume"] > data["volume_threshold"]).astype(int)
# 计算成交量比率
data["volume_ratio"] = data["volume"] / data["volume_ma"]
# 计算异常强度
data["spike_intensity"] = data["volume_ratio"] - 1
# 如果check_price为True检查价格分位数
if check_price:
if "close" not in data.columns:
logging.error("数据中缺少close列无法进行价格检查")
return data
# 计算移动窗口的收盘价分位数
data["close_80_percentile"] = (
data["close"].rolling(window=window, min_periods=1).quantile(0.8)
)
data["close_20_percentile"] = (
data["close"].rolling(window=window, min_periods=1).quantile(0.2)
)
# 检查收盘价是否在80%分位数及以上或20%分位数及以下
data["price_high"] = (data["close"] >= data["close_80_percentile"]).astype(
int
)
data["price_low"] = (data["close"] <= data["close_20_percentile"]).astype(
int
)
# 综合判断:成交量异常且价格处于极端位置
data["volume_price_spike"] = (
(data["huge_volume"] == 1)
& ((data["price_high"] == 1) | (data["price_low"] == 1))
).astype(int)
if only_output_huge_volume:
data = data[data["huge_volume"] == 1]
if output_excel:
# 检查数据是否为空
if len(data) == 0:
logging.warning("数据为空无法导出Excel文件")
return data
start_date = data["date_time"].iloc[0]
end_date = data["date_time"].iloc[-1]
# remove punctuation from start_date and end_date
start_date = re.sub(r"[\:\-\s]", "", str(start_date))
end_date = re.sub(r"[\:\-\s]", "", str(end_date))
symbol = data["symbol"].iloc[0]
bar = data["bar"].iloc[0]
file_name = f"volume_spike_{symbol}_{bar}_{start_date}_{end_date}.xlsx"
with pd.ExcelWriter(os.path.join(self.output_folder, file_name)) as writer:
data.to_excel(writer, sheet_name="volume_spike", index=False)
return data