crypto_quant/core/huge_volume.py

from pandas import DataFrame
import logging
import os
import re
import pandas as pd
from datetime import datetime
from typing import Optional, List, Dict, Any, Tuple

logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)


class HugeVolume:
    def __init__(self, output_folder: str = "./output"):
        self.output_folder = output_folder
        os.makedirs(self.output_folder, exist_ok=True)

    def _calculate_percentile_indicators(
        self,
        data: pd.DataFrame,
        window_size: int,
        percentiles: List[Tuple[float, str]] = [(0.8, "80"), (0.2, "20"), (0.9, "90"), (0.1, "10")]
    ) -> pd.DataFrame:
        """
        计算分位数指标
        :param data: 数据DataFrame
        :param window_size: 窗口大小
        :param percentiles: 分位数配置列表，格式为[(分位数, 名称后缀)]
        :return: 包含分位数指标的DataFrame
        """
        for percentile, suffix in percentiles:
            # 计算分位数
            data[f"close_{suffix}_percentile"] = (
                data["close"].rolling(window=window_size, min_periods=1).quantile(percentile)
            )

            # 判断价格是否达到分位数
            if suffix in ["80", "90"]:
                # 高点分位数
                data[f"price_{suffix}_high"] = (
                    data["close"] >= data[f"close_{suffix}_percentile"]
                ).astype(int)
            else:
                # 低点分位数
                data[f"price_{suffix}_low"] = (
                    data["close"] <= data[f"close_{suffix}_percentile"]
                ).astype(int)

        return data

    def _calculate_volume_price_spikes(self, data: pd.DataFrame) -> pd.DataFrame:
        """
        计算量价尖峰指标
        :param data: 数据DataFrame
        :return: 包含量价尖峰指标的DataFrame
        """
        # 80/20量价尖峰
        data["volume_80_20_price_spike"] = (
            (data["huge_volume"] == 1)
            & ((data["price_80_high"] == 1) | (data["price_20_low"] == 1))
        ).astype(int)

        # 90/10量价尖峰
        data["volume_90_10_price_spike"] = (
            (data["huge_volume"] == 1)
            & ((data["price_90_high"] == 1) | (data["price_10_low"] == 1))
        ).astype(int)

        return data

    def detect_huge_volume(
        self,
        data: DataFrame,
        window_size: int = 50,
        threshold: float = 2.0,
        check_price: bool = False,
        only_output_huge_volume: bool = False,
        output_excel: bool = False,
    ) -> Optional[DataFrame]:
        """
        detect_volume_spike的函数逻辑：
        1. 根据window滑动行情数据
        2. 每一个window的最新的volume是否高于该window的volume的均值+2倍标准差，如果满足条件，则增加一列：huge_volume，值为1
        3. 如果check_price为True，则检查:
        a. 每一个window的close是否处于该window的80%分位数及以上
        b. 每一个window的close是否处于该window的20%分位数及以下
        c. 每一个window的close是否处于该window的90%分位数及以上
        d. 每一个window的close是否处于该window的10%分位数及以下

        Args:
            data: 包含成交量数据的DataFrame
            threshold: 标准差倍数，默认为2.0（即成交量超过均值+2倍标准差）
            window_size: 计算移动窗口的大小，默认50个周期
            check_price: 是否检查价格处于windows内的分位数位置，默认False
            only_output_huge_volume: 是否只输出巨量交易记录，默认False
            output_excel: 是否输出到Excel文件，默认False
        Returns:
            DataFrame: 包含异常检测结果的DataFrame
        """
        if data is None or len(data) == 0:
            logging.warning("数据为空，无法进行成交量异常检测")
            return None

        if "volume" not in data.columns:
            logging.error("数据中缺少volume列")
            return None

        # 按时间戳排序
        data = data.sort_values(by="timestamp", ascending=True).copy()

        # 计算移动窗口的成交量均值和标准差
        data["volume_ma"] = (
            data["volume"].rolling(window=window_size, min_periods=1).mean()
        )
        data["volume_std"] = (
            data["volume"].rolling(window=window_size, min_periods=1).std()
        )

        # 计算成交量阈值（均值 + threshold倍标准差）
        data["volume_threshold"] = data["volume_ma"] + threshold * data["volume_std"]

        # 判断当前成交量是否超过阈值
        data["huge_volume"] = (data["volume"] > data["volume_threshold"]).astype(int)

        # 计算成交量比率
        data["volume_ratio"] = data["volume"] / data["volume_ma"]

        # 计算异常强度
        data["spike_intensity"] = data["volume_ratio"] - 1

        # 如果check_price为True，检查价格分位数
        if check_price:
            if "close" not in data.columns:
                logging.error("数据中缺少close列，无法进行价格检查")
                return data

            # 计算分位数指标（80/20和90/10）
            data = self._calculate_percentile_indicators(data, window_size)

            # 计算量价尖峰指标
            data = self._calculate_volume_price_spikes(data)

        if only_output_huge_volume:
            data = data[(data["huge_volume"] == 1)]

        data["create_time"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

        if output_excel:
            # 检查数据是否为空
            if len(data) == 0:
                logging.warning("数据为空，无法导出Excel文件")
                return data

            start_date = data["date_time"].iloc[0]
            end_date = data["date_time"].iloc[-1]
            # remove punctuation from start_date and end_date
            start_date = re.sub(r"[\:\-\s]", "", str(start_date))
            end_date = re.sub(r"[\:\-\s]", "", str(end_date))
            symbol = data["symbol"].iloc[0]
            bar = data["bar"].iloc[0]
            file_name = f"volume_spike_{symbol}_{bar}_{start_date}_{end_date}.xlsx"
            try:
                with pd.ExcelWriter(
                    os.path.join(self.output_folder, file_name)
                ) as writer:
                    data.to_excel(writer, sheet_name="volume_spike", index=False)
            except Exception as e:
                logging.error(f"导出Excel文件失败: {e}")

        return data

    def next_periods_rise_or_fall(
        self,
        data: pd.DataFrame,
        periods: List[int] = [3, 5],
        output_excel: bool = False
    ) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """
        1. 根据period_count，计算每个timestamp的下一个periods的rise_or_fall
        示例，period_count=3，则计算每个timestamp的下一个3个periods的rise_or_fall
        示例，data如下：
        timestamp    close huge_volume
        1000000000   100   1
        1000000001   101   0
        1000000002   102   1
        1000000003   103   0
        1000000004   100   1
        1000000005   98    0

        则对于timestamp，1000000000，计算结果为：
        timestamp    close    huge_volume  next_3_close  next_3_result   next_5_close  next_5_result
        1000000000   100      1             103           rise            98            fall
        因为之后第3个periods的close是103，所以next_3_result为rise
        因为之后第5个periods的close是98，所以next_3_result为fall

        2. 如果output_excel为True，则输出到excel
        3. 新建一个列表: result，计算huge_volume为1时，之后3或5个周期，close上涨或下跌的比例
        a. 计算huge_volume为1时，且price_80_high为1时的数量，如100， 并且计算next_3_result为fall的次数，如50， 然后计算fall_ratio， 如50/100=0.5
        b. 计算huge_volume为1时，且price_80_high为1时的数量，如100， 并且计算next_5_result为fall的次数，如30， 然后计算fall_ratio， 如30/100=0.3
        c. 计算huge_volume为1时，且price_20_low为1时的数量，如100， 并且计算next_3_result为rise的次数，如50， 然后计算rise_ratio， 如50/100=0.5
        d. 计算huge_volume为1时，且price_20_low为1时的数量，如100， 并且计算next_5_result为rise的次数，如30， 然后计算rise_ratio， 如30/100=0.3
        e. 同样计算90/10分位数的统计

        Args:
            data: 包含巨量交易数据的DataFrame
            periods: 计算周期列表，默认[3, 5]
            output_excel: 是否输出到Excel文件，默认False
        Returns:
            Tuple[pd.DataFrame, pd.DataFrame]: (处理后的数据, 统计结果)
        """
        data = data.sort_values(by="timestamp", ascending=True)
        data = data.reset_index(drop=True)

        # 计算未来价格变化
        for period in periods:
            data[f"next_{period}_close"] = data["close"].shift(-period)
            data[f"next_{period}_result"] = (
                data[f"next_{period}_close"] / data["close"] - 1
            )
            data[f"next_{period}_result"] = data[f"next_{period}_result"].apply(
                lambda x: (
                    "rise"
                    if pd.notna(x) and x > 0
                    else (
                        "fall"
                        if pd.notna(x) and x < 0
                        else "draw" if pd.notna(x) and x == 0 else x
                    )
                )
            )

        # 过滤data, 只获取huge_volume为1，且价格处于分位数位置的行
        price_conditions = []
        if "price_80_high" in data.columns:
            price_conditions.append(data["price_80_high"] == 1)
        if "price_20_low" in data.columns:
            price_conditions.append(data["price_20_low"] == 1)
        if "price_90_high" in data.columns:
            price_conditions.append(data["price_90_high"] == 1)
        if "price_10_low" in data.columns:
            price_conditions.append(data["price_10_low"] == 1)

        if price_conditions:
            combined_condition = data["huge_volume"] == 1
            for condition in price_conditions:
                combined_condition = combined_condition | condition
            data = data[combined_condition]

        data = data.reset_index(drop=True)

        # 统计各种分位数情况的数量
        price_stats = {}
        for price_type in ["price_80_high", "price_20_low", "price_90_high", "price_10_low"]:
            if price_type in data.columns:
                price_stats[price_type] = len(data[(data["huge_volume"] == 1) & (data[price_type] == 1)])

        results = []
        for period in periods:
            for price_type, count in price_stats.items():
                if count > 0:
                    # 计算下跌次数
                    fall_count = len(
                        data[
                            (data["huge_volume"] == 1) &
                            (data[price_type] == 1) &
                            (data[f"next_{period}_result"] == "fall")
                        ]
                    )
                    # 计算上涨次数
                    rise_count = len(
                        data[
                            (data["huge_volume"] == 1) &
                            (data[price_type] == 1) &
                            (data[f"next_{period}_result"] == "rise")
                        ]
                    )

                    results.append(
                        {
                            "symbol": data["symbol"].iloc[0] if len(data) > 0 else "",
                            "bar": data["bar"].iloc[0] if len(data) > 0 else "",
                            "huge_volume": 1,
                            "price_type": price_type,
                            "next_period": period,
                            "fall_count": fall_count,
                            "rise_count": rise_count,
                            "fall_ratio": fall_count / count,
                            "rise_ratio": rise_count / count,
                            "total_count": count,
                        }
                    )

        result_data = pd.DataFrame(results)
        return data, result_data