crypto_quant/core/biz/huge_volume.py

from pandas import DataFrame
import core.logger as logging
import os
import re
import pandas as pd
from datetime import datetime
from copy import deepcopy
from typing import Optional, List, Dict, Any, Tuple

logger = logging.logger


class HugeVolume:
    def __init__(self, output_folder: str = "./output"):
        self.output_folder = output_folder
        os.makedirs(self.output_folder, exist_ok=True)

    def _calculate_percentile_indicators(
        self, 
        data: pd.DataFrame, 
        window_size: int = 50, 
        price_column: str = "close",
        percentiles: List[Tuple[float, str]] = [(0.8, "80"), (0.2, "20"), (0.9, "90"), (0.1, "10")]
    ) -> pd.DataFrame:
        """
        计算分位数指标
        :param data: 数据DataFrame
        :param window_size: 窗口大小
        :param percentiles: 分位数配置列表，格式为[(分位数, 名称后缀)]
        :return: 包含分位数指标的DataFrame
        """
        for percentile, suffix in percentiles:
            # 计算分位数
            data[f"{price_column}_{suffix}_percentile"] = (
                data[price_column].rolling(window=window_size, min_periods=1).quantile(percentile)
            )
            
            # 判断价格是否达到分位数
            if suffix in ["80", "90"]:
                # 高点分位数
                data[f"{price_column}_{suffix}_high"] = (
                    data[price_column] >= data[f"{price_column}_{suffix}_percentile"]
                ).astype(int)
            else:
                # 低点分位数
                data[f"{price_column}_{suffix}_low"] = (
                    data[price_column] <= data[f"{price_column}_{suffix}_percentile"]
                ).astype(int)
        
        return data

    def detect_huge_volume(
        self,
        data: DataFrame,
        window_size: int = 50,
        threshold: float = 2.0,
        check_price: bool = False,
        only_output_huge_volume: bool = False,
        output_excel: bool = False,
    ) -> Optional[DataFrame]:
        """
        detect_volume_spike的函数逻辑：
        1. 根据window滑动行情数据
        2. 每一个window的最新的volume是否高于该window的volume的均值+2倍标准差，如果满足条件，则增加一列：huge_volume，值为1
        3. 如果check_price为True，则检查:
        a. 每一个window的close是否处于该window的80%分位数及以上
        b. 每一个window的close是否处于该window的20%分位数及以下
        c. 每一个window的close是否处于该window的90%分位数及以上
        d. 每一个window的close是否处于该window的10%分位数及以下

        Args:
            data: 包含成交量数据的DataFrame
            threshold: 标准差倍数，默认为2.0（即成交量超过均值+2倍标准差）
            window_size: 计算移动窗口的大小，默认50个周期
            check_price: 是否检查价格处于windows内的分位数位置，默认False
            only_output_huge_volume: 是否只输出巨量交易记录，默认False
            output_excel: 是否输出到Excel文件，默认False
        Returns:
            DataFrame: 包含异常检测结果的DataFrame
        """
        if data is None or len(data) == 0:
            logger.warning("数据为空，无法进行成交量异常检测")
            return None

        if "volume" not in data.columns:
            logger.error("数据中缺少volume列")
            return None

        # 按时间戳排序
        data = data.sort_values(by="timestamp", ascending=True).copy()
        data["window_size"] = window_size

        # 计算移动窗口的成交量均值和标准差
        data["volume_ma"] = (
            data["volume"].rolling(window=window_size, min_periods=1).mean()
        )
        data["volume_std"] = (
            data["volume"].rolling(window=window_size, min_periods=1).std()
        )

        # 计算成交量阈值（均值 + threshold倍标准差）
        data["volume_threshold"] = data["volume_ma"] + threshold * data["volume_std"]

        # 判断当前成交量是否超过阈值
        data["huge_volume"] = (data["volume"] > data["volume_threshold"]).astype(int)

        # 计算成交量比率
        data["volume_ratio"] = data["volume"] / data["volume_ma"]

        # 计算异常强度
        data["spike_intensity"] = data["volume_ratio"] - 1

        # 如果check_price为True，检查价格分位数
        if check_price:
            if "close" not in data.columns:
                logger.error("数据中缺少close列，无法进行价格检查")
                return data
            if "high" not in data.columns:
                logger.error("数据中缺少high列，无法进行价格检查")
                return data
            if "low" not in data.columns:
                logger.error("数据中缺少low列，无法进行价格检查")
                return data

            for price_column in ["close", "high", "low"]:
                # 计算分位数指标（80/20和90/10）
                data = self._calculate_percentile_indicators(data, window_size, price_column)
            

        if only_output_huge_volume:
            data = data[(data["huge_volume"] == 1)]
        
        data["create_time"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

        if output_excel:
            # 检查数据是否为空
            if len(data) == 0:
                logger.warning("数据为空，无法导出Excel文件")
                return data

            start_date = data["date_time"].iloc[0]
            end_date = data["date_time"].iloc[-1]
            # remove punctuation from start_date and end_date
            start_date = re.sub(r"[\:\-\s]", "", str(start_date))
            end_date = re.sub(r"[\:\-\s]", "", str(end_date))
            symbol = data["symbol"].iloc[0]
            bar = data["bar"].iloc[0]
            file_name = f"volume_spike_{symbol}_{bar}_{window_size}_{start_date}_{end_date}.xlsx"
            try:
                with pd.ExcelWriter(
                    os.path.join(self.output_folder, file_name)
                ) as writer:
                    data.to_excel(writer, sheet_name="volume_spike", index=False)
            except Exception as e:
                logger.error(f"导出Excel文件失败: {e}")

        return data

    def next_periods_rise_or_fall(
        self, 
        data: pd.DataFrame, 
        window_size: int = 50,
        periods: List[int] = [1, 2, 3, 5, 10], 
    ) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """
        1. 根据period_count，计算每个timestamp的下一个periods的rise_or_fall
        示例，period_count=3，则计算每个timestamp的下一个3个periods的rise_or_fall
        示例，data如下：
        timestamp    close huge_volume
        1000000000   100   1
        1000000001   101   0
        1000000002   102   1
        1000000003   103   0
        1000000004   100   1
        1000000005   98    0

        则对于timestamp，1000000000，计算结果为：
        timestamp    close    huge_volume  next_3_close  next_3_result   next_5_close  next_5_result
        1000000000   100      1             103           rise            98            fall
        因为之后第3个periods的close是103，所以next_3_result为rise
        因为之后第5个periods的close是98，所以next_3_result为fall
        2. 根据volume_ratio_percentile_10，将data分成10份，然后计算每一份的上涨与下跌次数，以及平均收益率
        3. 新建一个列表: result，计算之后n个周期，close上涨或下跌的比例
        a. 计算huge_volume为1时，且price_80_high为1时的数量，如100， 并且计算next_3_result为fall的次数，如50， 然后计算fall_ratio， 如50/100=0.5
        b. 计算huge_volume为1时，且price_80_high为1时的数量，如100， 并且计算next_5_result为fall的次数，如30， 然后计算fall_ratio， 如30/100=0.3
        c. 计算huge_volume为1时，且price_20_low为1时的数量，如100， 并且计算next_3_result为rise的次数，如50， 然后计算rise_ratio， 如50/100=0.5
        d. 计算huge_volume为1时，且price_20_low为1时的数量，如100， 并且计算next_5_result为rise的次数，如30， 然后计算rise_ratio， 如30/100=0.3
        e. 同样计算90/10分位数的统计
        
        Args:
            data: 包含巨量交易数据的DataFrame
            periods: 计算周期列表，默认[1, 2, 3, 5, 10]
            output_excel: 是否输出到Excel文件，默认False
        Returns:
            Tuple[pd.DataFrame, pd.DataFrame]: (处理后的数据, 统计结果)
        """
        # 将huge_volume, volume_80_20_price_spike, volume_90_10_price_spike, 
        # price_80_high, price_20_low, price_90_high, price_10_low设置为整型
        data["huge_volume"] = data["huge_volume"].astype(int)
        data["volume_80_20_price_spike"] = data["volume_80_20_price_spike"].astype(int)
        data["volume_90_10_price_spike"] = data["volume_90_10_price_spike"].astype(int)
        data["price_80_high"] = data["price_80_high"].astype(int)
        data["price_20_low"] = data["price_20_low"].astype(int)
        data["price_90_high"] = data["price_90_high"].astype(int)
        data["price_10_low"] = data["price_10_low"].astype(int)

        data = data.sort_values(by="timestamp", ascending=True)
        data = data.reset_index(drop=True)

        # 计算未来价格变化
        for period in periods:
            data[f"next_{period}_close"] = data["close"].shift(-period)
            data[f"next_{period}_change"] = (
                (data[f"next_{period}_close"] / data["close"] - 1) * 100
            )
            # 添加一列next_{period}_result，如果next_{period}_change大于0，则值为rise，如果next_{period}_change小于0，则值为fall，如果next_{period}_change等于0，则值为draw
            data[f"next_{period}_result"] = data[f"next_{period}_change"].apply(
                lambda x: (
                    "rise"
                    if pd.notna(x) and x > 0
                    else (
                        "fall"
                        if pd.notna(x) and x < 0
                        else "draw" if pd.notna(x) and x == 0 else x
                    )
                )
            )

        # 将volume_ratio按照百分位分成十份
        huge_volume_data = deepcopy(data[data["huge_volume"] == 1])
        huge_volume_data = huge_volume_data.sort_values(by="timestamp", ascending=True)
        huge_volume_data = huge_volume_data.reset_index(drop=True)
        huge_volume_data["volume_ratio_percentile"] = huge_volume_data["volume_ratio"].rank(pct=True)
        huge_volume_data["volume_ratio_percentile_10"] = huge_volume_data["volume_ratio_percentile"].apply(
            lambda x: 10 if x <= 0.1 else 20 
            if x <= 0.2 else 30 if x <= 0.3 else 40 
            if x <= 0.4 else 50 if x <= 0.5 else 60 
            if x <= 0.6 else 70 if x <= 0.7 else 80 
            if x <= 0.8 else 90 if x <= 0.9 else 100
        )
        huge_volume_ratio_percentile_10_mean = huge_volume_data.groupby("volume_ratio_percentile_10")["volume_ratio"].mean()
        percentile_10_mean = round(float(huge_volume_data["volume_ratio"].mean()), 4)
        # insert one row to huge_volume_ratio_percentile_10_mean, key is -1, value is percentile_10_mean
        huge_volume_ratio_percentile_10_mean.loc["-1"] = percentile_10_mean
        # transform huge_volume_ratio_percentile_10_mean index to int
        huge_volume_ratio_percentile_10_mean.index = huge_volume_ratio_percentile_10_mean.index.astype(int)
        # sort huge_volume_ratio_percentile_10_mean by index
        huge_volume_ratio_percentile_10_mean = huge_volume_ratio_percentile_10_mean.sort_index()

        results = []
        # iterate huge_volume_ratio_percentile_10_mean
        for index, value in huge_volume_ratio_percentile_10_mean.items():
            if index == -1:
                data_temp = deepcopy(huge_volume_data)
                volume_ratio_percentile_10 = "all"
                current_percentile_10_mean = percentile_10_mean
            else:
                data_temp = deepcopy(huge_volume_data[huge_volume_data["volume_ratio_percentile_10"] == index])
                volume_ratio_percentile_10 = str(index)
                current_percentile_10_mean = round(value, 4)

            data_temp = data_temp.reset_index(drop=True)
            data_temp = data_temp.sort_values(by="timestamp", ascending=True)
            data_temp = data_temp.reset_index(drop=True)
            
            # 过滤data, 只获取huge_volume为1，且价格处于分位数位置的行
            price_conditions = []
            if "price_80_high" in data_temp.columns:
                price_conditions.append(data_temp["price_80_high"] == 1)
            if "price_20_low" in data_temp.columns:
                price_conditions.append(data_temp["price_20_low"] == 1)
            if "price_90_high" in data_temp.columns:
                price_conditions.append(data_temp["price_90_high"] == 1)
            if "price_10_low" in data_temp.columns:
                price_conditions.append(data_temp["price_10_low"] == 1)
            
            if price_conditions:
                combined_condition = data_temp["huge_volume"] == 1
                for condition in price_conditions:
                    combined_condition = combined_condition | condition
                data_temp = data_temp[combined_condition]
            
            data_temp = data_temp.reset_index(drop=True)
            
            # 统计各种分位数情况的数量
            price_stats = {}
            for price_type in ["price_80_high", "price_20_low", "price_90_high", "price_10_low"]:
                if price_type in data.columns:
                    price_stats[price_type] = len(data_temp[(data_temp["huge_volume"] == 1) & (data_temp[price_type] == 1)])
            
            for period in periods:
                for price_type, count in price_stats.items():
                    if count > 0:
                        # 计算下跌次数
                        fall_count = len(
                            data_temp[
                                (data_temp["huge_volume"] == 1) & 
                                (data_temp[price_type] == 1) & 
                                (data_temp[f"next_{period}_result"] == "fall")
                            ]
                        )
                        # 计算上涨次数
                        rise_count = len(
                            data_temp[
                                (data_temp["huge_volume"] == 1) & 
                                (data_temp[price_type] == 1) & 
                                (data_temp[f"next_{period}_result"] == "rise")
                            ]
                        )

                        draw_count = len(
                            data_temp[
                                (data_temp["huge_volume"] == 1) & 
                                (data_temp[price_type] == 1) & 
                                (data_temp[f"next_{period}_result"] == "draw")
                            ]
                        )
                        # 根据data[f"next_{period}_result"]获得平均收益率
                        average_return = float(data_temp[(data_temp["huge_volume"] == 1) & (data_temp[price_type] == 1)]
                        [f"next_{period}_change"].mean())
                        max_return = float(data_temp[(data_temp["huge_volume"] == 1) & (data_temp[price_type] == 1)]
                        [f"next_{period}_change"].max())
                        min_return = float(data_temp[(data_temp["huge_volume"] == 1) & (data_temp[price_type] == 1)]
                        [f"next_{period}_change"].min())

                        results.append(
                            {
                                "symbol": data_temp["symbol"].iloc[0] if len(data_temp) > 0 else "",
                                "bar": data_temp["bar"].iloc[0] if len(data_temp) > 0 else "",
                                "window_size": window_size,
                                "huge_volume": 1,
                                "volume_ratio_percentile_10": volume_ratio_percentile_10,
                                "volume_ratio_percentile_10_mean": current_percentile_10_mean,
                                "price_type": price_type,
                                "next_period": period,
                                "average_return": average_return,
                                "max_return": max_return,
                                "min_return": min_return,
                                "rise_count": rise_count,
                                "rise_ratio": round((rise_count / count) * 100, 4),
                                "fall_count": fall_count,
                                "fall_ratio": round((fall_count / count) * 100, 4),
                                "draw_count": draw_count,
                                "draw_ratio": round((draw_count / count) * 100, 4),
                                "total_count": count,
                            }
                        )
        
        result_data = pd.DataFrame(results)
        return huge_volume_data, result_data
-												support statistics huge volume based on market data

											
										
										
											2025-07-26 06:41:50 +00:00
+								from pandas import DataFrame
-												invoke python file by schedule directly, instead of import way. The purpose is to make each event to be totally new initialization action.

											
										
										
											2025-08-15 03:37:06 +00:00
+								import core.logger as logging
-												support statistics huge volume based on market data

											
										
										
											2025-07-26 06:41:50 +00:00
+								import os
 								import re
 								import pandas as pd
-. support update huge volume data
a. update market data at first
b. According to the first updated market data timestamp, get earlier window-size records.
c. Calculate huge volume data by the latest market data
d. Only insert new data to huge volume table
2. rename code file to be more proper.

											
										
										
											2025-07-28 08:14:40 +00:00
+								from datetime import datetime
-												support statistics price-volume relationship

											
										
										
											2025-07-31 12:31:22 +00:00
+								from copy import deepcopy
-												update for more professional data fetch action

											
										
										
											2025-07-30 08:11:34 +00:00
+								from typing import Optional, List, Dict, Any, Tuple
-												support statistics huge volume based on market data

											
										
										
											2025-07-26 06:41:50 +00:00
-												invoke python file by schedule directly, instead of import way. The purpose is to make each event to be totally new initialization action.

											
										
										
											2025-08-15 03:37:06 +00:00
+								logger = logging.logger
-												support statistics huge volume based on market data

											
										
										
											2025-07-26 06:41:50 +00:00
-. support update huge volume data
a. update market data at first
b. According to the first updated market data timestamp, get earlier window-size records.
c. Calculate huge volume data by the latest market data
d. Only insert new data to huge volume table
2. rename code file to be more proper.

											
										
										
											2025-07-28 08:14:40 +00:00
+								class HugeVolume:
-												support statistics huge volume based on market data

											
										
										
											2025-07-26 06:41:50 +00:00
+								    def __init__(self, output_folder: str = "./output"):
 								        self.output_folder = output_folder
 								        os.makedirs(self.output_folder, exist_ok=True)
-												update for more professional data fetch action

											
										
										
											2025-07-30 08:11:34 +00:00
+								    def _calculate_percentile_indicators(
 								        self,
 								        data: pd.DataFrame,
-												complete market data monitor main logic

											
										
										
											2025-08-05 07:30:50 +00:00
+								        window_size: int = 50,
 								        price_column: str = "close",
-												update for more professional data fetch action

											
										
										
											2025-07-30 08:11:34 +00:00
+								        percentiles: List[Tuple[float, str]] = [(0.8, "80"), (0.2, "20"), (0.9, "90"), (0.1, "10")]
 								    ) -> pd.DataFrame:
 								        """
 								        计算分位数指标
 								        :param data: 数据DataFrame
 								        :param window_size: 窗口大小
 								        :param percentiles: 分位数配置列表，格式为[(分位数, 名称后缀)]
 								        :return: 包含分位数指标的DataFrame
 								        """
 								        for percentile, suffix in percentiles:
 								            # 计算分位数
-												complete market data monitor main logic

											
										
										
											2025-08-05 07:30:50 +00:00
+								            data[f"{price_column}_{suffix}_percentile"] = (
 								                data[price_column].rolling(window=window_size, min_periods=1).quantile(percentile)
-												update for more professional data fetch action

											
										
										
											2025-07-30 08:11:34 +00:00
+								            )
 								            # 判断价格是否达到分位数
 								            if suffix in ["80", "90"]:
 								                # 高点分位数
-												complete market data monitor main logic

											
										
										
											2025-08-05 07:30:50 +00:00
+								                data[f"{price_column}_{suffix}_high"] = (
 								                    data[price_column] >= data[f"{price_column}_{suffix}_percentile"]
-												update for more professional data fetch action

											
										
										
											2025-07-30 08:11:34 +00:00
+								                ).astype(int)
 								            else:
 								                # 低点分位数
-												complete market data monitor main logic

											
										
										
											2025-08-05 07:30:50 +00:00
+								                data[f"{price_column}_{suffix}_low"] = (
 								                    data[price_column] <= data[f"{price_column}_{suffix}_percentile"]
-												update for more professional data fetch action

											
										
										
											2025-07-30 08:11:34 +00:00
+								                ).astype(int)
 								        return data
-. support update huge volume data
a. update market data at first
b. According to the first updated market data timestamp, get earlier window-size records.
c. Calculate huge volume data by the latest market data
d. Only insert new data to huge volume table
2. rename code file to be more proper.

											
										
										
											2025-07-28 08:14:40 +00:00
+								    def detect_huge_volume(
-												support statistics huge volume based on market data

											
										
										
											2025-07-26 06:41:50 +00:00
+								        self,
 								        data: DataFrame,
-. support update huge volume data
a. update market data at first
b. According to the first updated market data timestamp, get earlier window-size records.
c. Calculate huge volume data by the latest market data
d. Only insert new data to huge volume table
2. rename code file to be more proper.

											
										
										
											2025-07-28 08:14:40 +00:00
+								        window_size: int = 50,
-												support statistics huge volume based on market data

											
										
										
											2025-07-26 06:41:50 +00:00
+								        threshold: float = 2.0,
 								        check_price: bool = False,
 								        only_output_huge_volume: bool = False,
 								        output_excel: bool = False,
-												update for more professional data fetch action

											
										
										
											2025-07-30 08:11:34 +00:00
+								    ) -> Optional[DataFrame]:
-												support statistics huge volume based on market data

											
										
										
											2025-07-26 06:41:50 +00:00
+								        """
 								        detect_volume_spike的函数逻辑：
 . 根据window滑动行情数据
 . 每一个window的最新的volume是否高于该window的volume的均值+2倍标准差，如果满足条件，则增加一列：huge_volume，值为1
 . 如果check_price为True，则检查:
 								        a. 每一个window的close是否处于该window的80%分位数及以上
-												refactor code for mysql data handler

											
										
										
											2025-07-28 04:29:31 +00:00
+								        b. 每一个window的close是否处于该window的20%分位数及以下
-												update for more professional data fetch action

											
										
										
											2025-07-30 08:11:34 +00:00
+								        c. 每一个window的close是否处于该window的90%分位数及以上
 								        d. 每一个window的close是否处于该window的10%分位数及以下
-												support statistics huge volume based on market data

											
										
										
											2025-07-26 06:41:50 +00:00
 								        Args:
 								            data: 包含成交量数据的DataFrame
 								            threshold: 标准差倍数，默认为2.0（即成交量超过均值+2倍标准差）
-												update for more professional data fetch action

											
										
										
											2025-07-30 08:11:34 +00:00
+								            window_size: 计算移动窗口的大小，默认50个周期
 								            check_price: 是否检查价格处于windows内的分位数位置，默认False
 								            only_output_huge_volume: 是否只输出巨量交易记录，默认False
 								            output_excel: 是否输出到Excel文件，默认False
-												support statistics huge volume based on market data

											
										
										
											2025-07-26 06:41:50 +00:00
+								        Returns:
 								            DataFrame: 包含异常检测结果的DataFrame
 								        """
 								        if data is None or len(data) == 0:
-												invoke python file by schedule directly, instead of import way. The purpose is to make each event to be totally new initialization action.

											
										
										
											2025-08-15 03:37:06 +00:00
+								            logger.warning("数据为空，无法进行成交量异常检测")
-												support statistics huge volume based on market data

											
										
										
											2025-07-26 06:41:50 +00:00
+								            return None
 								        if "volume" not in data.columns:
-												invoke python file by schedule directly, instead of import way. The purpose is to make each event to be totally new initialization action.

											
										
										
											2025-08-15 03:37:06 +00:00
+								            logger.error("数据中缺少volume列")
-												support statistics huge volume based on market data

											
										
										
											2025-07-26 06:41:50 +00:00
+								            return None
 								        # 按时间戳排序
 								        data = data.sort_values(by="timestamp", ascending=True).copy()
-												support get huge volume by different window sizes

											
										
										
											2025-07-30 09:28:01 +00:00
+								        data["window_size"] = window_size
-												support statistics huge volume based on market data

											
										
										
											2025-07-26 06:41:50 +00:00
 								        # 计算移动窗口的成交量均值和标准差
-												update for more professional data fetch action

											
										
										
											2025-07-30 08:11:34 +00:00
+								        data["volume_ma"] = (
 								            data["volume"].rolling(window=window_size, min_periods=1).mean()
 								        )
 								        data["volume_std"] = (
 								            data["volume"].rolling(window=window_size, min_periods=1).std()
 								        )
-												support statistics huge volume based on market data

											
										
										
											2025-07-26 06:41:50 +00:00
 								        # 计算成交量阈值（均值 + threshold倍标准差）
 								        data["volume_threshold"] = data["volume_ma"] + threshold * data["volume_std"]
 								        # 判断当前成交量是否超过阈值
 								        data["huge_volume"] = (data["volume"] > data["volume_threshold"]).astype(int)
 								        # 计算成交量比率
 								        data["volume_ratio"] = data["volume"] / data["volume_ma"]
 								        # 计算异常强度
 								        data["spike_intensity"] = data["volume_ratio"] - 1
 								        # 如果check_price为True，检查价格分位数
 								        if check_price:
 								            if "close" not in data.columns:
-												invoke python file by schedule directly, instead of import way. The purpose is to make each event to be totally new initialization action.

											
										
										
											2025-08-15 03:37:06 +00:00
+								                logger.error("数据中缺少close列，无法进行价格检查")
-												support statistics huge volume based on market data

											
										
										
											2025-07-26 06:41:50 +00:00
+								                return data
-												complete market data monitor main logic

											
										
										
											2025-08-05 07:30:50 +00:00
+								            if "high" not in data.columns:
-												invoke python file by schedule directly, instead of import way. The purpose is to make each event to be totally new initialization action.

											
										
										
											2025-08-15 03:37:06 +00:00
+								                logger.error("数据中缺少high列，无法进行价格检查")
-												complete market data monitor main logic

											
										
										
											2025-08-05 07:30:50 +00:00
+								                return data
 								            if "low" not in data.columns:
-												invoke python file by schedule directly, instead of import way. The purpose is to make each event to be totally new initialization action.

											
										
										
											2025-08-15 03:37:06 +00:00
+								                logger.error("数据中缺少low列，无法进行价格检查")
-												complete market data monitor main logic

											
										
										
											2025-08-05 07:30:50 +00:00
+								                return data
-												support statistics huge volume based on market data

											
										
										
											2025-07-26 06:41:50 +00:00
-												complete market data monitor main logic

											
										
										
											2025-08-05 07:30:50 +00:00
+								            for price_column in ["close", "high", "low"]:
 								                # 计算分位数指标（80/20和90/10）
 								                data = self._calculate_percentile_indicators(data, window_size, price_column)
-												update for more professional data fetch action

											
										
										
											2025-07-30 08:11:34 +00:00
-												support statistics huge volume based on market data

											
										
										
											2025-07-26 06:41:50 +00:00
 								        if only_output_huge_volume:
-												update for more professional data fetch action

											
										
										
											2025-07-30 08:11:34 +00:00
+								            data = data[(data["huge_volume"] == 1)]
-. support update huge volume data
a. update market data at first
b. According to the first updated market data timestamp, get earlier window-size records.
c. Calculate huge volume data by the latest market data
d. Only insert new data to huge volume table
2. rename code file to be more proper.

											
										
										
											2025-07-28 08:14:40 +00:00
+								        data["create_time"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-												support statistics huge volume based on market data

											
										
										
											2025-07-26 06:41:50 +00:00
 								        if output_excel:
 								            # 检查数据是否为空
 								            if len(data) == 0:
-												invoke python file by schedule directly, instead of import way. The purpose is to make each event to be totally new initialization action.

											
										
										
											2025-08-15 03:37:06 +00:00
+								                logger.warning("数据为空，无法导出Excel文件")
-												support statistics huge volume based on market data

											
										
										
											2025-07-26 06:41:50 +00:00
+								                return data
-												update for more professional data fetch action

											
										
										
											2025-07-30 08:11:34 +00:00
-												support statistics huge volume based on market data

											
										
										
											2025-07-26 06:41:50 +00:00
+								            start_date = data["date_time"].iloc[0]
 								            end_date = data["date_time"].iloc[-1]
 								            # remove punctuation from start_date and end_date
 								            start_date = re.sub(r"[\:\-\s]", "", str(start_date))
 								            end_date = re.sub(r"[\:\-\s]", "", str(end_date))
 								            symbol = data["symbol"].iloc[0]
 								            bar = data["bar"].iloc[0]
-												support get huge volume by different window sizes

											
										
										
											2025-07-30 09:28:01 +00:00
+								            file_name = f"volume_spike_{symbol}_{bar}_{window_size}_{start_date}_{end_date}.xlsx"
-. support update huge volume data
a. update market data at first
b. According to the first updated market data timestamp, get earlier window-size records.
c. Calculate huge volume data by the latest market data
d. Only insert new data to huge volume table
2. rename code file to be more proper.

											
										
										
											2025-07-28 08:14:40 +00:00
+								            try:
-												update for more professional data fetch action

											
										
										
											2025-07-30 08:11:34 +00:00
+								                with pd.ExcelWriter(
 								                    os.path.join(self.output_folder, file_name)
 								                ) as writer:
-. support update huge volume data
a. update market data at first
b. According to the first updated market data timestamp, get earlier window-size records.
c. Calculate huge volume data by the latest market data
d. Only insert new data to huge volume table
2. rename code file to be more proper.

											
										
										
											2025-07-28 08:14:40 +00:00
+								                    data.to_excel(writer, sheet_name="volume_spike", index=False)
 								            except Exception as e:
-												invoke python file by schedule directly, instead of import way. The purpose is to make each event to be totally new initialization action.

											
										
										
											2025-08-15 03:37:06 +00:00
+								                logger.error(f"导出Excel文件失败: {e}")
-												support statistics huge volume based on market data

											
										
										
											2025-07-26 06:41:50 +00:00
 								        return data
-												update for more professional data fetch action

											
										
										
											2025-07-30 08:11:34 +00:00
 								    def next_periods_rise_or_fall(
 								        self,
 								        data: pd.DataFrame,
-												support get huge volume by different window sizes

											
										
										
											2025-07-30 09:28:01 +00:00
+								        window_size: int = 50,
-												support statistics price-volume relationship

											
										
										
											2025-07-31 12:31:22 +00:00
+								        periods: List[int] = [1, 2, 3, 5, 10],
-												update for more professional data fetch action

											
										
										
											2025-07-30 08:11:34 +00:00
+								    ) -> Tuple[pd.DataFrame, pd.DataFrame]:
 								        """
 . 根据period_count，计算每个timestamp的下一个periods的rise_or_fall
 								        示例，period_count=3，则计算每个timestamp的下一个3个periods的rise_or_fall
 								        示例，data如下：
 								        timestamp    close huge_volume
 								        1000000000   100   1
 								        1000000001   101   0
 								        1000000002   102   1
 								        1000000003   103   0
 								        1000000004   100   1
 								        1000000005   98    0
 								        则对于timestamp，1000000000，计算结果为：
 								        timestamp    close    huge_volume  next_3_close  next_3_result   next_5_close  next_5_result
 								        1000000000   100      1             103           rise            98            fall
 								        因为之后第3个periods的close是103，所以next_3_result为rise
 								        因为之后第5个periods的close是98，所以next_3_result为fall
-												support statistics price-volume relationship

											
										
										
											2025-07-31 12:31:22 +00:00
+. 根据volume_ratio_percentile_10，将data分成10份，然后计算每一份的上涨与下跌次数，以及平均收益率
 . 新建一个列表: result，计算之后n个周期，close上涨或下跌的比例
-												update for more professional data fetch action

											
										
										
											2025-07-30 08:11:34 +00:00
+								        a. 计算huge_volume为1时，且price_80_high为1时的数量，如100， 并且计算next_3_result为fall的次数，如50， 然后计算fall_ratio， 如50/100=0.5
 								        b. 计算huge_volume为1时，且price_80_high为1时的数量，如100， 并且计算next_5_result为fall的次数，如30， 然后计算fall_ratio， 如30/100=0.3
 								        c. 计算huge_volume为1时，且price_20_low为1时的数量，如100， 并且计算next_3_result为rise的次数，如50， 然后计算rise_ratio， 如50/100=0.5
 								        d. 计算huge_volume为1时，且price_20_low为1时的数量，如100， 并且计算next_5_result为rise的次数，如30， 然后计算rise_ratio， 如30/100=0.3
 								        e. 同样计算90/10分位数的统计
 								        Args:
 								            data: 包含巨量交易数据的DataFrame
-												support statistics price-volume relationship

											
										
										
											2025-07-31 12:31:22 +00:00
+								            periods: 计算周期列表，默认[1, 2, 3, 5, 10]
-												update for more professional data fetch action

											
										
										
											2025-07-30 08:11:34 +00:00
+								            output_excel: 是否输出到Excel文件，默认False
 								        Returns:
 								            Tuple[pd.DataFrame, pd.DataFrame]: (处理后的数据, 统计结果)
 								        """
-												support statistics price-volume relationship

											
										
										
											2025-07-31 12:31:22 +00:00
+								        # 将huge_volume, volume_80_20_price_spike, volume_90_10_price_spike,
 								        # price_80_high, price_20_low, price_90_high, price_10_low设置为整型
 								        data["huge_volume"] = data["huge_volume"].astype(int)
 								        data["volume_80_20_price_spike"] = data["volume_80_20_price_spike"].astype(int)
 								        data["volume_90_10_price_spike"] = data["volume_90_10_price_spike"].astype(int)
 								        data["price_80_high"] = data["price_80_high"].astype(int)
 								        data["price_20_low"] = data["price_20_low"].astype(int)
 								        data["price_90_high"] = data["price_90_high"].astype(int)
 								        data["price_10_low"] = data["price_10_low"].astype(int)
-												update for more professional data fetch action

											
										
										
											2025-07-30 08:11:34 +00:00
+								        data = data.sort_values(by="timestamp", ascending=True)
 								        data = data.reset_index(drop=True)
-												support statistics price-volume relationship

											
										
										
											2025-07-31 12:31:22 +00:00
-												update for more professional data fetch action

											
										
										
											2025-07-30 08:11:34 +00:00
+								        # 计算未来价格变化
 								        for period in periods:
 								            data[f"next_{period}_close"] = data["close"].shift(-period)
-												support statistics price-volume relationship

											
										
										
											2025-07-31 12:31:22 +00:00
+								            data[f"next_{period}_change"] = (
 								                (data[f"next_{period}_close"] / data["close"] - 1) * 100
-												update for more professional data fetch action

											
										
										
											2025-07-30 08:11:34 +00:00
+								            )
-												support statistics price-volume relationship

											
										
										
											2025-07-31 12:31:22 +00:00
+								            # 添加一列next_{period}_result，如果next_{period}_change大于0，则值为rise，如果next_{period}_change小于0，则值为fall，如果next_{period}_change等于0，则值为draw
 								            data[f"next_{period}_result"] = data[f"next_{period}_change"].apply(
-												update for more professional data fetch action

											
										
										
											2025-07-30 08:11:34 +00:00
+								                lambda x: (
 								                    "rise"
 								                    if pd.notna(x) and x > 0
 								                    else (
 								                        "fall"
 								                        if pd.notna(x) and x < 0
 								                        else "draw" if pd.notna(x) and x == 0 else x
 								                    )
 								                )
 								            )
-												support statistics price-volume relationship

											
										
										
											2025-07-31 12:31:22 +00:00
 								        # 将volume_ratio按照百分位分成十份
 								        huge_volume_data = deepcopy(data[data["huge_volume"] == 1])
 								        huge_volume_data = huge_volume_data.sort_values(by="timestamp", ascending=True)
 								        huge_volume_data = huge_volume_data.reset_index(drop=True)
 								        huge_volume_data["volume_ratio_percentile"] = huge_volume_data["volume_ratio"].rank(pct=True)
 								        huge_volume_data["volume_ratio_percentile_10"] = huge_volume_data["volume_ratio_percentile"].apply(
 								            lambda x: 10 if x <= 0.1 else 20
 								            if x <= 0.2 else 30 if x <= 0.3 else 40
 								            if x <= 0.4 else 50 if x <= 0.5 else 60
 								            if x <= 0.6 else 70 if x <= 0.7 else 80
 								            if x <= 0.8 else 90 if x <= 0.9 else 100
 								        )
 								        huge_volume_ratio_percentile_10_mean = huge_volume_data.groupby("volume_ratio_percentile_10")["volume_ratio"].mean()
 								        percentile_10_mean = round(float(huge_volume_data["volume_ratio"].mean()), 4)
 								        # insert one row to huge_volume_ratio_percentile_10_mean, key is -1, value is percentile_10_mean
 								        huge_volume_ratio_percentile_10_mean.loc["-1"] = percentile_10_mean
 								        # transform huge_volume_ratio_percentile_10_mean index to int
 								        huge_volume_ratio_percentile_10_mean.index = huge_volume_ratio_percentile_10_mean.index.astype(int)
 								        # sort huge_volume_ratio_percentile_10_mean by index
 								        huge_volume_ratio_percentile_10_mean = huge_volume_ratio_percentile_10_mean.sort_index()
-												update for more professional data fetch action

											
										
										
											2025-07-30 08:11:34 +00:00
+								        results = []
-												support statistics price-volume relationship

											
										
										
											2025-07-31 12:31:22 +00:00
+								        # iterate huge_volume_ratio_percentile_10_mean
 								        for index, value in huge_volume_ratio_percentile_10_mean.items():
 								            if index == -1:
 								                data_temp = deepcopy(huge_volume_data)
 								                volume_ratio_percentile_10 = "all"
 								                current_percentile_10_mean = percentile_10_mean
 								            else:
 								                data_temp = deepcopy(huge_volume_data[huge_volume_data["volume_ratio_percentile_10"] == index])
 								                volume_ratio_percentile_10 = str(index)
 								                current_percentile_10_mean = round(value, 4)
 								            data_temp = data_temp.reset_index(drop=True)
 								            data_temp = data_temp.sort_values(by="timestamp", ascending=True)
 								            data_temp = data_temp.reset_index(drop=True)
 								            # 过滤data, 只获取huge_volume为1，且价格处于分位数位置的行
 								            price_conditions = []
 								            if "price_80_high" in data_temp.columns:
 								                price_conditions.append(data_temp["price_80_high"] == 1)
 								            if "price_20_low" in data_temp.columns:
 								                price_conditions.append(data_temp["price_20_low"] == 1)
 								            if "price_90_high" in data_temp.columns:
 								                price_conditions.append(data_temp["price_90_high"] == 1)
 								            if "price_10_low" in data_temp.columns:
 								                price_conditions.append(data_temp["price_10_low"] == 1)
 								            if price_conditions:
 								                combined_condition = data_temp["huge_volume"] == 1
 								                for condition in price_conditions:
 								                    combined_condition = combined_condition | condition
 								                data_temp = data_temp[combined_condition]
 								            data_temp = data_temp.reset_index(drop=True)
 								            # 统计各种分位数情况的数量
 								            price_stats = {}
 								            for price_type in ["price_80_high", "price_20_low", "price_90_high", "price_10_low"]:
 								                if price_type in data.columns:
 								                    price_stats[price_type] = len(data_temp[(data_temp["huge_volume"] == 1) & (data_temp[price_type] == 1)])
 								            for period in periods:
 								                for price_type, count in price_stats.items():
 								                    if count > 0:
 								                        # 计算下跌次数
 								                        fall_count = len(
 								                            data_temp[
 								                                (data_temp["huge_volume"] == 1) &
 								                                (data_temp[price_type] == 1) &
 								                                (data_temp[f"next_{period}_result"] == "fall")
 								                            ]
 								                        )
 								                        # 计算上涨次数
 								                        rise_count = len(
 								                            data_temp[
 								                                (data_temp["huge_volume"] == 1) &
 								                                (data_temp[price_type] == 1) &
 								                                (data_temp[f"next_{period}_result"] == "rise")
 								                            ]
 								                        )
 								                        draw_count = len(
 								                            data_temp[
 								                                (data_temp["huge_volume"] == 1) &
 								                                (data_temp[price_type] == 1) &
 								                                (data_temp[f"next_{period}_result"] == "draw")
 								                            ]
 								                        )
 								                        # 根据data[f"next_{period}_result"]获得平均收益率
 								                        average_return = float(data_temp[(data_temp["huge_volume"] == 1) & (data_temp[price_type] == 1)]
 								                        [f"next_{period}_change"].mean())
 								                        max_return = float(data_temp[(data_temp["huge_volume"] == 1) & (data_temp[price_type] == 1)]
 								                        [f"next_{period}_change"].max())
 								                        min_return = float(data_temp[(data_temp["huge_volume"] == 1) & (data_temp[price_type] == 1)]
 								                        [f"next_{period}_change"].min())
 								                        results.append(
 								                            {
 								                                "symbol": data_temp["symbol"].iloc[0] if len(data_temp) > 0 else "",
 								                                "bar": data_temp["bar"].iloc[0] if len(data_temp) > 0 else "",
 								                                "window_size": window_size,
 								                                "huge_volume": 1,
 								                                "volume_ratio_percentile_10": volume_ratio_percentile_10,
 								                                "volume_ratio_percentile_10_mean": current_percentile_10_mean,
 								                                "price_type": price_type,
 								                                "next_period": period,
 								                                "average_return": average_return,
 								                                "max_return": max_return,
 								                                "min_return": min_return,
 								                                "rise_count": rise_count,
 								                                "rise_ratio": round((rise_count / count) * 100, 4),
 								                                "fall_count": fall_count,
 								                                "fall_ratio": round((fall_count / count) * 100, 4),
 								                                "draw_count": draw_count,
 								                                "draw_ratio": round((draw_count / count) * 100, 4),
 								                                "total_count": count,
 								                            }
 								                        )
-												update for more professional data fetch action

											
										
										
											2025-07-30 08:11:34 +00:00
 								        result_data = pd.DataFrame(results)
-												support statistics price-volume relationship

											
										
										
											2025-07-31 12:31:22 +00:00
+								        return huge_volume_data, result_data