From 38a4465e29968f3af4473a025bff0fcde745bd3f Mon Sep 17 00:00:00 2001
From: blade <8019068@qq.com>
Date: Fri, 8 Aug 2025 18:02:37 +0800
Subject: [PATCH] support statistic volume-hour distribution

---
 core/statistics/price_volume_stats.py | 222 +++++++++++++++++++++++++-
 1 file changed, 214 insertions(+), 8 deletions(-)

diff --git a/core/statistics/price_volume_stats.py b/core/statistics/price_volume_stats.py
index 490799d..7f8489e 100644
--- a/core/statistics/price_volume_stats.py
+++ b/core/statistics/price_volume_stats.py
@@ -51,6 +51,8 @@ class PriceVolumeStats:
         os.makedirs(self.stats_chart_dir, exist_ok=True)
 
     def batch_price_volume_statistics(self):
+        high_volume_hours_list = []
+        huge_high_volume_hours_list = []
         price_stats_list = []
         pct_change_stats_list = []
         peak_valley_data_list = []
@@ -83,6 +85,12 @@ class PriceVolumeStats:
                     else:
                         if data["timestamp"].iloc[-1] > latest_market_timestamp:
                             latest_market_timestamp = data["timestamp"].iloc[-1]
+                    # 统计高成交量小时分布
+                    logging.info(f"统计{symbol} {bar} 巨量小时分布数据")
+                    high_volume_hours_data = self.stats_high_volume_hours(data)
+                    high_volume_hours_list.append(high_volume_hours_data)
+                    huge_high_volume_hours_data = self.stats_high_volume_hours(data, 4)
+                    huge_high_volume_hours_list.append(huge_high_volume_hours_data)
                     logging.info(f"统计{symbol} {bar} 价格数据")
                     price_stats_data = self.calculate_price_statistics(data)
                     logging.info(f"统计{symbol} {bar} 涨跌百分比数据")
@@ -103,6 +111,10 @@ class PriceVolumeStats:
                     peak_valley_stats_list.append(peak_valley_stats_data)
                     volume_stats_list.append(volume_stats_data)
                     price_volume_stats_list.append(price_volume_stats_data)
+        high_volume_hours_df = pd.concat(high_volume_hours_list)
+        high_volume_hours_df.sort_values(by=["symbol", "bar", "hour"], inplace=True)
+        huge_high_volume_hours_df = pd.concat(huge_high_volume_hours_list)
+        huge_high_volume_hours_df.sort_values(by=["symbol", "bar", "hour"], inplace=True)
         price_stats_df = pd.DataFrame(price_stats_list)
         price_stats_df.sort_values(by=["symbol", "bar"], inplace=True)
         pct_change_stats_df = pd.DataFrame(pct_change_stats_list)
@@ -126,22 +138,32 @@ class PriceVolumeStats:
         output_file_path = os.path.join(self.stats_output_dir, output_file_name)
         logging.info(f"导出{output_file_path}")
         with pd.ExcelWriter(output_file_path) as writer:
-            price_stats_df.to_excel(writer, sheet_name="price_stats", index=False)
+            price_stats_df.to_excel(writer, sheet_name="价格统计", index=False)
             pct_change_stats_df.to_excel(
-                writer, sheet_name="pct_change_stats", index=False
+                writer, sheet_name="涨跌百分比统计", index=False
             )
             peak_valley_data_df.to_excel(
-                writer, sheet_name="peak_valley_data", index=False
+                writer, sheet_name="波峰波谷明细", index=False
             )
             peak_valley_stats_df.to_excel(
-                writer, sheet_name="peak_valley_stats", index=False
+                writer, sheet_name="波峰波谷统计", index=False
             )
-            volume_stats_df.to_excel(writer, sheet_name="volume_stats", index=False)
+            volume_stats_df.to_excel(writer, sheet_name="量能统计", index=False)
             price_volume_stats_df.to_excel(
-                writer, sheet_name="price_volume_stats", index=False
+                writer, sheet_name="量价统计", index=False
+            )
+            high_volume_hours_df.to_excel(
+                writer, sheet_name="放量小时分布", index=False
+            )
+            huge_high_volume_hours_df.to_excel(
+                writer, sheet_name="4倍放量小时分布", index=False
             )
         chart_dict = self.draw_price_change_peak_valley_chart(peak_valley_stats_df)
         self.output_chart_to_excel(output_file_path, chart_dict)
+        chart_dict = self.draw_high_volume_hours_chart(high_volume_hours_df, normal=True)
+        self.output_chart_to_excel(output_file_path, chart_dict)
+        chart_dict = self.draw_high_volume_hours_chart(huge_high_volume_hours_df, normal=False)
+        self.output_chart_to_excel(output_file_path, chart_dict)
         return price_stats_df, volume_stats_df, price_volume_stats_df
 
     def calculate_price_statistics(self, data: pd.DataFrame):
@@ -395,6 +417,99 @@ class PriceVolumeStats:
         stats_data = self.base_statistics(peak_valley_data, "price_change_ratio")
         return peak_valley_data, stats_data
 
+
+    def draw_high_volume_hours_chart(self, data: pd.DataFrame, normal: bool = True):
+        """
+        绘制高成交量小时分布图表（美观，保存到self.stats_chart_dir）
+        :param data: 高成交量小时分布数据（如high_volume_hours_df）
+        :return: None
+        """
+        if data is None or data.empty:
+            return None
+        # seaborn风格设置
+        sns.set_theme(style="whitegrid")
+        # plt.rcParams['font.family'] = "SimHei"
+        plt.rcParams["font.sans-serif"] = ["SimHei"]  # 也可直接用字体名
+        plt.rcParams["font.size"] = 11  # 设置字体大小
+        plt.rcParams["axes.unicode_minus"] = False  # 解决负号显示问题
+        chart_dict = {}
+        
+        for symbol in data["symbol"].unique():
+            symbol_data = data[data["symbol"] == symbol]
+            if normal:
+                sheet_name = f"{symbol}_量时分布图表"
+            else:
+                sheet_name = f"{symbol}_4倍量时分布图表"
+            chart_dict[sheet_name] = {}
+            for bar in symbol_data["bar"].unique():
+                bar_data = symbol_data[symbol_data["bar"] == bar].copy()
+                # 将hour改名为小时
+                bar_data.rename(columns={"hour": "小时"}, inplace=True)
+                # huge_volume_count改名为巨量次数
+                bar_data.rename(columns={"huge_volume_count": "巨量次数"}, inplace=True)
+                # huge_volume_ratio改名为巨量次数占比
+                bar_data.rename(columns={"huge_volume_ratio": "巨量次数占比"}, inplace=True)
+                # huge_volume_rise_count改名为巨量上涨次数
+                bar_data.rename(columns={"huge_volume_rise_count": "巨量上涨次数"}, inplace=True)
+                # huge_volume_fall_count改名为巨量下跌次数
+                bar_data.rename(columns={"huge_volume_fall_count": "巨量下跌次数"}, inplace=True)
+                bar_data.reset_index(drop=True, inplace=True)
+                fig, axes = plt.subplots(1, 2, figsize=(14, 5))
+                fig.suptitle(f"巨量小时分布 - {symbol} {bar}", fontsize=18)
+                # huge_volume_count
+                # 柱状图使用不同颜色，巨量次数使用渐变蓝色
+                palette = sns.color_palette("Blues_d", 2)
+                palette[0] = sns.color_palette("Blues_d", 2)[1]
+                palette[1] = sns.color_palette("Reds_d", 2)[1]
+                sns.barplot(
+                    ax=axes[0],
+                    x="小时",
+                    y="巨量次数",
+                    data=bar_data,
+                    hue="symbol",
+                    palette=palette,
+                    legend=False,
+                )
+                axes[0].set_title("巨量小时分布")
+                axes[0].set_ylabel("巨量次数")
+                # huge_volume_rise_count与huge_volume_fall_count
+                # 创建一个图表，都位于axes[1, 0]，包含两个柱状图:
+                # huge_volume_rise_count与huge_volume_fall_count并列放置，
+                # 并使用不同的颜色
+
+                df_long = pd.melt(bar_data, id_vars=['小时'], value_vars=['巨量上涨次数', '巨量下跌次数'],
+                  var_name='类别', value_name='次数')
+                # 柱状图使用不同颜色，巨量上涨次数使用渐变红色，巨量下跌次数使用渐变绿色
+                palette = sns.color_palette("Blues_d", 2)
+                palette[0] = sns.color_palette("Reds_d", 2)[1]
+                palette[1] = sns.color_palette("Greens_d", 2)[1]
+                sns.barplot(
+                    ax=axes[1],
+                    x="小时",
+                    y="次数",
+                    data=df_long,
+                    hue="类别",
+                    palette=palette,
+                    legend=False,
+                )
+                axes[1].set_title("巨量小时上涨下跌分布")
+                axes[1].set_ylabel("次数")
+                # 旋转x轴标签
+                for ax in axes.flat:
+                    for label in ax.get_xticklabels():
+                        label.set_rotation(45)
+                plt.tight_layout(rect=[0, 0, 1, 0.96])
+                if normal:
+                    save_path = os.path.join(self.stats_chart_dir, f"{symbol}_{bar}_high_volume_hours.png")
+                else:
+                    save_path = os.path.join(self.stats_chart_dir, f"{symbol}_{bar}_4_high_volume_hours.png")
+                plt.savefig(save_path, dpi=150)
+                plt.close(fig)
+                chart_dict[sheet_name][
+                    f"巨量小时分布 - {bar}"
+                ] = save_path
+        return chart_dict
+
     def draw_price_change_peak_valley_chart(self, data: pd.DataFrame):
         """
         绘制价格变化峰值和谷值图表（美观，保存到self.stats_chart_dir）
@@ -409,7 +524,7 @@ class PriceVolumeStats:
         plt.rcParams["font.sans-serif"] = ["SimHei"]  # 也可直接用字体名
         plt.rcParams["font.size"] = 11  # 设置字体大小
         plt.rcParams["axes.unicode_minus"] = False  # 解决负号显示问题
-        chart_dict = {"bar_peak_valley_chart": {}}
+        chart_dict = {"波峰波谷图表": {}}
         for bar in data["bar"].unique():
             bar_data = data[data["bar"] == bar]
             fig, axes = plt.subplots(2, 2, figsize=(14, 10))
@@ -470,7 +585,7 @@ class PriceVolumeStats:
             save_path = os.path.join(self.stats_chart_dir, f"peak_valley_{bar}.png")
             plt.savefig(save_path, dpi=150)
             plt.close(fig)
-            chart_dict["bar_peak_valley_chart"][
+            chart_dict["波峰波谷图表"][
                 f"波段变化峰值和谷值统计 - {bar}"
             ] = save_path
         return chart_dict
@@ -530,6 +645,97 @@ class PriceVolumeStats:
         wb.save(excel_file_path)
         print(f"Chart saved as {excel_file_path}")
 
+    def stats_high_volume_hours(self, data: pd.DataFrame, volume_ratio_threshold: int = None):
+        """
+        统计巨量小时分布
+        小时包括0-23点，每小时一个数据
+        首先不区分价格涨跌，统计每个小时，满足huge_volume == 1的次数
+        然后区分价格涨跌，统计每个小时，满足huge_volume == 1的次数
+        最后统计每个小时，满足huge_volume == 1的次数，与满足huge_volume == 0的次数的比率
+        :param data: 市场数据
+        :return: 巨量小时分布
+        """
+        if data is None:
+            return None
+        if volume_ratio_threshold is not None and volume_ratio_threshold > 0:
+            data = data[data["volume_ratio"] >= volume_ratio_threshold]
+        # 将date_time转换为datetime类型
+        data["date_time"] = pd.to_datetime(data["date_time"])
+        # 通过pandas自带的功能，计算pct_chg
+        data["pct_chg"] = data["close"].pct_change()
+        # 统计每个小时，满足huge_volume == 1的次数
+        huge_volume_hours = data.groupby(data["date_time"].dt.hour)["huge_volume"].sum()
+        # 统计每个小时，满足huge_volume == 0的次数
+        # no_huge_volume_hours = (
+        #     data.groupby(data["date_time"].dt.hour)["huge_volume"].count()
+        #     - huge_volume_hours
+        # )
+        # 统计每个小时，满足huge_volume == 1的次数，与满足huge_volume == 0的次数的比率
+        # huge_volume_ratio_hours = huge_volume_hours / no_huge_volume_hours
+        # 将huge_volume_ratio_hours转换为百分比
+        # huge_volume_ratio_hours = huge_volume_ratio_hours * 100
+        # 统计每个小时，满足huge_volume == 1且上涨的次数
+        huge_volume_rise_hours_df = (
+            data[(data["huge_volume"] == 1) & (data["pct_chg"] > 0)]
+            .groupby(data["date_time"].dt.hour)["huge_volume"]
+            .sum()
+        )
+        # 统计每个小时，满足huge_volume == 1且下跌的次数
+        huge_volume_fall_hours_df = (
+            data[(data["huge_volume"] == 1) & (data["pct_chg"] < 0)]
+            .groupby(data["date_time"].dt.hour)["huge_volume"]
+            .sum()
+        )
+
+        # 将huge_volume_hours, no_huge_volume_hours, huge_volume_ratio_hours转换为DataFrame
+        huge_volume_hours_df = pd.DataFrame(huge_volume_hours)
+        # no_huge_volume_hours_df = pd.DataFrame(no_huge_volume_hours)
+        # huge_volume_ratio_hours_df = pd.DataFrame(huge_volume_ratio_hours)
+        huge_volume_rise_hours_df = pd.DataFrame(huge_volume_rise_hours_df)
+        huge_volume_fall_hours_df = pd.DataFrame(huge_volume_fall_hours_df)
+        # 将hour index作为列名: hour,将sum与count后的列名改为huge_volume_count, no_huge_volume_count
+        huge_volume_hours_df.index.name = "hour"
+        # no_huge_volume_hours_df.index.name = "hour"
+        # huge_volume_ratio_hours_df.index.name = "hour"
+        huge_volume_rise_hours_df.index.name = "hour"
+        huge_volume_fall_hours_df.index.name = "hour"
+        huge_volume_hours_df.columns = ["huge_volume_count"]
+        # no_huge_volume_hours_df.columns = ["no_huge_volume_count"]
+        # huge_volume_ratio_hours_df.columns = ["huge_volume_ratio"]
+        huge_volume_rise_hours_df.columns = ["huge_volume_rise_count"]
+        huge_volume_fall_hours_df.columns = ["huge_volume_fall_count"]
+        # 将huge_volume_hours_df, no_huge_volume_hours_df, huge_volume_ratio_hours_df, huge_volume_rise_hours_df, huge_volume_fall_hours_df合并为DataFrame
+        result_df = pd.concat(
+            [
+                huge_volume_hours_df,
+                # no_huge_volume_hours_df,
+                # huge_volume_ratio_hours_df,
+                huge_volume_rise_hours_df,
+                huge_volume_fall_hours_df,
+            ],
+            axis=1,
+        )
+        # 将hour index作为列名: hour
+        result_df.index.name = "hour"
+        result_df = result_df.reset_index()
+        # 将hour index转换为列名: hour, huge_volume_count, no_huge_volume_count, huge_volume_ratio
+        result_df["symbol"] = data.iloc[0]["symbol"]
+        result_df["bar"] = data.iloc[0]["bar"]
+        result_df = result_df[
+            [
+                "symbol",
+                "bar",
+                "hour",
+                "huge_volume_count",
+                # "no_huge_volume_count",
+                # "huge_volume_ratio",
+                "huge_volume_rise_count",
+                "huge_volume_fall_count",
+            ]
+        ]
+        result_df.reset_index(drop=True, inplace=True)
+        return result_df
+
     def find_peaks_valleys(self, data: pd.DataFrame, window=10):
         """
         识别K线数据的波峰和波谷