PostgreSQL窗口函数

本章将详细介绍PostgreSQL中的窗口函数,包括排名函数、偏移函数、聚合函数等,帮助您执行复杂的分析查询。

窗口函数概述

什么是窗口函数

窗口函数是对一组行执行计算的函数,与普通聚合函数不同,窗口函数不会将多行合并为一行,而是为每一行返回一个值:

-- 窗口函数基本语法
SELECT 
    column1,
    column2,
    window_function(column3) OVER (
        [PARTITION BY column4] 
        [ORDER BY column5] 
        [ROWS|RANGE frame_clause]
    )
FROM table_name;

窗口函数与普通聚合函数的区别

-- 普通聚合函数(返回一行)
SELECT COUNT(*) AS total_users FROM users;

-- 窗口函数(为每行返回值)
SELECT 
    username,
    created_at,
    COUNT(*) OVER() AS total_users
FROM users;

排名函数

ROW_NUMBER

为结果集中的每一行分配唯一的连续数字:

SELECT 
    username,
    score,
    ROW_NUMBER() OVER (ORDER BY score DESC) AS row_num
FROM users
ORDER BY score DESC;

RANK

为结果集中的每一行分配排名,相同值获得相同排名,但会跳过后续排名:

SELECT 
    username,
    score,
    RANK() OVER (ORDER BY score DESC) AS rank_num
FROM users
ORDER BY score DESC;

DENSE_RANK

为结果集中的每一行分配排名,相同值获得相同排名,但不会跳过后续排名:

SELECT 
    username,
    score,
    DENSE_RANK() OVER (ORDER BY score DESC) AS dense_rank_num
FROM users
ORDER BY score DESC;

PERCENT_RANK

计算相对排名(0到1之间):

SELECT 
    username,
    score,
    PERCENT_RANK() OVER (ORDER BY score) AS percent_rank
FROM users
ORDER BY score;

CUME_DIST

计算累积分布:

SELECT 
    username,
    score,
    CUME_DIST() OVER (ORDER BY score) AS cumulative_dist
FROM users
ORDER BY score;

偏移函数

LAG

访问前一行的数据:

SELECT 
    date,
    sales_amount,
    LAG(sales_amount, 1) OVER (ORDER BY date) AS previous_day_sales,
    sales_amount - LAG(sales_amount, 1) OVER (ORDER BY date) AS daily_change
FROM daily_sales
ORDER BY date;

LEAD

访问后一行的数据:

SELECT 
    date,
    sales_amount,
    LEAD(sales_amount, 1) OVER (ORDER BY date) AS next_day_sales,
    LEAD(sales_amount, 1) OVER (ORDER BY date) - sales_amount AS next_day_change
FROM daily_sales
ORDER BY date;

FIRST_VALUE

获取窗口中的第一个值:

SELECT 
    date,
    sales_amount,
    FIRST_VALUE(sales_amount) OVER (ORDER BY date) AS first_day_sales
FROM daily_sales
ORDER BY date;

LAST_VALUE

获取窗口中的最后一个值:

SELECT 
    date,
    sales_amount,
    LAST_VALUE(sales_amount) OVER (
        ORDER BY date 
        ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
    ) AS last_day_sales
FROM daily_sales
ORDER BY date;

聚合窗口函数

基本聚合函数

在窗口中使用聚合函数:

SELECT 
    department,
    employee_name,
    salary,
    COUNT(*) OVER (PARTITION BY department) AS dept_employee_count,
    AVG(salary) OVER (PARTITION BY department) AS dept_avg_salary,
    SUM(salary) OVER (PARTITION BY department) AS dept_total_salary,
    MIN(salary) OVER (PARTITION BY department) AS dept_min_salary,
    MAX(salary) OVER (PARTITION BY department) AS dept_max_salary
FROM employees
ORDER BY department, salary DESC;

运行总计

计算累计值:

SELECT 
    date,
    sales_amount,
    SUM(sales_amount) OVER (ORDER BY date) AS running_total
FROM daily_sales
ORDER BY date;

窗口框架

ROWS子句

定义基于行数的窗口框架:

SELECT 
    date,
    sales_amount,
    AVG(sales_amount) OVER (
        ORDER BY date 
        ROWS BETWEEN 2 PRECEDING AND 2 FOLLOWING
    ) AS moving_avg_5_days,
    AVG(sales_amount) OVER (
        ORDER BY date 
        ROWS BETWEEN 6 PRECEDING AND CURRENT ROW
    ) AS moving_avg_7_days
FROM daily_sales
ORDER BY date;

RANGE子句

定义基于值范围的窗口框架:

SELECT 
    employee_id,
    salary,
    COUNT(*) OVER (
        ORDER BY salary 
        RANGE BETWEEN 10000 PRECEDING AND 10000 FOLLOWING
    ) AS employees_in_salary_range
FROM employees
ORDER BY salary;

实际应用场景

销售数据分析

-- 创建销售数据表
CREATE TABLE sales_data (
    id SERIAL PRIMARY KEY,
    product_name VARCHAR(100),
    category VARCHAR(50),
    sale_date DATE,
    amount NUMERIC(10,2),
    quantity INTEGER
);

-- 插入示例数据
INSERT INTO sales_data (product_name, category, sale_date, amount, quantity) VALUES
('Product A', 'Electronics', '2023-10-01', 1000.00, 2),
('Product B', 'Electronics', '2023-10-01', 1500.00, 1),
('Product C', 'Clothing', '2023-10-01', 200.00, 5),
('Product A', 'Electronics', '2023-10-02', 800.00, 1),
('Product B', 'Electronics', '2023-10-02', 1200.00, 2),
('Product C', 'Clothing', '2023-10-02', 150.00, 3);

-- 销售排名分析
SELECT 
    product_name,
    category,
    SUM(amount) AS total_sales,
    RANK() OVER (ORDER BY SUM(amount) DESC) AS sales_rank,
    DENSE_RANK() OVER (PARTITION BY category ORDER BY SUM(amount) DESC) AS category_rank,
    ROUND(
        (SUM(amount) * 100.0 / SUM(SUM(amount)) OVER()), 
        2
    ) AS sales_percentage
FROM sales_data
GROUP BY product_name, category
ORDER BY total_sales DESC;

用户行为分析

-- 用户登录时间分析
SELECT 
    user_id,
    login_time,
    LAG(login_time) OVER (PARTITION BY user_id ORDER BY login_time) AS previous_login,
    login_time - LAG(login_time) OVER (PARTITION BY user_id ORDER BY login_time) AS time_since_last_login,
    ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY login_time) AS login_sequence
FROM user_logins
ORDER BY user_id, login_time;

财务数据分析

-- 股票价格分析
SELECT 
    stock_symbol,
    trade_date,
    closing_price,
    LAG(closing_price) OVER (PARTITION BY stock_symbol ORDER BY trade_date) AS previous_close,
    ROUND(
        ((closing_price - LAG(closing_price) OVER (PARTITION BY stock_symbol ORDER BY trade_date)) 
         / LAG(closing_price) OVER (PARTITION BY stock_symbol ORDER BY trade_date) * 100), 
        2
    ) AS daily_change_percent,
    AVG(closing_price) OVER (
        PARTITION BY stock_symbol 
        ORDER BY trade_date 
        ROWS BETWEEN 19 PRECEDING AND CURRENT ROW
    ) AS ma_20_days
FROM stock_prices
ORDER BY stock_symbol, trade_date;

绩效评估

-- 员工绩效评估
SELECT 
    department,
    employee_name,
    performance_score,
    salary,
    RANK() OVER (ORDER BY performance_score DESC) AS overall_rank,
    RANK() OVER (PARTITION BY department ORDER BY performance_score DESC) AS department_rank,
    CASE 
        WHEN RANK() OVER (PARTITION BY department ORDER BY performance_score DESC) = 1 THEN 'Top Performer'
        WHEN RANK() OVER (PARTITION BY department ORDER BY performance_score DESC) <= 3 THEN 'High Performer'
        ELSE 'Standard'
    END AS performance_category,
    salary - FIRST_VALUE(salary) OVER (PARTITION BY department ORDER BY performance_score DESC) AS salary_diff_from_top
FROM employees
ORDER BY department, performance_score DESC;

高级窗口函数

NTILE函数

将结果集分成指定数量的桶:

SELECT 
    employee_name,
    salary,
    NTILE(4) OVER (ORDER BY salary DESC) AS salary_quartile,
    CASE NTILE(4) OVER (ORDER BY salary DESC)
        WHEN 1 THEN 'Top 25%'
        WHEN 2 THEN 'Second 25%'
        WHEN 3 THEN 'Third 25%'
        WHEN 4 THEN 'Bottom 25%'
    END AS salary_group
FROM employees
ORDER BY salary DESC;

NTH_VALUE函数

获取窗口中第N个值:

SELECT 
    product_name,
    sales_amount,
    NTH_VALUE(product_name, 1) OVER (ORDER BY sales_amount DESC) AS top_product,
    NTH_VALUE(product_name, 2) OVER (ORDER BY sales_amount DESC) AS second_product,
    NTH_VALUE(product_name, 3) OVER (ORDER BY sales_amount DESC) AS third_product
FROM product_sales
ORDER BY sales_amount DESC;

性能优化

索引优化

为窗口函数创建合适的索引:

-- 为排序列创建索引
CREATE INDEX idx_employees_salary ON employees (salary);

-- 为分区列创建索引
CREATE INDEX idx_employees_department ON employees (department);

-- 为复合窗口函数创建复合索引
CREATE INDEX idx_employees_dept_salary ON employees (department, salary);

-- 为时间序列数据创建索引
CREATE INDEX idx_sales_date ON sales_data (sale_date);

查询优化

优化窗口函数查询:

-- 好的查询(使用适当索引)
SELECT 
    employee_name,
    salary,
    RANK() OVER (PARTITION BY department ORDER BY salary DESC) AS dept_rank
FROM employees
WHERE department IN ('IT', 'Sales', 'Marketing');

-- 避免在窗口函数中使用复杂表达式
SELECT 
    employee_name,
    salary,
    RANK() OVER (ORDER BY salary) AS salary_rank  -- 简单列
FROM employees;

-- 而不是
SELECT 
    employee_name,
    salary * 1.1 AS adjusted_salary,
    RANK() OVER (ORDER BY salary * 1.1) AS salary_rank  -- 复杂表达式

综合示例

-- 创建综合分析报告函数
CREATE OR REPLACE FUNCTION generate_sales_analysis_report()
RETURNS TABLE(
    period TEXT,
    total_sales NUMERIC,
    sales_growth_rate NUMERIC,
    top_product VARCHAR(100),
    best_performing_category VARCHAR(50),
    customer_segment TEXT
) AS $$
BEGIN
    RETURN QUERY
    WITH monthly_sales AS (
        SELECT 
            DATE_TRUNC('month', sale_date) AS sales_month,
            SUM(amount) AS monthly_total,
            LAG(SUM(amount)) OVER (ORDER BY DATE_TRUNC('month', sale_date)) AS previous_month_total
        FROM sales_data
        GROUP BY DATE_TRUNC('month', sale_date)
    ),
    product_performance AS (
        SELECT 
            product_name,
            SUM(amount) AS total_sales,
            RANK() OVER (ORDER BY SUM(amount) DESC) AS sales_rank
        FROM sales_data
        GROUP BY product_name
    ),
    category_performance AS (
        SELECT 
            category,
            SUM(amount) AS total_sales,
            RANK() OVER (ORDER BY SUM(amount) DESC) AS category_rank
        FROM sales_data
        GROUP BY category
    ),
    customer_analysis AS (
        SELECT 
            customer_id,
            SUM(amount) AS total_spent,
            NTILE(3) OVER (ORDER BY SUM(amount) DESC) AS customer_segment
        FROM sales_data
        GROUP BY customer_id
    )
    SELECT 
        TO_CHAR(ms.sales_month, 'YYYY-MM') AS period,
        ms.monthly_total AS total_sales,
        ROUND(
            ((ms.monthly_total - ms.previous_month_total) * 100.0 / ms.previous_month_total), 
            2
        ) AS sales_growth_rate,
        (SELECT product_name FROM product_performance WHERE sales_rank = 1 LIMIT 1) AS top_product,
        (SELECT category FROM category_performance WHERE category_rank = 1 LIMIT 1) AS best_performing_category,
        CASE 
            WHEN AVG(ca.customer_segment) = 1 THEN 'Premium'
            WHEN AVG(ca.customer_segment) = 2 THEN 'Standard'
            ELSE 'Basic'
        END AS customer_segment
    FROM monthly_sales ms
    LEFT JOIN sales_data sd ON DATE_TRUNC('month', sd.sale_date) = ms.sales_month
    LEFT JOIN customer_analysis ca ON sd.customer_id = ca.customer_id
    GROUP BY ms.sales_month, ms.monthly_total, ms.previous_month_total
    ORDER BY ms.sales_month;
END;
$$ LANGUAGE plpgsql;

-- 生成销售分析报告
SELECT * FROM generate_sales_analysis_report();

提示:窗口函数是PostgreSQL中非常强大的功能,特别适用于数据分析和报告场景。使用时要注意窗口框架的定义,合理使用PARTITION BY和ORDER BY子句,以及为相关列创建适当的索引以提高查询性能。