PostgreSQL窗口函数
本章将详细介绍PostgreSQL中的窗口函数,包括排名函数、偏移函数、聚合函数等,帮助您执行复杂的分析查询。
窗口函数概述
什么是窗口函数
窗口函数是对一组行执行计算的函数,与普通聚合函数不同,窗口函数不会将多行合并为一行,而是为每一行返回一个值:
-- 窗口函数基本语法
SELECT
column1,
column2,
window_function(column3) OVER (
[PARTITION BY column4]
[ORDER BY column5]
[ROWS|RANGE frame_clause]
)
FROM table_name;
窗口函数与普通聚合函数的区别
-- 普通聚合函数(返回一行)
SELECT COUNT(*) AS total_users FROM users;
-- 窗口函数(为每行返回值)
SELECT
username,
created_at,
COUNT(*) OVER() AS total_users
FROM users;
排名函数
ROW_NUMBER
为结果集中的每一行分配唯一的连续数字:
SELECT
username,
score,
ROW_NUMBER() OVER (ORDER BY score DESC) AS row_num
FROM users
ORDER BY score DESC;
RANK
为结果集中的每一行分配排名,相同值获得相同排名,但会跳过后续排名:
SELECT
username,
score,
RANK() OVER (ORDER BY score DESC) AS rank_num
FROM users
ORDER BY score DESC;
DENSE_RANK
为结果集中的每一行分配排名,相同值获得相同排名,但不会跳过后续排名:
SELECT
username,
score,
DENSE_RANK() OVER (ORDER BY score DESC) AS dense_rank_num
FROM users
ORDER BY score DESC;
PERCENT_RANK
计算相对排名(0到1之间):
SELECT
username,
score,
PERCENT_RANK() OVER (ORDER BY score) AS percent_rank
FROM users
ORDER BY score;
CUME_DIST
计算累积分布:
SELECT
username,
score,
CUME_DIST() OVER (ORDER BY score) AS cumulative_dist
FROM users
ORDER BY score;
偏移函数
LAG
访问前一行的数据:
SELECT
date,
sales_amount,
LAG(sales_amount, 1) OVER (ORDER BY date) AS previous_day_sales,
sales_amount - LAG(sales_amount, 1) OVER (ORDER BY date) AS daily_change
FROM daily_sales
ORDER BY date;
LEAD
访问后一行的数据:
SELECT
date,
sales_amount,
LEAD(sales_amount, 1) OVER (ORDER BY date) AS next_day_sales,
LEAD(sales_amount, 1) OVER (ORDER BY date) - sales_amount AS next_day_change
FROM daily_sales
ORDER BY date;
FIRST_VALUE
获取窗口中的第一个值:
SELECT
date,
sales_amount,
FIRST_VALUE(sales_amount) OVER (ORDER BY date) AS first_day_sales
FROM daily_sales
ORDER BY date;
LAST_VALUE
获取窗口中的最后一个值:
SELECT
date,
sales_amount,
LAST_VALUE(sales_amount) OVER (
ORDER BY date
ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
) AS last_day_sales
FROM daily_sales
ORDER BY date;
聚合窗口函数
基本聚合函数
在窗口中使用聚合函数:
SELECT
department,
employee_name,
salary,
COUNT(*) OVER (PARTITION BY department) AS dept_employee_count,
AVG(salary) OVER (PARTITION BY department) AS dept_avg_salary,
SUM(salary) OVER (PARTITION BY department) AS dept_total_salary,
MIN(salary) OVER (PARTITION BY department) AS dept_min_salary,
MAX(salary) OVER (PARTITION BY department) AS dept_max_salary
FROM employees
ORDER BY department, salary DESC;
运行总计
计算累计值:
SELECT
date,
sales_amount,
SUM(sales_amount) OVER (ORDER BY date) AS running_total
FROM daily_sales
ORDER BY date;
窗口框架
ROWS子句
定义基于行数的窗口框架:
SELECT
date,
sales_amount,
AVG(sales_amount) OVER (
ORDER BY date
ROWS BETWEEN 2 PRECEDING AND 2 FOLLOWING
) AS moving_avg_5_days,
AVG(sales_amount) OVER (
ORDER BY date
ROWS BETWEEN 6 PRECEDING AND CURRENT ROW
) AS moving_avg_7_days
FROM daily_sales
ORDER BY date;
RANGE子句
定义基于值范围的窗口框架:
SELECT
employee_id,
salary,
COUNT(*) OVER (
ORDER BY salary
RANGE BETWEEN 10000 PRECEDING AND 10000 FOLLOWING
) AS employees_in_salary_range
FROM employees
ORDER BY salary;
实际应用场景
销售数据分析
-- 创建销售数据表
CREATE TABLE sales_data (
id SERIAL PRIMARY KEY,
product_name VARCHAR(100),
category VARCHAR(50),
sale_date DATE,
amount NUMERIC(10,2),
quantity INTEGER
);
-- 插入示例数据
INSERT INTO sales_data (product_name, category, sale_date, amount, quantity) VALUES
('Product A', 'Electronics', '2023-10-01', 1000.00, 2),
('Product B', 'Electronics', '2023-10-01', 1500.00, 1),
('Product C', 'Clothing', '2023-10-01', 200.00, 5),
('Product A', 'Electronics', '2023-10-02', 800.00, 1),
('Product B', 'Electronics', '2023-10-02', 1200.00, 2),
('Product C', 'Clothing', '2023-10-02', 150.00, 3);
-- 销售排名分析
SELECT
product_name,
category,
SUM(amount) AS total_sales,
RANK() OVER (ORDER BY SUM(amount) DESC) AS sales_rank,
DENSE_RANK() OVER (PARTITION BY category ORDER BY SUM(amount) DESC) AS category_rank,
ROUND(
(SUM(amount) * 100.0 / SUM(SUM(amount)) OVER()),
2
) AS sales_percentage
FROM sales_data
GROUP BY product_name, category
ORDER BY total_sales DESC;
用户行为分析
-- 用户登录时间分析
SELECT
user_id,
login_time,
LAG(login_time) OVER (PARTITION BY user_id ORDER BY login_time) AS previous_login,
login_time - LAG(login_time) OVER (PARTITION BY user_id ORDER BY login_time) AS time_since_last_login,
ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY login_time) AS login_sequence
FROM user_logins
ORDER BY user_id, login_time;
财务数据分析
-- 股票价格分析
SELECT
stock_symbol,
trade_date,
closing_price,
LAG(closing_price) OVER (PARTITION BY stock_symbol ORDER BY trade_date) AS previous_close,
ROUND(
((closing_price - LAG(closing_price) OVER (PARTITION BY stock_symbol ORDER BY trade_date))
/ LAG(closing_price) OVER (PARTITION BY stock_symbol ORDER BY trade_date) * 100),
2
) AS daily_change_percent,
AVG(closing_price) OVER (
PARTITION BY stock_symbol
ORDER BY trade_date
ROWS BETWEEN 19 PRECEDING AND CURRENT ROW
) AS ma_20_days
FROM stock_prices
ORDER BY stock_symbol, trade_date;
绩效评估
-- 员工绩效评估
SELECT
department,
employee_name,
performance_score,
salary,
RANK() OVER (ORDER BY performance_score DESC) AS overall_rank,
RANK() OVER (PARTITION BY department ORDER BY performance_score DESC) AS department_rank,
CASE
WHEN RANK() OVER (PARTITION BY department ORDER BY performance_score DESC) = 1 THEN 'Top Performer'
WHEN RANK() OVER (PARTITION BY department ORDER BY performance_score DESC) <= 3 THEN 'High Performer'
ELSE 'Standard'
END AS performance_category,
salary - FIRST_VALUE(salary) OVER (PARTITION BY department ORDER BY performance_score DESC) AS salary_diff_from_top
FROM employees
ORDER BY department, performance_score DESC;
高级窗口函数
NTILE函数
将结果集分成指定数量的桶:
SELECT
employee_name,
salary,
NTILE(4) OVER (ORDER BY salary DESC) AS salary_quartile,
CASE NTILE(4) OVER (ORDER BY salary DESC)
WHEN 1 THEN 'Top 25%'
WHEN 2 THEN 'Second 25%'
WHEN 3 THEN 'Third 25%'
WHEN 4 THEN 'Bottom 25%'
END AS salary_group
FROM employees
ORDER BY salary DESC;
NTH_VALUE函数
获取窗口中第N个值:
SELECT
product_name,
sales_amount,
NTH_VALUE(product_name, 1) OVER (ORDER BY sales_amount DESC) AS top_product,
NTH_VALUE(product_name, 2) OVER (ORDER BY sales_amount DESC) AS second_product,
NTH_VALUE(product_name, 3) OVER (ORDER BY sales_amount DESC) AS third_product
FROM product_sales
ORDER BY sales_amount DESC;
性能优化
索引优化
为窗口函数创建合适的索引:
-- 为排序列创建索引
CREATE INDEX idx_employees_salary ON employees (salary);
-- 为分区列创建索引
CREATE INDEX idx_employees_department ON employees (department);
-- 为复合窗口函数创建复合索引
CREATE INDEX idx_employees_dept_salary ON employees (department, salary);
-- 为时间序列数据创建索引
CREATE INDEX idx_sales_date ON sales_data (sale_date);
查询优化
优化窗口函数查询:
-- 好的查询(使用适当索引)
SELECT
employee_name,
salary,
RANK() OVER (PARTITION BY department ORDER BY salary DESC) AS dept_rank
FROM employees
WHERE department IN ('IT', 'Sales', 'Marketing');
-- 避免在窗口函数中使用复杂表达式
SELECT
employee_name,
salary,
RANK() OVER (ORDER BY salary) AS salary_rank -- 简单列
FROM employees;
-- 而不是
SELECT
employee_name,
salary * 1.1 AS adjusted_salary,
RANK() OVER (ORDER BY salary * 1.1) AS salary_rank -- 复杂表达式
综合示例
-- 创建综合分析报告函数
CREATE OR REPLACE FUNCTION generate_sales_analysis_report()
RETURNS TABLE(
period TEXT,
total_sales NUMERIC,
sales_growth_rate NUMERIC,
top_product VARCHAR(100),
best_performing_category VARCHAR(50),
customer_segment TEXT
) AS $$
BEGIN
RETURN QUERY
WITH monthly_sales AS (
SELECT
DATE_TRUNC('month', sale_date) AS sales_month,
SUM(amount) AS monthly_total,
LAG(SUM(amount)) OVER (ORDER BY DATE_TRUNC('month', sale_date)) AS previous_month_total
FROM sales_data
GROUP BY DATE_TRUNC('month', sale_date)
),
product_performance AS (
SELECT
product_name,
SUM(amount) AS total_sales,
RANK() OVER (ORDER BY SUM(amount) DESC) AS sales_rank
FROM sales_data
GROUP BY product_name
),
category_performance AS (
SELECT
category,
SUM(amount) AS total_sales,
RANK() OVER (ORDER BY SUM(amount) DESC) AS category_rank
FROM sales_data
GROUP BY category
),
customer_analysis AS (
SELECT
customer_id,
SUM(amount) AS total_spent,
NTILE(3) OVER (ORDER BY SUM(amount) DESC) AS customer_segment
FROM sales_data
GROUP BY customer_id
)
SELECT
TO_CHAR(ms.sales_month, 'YYYY-MM') AS period,
ms.monthly_total AS total_sales,
ROUND(
((ms.monthly_total - ms.previous_month_total) * 100.0 / ms.previous_month_total),
2
) AS sales_growth_rate,
(SELECT product_name FROM product_performance WHERE sales_rank = 1 LIMIT 1) AS top_product,
(SELECT category FROM category_performance WHERE category_rank = 1 LIMIT 1) AS best_performing_category,
CASE
WHEN AVG(ca.customer_segment) = 1 THEN 'Premium'
WHEN AVG(ca.customer_segment) = 2 THEN 'Standard'
ELSE 'Basic'
END AS customer_segment
FROM monthly_sales ms
LEFT JOIN sales_data sd ON DATE_TRUNC('month', sd.sale_date) = ms.sales_month
LEFT JOIN customer_analysis ca ON sd.customer_id = ca.customer_id
GROUP BY ms.sales_month, ms.monthly_total, ms.previous_month_total
ORDER BY ms.sales_month;
END;
$$ LANGUAGE plpgsql;
-- 生成销售分析报告
SELECT * FROM generate_sales_analysis_report();
提示:窗口函数是PostgreSQL中非常强大的功能,特别适用于数据分析和报告场景。使用时要注意窗口框架的定义,合理使用PARTITION BY和ORDER BY子句,以及为相关列创建适当的索引以提高查询性能。