查询优化技术

本章将深入探讨PostgreSQL的查询优化技术,包括查询计划分析、性能调优方法、索引优化策略等,帮助您编写高效的SQL查询。

7.1 查询计划分析

EXPLAIN命令

使用EXPLAIN命令查看查询执行计划:

-- 基本EXPLAIN(不执行查询)
EXPLAIN SELECT * FROM users WHERE email = 'john@example.com';

-- EXPLAIN ANALYZE(执行查询并显示实际执行时间)
EXPLAIN (ANALYZE, BUFFERS) SELECT * FROM users WHERE email = 'john@example.com';

-- 详细EXPLAIN
EXPLAIN (ANALYZE, BUFFERS, VERBOSE, FORMAT JSON) 
SELECT u.username, o.total_amount 
FROM users u 
JOIN orders o ON u.id = o.user_id 
WHERE u.is_active = true;

执行计划解读

常见的执行节点类型:

  • Seq Scan:顺序扫描全表
  • Index Scan:使用索引扫描
  • Index Only Scan:仅索引扫描(覆盖索引)
  • Bitmap Heap Scan:位图堆扫描
  • Nested Loop:嵌套循环连接
  • Hash Join:哈希连接
  • Merge Join:合并连接

关键指标

EXPLAIN (ANALYZE, BUFFERS) SELECT * FROM users WHERE email = 'john@example.com';
/*
结果示例:
Index Scan using idx_users_email on users  (cost=0.28..8.30 rows=1 width=100)
  Index Cond: (email = 'john@example.com'::text)
  Buffers: shared hit=3
  Actual time=0.025..0.026 rows=1 loops=1
*/

-- 关键指标解释:
-- cost=0.28..8.30: 预估成本(启动成本..总成本)
-- rows=1: 预估返回行数
-- width=100: 平均行宽度(字节)
-- Buffers: shared hit=3: 使用的缓存页数
-- Actual time=0.025..0.026: 实际执行时间(启动时间..总时间)
-- rows=1 loops=1: 实际返回行数和执行次数

7.2 索引优化

索引选择性分析

-- 分析列的选择性
SELECT 
    COUNT(*) AS total_rows,
    COUNT(DISTINCT status) AS distinct_values,
    COUNT(DISTINCT status) * 100.0 / COUNT(*) AS selectivity_percent
FROM orders;

-- 分析复合索引的选择性
SELECT 
    COUNT(*) AS total_rows,
    COUNT(DISTINCT (status, priority)) AS distinct_combinations,
    COUNT(DISTINCT (status, priority)) * 100.0 / COUNT(*) AS selectivity_percent
FROM orders;

索引使用情况监控

-- 查看索引使用统计
SELECT 
    schemaname,
    tablename,
    indexname,
    idx_scan AS index_scans,
    idx_tup_read AS tuples_read,
    idx_tup_fetch AS tuples_fetched
FROM pg_stat_user_indexes
WHERE tablename = 'users'
ORDER BY idx_scan DESC;

-- 查看未使用的索引
SELECT 
    schemaname,
    tablename,
    indexname,
    pg_size_pretty(pg_relation_size(indexrelid)) AS index_size
FROM pg_stat_user_indexes 
WHERE idx_scan = 0
AND schemaname = 'public'
ORDER BY pg_relation_size(indexrelid) DESC;

7.3 查询优化技巧

避免全表扫描

-- 不好的查询(可能全表扫描)
SELECT * FROM users WHERE UPPER(email) = 'JOHN@EXAMPLE.COM';

-- 优化后的查询(使用函数索引)
CREATE INDEX idx_users_upper_email ON users (UPPER(email));
SELECT * FROM users WHERE UPPER(email) = 'JOHN@EXAMPLE.COM';

-- 或者修改查询逻辑
SELECT * FROM users WHERE email = LOWER('JOHN@EXAMPLE.COM');

优化JOIN操作

-- 确保JOIN列有索引
CREATE INDEX idx_orders_user_id ON orders (user_id);
CREATE INDEX idx_users_id ON users (id);

-- 使用适当的JOIN类型
EXPLAIN ANALYZE
SELECT u.username, COUNT(o.id) as order_count
FROM users u
LEFT JOIN orders o ON u.id = o.user_id
WHERE u.is_active = true
GROUP BY u.id, u.username;

优化子查询

-- 不好的子查询
SELECT * FROM users 
WHERE id IN (SELECT user_id FROM orders WHERE total_amount > 1000);

-- 优化为JOIN
SELECT DISTINCT u.* 
FROM users u
JOIN orders o ON u.id = o.user_id 
WHERE o.total_amount > 1000;

-- 或使用EXISTS
SELECT * FROM users u
WHERE EXISTS (
    SELECT 1 FROM orders o 
    WHERE o.user_id = u.id AND o.total_amount > 1000
);

优化LIMIT查询

-- 为LIMIT查询创建合适的索引
CREATE INDEX idx_users_created_at ON users (created_at DESC);

-- 高效的分页查询
SELECT * FROM users 
ORDER BY created_at DESC 
LIMIT 20 OFFSET 0;

-- 更好的分页方式(避免OFFSET)
SELECT * FROM users 
WHERE created_at < '2023-10-01 12:00:00' 
ORDER BY created_at DESC 
LIMIT 20;

7.4 聚合查询优化

使用物化视图

-- 创建聚合物化视图
CREATE MATERIALIZED VIEW user_order_stats AS
SELECT 
    u.id,
    u.username,
    COUNT(o.id) AS order_count,
    SUM(o.total_amount) AS total_spent,
    AVG(o.total_amount) AS avg_order_value
FROM users u
LEFT JOIN orders o ON u.id = o.user_id
GROUP BY u.id, u.username;

-- 为物化视图创建索引
CREATE INDEX idx_user_order_stats_user_id ON user_order_stats (id);
CREATE INDEX idx_user_order_stats_total_spent ON user_order_stats (total_spent DESC);

-- 刷新物化视图
REFRESH MATERIALIZED VIEW CONCURRENTLY user_order_stats;

优化GROUP BY

-- 为GROUP BY列创建索引
CREATE INDEX idx_orders_user_date ON orders (user_id, order_date);

-- 优化的聚合查询
SELECT 
    user_id,
    DATE_TRUNC('month', order_date) AS month,
    COUNT(*) AS order_count,
    SUM(total_amount) AS monthly_sales
FROM orders
WHERE order_date >= CURRENT_DATE - INTERVAL '1 year'
GROUP BY user_id, DATE_TRUNC('month', order_date)
ORDER BY user_id, month;

7.5 查询重写优化

使用CTE优化复杂查询

-- 使用CTE提高查询可读性和性能
WITH active_users AS (
    SELECT id, username, email
    FROM users
    WHERE is_active = true
),
user_orders AS (
    SELECT 
        user_id,
        COUNT(*) AS order_count,
        SUM(total_amount) AS total_spent
    FROM orders
    WHERE order_date >= CURRENT_DATE - INTERVAL '30 days'
    GROUP BY user_id
)
SELECT 
    au.username,
    au.email,
    COALESCE(uo.order_count, 0) AS order_count,
    COALESCE(uo.total_spent, 0) AS total_spent
FROM active_users au
LEFT JOIN user_orders uo ON au.id = uo.user_id
ORDER BY uo.total_spent DESC NULLS LAST;

窗口函数优化

-- 使用窗口函数避免子查询
SELECT 
    username,
    email,
    created_at,
    ROW_NUMBER() OVER (ORDER BY created_at DESC) AS row_num,
    RANK() OVER (PARTITION BY DATE(created_at) ORDER BY created_at) AS daily_rank
FROM users
WHERE is_active = true;

7.6 性能监控和调优

慢查询监控

-- 启用慢查询日志
-- 在postgresql.conf中设置:
-- log_min_duration_statement = 1000  # 记录执行时间超过1秒的查询

-- 查询慢查询日志
-- 需要配置logging_collector = on

-- 使用pg_stat_statements扩展监控查询性能
CREATE EXTENSION IF NOT EXISTS pg_stat_statements;

-- 查看最耗时的查询
SELECT 
    query,
    calls,
    total_time,
    mean_time,
    rows
FROM pg_stat_statements
ORDER BY total_time DESC
LIMIT 10;

数据库统计信息

-- 查看表统计信息
SELECT 
    schemaname,
    tablename,
    seq_scan,
    seq_tup_read,
    idx_scan,
    idx_tup_fetch,
    n_tup_ins,
    n_tup_upd,
    n_tup_del
FROM pg_stat_user_tables
WHERE schemaname = 'public'
ORDER BY seq_scan DESC;

-- 更新表统计信息
ANALYZE users;
ANALYZE orders;

-- 更新所有表统计信息
ANALYZE;

缓存命中率监控

-- 查看缓存命中率
SELECT 
    blks_read,
    blks_hit,
    ROUND(blks_hit::float / (blks_hit + blks_read) * 100, 2) AS cache_hit_ratio
FROM pg_stat_database
WHERE datname = 'mydb';

-- 理想情况下缓存命中率应该在95%以上

7.7 配置参数优化

内存相关参数

-- shared_buffers: 共享内存缓冲区大小(建议设置为物理内存的25%)
-- work_mem: 每个查询操作使用的内存(排序、哈希等)
-- maintenance_work_mem: 维护操作使用的内存(VACUUM、CREATE INDEX等)
-- effective_cache_size: 优化器假设的可用缓存大小

-- 在postgresql.conf中配置:
shared_buffers = 256MB
work_mem = 4MB
maintenance_work_mem = 64MB
effective_cache_size = 1GB

并行查询参数

-- max_worker_processes: 最大工作进程数
-- max_parallel_workers_per_gather: 每个Gather节点的最大并行工作进程数
-- max_parallel_workers: 系统范围内的最大并行工作进程数

-- 在postgresql.conf中配置:
max_worker_processes = 8
max_parallel_workers_per_gather = 2
max_parallel_workers = 8

7.8 查询优化工具

pgBadger

pgBadger是一个PostgreSQL日志分析工具:

-- 安装pgBadger
-- 通过包管理器安装或从源码编译

-- 配置PostgreSQL记录查询日志
-- 在postgresql.conf中设置:
log_statement = 'all'
log_duration = on
log_line_prefix = '%t [%p]: [%l-1] user=%u,db=%d,app=%a,client=%h '

-- 生成报告
pgbadger /path/to/postgresql.log -o report.html

EXPLAIN可视化工具

使用在线工具或本地工具可视化EXPLAIN输出:

  • Postgres Explain Visualizer (pev):在线EXPLAIN可视化工具
  • EXPLAIN.DEV:现代化的EXPLAIN分析工具

综合优化示例

-- 优化前的查询
EXPLAIN (ANALYZE, BUFFERS)
SELECT u.username, COUNT(o.id) as order_count, SUM(o.total_amount) as total_spent
FROM users u
JOIN orders o ON u.id = o.user_id
WHERE u.created_at > '2023-01-01' 
  AND o.order_date > '2023-01-01'
  AND u.is_active = true
GROUP BY u.id, u.username
HAVING COUNT(o.id) > 5
ORDER BY total_spent DESC
LIMIT 10;

-- 优化步骤:
-- 1. 创建必要的索引
CREATE INDEX idx_users_created_active ON users (created_at) WHERE is_active = true;
CREATE INDEX idx_orders_user_date ON orders (user_id, order_date);
CREATE INDEX idx_orders_date_user ON orders (order_date, user_id);

-- 2. 使用CTE重写查询
WITH active_users AS (
    SELECT id, username
    FROM users
    WHERE created_at > '2023-01-01' AND is_active = true
),
user_orders AS (
    SELECT 
        o.user_id,
        COUNT(*) as order_count,
        SUM(o.total_amount) as total_spent
    FROM orders o
    JOIN active_users au ON o.user_id = au.id
    WHERE o.order_date > '2023-01-01'
    GROUP BY o.user_id
    HAVING COUNT(*) > 5
)
SELECT 
    au.username,
    uo.order_count,
    uo.total_spent
FROM user_orders uo
JOIN active_users au ON uo.user_id = au.id
ORDER BY uo.total_spent DESC
LIMIT 10;

提示:查询优化是一个持续的过程。定期分析慢查询日志,监控数据库性能指标,并根据实际使用模式调整索引和配置参数。