PostgreSQL字符串函数

本章将详细介绍PostgreSQL中的字符串函数,包括字符串连接、大小写转换、子字符串提取、模式匹配等,帮助您高效处理文本数据。

字符串基本操作

字符串连接

多种方式连接字符串:

SELECT 
    'Hello' || ' ' || 'World' AS concat_operator,           -- 使用||操作符
    CONCAT('Hello', ' ', 'World') AS concat_function,       -- 使用CONCAT函数
    CONCAT_WS(' ', 'Hello', 'Beautiful', 'World') AS concat_ws_function; -- 使用分隔符连接

字符串长度

获取字符串长度:

SELECT 
    LENGTH('Hello World') AS char_length,                   -- 字符长度
    CHAR_LENGTH('Hello World') AS char_length_alt,          -- 同LENGTH
    OCTET_LENGTH('Hello World') AS byte_length;             -- 字节长度

大小写转换

转换字符串大小写:

SELECT 
    UPPER('Hello World') AS uppercase,                      -- 转大写
    LOWER('Hello World') AS lowercase,                      -- 转小写
    INITCAP('hello world') AS initcap;                      -- 首字母大写

字符串提取和定位

子字符串提取

从字符串中提取子串:

SELECT 
    SUBSTRING('Hello World' FROM 1 FOR 5) AS substring_pos, -- 从位置1开始取5个字符
    SUBSTRING('Hello World' FROM 'Hello') AS substring_pattern, -- 使用正则表达式
    LEFT('Hello World', 5) AS left_chars,                   -- 从左边取5个字符
    RIGHT('Hello World', 5) AS right_chars;                 -- 从右边取5个字符

字符串定位

查找字符或子串的位置:

SELECT 
    POSITION('World' IN 'Hello World') AS position_result,  -- 查找位置
    STRPOS('Hello World', 'World') AS strpos_result;        -- 同POSITION

字符串修改

字符串替换

替换字符串中的字符或子串:

SELECT 
    REPLACE('Hello World', 'World', 'PostgreSQL') AS replace_result, -- 替换子串
    TRANSLATE('Hello', 'el', 'ip') AS translate_result;     -- 字符映射替换

字符串填充

在字符串两侧填充字符:

SELECT 
    LPAD('Hello', 10, '*') AS left_padded,                  -- 左侧填充到10个字符
    RPAD('Hello', 10, '*') AS right_padded,                 -- 右侧填充到10个字符

字符串修剪

去除字符串首尾的字符:

SELECT 
    TRIM('  Hello World  ') AS trim_spaces,                 -- 去除首尾空格
    TRIM(LEADING 'x' FROM 'xxxHelloxxx') AS trim_leading,   -- 去除开头的x
    TRIM(TRAILING 'x' FROM 'xxxHelloxxx') AS trim_trailing, -- 去除结尾的x
    TRIM(BOTH 'x' FROM 'xxxHelloxxx') AS trim_both,         -- 去除首尾的x
    BTRIM('  Hello World  ') AS btrim_result,               -- 同TRIM
    LTRIM('  Hello World  ') AS ltrim_result,               -- 去除开头空格
    RTRIM('  Hello World  ') AS rtrim_result;               -- 去除结尾空格

模式匹配

LIKE操作符

使用LIKE进行简单模式匹配:

SELECT * FROM users 
WHERE username LIKE 'john%';                              -- 以john开头的用户名

SELECT * FROM users 
WHERE email LIKE '%@gmail.com';                           -- Gmail邮箱

SELECT * FROM products 
WHERE name LIKE 'A%B';                                    -- 以A开头B结尾的产品名

正则表达式匹配

使用正则表达式进行复杂模式匹配:

SELECT 
    'Hello World' ~ 'Hello' AS regex_match,               -- 匹配返回true
    'Hello World' ~* 'hello' AS regex_match_case_insensitive, -- 不区分大小写匹配
    'Hello World' !~ 'xyz' AS regex_not_match,            -- 不匹配返回true
    SUBSTRING('Hello World' FROM 'H.*o') AS regex_extract; -- 正则表达式提取

正则表达式替换

使用正则表达式替换文本:

SELECT 
    REGEXP_REPLACE('Hello World 123', '[0-9]+', 'NUM') AS regex_replace, -- 替换数字
    REGEXP_REPLACE('John Doe', '(\w+)\s+(\w+)', '\2, \1') AS regex_rearrange; -- 重排姓名

字符串分割

分割字符串

将字符串分割为数组:

SELECT 
    STRING_TO_ARRAY('apple,banana,orange', ',') AS string_array, -- 分割为数组
    UNNEST(STRING_TO_ARRAY('apple,banana,orange', ',')) AS unnested_array; -- 展开数组

数组转字符串

将数组连接为字符串:

SELECT 
    ARRAY_TO_STRING(ARRAY['apple', 'banana', 'orange'], ', ') AS array_string; -- 数组转字符串

特殊字符处理

重复字符串

重复字符串指定次数:

SELECT 
    REPEAT('Hello', 3) AS repeated_string;                -- 重复3次

反转字符串

将字符串反转:

SELECT 
    REVERSE('Hello World') AS reversed_string;            -- 反转字符串

ASCII和字符转换

ASCII码和字符之间的转换:

SELECT 
    ASCII('A') AS ascii_value,                            -- 获取字符ASCII码
    CHR(65) AS char_from_ascii;                           -- 根据ASCII码获取字符

实际应用场景

数据清洗

-- 清洗用户数据
UPDATE users 
SET 
    username = TRIM(LOWER(username)),                     -- 去除空格并转小写
    email = LOWER(TRIM(email)),                           -- 邮箱转小写并去空格
    phone = REGEXP_REPLACE(phone, '[^0-9]', '', 'g');     -- 只保留数字

格式化输出

-- 格式化姓名显示
SELECT 
    CONCAT(UPPER(LEFT(first_name, 1)), LOWER(SUBSTRING(first_name, 2))) || ' ' ||
    UPPER(last_name) AS formatted_name
FROM users;

-- 格式化电话号码
SELECT 
    CONCAT(
        '(', SUBSTRING(phone, 1, 3), ') ',
        SUBSTRING(phone, 4, 3), '-',
        SUBSTRING(phone, 7, 4)
    ) AS formatted_phone
FROM users;

搜索和过滤

-- 全文搜索
SELECT * FROM articles 
WHERE LOWER(title) LIKE '%postgresql%' 
   OR LOWER(content) LIKE '%postgresql%';

-- 使用正则表达式搜索邮箱格式
SELECT * FROM users 
WHERE email ~* '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$';

数据验证

-- 验证数据格式
SELECT 
    username,
    email,
    CASE 
        WHEN LENGTH(username) < 3 THEN 'Username too short'
        WHEN email !~* '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$' THEN 'Invalid email'
        ELSE 'Valid'
    END AS validation_result
FROM users;

性能优化

索引优化

为字符串操作创建合适的索引:

-- 为前缀匹配创建索引
CREATE INDEX idx_users_username_prefix ON users (username varchar_pattern_ops);

-- 为大小写不敏感查询创建索引
CREATE INDEX idx_users_email_lower ON users (LOWER(email));

-- 为正则表达式创建索引(部分索引)
CREATE INDEX idx_users_email_gmail ON users (email) WHERE email LIKE '%@gmail.com';

函数索引

为常用字符串函数创建索引:

-- 为字符串长度创建索引
CREATE INDEX idx_products_name_length ON products (LENGTH(name));

-- 为子字符串创建索引
CREATE INDEX idx_users_username_prefix3 ON users (SUBSTRING(username, 1, 3));

综合示例

-- 创建用户报告函数
CREATE OR REPLACE FUNCTION generate_user_report()
RETURNS TABLE(
    user_category TEXT,
    user_count BIGINT,
    avg_username_length NUMERIC,
    most_common_domain TEXT
) AS $$
BEGIN
    RETURN QUERY
    WITH domain_stats AS (
        SELECT 
            SUBSTRING(email FROM POSITION('@' IN email) + 1) AS domain,
            COUNT(*) AS domain_count
        FROM users 
        WHERE email IS NOT NULL
        GROUP BY domain
        ORDER BY domain_count DESC
        LIMIT 1
    )
    SELECT 
        CASE 
            WHEN LENGTH(username) < 5 THEN 'Short'
            WHEN LENGTH(username) BETWEEN 5 AND 10 THEN 'Medium'
            ELSE 'Long'
        END AS user_category,
        COUNT(*) AS user_count,
        ROUND(AVG(LENGTH(username)), 2) AS avg_username_length,
        (SELECT domain FROM domain_stats) AS most_common_domain
    FROM users
    WHERE username IS NOT NULL
    GROUP BY 
        CASE 
            WHEN LENGTH(username) < 5 THEN 'Short'
            WHEN LENGTH(username) BETWEEN 5 AND 10 THEN 'Medium'
            ELSE 'Long'
        END
    ORDER BY user_count DESC;
END;
$$ LANGUAGE plpgsql;

-- 使用函数生成报告
SELECT * FROM generate_user_report();

提示:在处理大量文本数据时,合理使用字符串函数和索引可以显著提高查询性能。对于复杂的文本处理需求,考虑使用正则表达式或全文搜索功能。