正则进阶

分组与捕获

基本分组

python

import re

# (abc) - 捕获分组：将括号内的内容作为一个整体匹配
pattern = r"(\d{4})-(\d{2})-(\d{2})"
text = "2024-05-15"
match = re.search(pattern, text)

if match:
    print(match.group())      # '2024-05-15'（完整匹配）
    print(match.group(1))     # '2024'（第一个分组）
    print(match.group(2))     # '05'（第二个分组）
    print(match.group(3))     # '15'（第三个分组）
    print(match.groups())     # ('2024', '05', '15')
    print(match.start(), match.end())  # 0, 10

分组命名

python

# (?P<name>...) - 命名分组
pattern = r"(?P<year>\d{4})-(?P<month>\d{2})-(?P<day>\d{2})"
text = "2024-05-15"
match = re.search(pattern, text)

if match:
    print(match.group('year'))   # '2024'
    print(match.group('month'))  # '05'
    print(match.group('day'))     # '15'
    print(match.groupdict())      # {'year': '2024', 'month': '05', 'day': '15'}

非捕获分组

python

# (?:...) - 非捕获分组：不创建分组编号
# 适用于只需要分组但不需要捕获的场景
pattern = r"(?:the|a) (\w+)"  # 只捕获单词部分
text = "the cat and a dog"
matches = re.findall(pattern, text)
print(matches)  # ['cat', 'dog']

# 捕获 vs 非捕获对比
pattern_capture = r"(\d{4})-(\d{2})"
pattern_nocapture = r"(?:\d{4})-(\d{2})"
text = "2024-05"

print(re.search(pattern_capture, text).groups())      # ('2024', '05')
print(re.search(pattern_nocapture, text).groups())    # ('05',) - 只有一个分组

反向引用

python

# \1, \2 - 引用前面捕获的内容
# 匹配重复的单词
pattern = r"\b(\w+)\s+\1\b"  # \1 引用 (\w+) 捕获的内容
text = "hello hello world the the"
matches = re.findall(pattern, text)
print(matches)  # ['hello', 'the']

# 命名反向引用
pattern = r"\b(?P<word>\w+)\s+(?P=word)\b"
text = "good good morning"
match = re.search(pattern, text)
print(match.group() if match else None)  # 'good good'

# 匹配 HTML 标签
pattern = r"<(\w+)[^>]*>.*?</\1>"  # \1 引用前面 <...> 中的标签名
text = "<div>content</div> <span>nested <b>bold</b></span>"
matches = re.findall(pattern, text)
print(matches)  # ['div', 'b']

零宽断言（Lookaround）

零宽断言是一种特殊结构，匹配一个位置，但不消耗字符。

正向先行断言 (?=...)

python

# (?=...) - 后面跟着 ...（不包含 ...）
# 匹配后面跟着 "world" 的 "hello"
pattern = r"hello(?=\s+world)"
text = "hello world hello python"
matches = re.findall(pattern, text)
print(matches)  # ['hello']

# 提取数字后面的货币单位
pattern = r"\d+(?=\s*元)"
text = "苹果 5元 香蕉 3元"
matches = re.findall(pattern, text)
print(matches)  # ['5', '3']

负向先行断言 (?!...)

python

# (?!...) - 后面不跟着 ...
# 匹配 "hello" 后面不跟着空格的情况
pattern = r"hello(?!\s)"
text = "hello world hello"
matches = re.findall(pattern, text)
print(matches)  # ['hello']（第二个 hello 后面是空格被排除）

# 排除特定结尾
pattern = r"\d+(?!px)"  # 数字后面不是 px
text = "10px 20em 30pt 40px"
matches = re.findall(pattern, text)
print(matches)  # ['20', '30']

正向后行断言 (?<=...)

python

# (?<=...) - 前面是 ...（不包含前面的字符）
# 匹配 "$" 后面的数字
pattern = r"(?<=\$)\d+"
text = "Price: $100, €200, £300"
matches = re.findall(pattern, text)
print(matches)  # ['100']

# 提取标签内的文本（不用捕获）
pattern = r"(?<=<b>)[^<]+(?=</b>)"
text = "<b>bold</b> and <b>strong</b>"
matches = re.findall(pattern, text)
print(matches)  # ['bold', 'strong']

负向后行断言 (?<!...)

python

# (?<!...) - 前面不是 ...
# 匹配前面不是数字的逗号
pattern = r"(?<!\d),"
text = "1,234,567"
matches = re.findall(pattern, text)
print(matches)  # [',', ',']  # 第一个和第二个逗号前是数字，被排除

条件匹配

(?(?=...)yes|no)

python

# 如果前面的断言成功，匹配 yes 部分，否则匹配 no
# 匹配带引号的字符串
pattern = r'"(?=.*?")(.+?)"'
text = '"hello" and "world"'
matches = re.findall(pattern, text)
print(matches)  # ['hello', 'world']

# 更复杂的条件：匹配特定格式
# 如果有 http:// 就保留域名，否则跳过
pattern = r"(?(?=https?://)(https?://[^\s]+)|[^\s]+)"
text = "https://example.com and plain text"
matches = re.findall(pattern, text)
print(matches)  # ['https://example.com', 'plain text']

Python re 模块高级用法

编译标志

python

text = "Hello\nWorld"

# re.IGNORECASE 或 re.I - 忽略大小写
print(re.findall(r"hello", text, re.I))  # ['Hello']

# re.MULTILINE 或 re.M - 多行模式
print(re.findall(r"^World", text, re.M))  # ['World']

# re.DOTALL 或 re.S - . 匹配换行符
text = "hello\nworld"
print(re.findall(r"hello.world", text, re.S))  # ['hello\nworld']

# re.VERBOSE 或 re.X - 允许注释和空白
pattern = r"""
\d{4}  # 年份
-      # 分隔符
\d{2}  # 月份
"""
print(re.findall(pattern, "2024-05", re.VERBOSE))  # ['2024-05']

# 组合使用
pattern = r"""
(?P<year>\d{4})  # 年份
-
(?P<month>\d{2})  # 月份
"""
match = re.search(pattern, "2024-05", re.VERBOSE)
print(match.groupdict())  # {'year': '2024', 'month': '05'}

finditer 高级用法

python

# 迭代所有匹配，逐个处理
pattern = r"\d+"
text = "a1b22c333d"

for match in re.finditer(pattern, text):
    print(f"Found: {match.group()} at {match.start()}-{match.end()}")

替换高级用法

python

# 使用 Match 对象进行复杂替换
def camel_to_snake(match):
    """将驼峰命名转为蛇形命名"""
    return "_" + match.group(1).lower()

text = "helloWorld pythonTest"
result = re.sub(r"[A-Z]", camel_to_snake, text)
print(result)  # 'hello_world python_test'

# 使用编号引用
text = "2024-05-15"
result = re.sub(r"(\d{4})-(\d{2})-(\d{2})", r"Year:\1 Month:\2 Day:\3", text)
print(result)  # 'Year:2024 Month:05 Day:15'

# 命名引用
result = re.sub(r"(?P<year>\d{4})-(?P<month>\d{2})", r"\g<month>/\g<year>", text)
print(result)  # '05/2024'

split 高级用法

python

# 使用捕获分组时，分隔符也会被保留
text = "2024-05-15"

# 普通分割
print(re.split(r"-", text))  # ['2024', '05', '15']

# 使用捕获分组（保留分隔符）
print(re.split(r"(-)", text))  # ['2024', '-', '05', '-', '15']

# 分割驼峰命名
text = "helloWorldPython"
parts = re.split(r"(?=[A-Z])", text)
print(parts)  # ['hello', 'World', 'Python']

实用技巧

1. 验证并提取数据

python

def parse_log_line(line):
    """解析日志行"""
    pattern = r"(?P<level>\w+)\s+(?P<time>\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2})\s+(?P<msg>.+)"
    match = re.match(pattern, line)
    if match:
        return match.groupdict()
    return None

log = "ERROR 2024-05-15 10:30:45 Database connection failed"
result = parse_log_line(log)
print(result)
# {'level': 'ERROR', 'time': '2024-05-15 10:30:45', 'msg': 'Database connection failed'}

2. HTML/文本清理

python

def clean_html(html):
    """移除 HTML 标签"""
    # 移除标签
    text = re.sub(r"<[^>]+>", "", html)
    # 解码 HTML 实体
    text = re.sub(r"&nbsp;", " ", text)
    text = re.sub(r"&lt;", "<", text)
    text = re.sub(r"&gt;", ">", text)
    text = re.sub(r"&amp;", "&", text)
    # 合并空白
    text = re.sub(r"\s+", " ", text).strip()
    return text

html = "<p>Hello&nbsp;World&lt;3&gt;</p>"
print(clean_html(html))  # 'Hello World<3>'

3. 密码强度验证

python

def validate_password(password):
    """
    验证密码强度
    - 至少8位
    - 包含大写字母
    - 包含小写字母
    - 包含数字
    - 包含特殊字符
    """
    if len(password) < 8:
        return False, "密码至少8位"
    if not re.search(r"[A-Z]", password):
        return False, "需要包含大写字母"
    if not re.search(r"[a-z]", password):
        return False, "需要包含小写字母"
    if not re.search(r"\d", password):
        return False, "需要包含数字"
    if not re.search(r"[!@#$%^&*(),.?\":{}|<>]", password):
        return False, "需要包含特殊字符"
    return True, "密码强度合格"

4. 敏感信息脱敏

python

def mask_phone(phone):
    """手机号脱敏"""
    return re.sub(r"(\d{3})\d{4}(\d{4})", r"\1****\2", phone)

def mask_email(email):
    """邮箱脱敏"""
    return re.sub(r"(?P<name>[^@]+)@", lambda m: m.group(1)[0] + "***@", email)

def mask_id_card(id_card):
    """身份证脱敏"""
    return re.sub(r"\d{6}\d{8}(\d{4})", r"******\1", id_card)

print(mask_phone("13812345678"))     # 138****5678
print(mask_email("john@example.com")) # j***@example.com
print(mask_id_card("110101199001011234"))  # ******0123

5. URL 参数解析

python

def parse_url_params(url):
    """解析 URL 参数"""
    # 提取查询字符串
    match = re.search(r"\?(.+)", url)
    if not match:
        return {}
    
    params = {}
    for pair in match.group(1).split("&"):
        if "=" in pair:
            key, value = pair.split("=", 1)
            params[key] = value
    return params

url = "https://example.com/search?q=python&page=1&sort=asc"
print(parse_url_params(url))
# {'q': 'python', 'page': '1', 'sort': 'asc'}

6. 代码注释移除

python

def remove_comments(code):
    """移除 Python 代码注释"""
    # 移除单行注释
    code = re.sub(r"#.*", "", code)
    # 移除字符串内的内容（需要先记录位置，这里简化处理）
    return code

python_code = '''
# 这是注释
def hello():
    print("Hello, World!")  # 打印
'''
print(remove_comments(python_code))

正则表达式性能优化

1. 预编译

python

# 多次使用时预编译
pattern = re.compile(r"\d+")
for _ in range(1000):
    pattern.findall(text)

2. 避免贪婪过度

python

# 贪婪可能回溯多次
text = "<div>content</div><span>more</span>"

# 差：贪婪可能导致性能问题
re.findall(r"<.+>", text)

# 好：非贪婪
re.findall(r"<[^>]+>", text)

3. 使用字符类代替点号

python

# 差：. 会尝试更多匹配
re.findall(r"class=\"(.+)\"", text)

# 好：指定字符类
re.findall(r"class=\"([^\"]+)\"", text)

[[返回正则首页|reg/index]]

正则进阶 ​

分组与捕获 ​

基本分组 ​

分组命名 ​

非捕获分组 ​

反向引用 ​

零宽断言（Lookaround） ​

正向先行断言 (?=...) ​

负向先行断言 (?!...) ​

正向后行断言 (?<=...) ​

负向后行断言 (?<!...) ​

条件匹配 ​

(?(?=...)yes|no) ​

Python re 模块高级用法 ​

编译标志 ​

finditer 高级用法 ​

替换高级用法 ​

split 高级用法 ​

实用技巧 ​

1. 验证并提取数据 ​

2. HTML/文本清理 ​

3. 密码强度验证 ​

4. 敏感信息脱敏 ​

5. URL 参数解析 ​

6. 代码注释移除 ​

正则表达式性能优化 ​

1. 预编译 ​

2. 避免贪婪过度 ​

3. 使用字符类代替点号 ​

正则进阶

分组与捕获

基本分组

分组命名

非捕获分组

反向引用

零宽断言（Lookaround）

正向先行断言 (?=...)

负向先行断言 (?!...)

正向后行断言 (?<=...)

负向后行断言 (?<!...)

条件匹配

(?(?=...)yes|no)

Python re 模块高级用法

编译标志

finditer 高级用法

替换高级用法

split 高级用法

实用技巧

1. 验证并提取数据

2. HTML/文本清理

3. 密码强度验证

4. 敏感信息脱敏

5. URL 参数解析

6. 代码注释移除

正则表达式性能优化

1. 预编译

2. 避免贪婪过度

3. 使用字符类代替点号