正则进阶
分组与捕获
基本分组
python
import re
# (abc) - 捕获分组:将括号内的内容作为一个整体匹配
pattern = r"(\d{4})-(\d{2})-(\d{2})"
text = "2024-05-15"
match = re.search(pattern, text)
if match:
print(match.group()) # '2024-05-15'(完整匹配)
print(match.group(1)) # '2024'(第一个分组)
print(match.group(2)) # '05'(第二个分组)
print(match.group(3)) # '15'(第三个分组)
print(match.groups()) # ('2024', '05', '15')
print(match.start(), match.end()) # 0, 101
2
3
4
5
6
7
8
9
10
11
12
13
14
2
3
4
5
6
7
8
9
10
11
12
13
14
分组命名
python
# (?P<name>...) - 命名分组
pattern = r"(?P<year>\d{4})-(?P<month>\d{2})-(?P<day>\d{2})"
text = "2024-05-15"
match = re.search(pattern, text)
if match:
print(match.group('year')) # '2024'
print(match.group('month')) # '05'
print(match.group('day')) # '15'
print(match.groupdict()) # {'year': '2024', 'month': '05', 'day': '15'}1
2
3
4
5
6
7
8
9
10
2
3
4
5
6
7
8
9
10
非捕获分组
python
# (?:...) - 非捕获分组:不创建分组编号
# 适用于只需要分组但不需要捕获的场景
pattern = r"(?:the|a) (\w+)" # 只捕获单词部分
text = "the cat and a dog"
matches = re.findall(pattern, text)
print(matches) # ['cat', 'dog']
# 捕获 vs 非捕获对比
pattern_capture = r"(\d{4})-(\d{2})"
pattern_nocapture = r"(?:\d{4})-(\d{2})"
text = "2024-05"
print(re.search(pattern_capture, text).groups()) # ('2024', '05')
print(re.search(pattern_nocapture, text).groups()) # ('05',) - 只有一个分组1
2
3
4
5
6
7
8
9
10
11
12
13
14
2
3
4
5
6
7
8
9
10
11
12
13
14
反向引用
python
# \1, \2 - 引用前面捕获的内容
# 匹配重复的单词
pattern = r"\b(\w+)\s+\1\b" # \1 引用 (\w+) 捕获的内容
text = "hello hello world the the"
matches = re.findall(pattern, text)
print(matches) # ['hello', 'the']
# 命名反向引用
pattern = r"\b(?P<word>\w+)\s+(?P=word)\b"
text = "good good morning"
match = re.search(pattern, text)
print(match.group() if match else None) # 'good good'
# 匹配 HTML 标签
pattern = r"<(\w+)[^>]*>.*?</\1>" # \1 引用前面 <...> 中的标签名
text = "<div>content</div> <span>nested <b>bold</b></span>"
matches = re.findall(pattern, text)
print(matches) # ['div', 'b']1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
零宽断言(Lookaround)
零宽断言是一种特殊结构,匹配一个位置,但不消耗字符。
正向先行断言 (?=...)
python
# (?=...) - 后面跟着 ...(不包含 ...)
# 匹配后面跟着 "world" 的 "hello"
pattern = r"hello(?=\s+world)"
text = "hello world hello python"
matches = re.findall(pattern, text)
print(matches) # ['hello']
# 提取数字后面的货币单位
pattern = r"\d+(?=\s*元)"
text = "苹果 5元 香蕉 3元"
matches = re.findall(pattern, text)
print(matches) # ['5', '3']1
2
3
4
5
6
7
8
9
10
11
12
2
3
4
5
6
7
8
9
10
11
12
负向先行断言 (?!...)
python
# (?!...) - 后面不跟着 ...
# 匹配 "hello" 后面不跟着空格的情况
pattern = r"hello(?!\s)"
text = "hello world hello"
matches = re.findall(pattern, text)
print(matches) # ['hello'](第二个 hello 后面是空格被排除)
# 排除特定结尾
pattern = r"\d+(?!px)" # 数字后面不是 px
text = "10px 20em 30pt 40px"
matches = re.findall(pattern, text)
print(matches) # ['20', '30']1
2
3
4
5
6
7
8
9
10
11
12
2
3
4
5
6
7
8
9
10
11
12
正向后行断言 (?<=...)
python
# (?<=...) - 前面是 ...(不包含前面的字符)
# 匹配 "$" 后面的数字
pattern = r"(?<=\$)\d+"
text = "Price: $100, €200, £300"
matches = re.findall(pattern, text)
print(matches) # ['100']
# 提取标签内的文本(不用捕获)
pattern = r"(?<=<b>)[^<]+(?=</b>)"
text = "<b>bold</b> and <b>strong</b>"
matches = re.findall(pattern, text)
print(matches) # ['bold', 'strong']1
2
3
4
5
6
7
8
9
10
11
12
2
3
4
5
6
7
8
9
10
11
12
负向后行断言 (?<!...)
python
# (?<!...) - 前面不是 ...
# 匹配前面不是数字的逗号
pattern = r"(?<!\d),"
text = "1,234,567"
matches = re.findall(pattern, text)
print(matches) # [',', ','] # 第一个和第二个逗号前是数字,被排除1
2
3
4
5
6
2
3
4
5
6
条件匹配
(?(?=...)yes|no)
python
# 如果前面的断言成功,匹配 yes 部分,否则匹配 no
# 匹配带引号的字符串
pattern = r'"(?=.*?")(.+?)"'
text = '"hello" and "world"'
matches = re.findall(pattern, text)
print(matches) # ['hello', 'world']
# 更复杂的条件:匹配特定格式
# 如果有 http:// 就保留域名,否则跳过
pattern = r"(?(?=https?://)(https?://[^\s]+)|[^\s]+)"
text = "https://example.com and plain text"
matches = re.findall(pattern, text)
print(matches) # ['https://example.com', 'plain text']1
2
3
4
5
6
7
8
9
10
11
12
13
2
3
4
5
6
7
8
9
10
11
12
13
Python re 模块高级用法
编译标志
python
text = "Hello\nWorld"
# re.IGNORECASE 或 re.I - 忽略大小写
print(re.findall(r"hello", text, re.I)) # ['Hello']
# re.MULTILINE 或 re.M - 多行模式
print(re.findall(r"^World", text, re.M)) # ['World']
# re.DOTALL 或 re.S - . 匹配换行符
text = "hello\nworld"
print(re.findall(r"hello.world", text, re.S)) # ['hello\nworld']
# re.VERBOSE 或 re.X - 允许注释和空白
pattern = r"""
\d{4} # 年份
- # 分隔符
\d{2} # 月份
"""
print(re.findall(pattern, "2024-05", re.VERBOSE)) # ['2024-05']
# 组合使用
pattern = r"""
(?P<year>\d{4}) # 年份
-
(?P<month>\d{2}) # 月份
"""
match = re.search(pattern, "2024-05", re.VERBOSE)
print(match.groupdict()) # {'year': '2024', 'month': '05'}1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
finditer 高级用法
python
# 迭代所有匹配,逐个处理
pattern = r"\d+"
text = "a1b22c333d"
for match in re.finditer(pattern, text):
print(f"Found: {match.group()} at {match.start()}-{match.end()}")1
2
3
4
5
6
2
3
4
5
6
替换高级用法
python
# 使用 Match 对象进行复杂替换
def camel_to_snake(match):
"""将驼峰命名转为蛇形命名"""
return "_" + match.group(1).lower()
text = "helloWorld pythonTest"
result = re.sub(r"[A-Z]", camel_to_snake, text)
print(result) # 'hello_world python_test'
# 使用编号引用
text = "2024-05-15"
result = re.sub(r"(\d{4})-(\d{2})-(\d{2})", r"Year:\1 Month:\2 Day:\3", text)
print(result) # 'Year:2024 Month:05 Day:15'
# 命名引用
result = re.sub(r"(?P<year>\d{4})-(?P<month>\d{2})", r"\g<month>/\g<year>", text)
print(result) # '05/2024'1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
split 高级用法
python
# 使用捕获分组时,分隔符也会被保留
text = "2024-05-15"
# 普通分割
print(re.split(r"-", text)) # ['2024', '05', '15']
# 使用捕获分组(保留分隔符)
print(re.split(r"(-)", text)) # ['2024', '-', '05', '-', '15']
# 分割驼峰命名
text = "helloWorldPython"
parts = re.split(r"(?=[A-Z])", text)
print(parts) # ['hello', 'World', 'Python']1
2
3
4
5
6
7
8
9
10
11
12
13
2
3
4
5
6
7
8
9
10
11
12
13
实用技巧
1. 验证并提取数据
python
def parse_log_line(line):
"""解析日志行"""
pattern = r"(?P<level>\w+)\s+(?P<time>\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2})\s+(?P<msg>.+)"
match = re.match(pattern, line)
if match:
return match.groupdict()
return None
log = "ERROR 2024-05-15 10:30:45 Database connection failed"
result = parse_log_line(log)
print(result)
# {'level': 'ERROR', 'time': '2024-05-15 10:30:45', 'msg': 'Database connection failed'}1
2
3
4
5
6
7
8
9
10
11
12
2
3
4
5
6
7
8
9
10
11
12
2. HTML/文本清理
python
def clean_html(html):
"""移除 HTML 标签"""
# 移除标签
text = re.sub(r"<[^>]+>", "", html)
# 解码 HTML 实体
text = re.sub(r" ", " ", text)
text = re.sub(r"<", "<", text)
text = re.sub(r">", ">", text)
text = re.sub(r"&", "&", text)
# 合并空白
text = re.sub(r"\s+", " ", text).strip()
return text
html = "<p>Hello World<3></p>"
print(clean_html(html)) # 'Hello World<3>'1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
2
3
4
5
6
7
8
9
10
11
12
13
14
15
3. 密码强度验证
python
def validate_password(password):
"""
验证密码强度
- 至少8位
- 包含大写字母
- 包含小写字母
- 包含数字
- 包含特殊字符
"""
if len(password) < 8:
return False, "密码至少8位"
if not re.search(r"[A-Z]", password):
return False, "需要包含大写字母"
if not re.search(r"[a-z]", password):
return False, "需要包含小写字母"
if not re.search(r"\d", password):
return False, "需要包含数字"
if not re.search(r"[!@#$%^&*(),.?\":{}|<>]", password):
return False, "需要包含特殊字符"
return True, "密码强度合格"1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
4. 敏感信息脱敏
python
def mask_phone(phone):
"""手机号脱敏"""
return re.sub(r"(\d{3})\d{4}(\d{4})", r"\1****\2", phone)
def mask_email(email):
"""邮箱脱敏"""
return re.sub(r"(?P<name>[^@]+)@", lambda m: m.group(1)[0] + "***@", email)
def mask_id_card(id_card):
"""身份证脱敏"""
return re.sub(r"\d{6}\d{8}(\d{4})", r"******\1", id_card)
print(mask_phone("13812345678")) # 138****5678
print(mask_email("john@example.com")) # j***@example.com
print(mask_id_card("110101199001011234")) # ******01231
2
3
4
5
6
7
8
9
10
11
12
13
14
15
2
3
4
5
6
7
8
9
10
11
12
13
14
15
5. URL 参数解析
python
def parse_url_params(url):
"""解析 URL 参数"""
# 提取查询字符串
match = re.search(r"\?(.+)", url)
if not match:
return {}
params = {}
for pair in match.group(1).split("&"):
if "=" in pair:
key, value = pair.split("=", 1)
params[key] = value
return params
url = "https://example.com/search?q=python&page=1&sort=asc"
print(parse_url_params(url))
# {'q': 'python', 'page': '1', 'sort': 'asc'}1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
6. 代码注释移除
python
def remove_comments(code):
"""移除 Python 代码注释"""
# 移除单行注释
code = re.sub(r"#.*", "", code)
# 移除字符串内的内容(需要先记录位置,这里简化处理)
return code
python_code = '''
# 这是注释
def hello():
print("Hello, World!") # 打印
'''
print(remove_comments(python_code))1
2
3
4
5
6
7
8
9
10
11
12
13
2
3
4
5
6
7
8
9
10
11
12
13
正则表达式性能优化
1. 预编译
python
# 多次使用时预编译
pattern = re.compile(r"\d+")
for _ in range(1000):
pattern.findall(text)1
2
3
4
2
3
4
2. 避免贪婪过度
python
# 贪婪可能回溯多次
text = "<div>content</div><span>more</span>"
# 差:贪婪可能导致性能问题
re.findall(r"<.+>", text)
# 好:非贪婪
re.findall(r"<[^>]+>", text)1
2
3
4
5
6
7
8
2
3
4
5
6
7
8
3. 使用字符类代替点号
python
# 差:. 会尝试更多匹配
re.findall(r"class=\"(.+)\"", text)
# 好:指定字符类
re.findall(r"class=\"([^\"]+)\"", text)1
2
3
4
5
2
3
4
5
[[返回正则首页|reg/index]]