正则基础
什么是正则表达式
正则表达式(Regular Expression)是一种用于匹配字符串的模式。它可以用简洁的方式描述字符串的特征,实现复杂的字符串查找、验证和替换。
字符类
基本字符类
python
import re
# [abc] - 匹配 a、b 或 c 中的任意一个字符
pattern = r"[aeiou]"
text = "hello world"
matches = re.findall(pattern, text)
print(matches) # ['e', 'o', 'o']
# [^abc] - 匹配除了 a、b、c 之外的其他字符
pattern = r"[^aeiou]"
text = "hello"
matches = re.findall(pattern, text)
print(matches) # ['h', 'l', 'l']1
2
3
4
5
6
7
8
9
10
11
12
13
2
3
4
5
6
7
8
9
10
11
12
13
预定义字符类
python
# \d - 匹配任意数字,等价于 [0-9]
pattern = r"\d+"
text = "Order 12345 placed"
matches = re.findall(pattern, text)
print(matches) # ['12345']
# \D - 匹配任意非数字字符,等价于 [^0-9]
pattern = r"\D+"
text = "Order 12345 placed"
matches = re.findall(pattern, text)
print(matches) # ['Order ', ' placed']
# \w - 匹配字母、数字、下划线,等价于 [a-zA-Z0-9_]
pattern = r"\w+"
text = "hello_world 123"
matches = re.findall(pattern, text)
print(matches) # ['hello_world', '123']
# \W - 匹配非字母、数字、下划线
pattern = r"\W+"
text = "hello world!"
matches = re.findall(pattern, text)
print(matches) # [' ', '!']
# \s - 匹配空白字符(空格、制表符、换行符)
pattern = r"\s+"
text = "hello world\n\ttab"
matches = re.findall(pattern, text)
print(matches) # [' ', '\n', '\t']
# . - 匹配除换行符外的任意字符
pattern = r"c.t"
text = "cat cut cot"
matches = re.findall(pattern, text)
print(matches) # ['cat', 'cut', 'cot']1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
数量词(量词)
数量词用于指定前面字符出现的次数。
基本量词
python
# * - 匹配 0 次或多次
pattern = r"ab*c" # b 出现 0 次或多次
print(re.findall(pattern, "ac abc abbc abbbc")) # ['ac', 'abc', 'abbc', 'abbbc']
# + - 匹配 1 次或多次
pattern = r"ab+c"
print(re.findall(pattern, "ac abc abbc")) # ['abc', 'abbc']
# ? - 匹配 0 次或 1 次
pattern = r"colou?r" # 可选 u
print(re.findall(pattern, "color colour")) # ['color', 'colour']
# {n} - 精确匹配 n 次
pattern = r"\d{4}" # 4 位数字
text = "2023年 12345月"
matches = re.findall(pattern, text)
print(matches) # ['2023', '1234']
# {n,} - 匹配至少 n 次
pattern = r"\d{2,}"
text = "1 12 123 1234"
matches = re.findall(pattern, text)
print(matches) # ['12', '123', '1234']
# {n,m} - 匹配 n 到 m 次
pattern = r"\d{2,4}"
text = "1 12 123 1234 12345"
matches = re.findall(pattern, text)
print(matches) # ['12', '123', '1234', '1235']1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
贪婪 vs 非贪婪
python
# 贪婪匹配:尽可能多地匹配
pattern = r"<.+>" # 贪婪
text = "<div>hello</div>"
matches = re.findall(pattern, text)
print(matches) # ['<div>hello</div>'](匹配整个字符串)
# 非贪婪匹配:尽可能少地匹配
pattern = r"<.+?>" # 加 ? 变成非贪婪
matches = re.findall(pattern, text)
print(matches) # ['<div>', '</div>']
# 使用 .*? 清理 HTML 标签
html = "<div>hello</div><span>world</span>"
clean = re.sub(r"<.+?>", "", html)
print(clean) # 'helloworld'1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
2
3
4
5
6
7
8
9
10
11
12
13
14
15
锚点(定位符)
锚点用于匹配位置,而不是具体字符。
基本锚点
python
# ^ - 匹配字符串开头(在多行模式下匹配行首)
pattern = r"^hello"
print(re.findall(pattern, "hello world\nhello python"))
# ['hello'](只匹配第一个)
# $ - 匹配字符串结尾(在多行模式下匹配行尾)
pattern = r"world$"
print(re.findall(pattern, "hello world\nhello world"))
# ['world'](只匹配最后一个)
# 多行模式
text = "hello\nworld\nhello python"
pattern = r"^hello"
matches = re.findall(pattern, text, re.MULTILINE)
print(matches) # ['hello', 'hello']1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
2
3
4
5
6
7
8
9
10
11
12
13
14
15
单词边界
python
# \b - 匹配单词边界
pattern = r"\bword\b" # 精确匹配 "word"
text = "a word in a sentence, not sword or words"
matches = re.findall(pattern, text)
print(matches) # ['word']
# \B - 匹配非单词边界
pattern = r"\Bword\B" # word 周围不是单词边界
text = "sword words swordsword"
matches = re.findall(pattern, text)
print(matches) # ['word'] 在 'words' 中间1
2
3
4
5
6
7
8
9
10
11
2
3
4
5
6
7
8
9
10
11
常用正则模式
匹配中文
python
# 匹配中文汉字
pattern = r"[\u4e00-\u9fff]+"
text = "Hello你好World世界Python"
matches = re.findall(pattern, text)
print(matches) # ['你好', '世界']
# 匹配中文标点
pattern = r"[,。!?、;:""''()]"
text = "你好,世界!"
matches = re.findall(pattern, text)
print(matches) # [',', '!']1
2
3
4
5
6
7
8
9
10
11
2
3
4
5
6
7
8
9
10
11
匹配手机号
python
# 中国手机号(11位,以1开头)
pattern = r"1[3-9]\d{9}"
print(re.findall(pattern, "13812345678 12345678901 138123456789"))
# ['13812345678']
# 手机号脱敏
phone = "13812345678"
masked = re.sub(r"(\d{3})\d{4}(\d{4})", r"\1****\2", phone)
print(masked) # '138****5678'1
2
3
4
5
6
7
8
9
2
3
4
5
6
7
8
9
匹配邮箱
python
pattern = r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
text = "contact@example.com user.name@domain.co.uk"
matches = re.findall(pattern, text)
print(matches) # ['contact@example.com', 'user.name@domain.co.uk']1
2
3
4
2
3
4
匹配 URL
python
pattern = r"https?://[^\s/$.?#].[^\s]*"
text = "Visit https://example.com or http://test.org"
matches = re.findall(pattern, text)
print(matches) # ['https://example.com', 'http://test.org']1
2
3
4
2
3
4
匹配 IP 地址
python
# 简单版(不严格校验范围)
pattern = r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}"
text = "Server at 192.168.1.1 and 10.0.0.256"
matches = re.findall(pattern, text)
print(matches) # ['192.168.1.1', '10.0.0.256']
# 严格版
pattern = r"(?:(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(?:25[0-5]|2[0-4]\d|[01]?\d\d?)"
matches = re.findall(pattern, text)
print(matches) # ['192.168.1.1'](256 不合法被排除)1
2
3
4
5
6
7
8
9
10
2
3
4
5
6
7
8
9
10
re 模块核心函数
re.findall()
python
# 返回所有匹配项的列表
pattern = r"\d+"
text = "abc 123 def 456"
result = re.findall(pattern, text)
print(result) # ['123', '456']1
2
3
4
5
2
3
4
5
re.search()
python
# 返回第一个匹配项(Match 对象)
pattern = r"\d+"
text = "abc 123 def 456"
result = re.search(pattern, text)
if result:
print(result.group()) # '123'
print(result.start(), result.end()) # 4, 71
2
3
4
5
6
7
2
3
4
5
6
7
re.match()
python
# 只匹配字符串开头
pattern = r"\d+"
text = "123 abc"
result = re.match(pattern, text)
print(result.group() if result else None) # '123'
result = re.match(pattern, "abc 123")
print(result) # None1
2
3
4
5
6
7
8
2
3
4
5
6
7
8
re.sub()
python
# 替换
text = "hello world"
result = re.sub(r"world", "python", text)
print(result) # 'hello python'
# 使用分组替换
text = "2023-05-15"
result = re.sub(r"(\d{4})-(\d{2})-(\d{2})", r"\3/\2/\1", text)
print(result) # '15/05/2023'
# 回调函数
def addThousands(match):
num = int(match.group())
return f"{num:,}"
text = "1234567"
result = re.sub(r"\d+", addThousands, text)
print(result) # '1,234,567'1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
re.split()
python
# 按模式分割
text = "apple,banana;orange,mango"
result = re.split(r"[,;]", text)
print(result) # ['apple', 'banana', 'orange', 'mango']1
2
3
4
2
3
4
re.finditer()
python
# 返回迭代器,每次迭代得到一个 Match 对象
pattern = r"\d+"
text = "a1b2c3"
for match in re.finditer(pattern, text):
print(f"Found: {match.group()} at {match.start()}-{match.end()}")
# Found: 1 at 1-2
# Found: 2 at 3-4
# Found: 3 at 5-61
2
3
4
5
6
7
8
2
3
4
5
6
7
8
编译正则表达式
python
# 预编译正则表达式,提高匹配效率
pattern = re.compile(r"\d+")
# 多次使用时只编译一次
text = "abc 123 def 456 ghi 789"
print(pattern.findall(text)) # ['123', '456', '789']
print(pattern.search(text).group()) # '123'1
2
3
4
5
6
7
2
3
4
5
6
7
常见问题
转义字符
python
# 在正则中,特殊字符需要转义
# . * + ? ^ $ { } [ ] \ | ( )
pattern = r"price: \$\d+" # 匹配 "price: $100"
text = "The price: $100"
match = re.search(pattern, text)
print(match.group() if match else None) # 'price: $100'1
2
3
4
5
6
2
3
4
5
6
忽略大小写
python
pattern = r"hello"
text = "Hello HELLO hello"
matches = re.findall(pattern, text, re.IGNORECASE)
print(matches) # ['Hello', 'HELLO', 'hello']1
2
3
4
2
3
4
练习题
python
# 1. 验证密码强度(至少8位,包含大小写字母和数字)
def validate_password(password):
if len(password) < 8:
return False
if not re.search(r"[A-Z]", password):
return False
if not re.search(r"[a-z]", password):
return False
if not re.search(r"\d", password):
return False
return True
print(validate_password("Pass1234")) # True
print(validate_password("pass1234")) # False
# 2. 提取日期
pattern = r"(\d{4})-(\d{2})-(\d{2})"
text = "Date: 2024-01-15, Deadline: 2024-12-31"
matches = re.findall(pattern, text)
print(matches) # [('2024', '01', '15'), ('2024', '12', '31')]
# 3. 去除多余空格
text = "hello world \n python "
result = re.sub(r"\s+", " ", text).strip()
print(result) # 'hello world python'1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
[[返回正则首页|reg/index]]