正则基础

什么是正则表达式

正则表达式（Regular Expression）是一种用于匹配字符串的模式。它可以用简洁的方式描述字符串的特征，实现复杂的字符串查找、验证和替换。

字符类

基本字符类

python

import re

# [abc] - 匹配 a、b 或 c 中的任意一个字符
pattern = r"[aeiou]"
text = "hello world"
matches = re.findall(pattern, text)
print(matches)  # ['e', 'o', 'o']

# [^abc] - 匹配除了 a、b、c 之外的其他字符
pattern = r"[^aeiou]"
text = "hello"
matches = re.findall(pattern, text)
print(matches)  # ['h', 'l', 'l']

预定义字符类

python

# \d - 匹配任意数字，等价于 [0-9]
pattern = r"\d+"
text = "Order 12345 placed"
matches = re.findall(pattern, text)
print(matches)  # ['12345']

# \D - 匹配任意非数字字符，等价于 [^0-9]
pattern = r"\D+"
text = "Order 12345 placed"
matches = re.findall(pattern, text)
print(matches)  # ['Order ', ' placed']

# \w - 匹配字母、数字、下划线，等价于 [a-zA-Z0-9_]
pattern = r"\w+"
text = "hello_world 123"
matches = re.findall(pattern, text)
print(matches)  # ['hello_world', '123']

# \W - 匹配非字母、数字、下划线
pattern = r"\W+"
text = "hello world!"
matches = re.findall(pattern, text)
print(matches)  # [' ', '!']

# \s - 匹配空白字符（空格、制表符、换行符）
pattern = r"\s+"
text = "hello   world\n\ttab"
matches = re.findall(pattern, text)
print(matches)  # ['   ', '\n', '\t']

# . - 匹配除换行符外的任意字符
pattern = r"c.t"
text = "cat cut cot"
matches = re.findall(pattern, text)
print(matches)  # ['cat', 'cut', 'cot']

数量词（量词）

数量词用于指定前面字符出现的次数。

基本量词

python

# * - 匹配 0 次或多次
pattern = r"ab*c"  # b 出现 0 次或多次
print(re.findall(pattern, "ac abc abbc abbbc"))  # ['ac', 'abc', 'abbc', 'abbbc']

# + - 匹配 1 次或多次
pattern = r"ab+c"
print(re.findall(pattern, "ac abc abbc"))  # ['abc', 'abbc']

# ? - 匹配 0 次或 1 次
pattern = r"colou?r"  # 可选 u
print(re.findall(pattern, "color colour"))  # ['color', 'colour']

# {n} - 精确匹配 n 次
pattern = r"\d{4}"  # 4 位数字
text = "2023年 12345月"
matches = re.findall(pattern, text)
print(matches)  # ['2023', '1234']

# {n,} - 匹配至少 n 次
pattern = r"\d{2,}"
text = "1 12 123 1234"
matches = re.findall(pattern, text)
print(matches)  # ['12', '123', '1234']

# {n,m} - 匹配 n 到 m 次
pattern = r"\d{2,4}"
text = "1 12 123 1234 12345"
matches = re.findall(pattern, text)
print(matches)  # ['12', '123', '1234', '1235']

贪婪 vs 非贪婪

python

# 贪婪匹配：尽可能多地匹配
pattern = r"<.+>"  # 贪婪
text = "<div>hello</div>"
matches = re.findall(pattern, text)
print(matches)  # ['<div>hello</div>']（匹配整个字符串）

# 非贪婪匹配：尽可能少地匹配
pattern = r"<.+?>"  # 加 ? 变成非贪婪
matches = re.findall(pattern, text)
print(matches)  # ['<div>', '</div>']

# 使用 .*? 清理 HTML 标签
html = "<div>hello</div><span>world</span>"
clean = re.sub(r"<.+?>", "", html)
print(clean)  # 'helloworld'

锚点（定位符）

锚点用于匹配位置，而不是具体字符。

基本锚点

python

# ^ - 匹配字符串开头（在多行模式下匹配行首）
pattern = r"^hello"
print(re.findall(pattern, "hello world\nhello python"))
# ['hello']（只匹配第一个）

# $ - 匹配字符串结尾（在多行模式下匹配行尾）
pattern = r"world$"
print(re.findall(pattern, "hello world\nhello world"))
# ['world']（只匹配最后一个）

# 多行模式
text = "hello\nworld\nhello python"
pattern = r"^hello"
matches = re.findall(pattern, text, re.MULTILINE)
print(matches)  # ['hello', 'hello']

单词边界

python

# \b - 匹配单词边界
pattern = r"\bword\b"  # 精确匹配 "word"
text = "a word in a sentence, not sword or words"
matches = re.findall(pattern, text)
print(matches)  # ['word']

# \B - 匹配非单词边界
pattern = r"\Bword\B"  # word 周围不是单词边界
text = "sword words swordsword"
matches = re.findall(pattern, text)
print(matches)  # ['word'] 在 'words' 中间

常用正则模式

匹配中文

python

# 匹配中文汉字
pattern = r"[\u4e00-\u9fff]+"
text = "Hello你好World世界Python"
matches = re.findall(pattern, text)
print(matches)  # ['你好', '世界']

# 匹配中文标点
pattern = r"[，。！？、；：""''（）]"
text = "你好，世界！"
matches = re.findall(pattern, text)
print(matches)  # ['，', '！']

匹配手机号

python

# 中国手机号（11位，以1开头）
pattern = r"1[3-9]\d{9}"
print(re.findall(pattern, "13812345678 12345678901 138123456789"))
# ['13812345678']

# 手机号脱敏
phone = "13812345678"
masked = re.sub(r"(\d{3})\d{4}(\d{4})", r"\1****\2", phone)
print(masked)  # '138****5678'

匹配邮箱

python

pattern = r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
text = "contact@example.com user.name@domain.co.uk"
matches = re.findall(pattern, text)
print(matches)  # ['contact@example.com', 'user.name@domain.co.uk']

匹配 URL

python

pattern = r"https?://[^\s/$.?#].[^\s]*"
text = "Visit https://example.com or http://test.org"
matches = re.findall(pattern, text)
print(matches)  # ['https://example.com', 'http://test.org']

匹配 IP 地址

python

# 简单版（不严格校验范围）
pattern = r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}"
text = "Server at 192.168.1.1 and 10.0.0.256"
matches = re.findall(pattern, text)
print(matches)  # ['192.168.1.1', '10.0.0.256']

# 严格版
pattern = r"(?:(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(?:25[0-5]|2[0-4]\d|[01]?\d\d?)"
matches = re.findall(pattern, text)
print(matches)  # ['192.168.1.1']（256 不合法被排除）

re 模块核心函数

re.findall()

python

# 返回所有匹配项的列表
pattern = r"\d+"
text = "abc 123 def 456"
result = re.findall(pattern, text)
print(result)  # ['123', '456']

re.search()

python

# 返回第一个匹配项（Match 对象）
pattern = r"\d+"
text = "abc 123 def 456"
result = re.search(pattern, text)
if result:
    print(result.group())  # '123'
    print(result.start(), result.end())  # 4, 7

re.match()

python

# 只匹配字符串开头
pattern = r"\d+"
text = "123 abc"
result = re.match(pattern, text)
print(result.group() if result else None)  # '123'

result = re.match(pattern, "abc 123")
print(result)  # None

re.sub()

python

# 替换
text = "hello world"
result = re.sub(r"world", "python", text)
print(result)  # 'hello python'

# 使用分组替换
text = "2023-05-15"
result = re.sub(r"(\d{4})-(\d{2})-(\d{2})", r"\3/\2/\1", text)
print(result)  # '15/05/2023'

# 回调函数
def addThousands(match):
    num = int(match.group())
    return f"{num:,}"

text = "1234567"
result = re.sub(r"\d+", addThousands, text)
print(result)  # '1,234,567'

re.split()

python

# 按模式分割
text = "apple,banana;orange,mango"
result = re.split(r"[,;]", text)
print(result)  # ['apple', 'banana', 'orange', 'mango']

re.finditer()

python

# 返回迭代器，每次迭代得到一个 Match 对象
pattern = r"\d+"
text = "a1b2c3"
for match in re.finditer(pattern, text):
    print(f"Found: {match.group()} at {match.start()}-{match.end()}")
# Found: 1 at 1-2
# Found: 2 at 3-4
# Found: 3 at 5-6

编译正则表达式

python

# 预编译正则表达式，提高匹配效率
pattern = re.compile(r"\d+")

# 多次使用时只编译一次
text = "abc 123 def 456 ghi 789"
print(pattern.findall(text))  # ['123', '456', '789']
print(pattern.search(text).group())  # '123'

常见问题

转义字符

python

# 在正则中，特殊字符需要转义
# . * + ? ^ $ { } [ ] \ | ( )
pattern = r"price: \$\d+"  # 匹配 "price: $100"
text = "The price: $100"
match = re.search(pattern, text)
print(match.group() if match else None)  # 'price: $100'

忽略大小写

python

pattern = r"hello"
text = "Hello HELLO hello"
matches = re.findall(pattern, text, re.IGNORECASE)
print(matches)  # ['Hello', 'HELLO', 'hello']

练习题

python

# 1. 验证密码强度（至少8位，包含大小写字母和数字）
def validate_password(password):
    if len(password) < 8:
        return False
    if not re.search(r"[A-Z]", password):
        return False
    if not re.search(r"[a-z]", password):
        return False
    if not re.search(r"\d", password):
        return False
    return True

print(validate_password("Pass1234"))  # True
print(validate_password("pass1234"))   # False

# 2. 提取日期
pattern = r"(\d{4})-(\d{2})-(\d{2})"
text = "Date: 2024-01-15, Deadline: 2024-12-31"
matches = re.findall(pattern, text)
print(matches)  # [('2024', '01', '15'), ('2024', '12', '31')]

# 3. 去除多余空格
text = "hello    world  \n  python   "
result = re.sub(r"\s+", " ", text).strip()
print(result)  # 'hello world python'

[[返回正则首页|reg/index]]

正则基础 ​

什么是正则表达式 ​

字符类 ​

基本字符类 ​

预定义字符类 ​

数量词（量词） ​

基本量词 ​

贪婪 vs 非贪婪 ​

锚点（定位符） ​

基本锚点 ​

单词边界 ​

常用正则模式 ​

匹配中文 ​

匹配手机号 ​

匹配邮箱 ​

匹配 URL ​

匹配 IP 地址 ​

re 模块核心函数 ​

re.findall() ​

re.search() ​

re.match() ​

re.sub() ​

re.split() ​

re.finditer() ​

编译正则表达式 ​

常见问题 ​

转义字符 ​

忽略大小写 ​

练习题 ​

正则基础

什么是正则表达式

字符类

基本字符类

预定义字符类

数量词（量词）

基本量词

贪婪 vs 非贪婪

锚点（定位符）

基本锚点

单词边界

常用正则模式

匹配中文

匹配手机号

匹配邮箱

匹配 URL

匹配 IP 地址

re 模块核心函数

re.findall()

re.search()

re.match()

re.sub()

re.split()

re.finditer()

编译正则表达式

常见问题

转义字符

忽略大小写

练习题