哈希表
哈希表的原理
哈希表(Hash Table)是一种以键值对形式存储数据的数据结构,通过哈希函数将键映射到数组的特定位置,实现 O(1) 平均时间复杂度的查找、插入和删除。
核心概念
- 哈希函数:将键转换为数组下标的函数
- 哈希冲突:不同键经过哈希函数得到相同下标的现象
- 负载因子:已存储元素数 / 哈希表容量,影响冲突概率和性能
键 → 哈希函数 → 数组下标 → 存储位置1
Python 中的哈希表
Python 的 dict 就是基于哈希表实现的:
python
# Python dict 的底层实现
d = {'name': 'Alice', 'age': 30, 'city': 'Beijing'}
# 内部使用哈希函数计算键的哈希值
print(hash('name')) # 键的哈希值
print(hash('age'))
# 基本操作都是 O(1)
print(d['name']) # 查找
d['gender'] = 'F' # 插入
del d['age'] # 删除1
2
3
4
5
6
7
8
9
10
11
2
3
4
5
6
7
8
9
10
11
自定义哈希函数
对于自定义对象,需要实现 __hash__ 和 __eq__ 方法:
python
class Person:
def __init__(self, name: str, age: int):
self.name = name
self.age = age
def __hash__(self):
# 使用元组组合多个字段
return hash((self.name, self.age))
def __eq__(self, other):
if not isinstance(other, Person):
return False
return self.name == other.name and self.age == other.age
p1 = Person("Alice", 30)
p2 = Person("Alice", 30)
p3 = Person("Bob", 25)
# p1 和 p2 的哈希值相同,可以被识别为相等的键
print(hash(p1) == hash(p2)) # True
print(p1 == p2) # True
# 用于 dict 的键
people = {p1: "Engineer", p3: "Designer"}
print(people[p2]) # "Engineer"(因为 p1 == p2)1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
哈希冲突的解决方法
1. 链地址法(Separate Chaining)
每个哈希桶维护一个链表(或动态数组),所有哈希到同一位置的元素都存在这个链表中。
python
class HashMapChaining:
"""链地址法实现哈希表"""
def __init__(self, capacity: int = 16, load_factor: float = 0.75):
self._capacity = capacity
self._load_factor = load_factor
self._size = 0
# 每个桶是一个列表,存放 (key, value) 对
self._buckets = [[] for _ in range(capacity)]
def _hash(self, key) -> int:
return hash(key) % self._capacity
def _resize(self):
"""扩容并重新哈希"""
old_buckets = self._buckets
self._capacity *= 2
self._size = 0
self._buckets = [[] for _ in range(self._capacity)]
for bucket in old_buckets:
for key, val in bucket:
self.put(key, val)
def put(self, key, value):
if self._size / self._capacity >= self._load_factor:
self._resize()
idx = self._hash(key)
bucket = self._buckets[idx]
for i, (k, v) in enumerate(bucket):
if k == key:
bucket[i] = (key, value)
return
bucket.append((key, value))
self._size += 1
def get(self, key, default=None):
idx = self._hash(key)
bucket = self._buckets[idx]
for k, v in bucket:
if k == key:
return v
return default
def remove(self, key):
idx = self._hash(key)
bucket = self._buckets[idx]
for i, (k, v) in enumerate(bucket):
if k == key:
del bucket[i]
self._size -= 1
return True
return False
def __len__(self):
return self._size
def __repr__(self):
items = []
for bucket in self._buckets:
for k, v in bucket:
items.append(f"{k!r}: {v!r}")
return "{" + ", ".join(items) + "}"
# 测试
h = HashMapChaining()
h.put("name", "Alice")
h.put("age", 30)
h.put("city", "Beijing")
print(h) # {'name': 'Alice', 'age': 30, 'city': 'Beijing'}
print(len(h)) # 3
print(h.get("age")) # 30
h.remove("age")
print(h.get("age")) # None1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
2. 开放地址法(Open Addressing)
当发生冲突时,探测其他可用的空槽位。
python
class HashMapOpenAddr:
"""开放地址法(线性探测)实现哈希表"""
class Entry:
def __init__(self, key=None, value=None, deleted=False):
self.key = key
self.value = value
self.deleted = deleted
def __init__(self, capacity: int = 16, load_factor: float = 0.75):
self._capacity = capacity
self._load_factor = load_factor
self._size = 0
self._used = 0 # 已使用(未删除)的槽位数
self._table = [self.Entry() for _ in range(capacity)]
def _hash(self, key) -> int:
return hash(key) % self._capacity
def _probe(self, key, start: int) -> int:
"""线性探测下一个位置"""
idx = start
step = 0
while self._table[idx].key is not None and self._table[idx].key != key:
if self._table[idx].deleted:
# 遇到已删除的槽位,尝试在此位置插入
break
idx = (idx + 1) % self._capacity
step += 1
if step >= self._capacity:
raise RuntimeError("Hash table is full")
return idx
def put(self, key, value):
if self._used / self._capacity >= self._load_factor:
self._resize()
idx = self._hash(key)
idx = self._probe(key, idx)
if self._table[idx].key is None or self._table[idx].deleted:
self._used += 1
self._table[idx] = self.Entry(key, value)
self._size += 1
def get(self, key, default=None):
idx = self._hash(key)
step = 0
while self._table[idx].key is not None:
if (self._table[idx].key == key and
not self._table[idx].deleted):
return self._table[idx].value
idx = (idx + 1) % self._capacity
step += 1
if step >= self._capacity:
break
return default
def remove(self, key):
idx = self._hash(key)
step = 0
while self._table[idx].key is not None:
if (self._table[idx].key == key and
not self._table[idx].deleted):
self._table[idx].deleted = True
self._size -= 1
return True
idx = (idx + 1) % self._capacity
step += 1
if step >= self._capacity:
break
return False
def _resize(self):
old_table = self._table
self._capacity *= 2
self._size = 0
self._used = 0
self._table = [self.Entry() for _ in range(self._capacity)]
for entry in old_table:
if entry.key is not None and not entry.deleted:
self.put(entry.key, entry.value)
def __repr__(self):
items = []
for entry in self._table:
if entry.key is not None and not entry.deleted:
items.append(f"{entry.key!r}: {entry.value!r}")
return "{" + ", ".join(items) + "}"1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
哈希表的应用
1. 两数之和(LeetCode 1)
最经典的哈希表应用。
python
def two_sum(nums: list[int], target: int) -> list[int]:
"""
在数组中找出和为目标值的两数的下标
时间复杂度: O(n)
空间复杂度: O(n)
"""
seen = {} # value -> index
for i, num in enumerate(nums):
complement = target - num
if complement in seen:
return [seen[complement], i]
seen[num] = i
return []
# 测试
print(two_sum([2, 7, 11, 15], 9)) # [0, 1]
print(two_sum([3, 2, 4], 6)) # [1, 2]
print(two_sum([3, 3], 6)) # [0, 1]1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
2. 字母异位词分组(LeetCode 49)
将字母异位词(由相同字母组成但顺序不同的词)分组。
python
def group_anagrams(strs: list[str]) -> list[list[str]]:
"""
将字母异位词分组
时间复杂度: O(n * k),k 为字符串平均长度
"""
from collections import defaultdict
# 键:按字母排序后的字符串
# 或者用字符计数作为键 (tuple of 26 counts)
groups = defaultdict(list)
for s in strs:
# 方法1:排序作为键
# key = ''.join(sorted(s))
# 方法2:字符计数作为键(更快)
key = [0] * 26
for ch in s:
key[ord(ch) - ord('a')] += 1
key = tuple(key)
groups[key].append(s)
return list(groups.values())
# 测试
strs = ["eat", "tea", "tan", "ate", "nat", "bat"]
result = group_anagrams(strs)
print(result)
# [['eat', 'tea', 'ate'], ['tan', 'nat'], ['bat']]1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
3. 快乐数(LeetCode 202)
判断一个数是否是「快乐数」。
python
def is_happy(n: int) -> bool:
"""
判断 n 是否是快乐数
快乐数:各位平方和最终等于 1
非快乐数:会陷入循环,永远不等于 1
"""
seen = set()
while n != 1 and n not in seen:
seen.add(n)
n = sum(int(d) ** 2 for d in str(n))
return n == 1
def is_happy_floyd(n: int) -> bool:
"""
快慢指针法检测循环
快乐数的各位平方和最终会收敛到 1
非快乐数会进入一个不包含 1 的循环
"""
def next_num(x):
return sum(int(d) ** 2 for d in str(x))
slow = next_num(n)
fast = next_num(next_num(n))
while slow != fast:
slow = next_num(slow)
fast = next_num(next_num(fast))
return slow == 1
# 测试
print(is_happy(19)) # True
print(is_happy(2)) # False1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
4. LRU 缓存(LeetCode 146)
结合哈希表和双向链表实现 O(1) 的 LRU 缓存。
python
class DLinkedNode:
def __init__(self, key=0, value=0):
self.key = key
self.value = value
self.prev = None
self.next = None
class LRUCache:
"""
LRU 缓存
哈希表 + 双向链表实现 O(1) 查找、插入、删除
"""
def __init__(self, capacity: int):
self._capacity = capacity
self._cache = {} # key -> DLinkedNode
# 哨兵节点,避免边界判断
self._head = DLinkedNode() # 最近使用
self._tail = DLinkedNode() # 最久未使用
self._head.next = self._tail
self._tail.prev = self._head
def _add_to_head(self, node: DLinkedNode):
"""添加到最近使用位置"""
node.prev = self._head
node.next = self._head.next
self._head.next.prev = node
self._head.next = node
def _remove_node(self, node: DLinkedNode):
"""从链表中移除节点"""
node.prev.next = node.next
node.next.prev = node.prev
def _move_to_head(self, node: DLinkedNode):
"""将节点移到最近使用位置"""
self._remove_node(node)
self._add_to_head(node)
def _pop_tail(self) -> DLinkedNode:
"""移除最久未使用的节点"""
node = self._tail.prev
self._remove_node(node)
return node
def get(self, key: int) -> int:
if key not in self._cache:
return -1
node = self._cache[key]
self._move_to_head(node)
return node.value
def put(self, key: int, value: int):
if key in self._cache:
node = self._cache[key]
node.value = value
self._move_to_head(node)
else:
node = DLinkedNode(key, value)
self._cache[key] = node
self._add_to_head(node)
if len(self._cache) > self._capacity:
removed = self._pop_tail()
del self._cache[removed.key]
# 测试
cache = LRUCache(2)
cache.put(1, 1) # {1=1}
cache.put(2, 2) # {1=1, 2=2}
print(cache.get(1)) # 1,返回后 {2=2, 1=1}
cache.put(3, 3) # 删除最久未使用的 2,{1=1, 3=3}
print(cache.get(2)) # -1 (不存在)
cache.put(4, 4) # 删除最久未使用的 1,{3=3, 4=4}
print(cache.get(1)) # -1
print(cache.get(3)) # 3
print(cache.get(4)) # 41
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
5. 查找共同字符(LeetCode 1002)
python
def common_chars(words: list[str]) -> list[str]:
"""
查找所有在每个字符串中都出现的字符
"""
from collections import Counter
# 第一个词的字符计数作为基准
result = Counter(words[0])
# 与后续每个词的计数取交集
for word in words[1:]:
result &= Counter(word)
# 展开为字符列表
return list(result.elements())
# 测试
print(common_chars(["bella", "label", "roller"]))
# ['e', 'l', 'l']1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
哈希表的设计要点
负载因子与扩容
负载因子(α = n/m)越大,冲突概率越高。一般当 α > 0.75 时进行扩容:
python
# 扩容策略
def _maybe_resize(self):
if self._size / self._capacity >= self._load_factor:
new_capacity = self._capacity * 2
self._resize(new_capacity)1
2
3
4
5
2
3
4
5
哈希函数的设计原则
- 均匀分布:哈希值应均匀分布在数组各处
- 计算简单:哈希函数的计算成本不能太高
- 确定性:相同输入必须产生相同输出
常见哈希函数:
- 除法取余:
h(k) = k mod m(m 选素数效果更好) - 乘法散列:
h(k) = floor(m * (k * A mod 1))(Knuth 乘法) - 全域哈希:随机选择哈希函数,防止最坏情况攻击
一致性哈希
在分布式系统中,一致性哈希(Consistent Hashing)用于将数据分布到多个节点:
python
import hashlib
class ConsistentHash:
"""简化版一致性哈希"""
def __init__(self, nodes: list[str] = None, replicas: int = 100):
self._replicas = replicas
self._ring = {} # 哈希值 -> 节点
self._sorted_keys = []
nodes = nodes or []
for node in nodes:
self.add_node(node)
def _hash(self, key: str) -> int:
return int(hashlib.md5(key.encode()).hexdigest(), 16)
def add_node(self, node: str):
for i in range(self._replicas):
hash_val = self._hash(f"{node}:{i}")
self._ring[hash_val] = node
self._sorted_keys.append(hash_val)
self._sorted_keys.sort()
def remove_node(self, node: str):
for i in range(self._replicas):
hash_val = self._hash(f"{node}:{i}")
del self._ring[hash_val]
self._sorted_keys.remove(hash_val)
def get_node(self, key: str) -> str:
if not self._ring:
return None
hash_val = self._hash(key)
# 二分查找第一个 >= hash_val 的节点
for h in self._sorted_keys:
if h >= hash_val:
return self._ring[h]
# 循环回到第一个节点
return self._ring[self._sorted_keys[0]]
# 测试
ch = ConsistentHash(['node1', 'node2', 'node3'])
for key in ['key1', 'key2', 'key3', 'key4', 'key5']:
print(f"{key} -> {ch.get_node(key)}")1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
总结
| 操作 | 平均时间复杂度 | 最坏时间复杂度 |
|---|---|---|
| 查找 | O(1) | O(n) |
| 插入 | O(1) | O(n) |
| 删除 | O(1) | O(n) |
| 扩容 | - | O(n)(均摊 O(1)) |
哈希表是面试和工程中的核心数据结构,理解其原理对于解决复杂问题至关重要。
[[返回 算法首页|../index]]