布隆过滤器¶

概述¶

布隆过滤器（Bloom Filter）是一种**空间效率极高**的概率型数据结构，由 Burton Howard Bloom 于 1970 年提出。它用于判断一个元素是否在集合中，具有以下特性：

核心特性

空间高效：使用位数组，空间复杂度远低于哈希表
查询快速：O(k) 时间复杂度，k 为哈希函数数量
假阳性：可能误判"存在"，但不会误判"不存在"
不可删除：标准布隆过滤器不支持删除操作

生活类比

想象一个图书馆的新书登记系统：每本新书在多个登记册上打勾。检查一本书是否是新书时，只要发现任意一个登记册上没有打勾，就一定不是新书；但如果所有登记册都打勾了，可能是这本书确实登记过，也可能是其他书碰巧在相同位置打了勾（假阳性）。

核心原理¶

数据结构¶

布隆过滤器由两部分组成：

Text Only

布隆过滤器结构:
┌─────────────────────────────────────────────────────┐
│                    位数组 (Bit Array)                │
│  ┌───┬───┬───┬───┬───┬───┬───┬───┬───┬───┬───┬───┐ │
│  │ 0 │ 1 │ 0 │ 1 │ 1 │ 0 │ 0 │ 1 │ 0 │ 1 │ 0 │ 0 │ │ │
│  └───┴───┴───┴───┴───┴───┴───┴───┴───┴───┴───┴───┘ │
│    0   1   2   3   4   5   6   7   8   9  10  11    │
└─────────────────────────────────────────────────────┘
                    ↓
┌─────────────────────────────────────────────────────┐
│              k 个独立的哈希函数                       │
│           h₁(x), h₂(x), h₃(x), ..., hₖ(x)           │
└─────────────────────────────────────────────────────┘

工作流程¶

flowchart TB
    subgraph "插入操作 Insert"
        A1["输入元素 x"] --> B1["计算 k 个哈希值"]
        B1 --> C1["h₁(x), h₂(x), ..., hₖ(x)"]
        C1 --> D1["将对应位置设为 1"]
    end
    
    subgraph "查询操作 Query"
        A2["输入元素 x"] --> B2["计算 k 个哈希值"]
        B2 --> C2["检查所有对应位"]
        C2 --> D2{"所有位都为 1?"}
        D2 -->|是| E2["返回: 可能存在"]
        D2 -->|否| F2["返回: 一定不存在"]
    end
    
    style E2 fill:#FFF3E0,stroke:#FF9800
    style F2 fill:#E8F5E9,stroke:#4CAF50

插入操作详解¶

Text Only

插入元素 "apple" 的过程 (m=12, k=3):

步骤1: 计算哈希值
┌────────────────────────────────────────┐
│  h₁("apple") = 5381 → 5381 % 12 = 5   │
│  h₂("apple") = 2166  → 2166 % 12 = 6  │
│  h₃("apple") = 8923  → 8923 % 12 = 11 │
└────────────────────────────────────────┘

步骤2: 设置位数组
初始状态:
┌───┬───┬───┬───┬───┬───┬───┬───┬───┬───┬───┬───┐
│ 0 │ 0 │ 0 │ 0 │ 0 │ 0 │ 0 │ 0 │ 0 │ 0 │ 0 │ 0 │
└───┴───┴───┴───┴───┴───┴───┴───┴───┴───┴───┴───┘
  0   1   2   3   4   5   6   7   8   9  10  11

设置后:
┌───┬───┬───┬───┬───┬───┬───┬───┬───┬───┬───┬───┐
│ 0 │ 0 │ 0 │ 0 │ 0 │ 1 │ 1 │ 0 │ 0 │ 0 │ 0 │ 1 │
└───┴───┴───┴───┴───┴───┴───┴───┴───┴───┴───┴───┘
  0   1   2   3   4   5   6   7   8   9  10  11
                      ↑       ↑              ↑
                    h₁      h₂             h₃

查询操作详解¶

Text Only

查询元素 "banana" 的过程:

步骤1: 计算哈希值
┌────────────────────────────────────────┐
│  h₁("banana") = 3521 → 3521 % 12 = 5  │
│  h₂("banana") = 7842 → 7842 % 12 = 6  │
│  h₃("banana") = 1234 → 1234 % 12 = 10 │
└────────────────────────────────────────┘

步骤2: 检查位数组
┌───┬───┬───┬───┬───┬───┬───┬───┬───┬───┬───┬───┐
│ 0 │ 0 │ 0 │ 0 │ 0 │ 1 │ 1 │ 0 │ 0 │ 0 │ 0 │ 1 │
└───┴───┴───┴───┴───┴───┴───┴───┴───┴───┴───┴───┘
  0   1   2   3   4   5   6   7   8   9  10  11
                      ↓       ↓              ↓
                     ✓(1)   ✓(1)           ✗(0)

结果: bit[10] = 0 → "banana" 一定不存在 ✓

假阳性现象¶

Text Only

假阳性示例:

假设已插入: "apple", "orange", "grape"

位数组状态:
┌───┬───┬───┬───┬───┬───┬───┬───┬───┬───┬───┬───┐
│ 0 │ 1 │ 0 │ 1 │ 1 │ 1 │ 1 │ 0 │ 1 │ 0 │ 1 │ 1 │
└───┴───┴───┴───┴───┴───┴───┴───┴───┴───┴───┴───┘
  0   1   2   3   4   5   6   7   8   9  10  11

查询 "banana" (未插入):
┌────────────────────────────────────────┐
│  h₁("banana") = 5  → bit[5] = 1  ✓    │
│  h₂("banana") = 8  → bit[8] = 1  ✓    │
│  h₃("banana") = 11 → bit[11] = 1 ✓    │
└────────────────────────────────────────┘

结果: 所有位都为 1 → "banana" 可能存在
实际: "banana" 未插入 → 这是假阳性！

graph TB
    subgraph "假阳性原因"
        A["元素 A 设置了位置 1, 5, 8"]
        B["元素 B 设置了位置 3, 5, 11"]
        C["元素 C 设置了位置 1, 8, 11"]
        D["查询元素 D 需要 5, 8, 11"]
        E["这些位置已被 A, B, C 恰好覆盖"]
        F["导致 D 被误判为存在"]
    end
    
    A --> E
    B --> E
    C --> E
    D --> E
    E --> F
    
    style F fill:#FFF3E0,stroke:#FF9800

数学原理¶

假阳性概率推导¶

设位数组大小为 m，插入 n 个元素，使用 k 个哈希函数。

推导过程

插入一个元素后，某一位仍为 0 的概率：(1 - 1/m)^k ≈ e^(-k/m)
插入 n 个元素后，某一位仍为 0 的概率：(1 - 1/m)^(nk) ≈ e^(-nk/m)
查询一个不存在的元素时，所有 k 位都为 1 的概率：

\[P_{fp} = \left(1 - e^{-kn/m}\right)^k\]

最优参数选择¶

最优哈希函数数量

当位数组大小 m 和元素数量 n 确定时，最优哈希函数数量为：

k = (m/n) × ln(2) ≈ 0.693 × (m/n)

最优位数组大小

给定预期元素数量 n 和可接受假阳性率 p 时：

m = -n × ln(p) / (ln(2))²

参数关系图¶

graph LR
    subgraph "参数关系"
        A["预期元素数 n"] --> C["计算最优 m 和 k"]
        B["可接受假阳性率 p"] --> C
        C --> D["位数组大小 m"]
        C --> E["哈希函数数 k"]
    end
    
    subgraph "示例: n=100万, p=1%"
        F["m ≈ 9.6M bits ≈ 1.2MB"]
        G["k ≈ 7"]
    end
    
    D --> F
    E --> G

空间效率对比¶

Text Only

存储 100 万元素，假阳性率 1%:

┌────────────────────────────────────────────────────┐
│ 数据结构          │ 空间占用      │ 相对大小       │
├────────────────────────────────────────────────────┤
│ 哈希表(整数)      │ ~32 MB       │ 100%          │
│ 哈希表(字符串)    │ ~50-100 MB   │ 150-300%      │
│ 布隆过滤器        │ ~1.2 MB      │ 3.75%         │
│ 位图(整数范围已知)│ ~1.2 MB      │ 3.75%         │
└────────────────────────────────────────────────────┘

布隆过滤器节省约 96% 的空间！

基本实现¶

CC++PythonJavaGoRust

C
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <stdint.h>

typedef struct {
    uint8_t *bits;
    int size;
    int hashCount;
} BloomFilter;

uint32_t murmur3_32(const char *key, int len, uint32_t seed) {
    const uint8_t *data = (const uint8_t *)key;
    uint32_t h1 = seed;
    const uint32_t c1 = 0xcc9e2d51;
    const uint32_t c2 = 0x1b873593;
    
    for (int i = 0; i < len; i++) {
        uint32_t k1 = data[i];
        k1 *= c1;
        k1 = (k1 << 15) | (k1 >> 17);
        k1 *= c2;
        h1 ^= k1;
        h1 = (h1 << 13) | (h1 >> 19);
        h1 = h1 * 5 + 0xe6546b64;
    }
    
    h1 ^= len;
    h1 ^= h1 >> 16;
    h1 *= 0x85ebca6b;
    h1 ^= h1 >> 13;
    h1 *= 0xc2b2ae35;
    h1 ^= h1 >> 16;
    
    return h1;
}

BloomFilter* bloomCreate(int size, int hashCount) {
    BloomFilter *bf = (BloomFilter*)malloc(sizeof(BloomFilter));
    bf->size = size;
    bf->hashCount = hashCount;
    bf->bits = (uint8_t*)calloc((size + 7) / 8, sizeof(uint8_t));
    return bf;
}

BloomFilter* bloomCreateOptimal(int expectedItems, double falsePositiveRate) {
    double ln2 = log(2);
    int m = (int)(-(expectedItems * log(falsePositiveRate)) / (ln2 * ln2));
    int k = (int)((m / expectedItems) * ln2);
    if (k < 1) k = 1;
    if (k > 32) k = 32;
    return bloomCreate(m, k);
}

static void setBit(uint8_t *bits, int index) {
    bits[index / 8] |= (1 << (index % 8));
}

static int getBit(uint8_t *bits, int index) {
    return (bits[index / 8] >> (index % 8)) & 1;
}

void bloomAdd(BloomFilter *bf, const char *item) {
    int len = strlen(item);
    for (int i = 0; i < bf->hashCount; i++) {
        uint32_t hash = murmur3_32(item, len, i);
        int index = hash % bf->size;
        setBit(bf->bits, index);
    }
}

int bloomContains(BloomFilter *bf, const char *item) {
    int len = strlen(item);
    for (int i = 0; i < bf->hashCount; i++) {
        uint32_t hash = murmur3_32(item, len, i);
        int index = hash % bf->size;
        if (!getBit(bf->bits, index)) return 0;
    }
    return 1;
}

void bloomFree(BloomFilter *bf) {
    free(bf->bits);
    free(bf);
}

C++
#include <vector>
#include <string>
#include <cmath>
#include <cstdint>

class BloomFilter {
private:
    std::vector<uint8_t> bits;
    int hashCount;
    int size;
    
    uint32_t getHash(const std::string& item, int i) const {
        uint32_t hash = i;
        for (char c : item) {
            hash = hash * 31 + c;
        }
        return hash;
    }
    
public:
    BloomFilter(int numBits, int numHashes) 
        : bits((numBits + 7) / 8, 0), 
          hashCount(numHashes), 
          size(numBits) {}
    
    static BloomFilter create(int expectedItems, double falsePositiveRate) {
        double ln2 = std::log(2);
        int m = static_cast<int>(-(expectedItems * std::log(falsePositiveRate)) / (ln2 * ln2));
        int k = static_cast<int>((m / expectedItems) * ln2);
        k = std::max(1, std::min(k, 32));
        return BloomFilter(m, k);
    }
    
    void add(const std::string& item) {
        for (int i = 0; i < hashCount; i++) {
            uint32_t hash = getHash(item, i);
            int index = hash % size;
            bits[index / 8] |= (1 << (index % 8));
        }
    }
    
    bool contains(const std::string& item) const {
        for (int i = 0; i < hashCount; i++) {
            uint32_t hash = getHash(item, i);
            int index = hash % size;
            if (!(bits[index / 8] & (1 << (index % 8)))) return false;
        }
        return true;
    }
};

Python
import math
from typing import List

class BloomFilter:
    def __init__(self, size: int, hash_count: int):
        self.size = size
        self.hash_count = hash_count
        self.bits = [0] * ((size + 7) // 8)
    
    @classmethod
    def create(cls, expected_items: int, false_positive_rate: float):
        ln2 = math.log(2)
        m = int(-(expected_items * math.log(false_positive_rate)) / (ln2 * ln2))
        k = int((m / expected_items) * ln2)
        k = max(1, min(k, 32))
        return cls(m, k)
    
    def _hash(self, item: str, seed: int) -> int:
        hash_val = seed
        for c in item:
            hash_val = hash_val * 31 + ord(c)
        return hash_val % self.size
    
    def _set_bit(self, index: int):
        self.bits[index // 8] |= (1 << (index % 8))
    
    def _get_bit(self, index: int) -> bool:
        return bool(self.bits[index // 8] & (1 << (index % 8)))
    
    def add(self, item: str):
        for i in range(self.hash_count):
            index = self._hash(item, i)
            self._set_bit(index)
    
    def contains(self, item: str) -> bool:
        for i in range(self.hash_count):
            index = self._hash(item, i)
            if not self._get_bit(index):
                return False
        return True

Java
public class BloomFilter {
    private byte[] bits;
    private int size;
    private int hashCount;
    
    public BloomFilter(int size, int hashCount) {
        this.size = size;
        this.hashCount = hashCount;
        this.bits = new byte[(size + 7) / 8];
    }
    
    public static BloomFilter create(int expectedItems, double falsePositiveRate) {
        double ln2 = Math.log(2);
        int m = (int)(-(expectedItems * Math.log(falsePositiveRate)) / (ln2 * ln2));
        int k = (int)((m / expectedItems) * ln2);
        k = Math.max(1, Math.min(k, 32));
        return new BloomFilter(m, k);
    }
    
    private int getHash(String item, int seed) {
        int hash = seed;
        for (char c : item.toCharArray()) {
            hash = hash * 31 + c;
        }
        return Math.abs(hash % size);
    }
    
    public void add(String item) {
        for (int i = 0; i < hashCount; i++) {
            int index = getHash(item, i);
            bits[index / 8] |= (1 << (index % 8));
        }
    }
    
    public boolean contains(String item) {
        for (int i = 0; i < hashCount; i++) {
            int index = getHash(item, i);
            if ((bits[index / 8] & (1 << (index % 8))) == 0) return false;
        }
        return true;
    }
}

Go
type BloomFilter struct {
    bits      []byte
    size      int
    hashCount int
}

func NewBloomFilter(size, hashCount int) *BloomFilter {
    return &BloomFilter{
        bits:      make([]byte, (size+7)/8),
        size:      size,
        hashCount: hashCount,
    }
}

func NewBloomFilterOptimal(expectedItems int, falsePositiveRate float64) *BloomFilter {
    ln2 := math.Log(2)
    m := int(-float64(expectedItems)*math.Log(falsePositiveRate) / (ln2 * ln2))
    k := int(float64(m) / float64(expectedItems) * ln2)
    if k < 1 {
        k = 1
    }
    if k > 32 {
        k = 32
    }
    return NewBloomFilter(m, k)
}

func (bf *BloomFilter) getHash(item string, seed int) int {
    hash := seed
    for _, c := range item {
        hash = hash*31 + int(c)
    }
    if hash < 0 {
        hash = -hash
    }
    return hash % bf.size
}

func (bf *BloomFilter) Add(item string) {
    for i := 0; i < bf.hashCount; i++ {
        index := bf.getHash(item, i)
        bf.bits[index/8] |= (1 << (index % 8))
    }
}

func (bf *BloomFilter) Contains(item string) bool {
    for i := 0; i < bf.hashCount; i++ {
        index := bf.getHash(item, i)
        if bf.bits[index/8]&(1<<(index%8)) == 0 {
            return false
        }
    }
    return true
}

Rust
pub struct BloomFilter {
    bits: Vec<u8>,
    size: usize,
    hash_count: usize,
}

impl BloomFilter {
    pub fn new(size: usize, hash_count: usize) -> Self {
        BloomFilter {
            bits: vec![0; (size + 7) / 8],
            size,
            hash_count,
        }
    }
    
    pub fn create_optimal(expected_items: usize, false_positive_rate: f64) -> Self {
        let ln2 = 2f64.ln();
        let m = (-(expected_items as f64 * false_positive_rate.ln()) / (ln2 * ln2)) as usize;
        let k = ((m as f64 / expected_items as f64) * ln2) as usize;
        let k = k.max(1).min(32);
        Self::new(m, k)
    }
    
    fn get_hash(&self, item: &str, seed: usize) -> usize {
        let mut hash = seed as u32;
        for c in item.chars() {
            hash = hash.wrapping_mul(31).wrapping_add(c as u32);
        }
        (hash as usize) % self.size
    }
    
    pub fn add(&mut self, item: &str) {
        for i in 0..self.hash_count {
            let index = self.get_hash(item, i);
            self.bits[index / 8] |= 1 << (index % 8);
        }
    }
    
    pub fn contains(&self, item: &str) -> bool {
        for i in 0..self.hash_count {
            let index = self.get_hash(item, i);
            if self.bits[index / 8] & (1 << (index % 8)) == 0 {
                return false;
            }
        }
        true
    }
}