跳转至

LSM树

概述

LSM树(Log-Structured Merge Tree)是一种针对写密集型场景优化的数据结构,广泛应用于现代数据库和存储系统。通过将随机写转换为顺序写,大幅提升写入性能。

核心思想

LSM树的核心思想是将数据分层存储,新数据先写入内存,达到阈值后刷新到磁盘,并通过合并操作维护有序性。

LSM树特点

  • 写入性能极高:将随机写转换为顺序写
  • 读取需要合并:可能需要查询多层
  • 空间放大:存在冗余数据,需要压缩
  • 适合写密集场景:日志系统、时序数据库

基本结构

LSM树由多个层级组成:

内存层
MemTable
(活跃)
Immutable
MemTable
↓ Flush
磁盘层
Level 0: [SSTable1] [SSTable2] [SSTable3]
Level 1: [SSTable4] [SSTable5]
Level 2: [SSTable6] [SSTable7] [SSTable8] [SSTable9]
...

核心组件

1. MemTable(内存表)

C
#define MAX_MEMTABLE_SIZE 1000

typedef struct KVPair {
    char key[64];
    char value[256];
    long long timestamp;
    int deleted;
} KVPair;

typedef struct MemTable {
    KVPair entries[MAX_MEMTABLE_SIZE];
    int count;
    int capacity;
} MemTable;

MemTable* createMemTable() {
    MemTable *mt = (MemTable*)malloc(sizeof(MemTable));
    mt->count = 0;
    mt->capacity = MAX_MEMTABLE_SIZE;
    return mt;
}

void memTablePut(MemTable *mt, char *key, char *value) {
    if (mt->count >= mt->capacity) {
        return;
    }
    
    strcpy(mt->entries[mt->count].key, key);
    strcpy(mt->entries[mt->count].value, value);
    mt->entries[mt->count].timestamp = time(NULL);
    mt->entries[mt->count].deleted = 0;
    mt->count++;
}

int memTableGet(MemTable *mt, char *key, char *value) {
    for (int i = mt->count - 1; i >= 0; i--) {
        if (strcmp(mt->entries[i].key, key) == 0) {
            if (mt->entries[i].deleted) {
                return -1;
            }
            strcpy(value, mt->entries[i].value);
            return 1;
        }
    }
    return 0;
}

2. SSTable(排序字符串表)

C
#define BLOCK_SIZE 4096

typedef struct SSTableBlock {
    KVPair pairs[100];
    int count;
    char minKey[64];
    char maxKey[64];
} SSTableBlock;

typedef struct SSTable {
    char filename[256];
    int level;
    long long size;
    char minKey[64];
    char maxKey[64];
    SSTableBlock *blocks;
    int blockCount;
} SSTable;

typedef struct BloomFilter {
    unsigned char *bits;
    int size;
    int hashCount;
} BloomFilter;

3. LSM树结构

C
#define MAX_LEVELS 7

typedef struct LSMTree {
    MemTable *memTable;
    MemTable *immutableMemTable;
    SSTable **levels[MAX_LEVELS];
    int levelCounts[MAX_LEVELS];
    int maxMemTableSize;
    int sizeRatio;
    BloomFilter *bloomFilters[MAX_LEVELS];
} LSMTree;

LSMTree* createLSMTree() {
    LSMTree *lsm = (LSMTree*)malloc(sizeof(LSMTree));
    lsm->memTable = createMemTable();
    lsm->immutableMemTable = NULL;
    lsm->maxMemTableSize = MAX_MEMTABLE_SIZE;
    lsm->sizeRatio = 10;
    
    for (int i = 0; i < MAX_LEVELS; i++) {
        lsm->levels[i] = NULL;
        lsm->levelCounts[i] = 0;
    }
    
    return lsm;
}

核心操作实现

写入操作

C
void lsmPut(LSMTree *lsm, char *key, char *value) {
    memTablePut(lsm->memTable, key, value);
    
    if (lsm->memTable->count >= lsm->maxMemTableSize) {
        if (lsm->immutableMemTable != NULL) {
            waitForFlush(lsm);
        }
        lsm->immutableMemTable = lsm->memTable;
        lsm->memTable = createMemTable();
        flushMemTable(lsm);
    }
}

Flush操作

C
int compareKVPair(const void *a, const void *b) {
    return strcmp(((KVPair*)a)->key, ((KVPair*)b)->key);
}

SSTable* flushToSSTable(MemTable *mt) {
    qsort(mt->entries, mt->count, sizeof(KVPair), compareKVPair);
    
    SSTable *sstable = (SSTable*)malloc(sizeof(SSTable));
    sstable->level = 0;
    sstable->size = mt->count;
    sstable->blockCount = (mt->count + 99) / 100;
    sstable->blocks = (SSTableBlock*)malloc(sstable->blockCount * sizeof(SSTableBlock));
    
    strcpy(sstable->minKey, mt->entries[0].key);
    strcpy(sstable->maxKey, mt->entries[mt->count - 1].key);
    
    for (int i = 0; i < sstable->blockCount; i++) {
        SSTableBlock *block = &sstable->blocks[i];
        block->count = 0;
        
        for (int j = 0; j < 100 && i * 100 + j < mt->count; j++) {
            block->pairs[j] = mt->entries[i * 100 + j];
            block->count++;
        }
        
        strcpy(block->minKey, block->pairs[0].key);
        strcpy(block->maxKey, block->pairs[block->count - 1].key);
    }
    
    return sstable;
}

void flushMemTable(LSMTree *lsm) {
    if (!lsm->immutableMemTable) return;
    
    SSTable *sstable = flushToSSTable(lsm->immutableMemTable);
    
    lsm->levels[0] = (SSTable**)realloc(lsm->levels[0], 
        (lsm->levelCounts[0] + 1) * sizeof(SSTable*));
    lsm->levels[0][lsm->levelCounts[0]] = sstable;
    lsm->levelCounts[0]++;
    
    free(lsm->immutableMemTable);
    lsm->immutableMemTable = NULL;
    
    checkCompaction(lsm, 0);
}

读取操作

C
int lsmGet(LSMTree *lsm, char *key, char *value) {
    if (memTableGet(lsm->memTable, key, value)) {
        return 1;
    }
    
    if (lsm->immutableMemTable && 
        memTableGet(lsm->immutableMemTable, key, value)) {
        return 1;
    }
    
    for (int level = 0; level < MAX_LEVELS; level++) {
        for (int i = 0; i < lsm->levelCounts[level]; i++) {
            SSTable *sstable = lsm->levels[level][i];
            
            if (strcmp(key, sstable->minKey) < 0 || 
                strcmp(key, sstable->maxKey) > 0) {
                continue;
            }
            
            if (sstableGet(sstable, key, value)) {
                return 1;
            }
        }
    }
    
    return 0;
}

int sstableGet(SSTable *sstable, char *key, char *value) {
    for (int i = 0; i < sstable->blockCount; i++) {
        SSTableBlock *block = &sstable->blocks[i];
        
        if (strcmp(key, block->minKey) < 0 || 
            strcmp(key, block->maxKey) > 0) {
            continue;
        }
        
        for (int j = 0; j < block->count; j++) {
            if (strcmp(block->pairs[j].key, key) == 0) {
                if (block->pairs[j].deleted) {
                    return -1;
                }
                strcpy(value, block->pairs[j].value);
                return 1;
            }
        }
    }
    return 0;
}

合并压缩操作

C
void mergeSSTables(SSTable *t1, SSTable *t2, SSTable *result) {
    int i = 0, j = 0, k = 0;
    int totalCount = t1->size + t2->size;
    
    result->blocks = (SSTableBlock*)malloc(sizeof(SSTableBlock));
    
    while (i < t1->size && j < t2->size) {
        KVPair *p1 = getKVPair(t1, i);
        KVPair *p2 = getKVPair(t2, j);
        
        int cmp = strcmp(p1->key, p2->key);
        
        if (cmp < 0) {
            addKVPair(result, p1);
            i++;
        } else if (cmp > 0) {
            addKVPair(result, p2);
            j++;
        } else {
            if (p1->timestamp > p2->timestamp) {
                addKVPair(result, p1);
            } else {
                addKVPair(result, p2);
            }
            i++;
            j++;
        }
    }
    
    while (i < t1->size) {
        addKVPair(result, getKVPair(t1, i++));
    }
    
    while (j < t2->size) {
        addKVPair(result, getKVPair(t2, j++));
    }
}

void checkCompaction(LSMTree *lsm, int level) {
    int threshold = lsm->sizeRatio;
    for (int i = 0; i < level; i++) {
        threshold *= lsm->sizeRatio;
    }
    
    if (lsm->levelCounts[level] > threshold) {
        compact(lsm, level);
    }
}

void compact(LSMTree *lsm, int level) {
    if (level >= MAX_LEVELS - 1) return;
    
    SSTable *merged = NULL;
    
    for (int i = 0; i < lsm->levelCounts[level]; i++) {
        if (merged == NULL) {
            merged = lsm->levels[level][i];
        } else {
            SSTable *newMerged = (SSTable*)malloc(sizeof(SSTable));
            mergeSSTables(merged, lsm->levels[level][i], newMerged);
            free(merged);
            merged = newMerged;
        }
    }
    
    lsm->levels[level + 1] = (SSTable**)realloc(lsm->levels[level + 1],
        (lsm->levelCounts[level + 1] + 1) * sizeof(SSTable*));
    lsm->levels[level + 1][lsm->levelCounts[level + 1]] = merged;
    lsm->levelCounts[level + 1]++;
    
    free(lsm->levels[level]);
    lsm->levels[level] = NULL;
    lsm->levelCounts[level] = 0;
    
    checkCompaction(lsm, level + 1);
}

删除操作

C
void lsmDelete(LSMTree *lsm, char *key) {
    KVPair tombstone;
    strcpy(tombstone.key, key);
    strcpy(tombstone.value, "");
    tombstone.timestamp = time(NULL);
    tombstone.deleted = 1;
    
    memTablePut(lsm->memTable, key, "");
    lsm->memTable->entries[lsm->memTable->count - 1].deleted = 1;
    
    if (lsm->memTable->count >= lsm->maxMemTableSize) {
        if (lsm->immutableMemTable != NULL) {
            waitForFlush(lsm);
        }
        lsm->immutableMemTable = lsm->memTable;
        lsm->memTable = createMemTable();
        flushMemTable(lsm);
    }
}

C++ 实现

C++
template<typename K, typename V>
class LSMTree {
private:
    struct KVPair {
        K key;
        V value;
        long long timestamp;
        bool deleted;
    };
    
    class MemTable {
    private:
        std::map<K, KVPair> data;
        size_t maxSize;
        
    public:
        MemTable(size_t size) : maxSize(size) {}
        
        void put(const K& key, const V& value) {
            data[key] = {key, value, std::chrono::system_clock::now().time_since_epoch().count(), false};
        }
        
        void markDeleted(const K& key) {
            if (data.count(key)) {
                data[key].deleted = true;
            }
        }
        
        std::optional<V> get(const K& key) {
            if (data.count(key) && !data[key].deleted) {
                return data[key].value;
            }
            return std::nullopt;
        }
        
        bool isFull() const { return data.size() >= maxSize; }
        const std::map<K, KVPair>& getData() const { return data; }
        void clear() { data.clear(); }
    };
    
    class SSTable {
    private:
        std::vector<KVPair> data;
        int level;
        K minKey, maxKey;
        
    public:
        SSTable(const std::vector<KVPair>& entries, int lvl) : level(lvl) {
            data = entries;
            if (!data.empty()) {
                minKey = data.front().key;
                maxKey = data.back().key;
            }
        }
        
        std::optional<V> get(const K& key) {
            auto it = std::lower_bound(data.begin(), data.end(), key,
                [](const KVPair& p, const K& k) { return p.key < k; });
            
            if (it != data.end() && it->key == key && !it->deleted) {
                return it->value;
            }
            return std::nullopt;
        }
        
        bool mayContain(const K& key) const {
            return key >= minKey && key <= maxKey;
        }
    };
    
    std::unique_ptr<MemTable> memTable;
    std::unique_ptr<MemTable> immutableMemTable;
    std::vector<std::vector<std::unique_ptr<SSTable>>> levels;
    size_t maxMemTableSize;
    int sizeRatio;
    
    void flush() {
        std::vector<KVPair> entries;
        for (const auto& [k, v] : immutableMemTable->getData()) {
            entries.push_back(v);
        }
        
        levels[0].push_back(std::make_unique<SSTable>(entries, 0));
        immutableMemTable->clear();
        immutableMemTable = nullptr;
        
        checkCompaction(0);
    }
    
    void checkCompaction(int level) {
        int threshold = sizeRatio;
        for (int i = 0; i < level; i++) {
            threshold *= sizeRatio;
        }
        
        if (levels[level].size() > threshold) {
            compact(level);
        }
    }
    
    void compact(int level);
    
public:
    LSMTree(size_t memSize = 1000, int ratio = 10) 
        : maxMemTableSize(memSize), sizeRatio(ratio) {
        memTable = std::make_unique<MemTable>(memSize);
        levels.resize(7);
    }
    
    void put(const K& key, const V& value);
    std::optional<V> get(const K& key);
    void remove(const K& key);
};

时间复杂度分析

操作 平均时间复杂度 最坏时间复杂度 说明
写入 O(1) O(1) 内存写入,极快
读取 O(log n) O(n) 需要查询多层
删除 O(1) O(1) 写入墓碑标记
范围查询 O(k + log n) O(n) k为结果数量
Flush O(n log n) O(n log n) 排序并写入磁盘
Compaction O(n) O(n) 合并SSTable

空间复杂度

  • 内存使用:O(M)(M为MemTable大小)
  • 磁盘使用:O(n × (1 + α))(α为空间放大系数)
  • 墓碑开销:额外存储删除标记

压缩策略

1. Size-Tiered Compaction

  • 当某层SSTable数量超过阈值时触发
  • 合并该层所有SSTable到下一层
  • 写放大较高,但实现简单

2. Leveled Compaction

  • 每层维持有序且不重叠的SSTable
  • Level i的大小是Level i-1的sizeRatio倍
  • 写放大较低,适合读密集场景

3. FIFO Compaction

  • 按时间顺序压缩
  • 适合时序数据
  • 自动删除旧数据

优化技术

1. 布隆过滤器

C
1
2
3
4
5
6
7
8
typedef struct BloomFilter {
    unsigned char *bits;
    int size;
    int hashCount;
} BloomFilter;

void bloomAdd(BloomFilter *bf, char *key);
int bloomMayContain(BloomFilter *bf, char *key);

2. 稀疏索引

只在SSTable中存储部分键,减少索引大小。

3. 数据压缩

对SSTable块使用Snappy、LZ4等压缩算法。

应用场景

  1. NoSQL数据库:LevelDB、RocksDB、Cassandra、HBase
  2. 时序数据库:InfluxDB、TimescaleDB
  3. 日志系统:Kafka、Elasticsearch
  4. 键值存储:Redis(AOF)、BadgerDB

优点与缺点

优点

  1. 写入性能极高:顺序写,充分利用磁盘带宽
  2. 批量写入友好:适合写入密集场景
  3. 可扩展性好:分层存储,支持海量数据
  4. 压缩效率高:SSTable有序,压缩效果好

缺点

  1. 读取性能较低:可能需要查询多层
  2. 空间放大:存在数据冗余
  3. 写放大:压缩操作产生额外写入
  4. 不适合随机读:不如B+树高效

与B+树比较

特性 LSM树 B+树
写入性能 极高(顺序写) 中等(随机写)
读取性能 中等(多层查询) 高(单次查询)
空间利用率 较低(冗余)
范围查询 高效 高效
适用场景 写密集 读密集/混合

参考资料