LSM树
概述
LSM树(Log-Structured Merge Tree)是一种针对写密集型场景优化的数据结构,广泛应用于现代数据库和存储系统。通过将随机写转换为顺序写,大幅提升写入性能。
核心思想
LSM树的核心思想是将数据分层存储,新数据先写入内存,达到阈值后刷新到磁盘,并通过合并操作维护有序性。
LSM树特点
- 写入性能极高:将随机写转换为顺序写
- 读取需要合并:可能需要查询多层
- 空间放大:存在冗余数据,需要压缩
- 适合写密集场景:日志系统、时序数据库
基本结构
LSM树由多个层级组成:
↓ Flush
磁盘层
Level 0: [SSTable1] [SSTable2] [SSTable3]
Level 1: [SSTable4] [SSTable5]
Level 2: [SSTable6] [SSTable7] [SSTable8] [SSTable9]
...
核心组件
1. MemTable(内存表)
| C |
|---|
| #define MAX_MEMTABLE_SIZE 1000
typedef struct KVPair {
char key[64];
char value[256];
long long timestamp;
int deleted;
} KVPair;
typedef struct MemTable {
KVPair entries[MAX_MEMTABLE_SIZE];
int count;
int capacity;
} MemTable;
MemTable* createMemTable() {
MemTable *mt = (MemTable*)malloc(sizeof(MemTable));
mt->count = 0;
mt->capacity = MAX_MEMTABLE_SIZE;
return mt;
}
void memTablePut(MemTable *mt, char *key, char *value) {
if (mt->count >= mt->capacity) {
return;
}
strcpy(mt->entries[mt->count].key, key);
strcpy(mt->entries[mt->count].value, value);
mt->entries[mt->count].timestamp = time(NULL);
mt->entries[mt->count].deleted = 0;
mt->count++;
}
int memTableGet(MemTable *mt, char *key, char *value) {
for (int i = mt->count - 1; i >= 0; i--) {
if (strcmp(mt->entries[i].key, key) == 0) {
if (mt->entries[i].deleted) {
return -1;
}
strcpy(value, mt->entries[i].value);
return 1;
}
}
return 0;
}
|
2. SSTable(排序字符串表)
| C |
|---|
| #define BLOCK_SIZE 4096
typedef struct SSTableBlock {
KVPair pairs[100];
int count;
char minKey[64];
char maxKey[64];
} SSTableBlock;
typedef struct SSTable {
char filename[256];
int level;
long long size;
char minKey[64];
char maxKey[64];
SSTableBlock *blocks;
int blockCount;
} SSTable;
typedef struct BloomFilter {
unsigned char *bits;
int size;
int hashCount;
} BloomFilter;
|
3. LSM树结构
| C |
|---|
| #define MAX_LEVELS 7
typedef struct LSMTree {
MemTable *memTable;
MemTable *immutableMemTable;
SSTable **levels[MAX_LEVELS];
int levelCounts[MAX_LEVELS];
int maxMemTableSize;
int sizeRatio;
BloomFilter *bloomFilters[MAX_LEVELS];
} LSMTree;
LSMTree* createLSMTree() {
LSMTree *lsm = (LSMTree*)malloc(sizeof(LSMTree));
lsm->memTable = createMemTable();
lsm->immutableMemTable = NULL;
lsm->maxMemTableSize = MAX_MEMTABLE_SIZE;
lsm->sizeRatio = 10;
for (int i = 0; i < MAX_LEVELS; i++) {
lsm->levels[i] = NULL;
lsm->levelCounts[i] = 0;
}
return lsm;
}
|
核心操作实现
写入操作
| C |
|---|
| void lsmPut(LSMTree *lsm, char *key, char *value) {
memTablePut(lsm->memTable, key, value);
if (lsm->memTable->count >= lsm->maxMemTableSize) {
if (lsm->immutableMemTable != NULL) {
waitForFlush(lsm);
}
lsm->immutableMemTable = lsm->memTable;
lsm->memTable = createMemTable();
flushMemTable(lsm);
}
}
|
Flush操作
| C |
|---|
| int compareKVPair(const void *a, const void *b) {
return strcmp(((KVPair*)a)->key, ((KVPair*)b)->key);
}
SSTable* flushToSSTable(MemTable *mt) {
qsort(mt->entries, mt->count, sizeof(KVPair), compareKVPair);
SSTable *sstable = (SSTable*)malloc(sizeof(SSTable));
sstable->level = 0;
sstable->size = mt->count;
sstable->blockCount = (mt->count + 99) / 100;
sstable->blocks = (SSTableBlock*)malloc(sstable->blockCount * sizeof(SSTableBlock));
strcpy(sstable->minKey, mt->entries[0].key);
strcpy(sstable->maxKey, mt->entries[mt->count - 1].key);
for (int i = 0; i < sstable->blockCount; i++) {
SSTableBlock *block = &sstable->blocks[i];
block->count = 0;
for (int j = 0; j < 100 && i * 100 + j < mt->count; j++) {
block->pairs[j] = mt->entries[i * 100 + j];
block->count++;
}
strcpy(block->minKey, block->pairs[0].key);
strcpy(block->maxKey, block->pairs[block->count - 1].key);
}
return sstable;
}
void flushMemTable(LSMTree *lsm) {
if (!lsm->immutableMemTable) return;
SSTable *sstable = flushToSSTable(lsm->immutableMemTable);
lsm->levels[0] = (SSTable**)realloc(lsm->levels[0],
(lsm->levelCounts[0] + 1) * sizeof(SSTable*));
lsm->levels[0][lsm->levelCounts[0]] = sstable;
lsm->levelCounts[0]++;
free(lsm->immutableMemTable);
lsm->immutableMemTable = NULL;
checkCompaction(lsm, 0);
}
|
读取操作
| C |
|---|
| int lsmGet(LSMTree *lsm, char *key, char *value) {
if (memTableGet(lsm->memTable, key, value)) {
return 1;
}
if (lsm->immutableMemTable &&
memTableGet(lsm->immutableMemTable, key, value)) {
return 1;
}
for (int level = 0; level < MAX_LEVELS; level++) {
for (int i = 0; i < lsm->levelCounts[level]; i++) {
SSTable *sstable = lsm->levels[level][i];
if (strcmp(key, sstable->minKey) < 0 ||
strcmp(key, sstable->maxKey) > 0) {
continue;
}
if (sstableGet(sstable, key, value)) {
return 1;
}
}
}
return 0;
}
int sstableGet(SSTable *sstable, char *key, char *value) {
for (int i = 0; i < sstable->blockCount; i++) {
SSTableBlock *block = &sstable->blocks[i];
if (strcmp(key, block->minKey) < 0 ||
strcmp(key, block->maxKey) > 0) {
continue;
}
for (int j = 0; j < block->count; j++) {
if (strcmp(block->pairs[j].key, key) == 0) {
if (block->pairs[j].deleted) {
return -1;
}
strcpy(value, block->pairs[j].value);
return 1;
}
}
}
return 0;
}
|
合并压缩操作
| C |
|---|
| void mergeSSTables(SSTable *t1, SSTable *t2, SSTable *result) {
int i = 0, j = 0, k = 0;
int totalCount = t1->size + t2->size;
result->blocks = (SSTableBlock*)malloc(sizeof(SSTableBlock));
while (i < t1->size && j < t2->size) {
KVPair *p1 = getKVPair(t1, i);
KVPair *p2 = getKVPair(t2, j);
int cmp = strcmp(p1->key, p2->key);
if (cmp < 0) {
addKVPair(result, p1);
i++;
} else if (cmp > 0) {
addKVPair(result, p2);
j++;
} else {
if (p1->timestamp > p2->timestamp) {
addKVPair(result, p1);
} else {
addKVPair(result, p2);
}
i++;
j++;
}
}
while (i < t1->size) {
addKVPair(result, getKVPair(t1, i++));
}
while (j < t2->size) {
addKVPair(result, getKVPair(t2, j++));
}
}
void checkCompaction(LSMTree *lsm, int level) {
int threshold = lsm->sizeRatio;
for (int i = 0; i < level; i++) {
threshold *= lsm->sizeRatio;
}
if (lsm->levelCounts[level] > threshold) {
compact(lsm, level);
}
}
void compact(LSMTree *lsm, int level) {
if (level >= MAX_LEVELS - 1) return;
SSTable *merged = NULL;
for (int i = 0; i < lsm->levelCounts[level]; i++) {
if (merged == NULL) {
merged = lsm->levels[level][i];
} else {
SSTable *newMerged = (SSTable*)malloc(sizeof(SSTable));
mergeSSTables(merged, lsm->levels[level][i], newMerged);
free(merged);
merged = newMerged;
}
}
lsm->levels[level + 1] = (SSTable**)realloc(lsm->levels[level + 1],
(lsm->levelCounts[level + 1] + 1) * sizeof(SSTable*));
lsm->levels[level + 1][lsm->levelCounts[level + 1]] = merged;
lsm->levelCounts[level + 1]++;
free(lsm->levels[level]);
lsm->levels[level] = NULL;
lsm->levelCounts[level] = 0;
checkCompaction(lsm, level + 1);
}
|
删除操作
| C |
|---|
| void lsmDelete(LSMTree *lsm, char *key) {
KVPair tombstone;
strcpy(tombstone.key, key);
strcpy(tombstone.value, "");
tombstone.timestamp = time(NULL);
tombstone.deleted = 1;
memTablePut(lsm->memTable, key, "");
lsm->memTable->entries[lsm->memTable->count - 1].deleted = 1;
if (lsm->memTable->count >= lsm->maxMemTableSize) {
if (lsm->immutableMemTable != NULL) {
waitForFlush(lsm);
}
lsm->immutableMemTable = lsm->memTable;
lsm->memTable = createMemTable();
flushMemTable(lsm);
}
}
|
C++ 实现
| C++ |
|---|
| template<typename K, typename V>
class LSMTree {
private:
struct KVPair {
K key;
V value;
long long timestamp;
bool deleted;
};
class MemTable {
private:
std::map<K, KVPair> data;
size_t maxSize;
public:
MemTable(size_t size) : maxSize(size) {}
void put(const K& key, const V& value) {
data[key] = {key, value, std::chrono::system_clock::now().time_since_epoch().count(), false};
}
void markDeleted(const K& key) {
if (data.count(key)) {
data[key].deleted = true;
}
}
std::optional<V> get(const K& key) {
if (data.count(key) && !data[key].deleted) {
return data[key].value;
}
return std::nullopt;
}
bool isFull() const { return data.size() >= maxSize; }
const std::map<K, KVPair>& getData() const { return data; }
void clear() { data.clear(); }
};
class SSTable {
private:
std::vector<KVPair> data;
int level;
K minKey, maxKey;
public:
SSTable(const std::vector<KVPair>& entries, int lvl) : level(lvl) {
data = entries;
if (!data.empty()) {
minKey = data.front().key;
maxKey = data.back().key;
}
}
std::optional<V> get(const K& key) {
auto it = std::lower_bound(data.begin(), data.end(), key,
[](const KVPair& p, const K& k) { return p.key < k; });
if (it != data.end() && it->key == key && !it->deleted) {
return it->value;
}
return std::nullopt;
}
bool mayContain(const K& key) const {
return key >= minKey && key <= maxKey;
}
};
std::unique_ptr<MemTable> memTable;
std::unique_ptr<MemTable> immutableMemTable;
std::vector<std::vector<std::unique_ptr<SSTable>>> levels;
size_t maxMemTableSize;
int sizeRatio;
void flush() {
std::vector<KVPair> entries;
for (const auto& [k, v] : immutableMemTable->getData()) {
entries.push_back(v);
}
levels[0].push_back(std::make_unique<SSTable>(entries, 0));
immutableMemTable->clear();
immutableMemTable = nullptr;
checkCompaction(0);
}
void checkCompaction(int level) {
int threshold = sizeRatio;
for (int i = 0; i < level; i++) {
threshold *= sizeRatio;
}
if (levels[level].size() > threshold) {
compact(level);
}
}
void compact(int level);
public:
LSMTree(size_t memSize = 1000, int ratio = 10)
: maxMemTableSize(memSize), sizeRatio(ratio) {
memTable = std::make_unique<MemTable>(memSize);
levels.resize(7);
}
void put(const K& key, const V& value);
std::optional<V> get(const K& key);
void remove(const K& key);
};
|
时间复杂度分析
| 操作 |
平均时间复杂度 |
最坏时间复杂度 |
说明 |
| 写入 |
O(1) |
O(1) |
内存写入,极快 |
| 读取 |
O(log n) |
O(n) |
需要查询多层 |
| 删除 |
O(1) |
O(1) |
写入墓碑标记 |
| 范围查询 |
O(k + log n) |
O(n) |
k为结果数量 |
| Flush |
O(n log n) |
O(n log n) |
排序并写入磁盘 |
| Compaction |
O(n) |
O(n) |
合并SSTable |
空间复杂度
- 内存使用:O(M)(M为MemTable大小)
- 磁盘使用:O(n × (1 + α))(α为空间放大系数)
- 墓碑开销:额外存储删除标记
压缩策略
1. Size-Tiered Compaction
- 当某层SSTable数量超过阈值时触发
- 合并该层所有SSTable到下一层
- 写放大较高,但实现简单
2. Leveled Compaction
- 每层维持有序且不重叠的SSTable
- Level i的大小是Level i-1的sizeRatio倍
- 写放大较低,适合读密集场景
3. FIFO Compaction
优化技术
1. 布隆过滤器
| C |
|---|
| typedef struct BloomFilter {
unsigned char *bits;
int size;
int hashCount;
} BloomFilter;
void bloomAdd(BloomFilter *bf, char *key);
int bloomMayContain(BloomFilter *bf, char *key);
|
2. 稀疏索引
只在SSTable中存储部分键,减少索引大小。
3. 数据压缩
对SSTable块使用Snappy、LZ4等压缩算法。
应用场景
- NoSQL数据库:LevelDB、RocksDB、Cassandra、HBase
- 时序数据库:InfluxDB、TimescaleDB
- 日志系统:Kafka、Elasticsearch
- 键值存储:Redis(AOF)、BadgerDB
优点与缺点
优点
- 写入性能极高:顺序写,充分利用磁盘带宽
- 批量写入友好:适合写入密集场景
- 可扩展性好:分层存储,支持海量数据
- 压缩效率高:SSTable有序,压缩效果好
缺点
- 读取性能较低:可能需要查询多层
- 空间放大:存在数据冗余
- 写放大:压缩操作产生额外写入
- 不适合随机读:不如B+树高效
与B+树比较
| 特性 |
LSM树 |
B+树 |
| 写入性能 |
极高(顺序写) |
中等(随机写) |
| 读取性能 |
中等(多层查询) |
高(单次查询) |
| 空间利用率 |
较低(冗余) |
高 |
| 范围查询 |
高效 |
高效 |
| 适用场景 |
写密集 |
读密集/混合 |
参考资料