mirror of
https://github.com/torvalds/linux.git
synced 2026-01-25 07:47:50 +00:00
This patch introduces dm-pcache, a new DM target that places a DAX-
capable persistent-memory device in front of any slower block device and
uses it as a high-throughput, low-latency cache.
Design highlights
-----------------
- DAX data path – data is copied directly between DRAM and the pmem
mapping, bypassing the block layer’s overhead.
- Segmented, crash-consistent layout
- all layout metadata are dual-replicated CRC-protected.
- atomic kset flushes; key replay on mount guarantees cache integrity
even after power loss.
- Striped multi-tree index
- Multi‑tree indexing for high parallelism.
- overlap-resolution logic ensures non-intersecting cached extents.
- Background services
- write-back worker flushes dirty keys in order, preserving backing-device
crash consistency. This is important for checkpoint in cloud storage.
- garbage collector reclaims clean segments when utilisation exceeds a
tunable threshold.
- Data integrity – optional CRC32 on cached payload; metadata always protected.
Comparison with existing block-level caches
---------------------------------------------------------------------------------------------------------------------------------
| Feature | pcache (this patch) | bcache | dm-writecache |
|----------------------------------|---------------------------------|------------------------------|---------------------------|
| pmem access method | DAX | bio (block I/O) | DAX |
| Write latency (4 K rand-write) | ~5 µs | ~20 µs | ~5 µs |
| Concurrency | multi subtree index | global index tree | single tree + wc_lock |
| IOPS (4K randwrite, 32 numjobs) | 2.1 M | 352 K | 283 K |
| Read-cache support | YES | YES | NO |
| Deployment | no re-format of backend | backend devices must be | no re-format of backend |
| | | reformatted | |
| Write-back ordering | log-structured; | no ordering guarantee | no ordering guarantee |
| | preserves app-IO-order | | |
| Data integrity checks | metadata + data CRC(optional) | metadata CRC only | none |
---------------------------------------------------------------------------------------------------------------------------------
Signed-off-by: Dongsheng Yang <dongsheng.yang@linux.dev>
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
62 lines
1.6 KiB
C
62 lines
1.6 KiB
C
// SPDX-License-Identifier: GPL-2.0-or-later
|
|
#include <linux/dax.h>
|
|
|
|
#include "pcache_internal.h"
|
|
#include "cache_dev.h"
|
|
#include "segment.h"
|
|
|
|
int segment_copy_to_bio(struct pcache_segment *segment,
|
|
u32 data_off, u32 data_len, struct bio *bio, u32 bio_off)
|
|
{
|
|
struct iov_iter iter;
|
|
size_t copied;
|
|
void *src;
|
|
|
|
iov_iter_bvec(&iter, ITER_DEST, &bio->bi_io_vec[bio->bi_iter.bi_idx],
|
|
bio_segments(bio), bio->bi_iter.bi_size);
|
|
iter.iov_offset = bio->bi_iter.bi_bvec_done;
|
|
if (bio_off)
|
|
iov_iter_advance(&iter, bio_off);
|
|
|
|
src = segment->data + data_off;
|
|
copied = _copy_mc_to_iter(src, data_len, &iter);
|
|
if (copied != data_len)
|
|
return -EIO;
|
|
|
|
return 0;
|
|
}
|
|
|
|
int segment_copy_from_bio(struct pcache_segment *segment,
|
|
u32 data_off, u32 data_len, struct bio *bio, u32 bio_off)
|
|
{
|
|
struct iov_iter iter;
|
|
size_t copied;
|
|
void *dst;
|
|
|
|
iov_iter_bvec(&iter, ITER_SOURCE, &bio->bi_io_vec[bio->bi_iter.bi_idx],
|
|
bio_segments(bio), bio->bi_iter.bi_size);
|
|
iter.iov_offset = bio->bi_iter.bi_bvec_done;
|
|
if (bio_off)
|
|
iov_iter_advance(&iter, bio_off);
|
|
|
|
dst = segment->data + data_off;
|
|
copied = _copy_from_iter_flushcache(dst, data_len, &iter);
|
|
if (copied != data_len)
|
|
return -EIO;
|
|
pmem_wmb();
|
|
|
|
return 0;
|
|
}
|
|
|
|
void pcache_segment_init(struct pcache_cache_dev *cache_dev, struct pcache_segment *segment,
|
|
struct pcache_segment_init_options *options)
|
|
{
|
|
segment->seg_info = options->seg_info;
|
|
segment_info_set_type(segment->seg_info, options->type);
|
|
|
|
segment->cache_dev = cache_dev;
|
|
segment->seg_id = options->seg_id;
|
|
segment->data_size = PCACHE_SEG_SIZE - options->data_off;
|
|
segment->data = CACHE_DEV_SEGMENT(cache_dev, options->seg_id) + options->data_off;
|
|
}
|