Merge pull request 'Block manager: limit simultaneous block reads from disk' (#1157) from block-max-simultaneous-reads into main-v1

Reviewed-on: https://git.deuxfleurs.fr/Deuxfleurs/garage/pulls/1157
This commit is contained in:
Alex
2025-09-13 15:53:24 +00:00
4 changed files with 54 additions and 3 deletions

View File

@@ -24,6 +24,7 @@ db_engine = "lmdb"
block_size = "1M"
block_ram_buffer_max = "256MiB"
block_max_concurrent_reads = 16
lmdb_map_size = "1T"
@@ -96,6 +97,7 @@ The following gives details about each available configuration option.
Top-level configuration options, in alphabetical order:
[`allow_punycode`](#allow_punycode),
[`allow_world_readable_secrets`](#allow_world_readable_secrets),
[`block_max_concurrent_reads`](`block_max_concurrent_reads),
[`block_ram_buffer_max`](#block_ram_buffer_max),
[`block_size`](#block_size),
[`bootstrap_peers`](#bootstrap_peers),
@@ -522,6 +524,29 @@ node.
The default value is 256MiB.
#### `block_max_concurrent_reads` (since `v1.3.0` / `v2.1.0`) {#block_max_concurrent_reads}
The maximum number of blocks (individual files in the data directory) open
simultaneously for reading.
Reducing this number does not limit the number of data blocks that can be
transferred through the network simultaneously. This mechanism was just added
as a backpressure mechanism for HDD read speed: it helps avoid a situation
where too many requests are coming in and Garage is reading too many block
files simultaneously, thus not making timely progress on any of the reads.
When a request to read a data block comes in through the network, the requests
awaits for one of the `block_max_concurrent_reads` slots to be available
(internally implemented using a Semaphore object). Once it acquired a read
slot, it reads the entire block file to RAM and frees the slot as soon as the
block file is finished reading. Only after the slot is released will the
block's data start being transferred over the network. If the request fails to
acquire a reading slot wihtin 15 seconds, it fails with a timeout error.
Timeout events can be monitored through the `block_read_semaphore_timeouts`
metric in Prometheus: a non-zero number of such events indicates an I/O
bottleneck on HDD read speed.
#### `lmdb_map_size` {#lmdb_map_size}
This parameters can be used to set the map size used by LMDB,

View File

@@ -50,6 +50,8 @@ pub const INLINE_THRESHOLD: usize = 3072;
// to delete the block locally.
pub(crate) const BLOCK_GC_DELAY: Duration = Duration::from_secs(600);
const BLOCK_READ_SEMAPHORE_TIMEOUT: Duration = Duration::from_secs(15);
/// RPC messages used to share blocks of data between nodes
#[derive(Debug, Serialize, Deserialize)]
pub enum BlockRpc {
@@ -87,6 +89,7 @@ pub struct BlockManager {
disable_scrub: bool,
mutation_lock: Vec<Mutex<BlockManagerLocked>>,
read_semaphore: Semaphore,
pub rc: BlockRc,
pub resync: BlockResyncManager,
@@ -176,6 +179,8 @@ impl BlockManager {
.iter()
.map(|_| Mutex::new(BlockManagerLocked()))
.collect::<Vec<_>>(),
read_semaphore: Semaphore::new(config.block_max_concurrent_reads),
rc,
resync,
system,
@@ -557,9 +562,6 @@ impl BlockManager {
match self.find_block(hash).await {
Some(p) => self.read_block_from(hash, &p).await,
None => {
// Not found but maybe we should have had it ??
self.resync
.put_to_resync(hash, 2 * self.system.rpc_helper().rpc_timeout())?;
return Err(Error::Message(format!(
"block {:?} not found on node",
hash
@@ -581,6 +583,15 @@ impl BlockManager {
) -> Result<DataBlock, Error> {
let (header, path) = block_path.as_parts_ref();
let permit = tokio::select! {
sem = self.read_semaphore.acquire() => sem.ok_or_message("acquire read semaphore")?,
_ = tokio::time::sleep(BLOCK_READ_SEMAPHORE_TIMEOUT) => {
self.metrics.block_read_semaphore_timeouts.add(1);
debug!("read block {:?}: read_semaphore acquire timeout", hash);
return Err(Error::Message("read block: read_semaphore acquire timeout".into()));
}
};
let mut f = fs::File::open(&path).await?;
let mut data = vec![];
f.read_to_end(&mut data).await?;
@@ -605,6 +616,8 @@ impl BlockManager {
return Err(Error::CorruptData(*hash));
}
drop(permit);
Ok(data)
}

View File

@@ -22,6 +22,7 @@ pub struct BlockManagerMetrics {
pub(crate) bytes_read: BoundCounter<u64>,
pub(crate) block_read_duration: BoundValueRecorder<f64>,
pub(crate) block_read_semaphore_timeouts: BoundCounter<u64>,
pub(crate) bytes_written: BoundCounter<u64>,
pub(crate) block_write_duration: BoundValueRecorder<f64>,
pub(crate) delete_counter: BoundCounter<u64>,
@@ -119,6 +120,11 @@ impl BlockManagerMetrics {
.with_description("Duration of block read operations")
.init()
.bind(&[]),
block_read_semaphore_timeouts: meter
.u64_counter("block.read_semaphore_timeouts")
.with_description("Number of block reads that failed due to semaphore acquire timeout")
.init()
.bind(&[]),
bytes_written: meter
.u64_counter("block.bytes_written")
.with_description("Number of bytes written to disk")

View File

@@ -75,6 +75,10 @@ pub struct Config {
)]
pub block_ram_buffer_max: usize,
/// Maximum number of concurrent reads of block files on disk
#[serde(default = "default_block_max_concurrent_reads")]
pub block_max_concurrent_reads: usize,
/// Skip the permission check of secret files. Useful when
/// POSIX ACLs (or more complex chmods) are used.
#[serde(default)]
@@ -280,6 +284,9 @@ fn default_block_size() -> usize {
fn default_block_ram_buffer_max() -> usize {
256 * 1024 * 1024
}
fn default_block_max_concurrent_reads() -> usize {
16
}
fn default_consistency_mode() -> String {
"consistent".into()