Merge pull request 'Block manager: limit simultaneous block reads from disk' (#1157) from block-max-simultaneous-reads into main-v1

Reviewed-on: https://git.deuxfleurs.fr/Deuxfleurs/garage/pulls/1157
2026-01-25 05:26:22 +00:00 · 2025-09-13 15:53:24 +00:00
parent 2b007ddea3 d5a57e3e13
commit 57a467b5c0
4 changed files with 54 additions and 3 deletions
--- a/doc/book/reference-manual/configuration.md
+++ b/doc/book/reference-manual/configuration.md
@@ -24,6 +24,7 @@ db_engine = "lmdb"

 block_size = "1M"
 block_ram_buffer_max = "256MiB"
+block_max_concurrent_reads = 16

 lmdb_map_size = "1T"

@@ -96,6 +97,7 @@ The following gives details about each available configuration option.
 Top-level configuration options, in alphabetical order:
 [`allow_punycode`](#allow_punycode),
 [`allow_world_readable_secrets`](#allow_world_readable_secrets),
+[`block_max_concurrent_reads`](`block_max_concurrent_reads),
 [`block_ram_buffer_max`](#block_ram_buffer_max),
 [`block_size`](#block_size),
 [`bootstrap_peers`](#bootstrap_peers),
@@ -522,6 +524,29 @@ node.

 The default value is 256MiB.

+#### `block_max_concurrent_reads` (since `v1.3.0` / `v2.1.0`) {#block_max_concurrent_reads}
+
+The maximum number of blocks (individual files in the data directory) open
+simultaneously for reading.
+
+Reducing this number does not limit the number of data blocks that can be
+transferred through the network simultaneously. This mechanism was just added
+as a backpressure mechanism for HDD read speed: it helps avoid a situation
+where too many requests are coming in and Garage is reading too many block
+files simultaneously, thus not making timely progress on any of the reads.
+
+When a request to read a data block comes in through the network, the requests
+awaits for one of the `block_max_concurrent_reads` slots to be available
+(internally implemented using a Semaphore object). Once it acquired a read
+slot, it reads the entire block file to RAM and frees the slot as soon as the
+block file is finished reading. Only after the slot is released will the
+block's data start being transferred over the network.  If the request fails to
+acquire a reading slot wihtin 15 seconds, it fails with a timeout error.
+Timeout events can be monitored through the `block_read_semaphore_timeouts`
+metric in Prometheus: a non-zero number of such events indicates an I/O
+bottleneck on HDD read speed.
+
+
 #### `lmdb_map_size` {#lmdb_map_size}

 This parameters can be used to set the map size used by LMDB,
--- a/src/block/manager.rs
+++ b/src/block/manager.rs
@@ -50,6 +50,8 @@ pub const INLINE_THRESHOLD: usize = 3072;
 // to delete the block locally.
 pub(crate) const BLOCK_GC_DELAY: Duration = Duration::from_secs(600);

+const BLOCK_READ_SEMAPHORE_TIMEOUT: Duration = Duration::from_secs(15);
+
 /// RPC messages used to share blocks of data between nodes
 #[derive(Debug, Serialize, Deserialize)]
 pub enum BlockRpc {
@@ -87,6 +89,7 @@ pub struct BlockManager {
 	disable_scrub: bool,

 	mutation_lock: Vec<Mutex<BlockManagerLocked>>,
+	read_semaphore: Semaphore,

 	pub rc: BlockRc,
 	pub resync: BlockResyncManager,
@@ -176,6 +179,8 @@ impl BlockManager {
 				.iter()
 				.map(|_| Mutex::new(BlockManagerLocked()))
 				.collect::<Vec<_>>(),
+
+			read_semaphore: Semaphore::new(config.block_max_concurrent_reads),
 			rc,
 			resync,
 			system,
@@ -557,9 +562,6 @@ impl BlockManager {
 			match self.find_block(hash).await {
 				Some(p) => self.read_block_from(hash, &p).await,
 				None => {
-					// Not found but maybe we should have had it ??
-					self.resync
-						.put_to_resync(hash, 2 * self.system.rpc_helper().rpc_timeout())?;
 					return Err(Error::Message(format!(
 						"block {:?} not found on node",
 						hash
@@ -581,6 +583,15 @@ impl BlockManager {
 	) -> Result<DataBlock, Error> {
 		let (header, path) = block_path.as_parts_ref();

+		let permit = tokio::select! {
+			sem = self.read_semaphore.acquire() => sem.ok_or_message("acquire read semaphore")?,
+			_ = tokio::time::sleep(BLOCK_READ_SEMAPHORE_TIMEOUT) => {
+				self.metrics.block_read_semaphore_timeouts.add(1);
+				debug!("read block {:?}: read_semaphore acquire timeout", hash);
+				return Err(Error::Message("read block: read_semaphore acquire timeout".into()));
+			}
+		};
+
 		let mut f = fs::File::open(&path).await?;
 		let mut data = vec![];
 		f.read_to_end(&mut data).await?;
@@ -605,6 +616,8 @@ impl BlockManager {
 			return Err(Error::CorruptData(*hash));
 		}

+		drop(permit);
+
 		Ok(data)
 	}

--- a/src/block/metrics.rs
+++ b/src/block/metrics.rs
@@ -22,6 +22,7 @@ pub struct BlockManagerMetrics {

 	pub(crate) bytes_read: BoundCounter<u64>,
 	pub(crate) block_read_duration: BoundValueRecorder<f64>,
+	pub(crate) block_read_semaphore_timeouts: BoundCounter<u64>,
 	pub(crate) bytes_written: BoundCounter<u64>,
 	pub(crate) block_write_duration: BoundValueRecorder<f64>,
 	pub(crate) delete_counter: BoundCounter<u64>,
@@ -119,6 +120,11 @@ impl BlockManagerMetrics {
 				.with_description("Duration of block read operations")
 				.init()
 				.bind(&[]),
+			block_read_semaphore_timeouts: meter
+				.u64_counter("block.read_semaphore_timeouts")
+				.with_description("Number of block reads that failed due to semaphore acquire timeout")
+				.init()
+				.bind(&[]),
 			bytes_written: meter
 				.u64_counter("block.bytes_written")
 				.with_description("Number of bytes written to disk")
--- a/src/util/config.rs
+++ b/src/util/config.rs
@@ -75,6 +75,10 @@ pub struct Config {
 	)]
 	pub block_ram_buffer_max: usize,

+	/// Maximum number of concurrent reads of block files on disk
+	#[serde(default = "default_block_max_concurrent_reads")]
+	pub block_max_concurrent_reads: usize,
+
 	/// Skip the permission check of secret files. Useful when
 	/// POSIX ACLs (or more complex chmods) are used.
 	#[serde(default)]
@@ -280,6 +284,9 @@ fn default_block_size() -> usize {
 fn default_block_ram_buffer_max() -> usize {
 	256 * 1024 * 1024
 }
+fn default_block_max_concurrent_reads() -> usize {
+	16
+}

 fn default_consistency_mode() -> String {
 	"consistent".into()