mirror of
https://git.deuxfleurs.fr/Deuxfleurs/garage.git
synced 2026-01-25 05:26:22 +00:00
Merge pull request 'Block manager: limit simultaneous block reads from disk' (#1157) from block-max-simultaneous-reads into main-v1
Reviewed-on: https://git.deuxfleurs.fr/Deuxfleurs/garage/pulls/1157
This commit is contained in:
@@ -24,6 +24,7 @@ db_engine = "lmdb"
|
||||
|
||||
block_size = "1M"
|
||||
block_ram_buffer_max = "256MiB"
|
||||
block_max_concurrent_reads = 16
|
||||
|
||||
lmdb_map_size = "1T"
|
||||
|
||||
@@ -96,6 +97,7 @@ The following gives details about each available configuration option.
|
||||
Top-level configuration options, in alphabetical order:
|
||||
[`allow_punycode`](#allow_punycode),
|
||||
[`allow_world_readable_secrets`](#allow_world_readable_secrets),
|
||||
[`block_max_concurrent_reads`](`block_max_concurrent_reads),
|
||||
[`block_ram_buffer_max`](#block_ram_buffer_max),
|
||||
[`block_size`](#block_size),
|
||||
[`bootstrap_peers`](#bootstrap_peers),
|
||||
@@ -522,6 +524,29 @@ node.
|
||||
|
||||
The default value is 256MiB.
|
||||
|
||||
#### `block_max_concurrent_reads` (since `v1.3.0` / `v2.1.0`) {#block_max_concurrent_reads}
|
||||
|
||||
The maximum number of blocks (individual files in the data directory) open
|
||||
simultaneously for reading.
|
||||
|
||||
Reducing this number does not limit the number of data blocks that can be
|
||||
transferred through the network simultaneously. This mechanism was just added
|
||||
as a backpressure mechanism for HDD read speed: it helps avoid a situation
|
||||
where too many requests are coming in and Garage is reading too many block
|
||||
files simultaneously, thus not making timely progress on any of the reads.
|
||||
|
||||
When a request to read a data block comes in through the network, the requests
|
||||
awaits for one of the `block_max_concurrent_reads` slots to be available
|
||||
(internally implemented using a Semaphore object). Once it acquired a read
|
||||
slot, it reads the entire block file to RAM and frees the slot as soon as the
|
||||
block file is finished reading. Only after the slot is released will the
|
||||
block's data start being transferred over the network. If the request fails to
|
||||
acquire a reading slot wihtin 15 seconds, it fails with a timeout error.
|
||||
Timeout events can be monitored through the `block_read_semaphore_timeouts`
|
||||
metric in Prometheus: a non-zero number of such events indicates an I/O
|
||||
bottleneck on HDD read speed.
|
||||
|
||||
|
||||
#### `lmdb_map_size` {#lmdb_map_size}
|
||||
|
||||
This parameters can be used to set the map size used by LMDB,
|
||||
|
||||
@@ -50,6 +50,8 @@ pub const INLINE_THRESHOLD: usize = 3072;
|
||||
// to delete the block locally.
|
||||
pub(crate) const BLOCK_GC_DELAY: Duration = Duration::from_secs(600);
|
||||
|
||||
const BLOCK_READ_SEMAPHORE_TIMEOUT: Duration = Duration::from_secs(15);
|
||||
|
||||
/// RPC messages used to share blocks of data between nodes
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub enum BlockRpc {
|
||||
@@ -87,6 +89,7 @@ pub struct BlockManager {
|
||||
disable_scrub: bool,
|
||||
|
||||
mutation_lock: Vec<Mutex<BlockManagerLocked>>,
|
||||
read_semaphore: Semaphore,
|
||||
|
||||
pub rc: BlockRc,
|
||||
pub resync: BlockResyncManager,
|
||||
@@ -176,6 +179,8 @@ impl BlockManager {
|
||||
.iter()
|
||||
.map(|_| Mutex::new(BlockManagerLocked()))
|
||||
.collect::<Vec<_>>(),
|
||||
|
||||
read_semaphore: Semaphore::new(config.block_max_concurrent_reads),
|
||||
rc,
|
||||
resync,
|
||||
system,
|
||||
@@ -557,9 +562,6 @@ impl BlockManager {
|
||||
match self.find_block(hash).await {
|
||||
Some(p) => self.read_block_from(hash, &p).await,
|
||||
None => {
|
||||
// Not found but maybe we should have had it ??
|
||||
self.resync
|
||||
.put_to_resync(hash, 2 * self.system.rpc_helper().rpc_timeout())?;
|
||||
return Err(Error::Message(format!(
|
||||
"block {:?} not found on node",
|
||||
hash
|
||||
@@ -581,6 +583,15 @@ impl BlockManager {
|
||||
) -> Result<DataBlock, Error> {
|
||||
let (header, path) = block_path.as_parts_ref();
|
||||
|
||||
let permit = tokio::select! {
|
||||
sem = self.read_semaphore.acquire() => sem.ok_or_message("acquire read semaphore")?,
|
||||
_ = tokio::time::sleep(BLOCK_READ_SEMAPHORE_TIMEOUT) => {
|
||||
self.metrics.block_read_semaphore_timeouts.add(1);
|
||||
debug!("read block {:?}: read_semaphore acquire timeout", hash);
|
||||
return Err(Error::Message("read block: read_semaphore acquire timeout".into()));
|
||||
}
|
||||
};
|
||||
|
||||
let mut f = fs::File::open(&path).await?;
|
||||
let mut data = vec![];
|
||||
f.read_to_end(&mut data).await?;
|
||||
@@ -605,6 +616,8 @@ impl BlockManager {
|
||||
return Err(Error::CorruptData(*hash));
|
||||
}
|
||||
|
||||
drop(permit);
|
||||
|
||||
Ok(data)
|
||||
}
|
||||
|
||||
|
||||
@@ -22,6 +22,7 @@ pub struct BlockManagerMetrics {
|
||||
|
||||
pub(crate) bytes_read: BoundCounter<u64>,
|
||||
pub(crate) block_read_duration: BoundValueRecorder<f64>,
|
||||
pub(crate) block_read_semaphore_timeouts: BoundCounter<u64>,
|
||||
pub(crate) bytes_written: BoundCounter<u64>,
|
||||
pub(crate) block_write_duration: BoundValueRecorder<f64>,
|
||||
pub(crate) delete_counter: BoundCounter<u64>,
|
||||
@@ -119,6 +120,11 @@ impl BlockManagerMetrics {
|
||||
.with_description("Duration of block read operations")
|
||||
.init()
|
||||
.bind(&[]),
|
||||
block_read_semaphore_timeouts: meter
|
||||
.u64_counter("block.read_semaphore_timeouts")
|
||||
.with_description("Number of block reads that failed due to semaphore acquire timeout")
|
||||
.init()
|
||||
.bind(&[]),
|
||||
bytes_written: meter
|
||||
.u64_counter("block.bytes_written")
|
||||
.with_description("Number of bytes written to disk")
|
||||
|
||||
@@ -75,6 +75,10 @@ pub struct Config {
|
||||
)]
|
||||
pub block_ram_buffer_max: usize,
|
||||
|
||||
/// Maximum number of concurrent reads of block files on disk
|
||||
#[serde(default = "default_block_max_concurrent_reads")]
|
||||
pub block_max_concurrent_reads: usize,
|
||||
|
||||
/// Skip the permission check of secret files. Useful when
|
||||
/// POSIX ACLs (or more complex chmods) are used.
|
||||
#[serde(default)]
|
||||
@@ -280,6 +284,9 @@ fn default_block_size() -> usize {
|
||||
fn default_block_ram_buffer_max() -> usize {
|
||||
256 * 1024 * 1024
|
||||
}
|
||||
fn default_block_max_concurrent_reads() -> usize {
|
||||
16
|
||||
}
|
||||
|
||||
fn default_consistency_mode() -> String {
|
||||
"consistent".into()
|
||||
|
||||
Reference in New Issue
Block a user