diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 8ad3a688..89165678 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -49,3 +49,11 @@ jobs:
 
       - name: Run fixture-based tests
         uses: ./.github/actions/run-fixture-tests
+
+      # Reuses the release build from the test step; validates the benchmark
+      # harness end-to-end and its JSON output contract in a few seconds.
+      - name: Benchmark smoke (mock crypto)
+        run: |
+          cargo run --release --bin ethlambda -- benchmark synthetic --mock-crypto \
+            --num-validators 4 --warmup-slots 4 --iterations 3 --format json \
+            | jq -e '.schema_version == 1 and (.samples | length == 3)'
diff --git a/Cargo.lock b/Cargo.lock
index c57423fa..6ec93afe 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2012,6 +2012,7 @@ dependencies = [
  "clap",
  "ethlambda-blockchain",
  "ethlambda-crypto",
+ "ethlambda-metrics",
  "ethlambda-network-api",
  "ethlambda-p2p",
  "ethlambda-rpc",
@@ -2024,6 +2025,7 @@ dependencies = [
  "libssz-types",
  "reqwest",
  "serde",
+ "serde_json",
  "serde_yaml_ng",
  "thiserror 2.0.18",
  "tikv-jemallocator",
diff --git a/Makefile b/Makefile
index d28dc505..6d3399bf 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,4 @@
-.PHONY: help fmt lint docker-build shadow-build shadow-docker-build run-devnet test docs docs-deps docs-serve
+.PHONY: help fmt lint bench docker-build shadow-build shadow-docker-build run-devnet test docs docs-deps docs-serve
 
 help: ## 📚 Show help for each of the Makefile recipes
 	@grep -E '^[a-zA-Z0-9_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'
@@ -13,6 +13,11 @@ test: leanSpec/fixtures ## 🧪 Run all tests
 	# Tests need to be run on release to avoid stack overflows during signature verification/aggregation
 	cargo test --workspace --release
 
+BENCH_ARGS ?= synthetic --mock-crypto
+
+bench: ## 🏁 Benchmark block building offline (override BENCH_ARGS to customize)
+	cargo run --release --bin ethlambda -- benchmark $(BENCH_ARGS)
+
 GIT_COMMIT=$(shell git rev-parse HEAD)
 GIT_BRANCH=$(shell git rev-parse --abbrev-ref HEAD)
 DOCKER_TAG?=local
diff --git a/bin/ethlambda/Cargo.toml b/bin/ethlambda/Cargo.toml
index 3b9e5582..0711920f 100644
--- a/bin/ethlambda/Cargo.toml
+++ b/bin/ethlambda/Cargo.toml
@@ -21,6 +21,7 @@ shadow-integration = ["ethlambda-crypto/shadow-integration"]
 [dependencies]
 ethlambda-blockchain.workspace = true
 ethlambda-crypto.workspace = true
+ethlambda-metrics.workspace = true
 ethlambda-network-api.workspace = true
 ethlambda-p2p.workspace = true
 ethlambda-types.workspace = true
@@ -37,6 +38,7 @@ tracing.workspace = true
 tracing-subscriber = "0.3"
 
 serde.workspace = true
+serde_json.workspace = true
 serde_yaml_ng.workspace = true
 hex.workspace = true
 
diff --git a/bin/ethlambda/build.rs b/bin/ethlambda/build.rs
index ad4184ed..bb2a9e87 100644
--- a/bin/ethlambda/build.rs
+++ b/bin/ethlambda/build.rs
@@ -1,3 +1,5 @@
+use std::path::PathBuf;
+
 use vergen_git2::{Emitter, Git2Builder, RustcBuilder};
 
 fn main() -> Result<(), Box<dyn std::error::Error>> {
@@ -12,5 +14,45 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
         .add_instructions(&git2)?
         .emit()?;
 
+    emit_leansig_rev();
+
     Ok(())
 }
+
+/// Embed the resolved leansig git revision from the workspace Cargo.lock.
+///
+/// leansig is pinned to a moving branch, so a `cargo update` changes the
+/// measured crypto with zero ethlambda diff; benchmark reports embed this
+/// revision to keep results interpretable across lock bumps.
+fn emit_leansig_rev() {
+    let rev = leansig_rev_from_lockfile().unwrap_or_else(|| "unknown".to_string());
+    println!("cargo:rustc-env=ETHLAMBDA_LEANSIG_REV={rev}");
+    if let Some(lockfile) = workspace_lockfile() {
+        println!("cargo:rerun-if-changed={}", lockfile.display());
+    }
+}
+
+fn workspace_lockfile() -> Option<PathBuf> {
+    let manifest_dir = std::env::var("CARGO_MANIFEST_DIR").ok()?;
+    Some(PathBuf::from(manifest_dir).join("../../Cargo.lock"))
+}
+
+fn leansig_rev_from_lockfile() -> Option<String> {
+    let lockfile = std::fs::read_to_string(workspace_lockfile()?).ok()?;
+    let mut in_leansig_package = false;
+    for line in lockfile.lines() {
+        let line = line.trim();
+        if line == "[[package]]" {
+            in_leansig_package = false;
+        } else if line == "name = \"leansig\"" {
+            in_leansig_package = true;
+        } else if in_leansig_package {
+            // source = "git+https://github.com/leanEthereum/leanSig?branch=devnet4#<rev>"
+            if let Some(source) = line.strip_prefix("source = ") {
+                let rev = source.trim_matches('"').rsplit('#').next()?;
+                return Some(rev.to_string());
+            }
+        }
+    }
+    None
+}
diff --git a/bin/ethlambda/src/benchmark/corpus.rs b/bin/ethlambda/src/benchmark/corpus.rs
new file mode 100644
index 00000000..330c7cc6
--- /dev/null
+++ b/bin/ethlambda/src/benchmark/corpus.rs
@@ -0,0 +1,145 @@
+//! Synthetic benchmark corpus: deterministic validators, a genesis store, and
+//! per-slot attestation-pool seeding.
+
+use std::sync::Arc;
+
+use ethlambda_blockchain::store::produce_attestation_data;
+use ethlambda_storage::{Store, backend::InMemoryBackend};
+use ethlambda_types::{
+    attestation::{AggregationBits, HashedAttestationData},
+    block::SingleMessageAggregate,
+    state::{State, Validator, ValidatorPubkeyBytes},
+};
+
+/// Fixed genesis time for synthetic runs. The harness derives every tick
+/// timestamp from slot numbers relative to this value and never reads the wall
+/// clock, so runs are reproducible at any time of day.
+const GENESIS_TIME: u64 = 1_700_000_000;
+
+pub(crate) struct SyntheticCorpus {
+    num_validators: u64,
+    proofs_per_data: u64,
+}
+
+impl SyntheticCorpus {
+    pub(crate) fn new(num_validators: u64, proofs_per_data: u64) -> Self {
+        Self {
+            num_validators,
+            proofs_per_data,
+        }
+    }
+
+    /// Build a genesis store over an in-memory backend with `num_validators`
+    /// seed-derived validators.
+    ///
+    /// Pubkeys are deterministic placeholder bytes: in mock-crypto mode no code
+    /// path decodes them (signature verification is skipped and best-proof
+    /// compaction never resolves pubkeys).
+    pub(crate) fn genesis_store(&self, seed: u64) -> Store {
+        let mut rng_state = seed;
+        let validators = (0..self.num_validators)
+            .map(|index| Validator {
+                attestation_pubkey: synthetic_pubkey(&mut rng_state),
+                proposal_pubkey: synthetic_pubkey(&mut rng_state),
+                index,
+            })
+            .collect();
+        let genesis_state = State::from_genesis(GENESIS_TIME, validators);
+        Store::from_anchor_state(Arc::new(InMemoryBackend::new()), genesis_state)
+    }
+
+    /// Seed the pending ("new") pool with the full validator set's attestations
+    /// for `attestation_slot`, split into `proofs_per_data` disjoint aggregates.
+    ///
+    /// Mirrors what committee aggregators gossip during a slot: several
+    /// aggregates for the same `AttestationData`, each covering a validator
+    /// subset. The proposal tick then promotes them to the known pool, exactly
+    /// as on a live node. Entries are inserted in a fixed order because pool
+    /// insertion order pins within-entry proof choice during selection.
+    pub(crate) fn seed_pool(&self, store: &mut Store, attestation_slot: u64) {
+        let data = produce_attestation_data(store, attestation_slot);
+        let entries = participant_groups(self.num_validators, self.proofs_per_data)
+            .into_iter()
+            .map(|participants| {
+                (
+                    HashedAttestationData::new(data.clone()),
+                    SingleMessageAggregate::empty(participants),
+                )
+            })
+            .collect();
+        store.insert_new_aggregated_payloads_batch(entries);
+    }
+}
+
+/// Partition validators 0..num_validators into `groups` disjoint bitfields,
+/// assigning validator `i` to group `i % groups`. Every group is non-empty
+/// (groups is capped at the validator count) and the union covers every
+/// validator exactly once.
+fn participant_groups(num_validators: u64, groups: u64) -> Vec<AggregationBits> {
+    let groups = groups.clamp(1, num_validators);
+    (0..groups)
+        .map(|group| {
+            let mut bits = AggregationBits::with_length(num_validators as usize)
+                .expect("validator count is within the bitlist limit");
+            for index in (group..num_validators).step_by(groups as usize) {
+                bits.set(index as usize, true)
+                    .expect("index is within the bitlist length");
+            }
+            bits
+        })
+        .collect()
+}
+
+/// splitmix64: tiny deterministic generator for placeholder pubkey bytes,
+/// avoiding a rand dependency.
+fn splitmix64(state: &mut u64) -> u64 {
+    *state = state.wrapping_add(0x9e37_79b9_7f4a_7c15);
+    let mut z = *state;
+    z = (z ^ (z >> 30)).wrapping_mul(0xbf58_476d_1ce4_e5b9);
+    z = (z ^ (z >> 27)).wrapping_mul(0x94d0_49bb_1331_11eb);
+    z ^ (z >> 31)
+}
+
+fn synthetic_pubkey(rng_state: &mut u64) -> ValidatorPubkeyBytes {
+    let mut bytes = [0u8; 52];
+    for chunk in bytes.chunks_mut(8) {
+        let word = splitmix64(rng_state).to_le_bytes();
+        chunk.copy_from_slice(&word[..chunk.len()]);
+    }
+    bytes
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use ethlambda_types::attestation::validator_indices;
+
+    #[test]
+    fn participant_groups_partition_all_validators() {
+        for (validators, groups) in [(8u64, 2u64), (8, 3), (5, 8), (1, 1), (4096, 4)] {
+            let partition = participant_groups(validators, groups);
+            assert_eq!(partition.len() as u64, groups.min(validators));
+            let mut seen = vec![0u32; validators as usize];
+            for bits in &partition {
+                let indices: Vec<u64> = validator_indices(bits).collect();
+                assert!(!indices.is_empty(), "every group must be non-empty");
+                for index in indices {
+                    seen[index as usize] += 1;
+                }
+            }
+            assert!(
+                seen.iter().all(|&count| count == 1),
+                "every validator must appear in exactly one group: {seen:?}"
+            );
+        }
+    }
+
+    #[test]
+    fn synthetic_pubkeys_are_deterministic() {
+        let mut a = 42u64;
+        let mut b = 42u64;
+        assert_eq!(synthetic_pubkey(&mut a), synthetic_pubkey(&mut b));
+        let mut c = 43u64;
+        assert_ne!(synthetic_pubkey(&mut a), synthetic_pubkey(&mut c));
+    }
+}
diff --git a/bin/ethlambda/src/benchmark/mod.rs b/bin/ethlambda/src/benchmark/mod.rs
new file mode 100644
index 00000000..2e079dc4
--- /dev/null
+++ b/bin/ethlambda/src/benchmark/mod.rs
@@ -0,0 +1,285 @@
+//! Offline block-building benchmark (`ethlambda benchmark`).
+//!
+//! Drives the exact production proposer path — `produce_block_with_signatures`,
+//! the same entry `BlockChainServer::propose_block` uses — against a synthetic
+//! in-memory chain, and reports per-phase timing distributions. Gossip publish
+//! and the slot-alignment sleep are outside the measured span, matching the
+//! node's own `lean_block_building_time_seconds` boundary.
+//!
+//! See docs/plans/block-building-benchmark.md for the design and roadmap
+//! (real-crypto pools and replay-from-datadir land in later milestones).
+
+mod corpus;
+mod report;
+
+use std::collections::{BTreeMap, HashMap};
+use std::path::PathBuf;
+use std::time::Instant;
+
+use ethlambda_blockchain::block_builder::ProposerConfig;
+use ethlambda_blockchain::metrics::BLOCK_PROPOSAL_ATTESTATION_BUILD_PHASES;
+use ethlambda_blockchain::store::{on_block_without_verification, produce_block_with_signatures};
+use ethlambda_storage::NEW_PAYLOAD_CAP;
+use ethlambda_types::block::{MultiMessageAggregate, SignedBlock};
+use ethlambda_types::primitives::HashTreeRoot as _;
+use eyre::WrapErr as _;
+
+use report::{Environment, Params, Report, Sample};
+
+#[derive(Debug, clap::Args)]
+pub(crate) struct BenchmarkOptions {
+    #[command(subcommand)]
+    workload: Workload,
+}
+
+#[derive(Debug, clap::Subcommand)]
+enum Workload {
+    /// Benchmark block building on a synthetic in-memory chain.
+    Synthetic(SyntheticOptions),
+}
+
+#[derive(Debug, clap::Args)]
+struct SyntheticOptions {
+    /// Number of validators in the synthetic genesis.
+    #[arg(long, default_value = "8", value_parser = clap::value_parser!(u64).range(1..=4096))]
+    num_validators: u64,
+    /// Unmeasured chain-advancement slots before measuring. Builds and imports
+    /// one block per slot so the measured builds run on a state with
+    /// representative historical roots and justifications, and warms the state
+    /// cache.
+    #[arg(long, default_value = "8")]
+    warmup_slots: u64,
+    /// Aggregate proofs seeded per AttestationData, mimicking committee
+    /// aggregators covering disjoint validator subsets. The default of 1 (one
+    /// full-coverage proof per data) keeps justification/finalization
+    /// advancing every slot. Higher values exercise multi-proof selection and
+    /// same-data collapse, but without --enable-proposer-aggregation the block
+    /// then carries only the best partial proof (< 2/3 coverage), so
+    /// justification stalls — the real coverage cost of disabling proposer
+    /// aggregation.
+    #[arg(long, default_value = "1", value_parser = clap::value_parser!(u64).range(1..))]
+    proofs_per_data: u64,
+    /// Deterministic seed for the synthetic validator set. Two runs with the
+    /// same seed and parameters produce identical per-iteration block roots.
+    #[arg(long, default_value = "42")]
+    seed: u64,
+    #[command(flatten)]
+    common: CommonOptions,
+}
+
+#[derive(Debug, clap::Args)]
+struct CommonOptions {
+    /// Measured iterations (one built block each), after warmup.
+    #[arg(long, default_value = "10", value_parser = clap::value_parser!(u64).range(1..))]
+    iterations: u64,
+    /// Seed pools with empty placeholder proofs instead of real XMSS/leanVM
+    /// crypto. Measures selection + best-proof compaction + state transition
+    /// only; runs in seconds. Conflicts with --enable-proposer-aggregation,
+    /// whose recursive aggregation needs real proof bytes.
+    #[arg(long, conflicts_with = "enable_proposer_aggregation")]
+    mock_crypto: bool,
+    /// Mirrors the node flag: collapse same-data proofs via recursive leanVM
+    /// aggregation instead of keeping the single best-coverage proof.
+    #[arg(long)]
+    enable_proposer_aggregation: bool,
+    /// Mirrors the node flag: distinct AttestationData cap per built block.
+    #[arg(long, default_value = "3")]
+    max_attestations_per_block: usize,
+    /// Report format printed to stdout. Logs go to stderr, so JSON output can
+    /// be piped directly (e.g. into jq).
+    #[arg(long, value_enum, default_value_t = OutputFormat::Human)]
+    format: OutputFormat,
+    /// Also write the JSON report to this file.
+    #[arg(long)]
+    output: Option<PathBuf>,
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, clap::ValueEnum)]
+enum OutputFormat {
+    Human,
+    Json,
+}
+
+pub(crate) fn run(options: BenchmarkOptions) -> eyre::Result<()> {
+    let Workload::Synthetic(synthetic) = options.workload;
+    run_synthetic(synthetic)
+}
+
+fn run_synthetic(options: SyntheticOptions) -> eyre::Result<()> {
+    let common = &options.common;
+    eyre::ensure!(
+        common.mock_crypto,
+        "real-crypto benchmarking is not implemented yet; rerun with --mock-crypto"
+    );
+    // The pending pool evicts whole data-root entries FIFO once its proof cap
+    // is exceeded, so a single slot's batch larger than the cap would silently
+    // seed nothing and every measured block would be empty.
+    eyre::ensure!(
+        options.proofs_per_data as usize <= NEW_PAYLOAD_CAP,
+        "--proofs-per-data {} exceeds the pending-pool capacity ({NEW_PAYLOAD_CAP}); \
+         one slot's batch would be evicted whole and every measured block would be empty",
+        options.proofs_per_data
+    );
+
+    let proposer_config = ProposerConfig {
+        enable_proposer_aggregation: common.enable_proposer_aggregation,
+        max_attestations_per_block: common.max_attestations_per_block,
+    };
+    let corpus = corpus::SyntheticCorpus::new(options.num_validators, options.proofs_per_data);
+    let mut store = corpus.genesis_store(options.seed);
+
+    let total_slots = options
+        .warmup_slots
+        .checked_add(common.iterations)
+        .ok_or_else(|| eyre::eyre!("--warmup-slots plus --iterations overflows u64"))?;
+    let mut samples = Vec::with_capacity(common.iterations as usize);
+    for slot in 1..=total_slots {
+        // Seed the pending pool with the previous slot's attestations, exactly
+        // where gossip aggregates would sit before the proposal tick promotes
+        // them to the known pool. Entries from earlier slots stay in the known
+        // pool, as they would on a live node.
+        corpus.seed_pool(&mut store, slot - 1);
+        eyre::ensure!(
+            store.new_aggregated_payloads_count() > 0,
+            "seeded attestations were evicted from the pending pool at slot {slot}; \
+             the measured workload would not match the requested parameters"
+        );
+        let pool_entries =
+            store.new_aggregated_payloads_count() + store.known_aggregated_payloads_count();
+
+        // Round-robin proposer, matching `is_proposer`.
+        let proposer = slot % options.num_validators;
+
+        let before = phase_snapshot();
+        let build_start = Instant::now();
+        let (block, aggregates, _checkpoints) =
+            produce_block_with_signatures(&mut store, slot, proposer, proposer_config)
+                .wrap_err_with(|| format!("block build failed at slot {slot}"))?;
+        let wall_seconds = build_start.elapsed().as_secs_f64();
+        let phases = phase_deltas(&before, &phase_snapshot())?;
+
+        let block_root = block.hash_tree_root();
+        let attestations_packed = block.body.attestations.len();
+        let aggregates_count = aggregates.len();
+
+        // Import the built block (outside the measured span) so the next
+        // iteration builds one slot ahead of head, like a live proposer;
+        // building repeatedly on a fixed head would make `process_slots` cost
+        // grow with the iteration index.
+        let signed_block = SignedBlock {
+            message: block,
+            proof: MultiMessageAggregate::default(),
+        };
+        on_block_without_verification(&mut store, signed_block)
+            .wrap_err_with(|| format!("importing the built block failed at slot {slot}"))?;
+
+        let measured = slot > options.warmup_slots;
+        let label = if measured { "measured" } else { "warmup" };
+        eprintln!(
+            "[{slot}/{total_slots}] {label}: built block in {:.3}ms \
+             (attestations={attestations_packed}, pool_entries={pool_entries})",
+            wall_seconds * 1e3,
+        );
+
+        if measured {
+            let overhead_seconds = wall_seconds - phases.values().sum::<f64>();
+            samples.push(Sample {
+                iteration: slot - options.warmup_slots,
+                slot,
+                proposer,
+                block_root: format!("0x{}", hex::encode(block_root.0)),
+                wall_seconds,
+                phases,
+                overhead_seconds,
+                attestations_packed,
+                aggregates: aggregates_count,
+                pool_entries,
+            });
+        }
+    }
+
+    eyre::ensure!(
+        samples.len() as u64 == common.iterations,
+        "collected {} samples but expected {}; the measured-slot accounting drifted",
+        samples.len(),
+        common.iterations
+    );
+
+    let params = Params {
+        mode: "synthetic",
+        mock_crypto: common.mock_crypto,
+        num_validators: options.num_validators,
+        warmup_slots: options.warmup_slots,
+        proofs_per_data: options.proofs_per_data,
+        seed: options.seed,
+        iterations: common.iterations,
+        enable_proposer_aggregation: common.enable_proposer_aggregation,
+        max_attestations_per_block: common.max_attestations_per_block,
+    };
+    let report = Report::new(Environment::collect(), params, samples);
+
+    match common.format {
+        OutputFormat::Human => println!("{}", report.human_table()),
+        OutputFormat::Json => println!("{}", report.to_json()?),
+    }
+    if let Some(path) = &common.output {
+        std::fs::write(path, report.to_json()?)
+            .wrap_err_with(|| format!("failed to write report to {}", path.display()))?;
+        eprintln!("report written to {}", path.display());
+    }
+
+    Ok(())
+}
+
+const PHASE_HISTOGRAM: &str = "lean_block_proposal_attestation_build_phase_seconds";
+
+/// Per-phase (sample_sum, sample_count) snapshot of the block-proposal phase
+/// histogram, read from the default prometheus registry.
+type PhaseSnapshot = HashMap<String, (f64, u64)>;
+
+fn phase_snapshot() -> PhaseSnapshot {
+    ethlambda_metrics::gather()
+        .iter()
+        .filter(|family| family.name() == PHASE_HISTOGRAM)
+        .flat_map(|family| family.get_metric())
+        .filter_map(|metric| {
+            let phase = metric
+                .get_label()
+                .iter()
+                .find(|label| label.name() == "phase")?
+                .value()
+                .to_string();
+            let histogram = metric.get_histogram();
+            Some((
+                phase,
+                (histogram.get_sample_sum(), histogram.get_sample_count()),
+            ))
+        })
+        .collect()
+}
+
+/// Exact per-iteration phase durations from two snapshots around one build.
+///
+/// Histogram sums accumulate the raw f64 seconds of every observation, so the
+/// sum delta IS the build's phase time — bucket boundaries play no role. The
+/// count must advance by exactly 1 per phase (each phase observes once per
+/// `build_block` in this single-threaded process); anything else means the
+/// accounting drifted and attribution would be wrong, so it is a hard error.
+fn phase_deltas(
+    before: &PhaseSnapshot,
+    after: &PhaseSnapshot,
+) -> eyre::Result<BTreeMap<String, f64>> {
+    let mut deltas = BTreeMap::new();
+    for &phase in BLOCK_PROPOSAL_ATTESTATION_BUILD_PHASES {
+        let (sum_before, count_before) = before.get(phase).copied().unwrap_or((0.0, 0));
+        let (sum_after, count_after) = after.get(phase).copied().unwrap_or((0.0, 0));
+        let observations = count_after.saturating_sub(count_before);
+        eyre::ensure!(
+            observations == 1,
+            "phase '{phase}' was observed {observations} times during one build (expected 1); \
+             phase attribution would be wrong"
+        );
+        deltas.insert(phase.to_string(), sum_after - sum_before);
+    }
+    Ok(deltas)
+}
diff --git a/bin/ethlambda/src/benchmark/report.rs b/bin/ethlambda/src/benchmark/report.rs
new file mode 100644
index 00000000..1583f0d9
--- /dev/null
+++ b/bin/ethlambda/src/benchmark/report.rs
@@ -0,0 +1,303 @@
+//! Statistics and report emission for the block-building benchmark.
+//!
+//! Raw per-iteration samples are always included in the JSON report: outliers
+//! are never discarded (XMSS signing and OTS window advancement produce
+//! legitimate heavy tails worth inspecting), and per-iteration block roots let
+//! a baseline-vs-optimized diff prove an optimization changed only speed, not
+//! which attestations get selected.
+
+use std::collections::BTreeMap;
+use std::fmt::Write as _;
+
+use serde::Serialize;
+
+use crate::version;
+
+/// Coefficient-of-variation threshold above which wall-time results are
+/// flagged as too noisy to compare, per the benchmarking workflow standard.
+const CV_WARN_THRESHOLD: f64 = 0.10;
+
+#[derive(Debug, Serialize)]
+pub(crate) struct Sample {
+    pub iteration: u64,
+    pub slot: u64,
+    pub proposer: u64,
+    /// Determinism checksum: same seed + params must reproduce the same roots.
+    pub block_root: String,
+    pub wall_seconds: f64,
+    /// Per-phase seconds from histogram sum deltas.
+    pub phases: BTreeMap<String, f64>,
+    /// Wall time not attributed to any phase: the `produce_block_with_signatures`
+    /// preamble (tick advance, pool promotion, fork-choice head update, pool
+    /// deep-clone, block-roots scan) plus measurement slack.
+    pub overhead_seconds: f64,
+    pub attestations_packed: usize,
+    pub aggregates: usize,
+    /// Pool entries (new + known) visible to this build; reported so pool
+    /// growth across iterations is visible in the samples.
+    pub pool_entries: usize,
+}
+
+#[derive(Debug, Serialize)]
+pub(crate) struct Environment {
+    pub client_version: &'static str,
+    /// Resolved leansig git revision from Cargo.lock. leansig is pinned to a
+    /// moving branch, so results are not comparable across revisions.
+    pub leansig_rev: &'static str,
+    pub os: &'static str,
+    pub arch: &'static str,
+    pub available_parallelism: usize,
+}
+
+impl Environment {
+    pub(crate) fn collect() -> Self {
+        Self {
+            client_version: version::CLIENT_VERSION,
+            leansig_rev: env!("ETHLAMBDA_LEANSIG_REV"),
+            os: std::env::consts::OS,
+            arch: std::env::consts::ARCH,
+            available_parallelism: std::thread::available_parallelism()
+                .map(|n| n.get())
+                .unwrap_or(0),
+        }
+    }
+}
+
+#[derive(Debug, Serialize)]
+pub(crate) struct Params {
+    pub mode: &'static str,
+    pub mock_crypto: bool,
+    pub num_validators: u64,
+    pub warmup_slots: u64,
+    pub proofs_per_data: u64,
+    pub seed: u64,
+    pub iterations: u64,
+    pub enable_proposer_aggregation: bool,
+    pub max_attestations_per_block: usize,
+}
+
+#[derive(Debug, Serialize)]
+pub(crate) struct Stats {
+    pub count: usize,
+    pub min_seconds: f64,
+    pub mean_seconds: f64,
+    pub p50_seconds: f64,
+    pub p90_seconds: f64,
+    pub max_seconds: f64,
+    /// Coefficient of variation (stddev / mean); NaN-free (0 when mean is 0).
+    pub cv: f64,
+}
+
+#[derive(Debug, Serialize)]
+pub(crate) struct Summary {
+    pub phases: BTreeMap<String, Stats>,
+    pub overhead: Stats,
+    pub wall: Stats,
+}
+
+#[derive(Debug, Serialize)]
+pub(crate) struct Report {
+    pub schema_version: u32,
+    pub environment: Environment,
+    pub params: Params,
+    pub samples: Vec<Sample>,
+    pub summary: Summary,
+}
+
+impl Report {
+    pub(crate) fn new(environment: Environment, params: Params, samples: Vec<Sample>) -> Self {
+        let mut phases: BTreeMap<String, Stats> = BTreeMap::new();
+        if let Some(first) = samples.first() {
+            for phase in first.phases.keys() {
+                let values: Vec<f64> = samples
+                    .iter()
+                    .filter_map(|sample| sample.phases.get(phase).copied())
+                    .collect();
+                phases.insert(phase.clone(), stats(&values));
+            }
+        }
+        let overhead = stats(
+            &samples
+                .iter()
+                .map(|sample| sample.overhead_seconds)
+                .collect::<Vec<_>>(),
+        );
+        let wall = stats(
+            &samples
+                .iter()
+                .map(|sample| sample.wall_seconds)
+                .collect::<Vec<_>>(),
+        );
+
+        if wall.cv > CV_WARN_THRESHOLD {
+            eprintln!(
+                "warning: wall-time coefficient of variation is {:.1}% (>{:.0}%); \
+                 results are noisy — check for background load or increase --iterations",
+                wall.cv * 100.0,
+                CV_WARN_THRESHOLD * 100.0
+            );
+        }
+
+        Self {
+            schema_version: 1,
+            environment,
+            params,
+            samples,
+            summary: Summary {
+                phases,
+                overhead,
+                wall,
+            },
+        }
+    }
+
+    pub(crate) fn to_json(&self) -> eyre::Result<String> {
+        serde_json::to_string_pretty(self).map_err(Into::into)
+    }
+
+    pub(crate) fn human_table(&self) -> String {
+        let mut out = String::new();
+        let params = &self.params;
+        let env = &self.environment;
+        let crypto = if params.mock_crypto { "mock" } else { "real" };
+        let _ = writeln!(
+            out,
+            "Block-building benchmark — {} workload ({crypto} crypto)",
+            params.mode
+        );
+        let _ = writeln!(
+            out,
+            "  validators={} warmup_slots={} iterations={} proofs_per_data={} seed={}",
+            params.num_validators,
+            params.warmup_slots,
+            params.iterations,
+            params.proofs_per_data,
+            params.seed
+        );
+        let _ = writeln!(
+            out,
+            "  enable_proposer_aggregation={} max_attestations_per_block={}",
+            params.enable_proposer_aggregation, params.max_attestations_per_block
+        );
+        let _ = writeln!(
+            out,
+            "  {} leansig={} os={} arch={} threads={}",
+            env.client_version, env.leansig_rev, env.os, env.arch, env.available_parallelism
+        );
+        let _ = writeln!(out);
+        let _ = writeln!(
+            out,
+            "  {:<18} {:>5} {:>10} {:>10} {:>10} {:>10} {:>10}",
+            "phase", "count", "min", "mean", "p50", "p90", "max"
+        );
+        for (phase, stats) in &self.summary.phases {
+            let _ = writeln!(out, "{}", stats_row(phase, stats));
+        }
+        let _ = writeln!(out, "{}", stats_row("overhead", &self.summary.overhead));
+        let _ = writeln!(out, "{}", stats_row("wall", &self.summary.wall));
+        out
+    }
+}
+
+fn stats_row(name: &str, stats: &Stats) -> String {
+    format!(
+        "  {:<18} {:>5} {:>10} {:>10} {:>10} {:>10} {:>10}",
+        name,
+        stats.count,
+        format_ms(stats.min_seconds),
+        format_ms(stats.mean_seconds),
+        format_ms(stats.p50_seconds),
+        format_ms(stats.p90_seconds),
+        format_ms(stats.max_seconds),
+    )
+}
+
+fn format_ms(seconds: f64) -> String {
+    format!("{:.3}ms", seconds * 1e3)
+}
+
+fn stats(values: &[f64]) -> Stats {
+    if values.is_empty() {
+        return Stats {
+            count: 0,
+            min_seconds: 0.0,
+            mean_seconds: 0.0,
+            p50_seconds: 0.0,
+            p90_seconds: 0.0,
+            max_seconds: 0.0,
+            cv: 0.0,
+        };
+    }
+    let mut sorted = values.to_vec();
+    sorted.sort_by(|a, b| a.total_cmp(b));
+    let count = sorted.len();
+    let mean = sorted.iter().sum::<f64>() / count as f64;
+    let variance = sorted
+        .iter()
+        .map(|value| (value - mean).powi(2))
+        .sum::<f64>()
+        / count as f64;
+    let cv = if mean > 0.0 {
+        variance.sqrt() / mean
+    } else {
+        0.0
+    };
+    Stats {
+        count,
+        min_seconds: sorted[0],
+        mean_seconds: mean,
+        p50_seconds: percentile(&sorted, 0.50),
+        p90_seconds: percentile(&sorted, 0.90),
+        max_seconds: sorted[count - 1],
+        cv,
+    }
+}
+
+/// Nearest-rank percentile over a sorted slice (no interpolation; sample
+/// counts are small so exact sample values are preferable to blends).
+fn percentile(sorted: &[f64], q: f64) -> f64 {
+    let index = ((sorted.len() - 1) as f64 * q).round() as usize;
+    sorted[index]
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn percentile_handles_single_sample() {
+        let sorted = [7.0];
+        assert_eq!(percentile(&sorted, 0.0), 7.0);
+        assert_eq!(percentile(&sorted, 0.5), 7.0);
+        assert_eq!(percentile(&sorted, 1.0), 7.0);
+    }
+
+    #[test]
+    fn percentile_odd_and_even_lengths() {
+        let odd = [1.0, 2.0, 3.0, 4.0, 5.0];
+        assert_eq!(percentile(&odd, 0.5), 3.0);
+        assert_eq!(percentile(&odd, 1.0), 5.0);
+        let even = [1.0, 2.0, 3.0, 4.0];
+        assert_eq!(percentile(&even, 0.5), 3.0);
+        assert_eq!(percentile(&even, 0.0), 1.0);
+    }
+
+    #[test]
+    fn stats_on_known_values() {
+        let stats = stats(&[2.0, 4.0, 4.0, 4.0, 5.0, 5.0, 7.0, 9.0]);
+        assert_eq!(stats.count, 8);
+        assert_eq!(stats.min_seconds, 2.0);
+        assert_eq!(stats.max_seconds, 9.0);
+        assert_eq!(stats.mean_seconds, 5.0);
+        // population stddev of this classic set is 2.0 => cv = 0.4
+        assert!((stats.cv - 0.4).abs() < 1e-12);
+    }
+
+    #[test]
+    fn stats_on_empty_input_is_zeroed() {
+        let stats = stats(&[]);
+        assert_eq!(stats.count, 0);
+        assert_eq!(stats.mean_seconds, 0.0);
+        assert_eq!(stats.cv, 0.0);
+    }
+}
diff --git a/bin/ethlambda/src/cli.rs b/bin/ethlambda/src/cli.rs
index 4bbe1683..9f00c827 100644
--- a/bin/ethlambda/src/cli.rs
+++ b/bin/ethlambda/src/cli.rs
@@ -5,24 +5,46 @@ use std::path::PathBuf;
 
 use crate::version;
 
+// Node options plus optional sub-commands.
+//
+// The seven node-required arguments are declared `Option<T>` with
+// `required = true`: together with `subcommand_negates_reqs`, clap keeps
+// enforcing them (with its native missing-argument errors) for the flat node
+// invocation while letting sub-commands parse without any of them. Plain
+// non-`Option` fields would make sub-command invocations fail during derive
+// extraction even though validation was negated.
+// `args_conflicts_with_subcommands` rejects mixed invocations
+// (e.g. `--genesis x benchmark`) instead of silently ignoring the node flags.
+//
+// NOT a doc comment: clap derive turns struct doc comments into the
+// `long_about` shown by `--help`, and this note is for maintainers, not users.
 #[derive(Debug, clap::Parser)]
-#[command(name = "ethlambda", author = "LambdaClass", version = version::CLIENT_VERSION, about = "ethlambda consensus client")]
+#[command(
+    name = "ethlambda",
+    author = "LambdaClass",
+    version = version::CLIENT_VERSION,
+    about = "ethlambda consensus client",
+    subcommand_negates_reqs = true,
+    args_conflicts_with_subcommands = true
+)]
 pub(crate) struct CliOptions {
+    #[command(subcommand)]
+    pub(crate) command: Option<Command>,
     /// Path to the chain genesis config (e.g., config.yaml).
-    #[arg(long)]
-    pub(crate) genesis: PathBuf,
+    #[arg(long, required = true)]
+    pub(crate) genesis: Option<PathBuf>,
     /// Path to the validator registry (e.g., annotated_validators.yaml).
-    #[arg(long)]
-    pub(crate) validators: PathBuf,
+    #[arg(long, required = true)]
+    pub(crate) validators: Option<PathBuf>,
     /// Path to the bootnode list (e.g., nodes.yaml).
-    #[arg(long)]
-    pub(crate) bootnodes: PathBuf,
+    #[arg(long, required = true)]
+    pub(crate) bootnodes: Option<PathBuf>,
     /// Path to validator-config.yaml (validator name registry for metrics labels).
-    #[arg(long)]
-    pub(crate) validator_config: PathBuf,
+    #[arg(long, required = true)]
+    pub(crate) validator_config: Option<PathBuf>,
     /// Directory containing per-validator XMSS keys (e.g., hash-sig-keys/).
-    #[arg(long)]
-    pub(crate) hash_sig_keys_dir: PathBuf,
+    #[arg(long, required = true)]
+    pub(crate) hash_sig_keys_dir: Option<PathBuf>,
     #[arg(long, default_value = "9000")]
     pub(crate) gossipsub_port: u16,
     #[arg(long, default_value = "127.0.0.1")]
@@ -31,11 +53,11 @@ pub(crate) struct CliOptions {
     pub(crate) api_port: u16,
     #[arg(long, default_value = "5054")]
     pub(crate) metrics_port: u16,
-    #[arg(long)]
-    pub(crate) node_key: PathBuf,
+    #[arg(long, required = true)]
+    pub(crate) node_key: Option<PathBuf>,
     /// The node ID to look up in annotated_validators.yaml (e.g., "ethlambda_0")
-    #[arg(long)]
-    pub(crate) node_id: String,
+    #[arg(long, required = true)]
+    pub(crate) node_id: Option<String>,
     /// Base URL(s) of checkpoint-sync peer API servers (e.g., http://peer:5052).
     /// When set, skips genesis initialization and fetches the finalized state
     /// and block from each peer's `/lean/v0/states/finalized` and
@@ -150,3 +172,118 @@ pub(crate) struct ShadowOptions {
     )]
     pub(crate) shadow_xmss_fake_proof_size: u64,
 }
+
+#[derive(Debug, clap::Subcommand)]
+pub(crate) enum Command {
+    /// Benchmark block building offline against a controlled workload.
+    Benchmark(crate::benchmark::BenchmarkOptions),
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use clap::Parser as _;
+    use clap::error::ErrorKind;
+
+    /// The flat node invocation shape used by lean-quickstart, the Dockerfile,
+    /// and the devnet skills. It must keep parsing unchanged.
+    const FLAT_INVOCATION: &[&str] = &[
+        "ethlambda",
+        "--genesis",
+        "config.yaml",
+        "--validators",
+        "annotated_validators.yaml",
+        "--bootnodes",
+        "nodes.yaml",
+        "--validator-config",
+        "validator-config.yaml",
+        "--hash-sig-keys-dir",
+        "hash-sig-keys/",
+        "--node-key",
+        "node.key",
+        "--node-id",
+        "ethlambda_0",
+        "--gossipsub-port",
+        "9001",
+        "--is-aggregator",
+    ];
+
+    #[test]
+    fn flat_node_invocation_parses_unchanged() {
+        let options = CliOptions::try_parse_from(FLAT_INVOCATION).expect("flat invocation parses");
+        assert!(options.command.is_none());
+        assert_eq!(options.genesis.as_deref(), Some("config.yaml".as_ref()));
+        assert_eq!(options.node_id.as_deref(), Some("ethlambda_0"));
+        assert_eq!(options.gossipsub_port, 9001);
+        assert!(options.is_aggregator);
+    }
+
+    #[test]
+    fn missing_required_node_flag_keeps_clap_error() {
+        let without_genesis: Vec<&str> = FLAT_INVOCATION
+            .iter()
+            .enumerate()
+            .filter(|(i, _)| *i != 1 && *i != 2)
+            .map(|(_, arg)| *arg)
+            .collect();
+        let err = CliOptions::try_parse_from(without_genesis)
+            .expect_err("missing --genesis must still error");
+        assert_eq!(err.kind(), ErrorKind::MissingRequiredArgument);
+    }
+
+    #[test]
+    fn benchmark_subcommand_parses_without_node_args() {
+        let options = CliOptions::try_parse_from([
+            "ethlambda",
+            "benchmark",
+            "synthetic",
+            "--mock-crypto",
+            "--iterations",
+            "3",
+        ])
+        .expect("benchmark subcommand parses without node args");
+        assert!(matches!(options.command, Some(Command::Benchmark(_))));
+        assert!(options.genesis.is_none());
+        assert!(options.node_id.is_none());
+    }
+
+    #[test]
+    fn node_flags_mixed_with_subcommand_are_rejected() {
+        let err = CliOptions::try_parse_from([
+            "ethlambda",
+            "--genesis",
+            "config.yaml",
+            "benchmark",
+            "synthetic",
+        ])
+        .expect_err("mixing node flags with a subcommand must be rejected");
+        assert_eq!(err.kind(), ErrorKind::ArgumentConflict);
+    }
+
+    #[test]
+    fn mock_crypto_conflicts_with_proposer_aggregation() {
+        let err = CliOptions::try_parse_from([
+            "ethlambda",
+            "benchmark",
+            "synthetic",
+            "--mock-crypto",
+            "--enable-proposer-aggregation",
+        ])
+        .expect_err("--mock-crypto cannot drive real leanVM aggregation");
+        assert_eq!(err.kind(), ErrorKind::ArgumentConflict);
+    }
+
+    #[test]
+    fn node_id_value_named_benchmark_is_not_a_subcommand() {
+        let mut args: Vec<&str> = FLAT_INVOCATION.to_vec();
+        let node_id_position = args
+            .iter()
+            .position(|arg| *arg == "ethlambda_0")
+            .expect("node id value present");
+        args[node_id_position] = "benchmark";
+        let options =
+            CliOptions::try_parse_from(args).expect("flag values must not become subcommands");
+        assert!(options.command.is_none());
+        assert_eq!(options.node_id.as_deref(), Some("benchmark"));
+    }
+}
diff --git a/bin/ethlambda/src/main.rs b/bin/ethlambda/src/main.rs
index b933b928..c5d2c64d 100644
--- a/bin/ethlambda/src/main.rs
+++ b/bin/ethlambda/src/main.rs
@@ -1,3 +1,4 @@
+mod benchmark;
 mod checkpoint_sync;
 mod cli;
 mod fd_limit;
@@ -71,15 +72,31 @@ const ASCII_ART: &str = r#"
 #[cfg_attr(not(feature = "shadow-integration"), tokio::main)]
 #[cfg_attr(feature = "shadow-integration", tokio::main(flavor = "current_thread"))]
 async fn main() -> eyre::Result<()> {
-    let filter = EnvFilter::builder()
-        .with_default_directive(tracing::Level::INFO.into())
-        .from_env_lossy();
-    let subscriber = Registry::default().with(tracing_subscriber::fmt::layer().with_filter(filter));
-    tracing::subscriber::set_global_default(subscriber)
-        .wrap_err("failed to set global tracing subscriber")?;
-
     let options = CliOptions::parse();
 
+    // Benchmark mode logs to stderr (default WARN) so the report on stdout
+    // stays pipe-clean; the node path keeps its stdout INFO logging.
+    if options.command.is_some() {
+        let filter = EnvFilter::builder()
+            .with_default_directive(tracing::Level::WARN.into())
+            .from_env_lossy();
+        let subscriber = Registry::default().with(
+            tracing_subscriber::fmt::layer()
+                .with_writer(std::io::stderr)
+                .with_filter(filter),
+        );
+        tracing::subscriber::set_global_default(subscriber)
+            .wrap_err("failed to set global tracing subscriber")?;
+    } else {
+        let filter = EnvFilter::builder()
+            .with_default_directive(tracing::Level::INFO.into())
+            .from_env_lossy();
+        let subscriber =
+            Registry::default().with(tracing_subscriber::fmt::layer().with_filter(filter));
+        tracing::subscriber::set_global_default(subscriber)
+            .wrap_err("failed to set global tracing subscriber")?;
+    }
+
     #[cfg(feature = "shadow-integration")]
     init_shadow_cost(&options.shadow);
 
@@ -88,6 +105,10 @@ async fn main() -> eyre::Result<()> {
     ethlambda_blockchain::metrics::set_node_info("ethlambda", version::CLIENT_VERSION);
     ethlambda_blockchain::metrics::set_node_start_time();
 
+    if let Some(cli::Command::Benchmark(benchmark_options)) = options.command {
+        return benchmark::run(benchmark_options);
+    }
+
     let rpc_config = RpcConfig {
         http_address: options.http_address,
         api_port: options.api_port,
@@ -117,12 +138,19 @@ async fn main() -> eyre::Result<()> {
         return run_test_driver(rpc_config).await;
     }
 
-    let node_p2p_key = read_hex_file_bytes(&options.node_key).wrap_err_with(|| {
-        format!(
-            "failed to load node key from {}",
-            options.node_key.display()
-        )
-    })?;
+    // clap enforces the node-required arguments when no sub-command is given
+    // (`subcommand_negates_reqs` only lifts them for sub-commands, which
+    // returned above), so these unwraps cannot fail on the node path.
+    let config_path = require_arg(options.genesis, "--genesis")?;
+    let validators_path = require_arg(options.validators, "--validators")?;
+    let bootnodes_path = require_arg(options.bootnodes, "--bootnodes")?;
+    let validator_config = require_arg(options.validator_config, "--validator-config")?;
+    let validator_keys_dir = require_arg(options.hash_sig_keys_dir, "--hash-sig-keys-dir")?;
+    let node_key_path = require_arg(options.node_key, "--node-key")?;
+    let node_id = require_arg(options.node_id, "--node-id")?;
+
+    let node_p2p_key = read_hex_file_bytes(&node_key_path)
+        .wrap_err_with(|| format!("failed to load node key from {}", node_key_path.display()))?;
     let p2p_socket = SocketAddr::new(IpAddr::from([0, 0, 0, 0]), options.gossipsub_port);
 
     #[cfg(all(not(target_env = "msvc"), feature = "jemalloc"))]
@@ -130,13 +158,7 @@ async fn main() -> eyre::Result<()> {
     #[cfg(any(target_env = "msvc", not(feature = "jemalloc")))]
     info!("Using system allocator");
 
-    info!(node_key=?options.node_key, "got node key");
-
-    let config_path = options.genesis;
-    let bootnodes_path = options.bootnodes;
-    let validators_path = options.validators;
-    let validator_config = options.validator_config;
-    let validator_keys_dir = options.hash_sig_keys_dir;
+    info!(node_key=?node_key_path, "got node key");
 
     let config_yaml = std::fs::read_to_string(&config_path).wrap_err_with(|| {
         format!(
@@ -180,9 +202,8 @@ async fn main() -> eyre::Result<()> {
 
     let bootnodes = read_bootnodes(&bootnodes_path)?;
 
-    let validator_keys =
-        read_validator_keys(&validators_path, &validator_keys_dir, &options.node_id)
-            .wrap_err("failed to load validator keys")?;
+    let validator_keys = read_validator_keys(&validators_path, &validator_keys_dir, &node_id)
+        .wrap_err("failed to load validator keys")?;
 
     let data_dir =
         std::path::absolute(&options.data_dir).unwrap_or_else(|_| options.data_dir.clone());
@@ -589,6 +610,14 @@ fn read_validator_keys(
     Ok(validator_keys)
 }
 
+/// Unwrap a node-required CLI argument.
+///
+/// clap's `required = true` guarantees presence whenever no sub-command is
+/// given, so a failure here means the CLI definition and the node path drifted.
+fn require_arg<T>(value: Option<T>, flag: &str) -> eyre::Result<T> {
+    value.ok_or_else(|| eyre::eyre!("missing required argument {flag}"))
+}
+
 fn read_hex_file_bytes(path: impl AsRef<Path>) -> eyre::Result<Vec<u8>> {
     let path = path.as_ref();
     let file_content = std::fs::read_to_string(path)
diff --git a/crates/blockchain/src/block_builder.rs b/crates/blockchain/src/block_builder.rs
index 919a7719..b25d8c7c 100644
--- a/crates/blockchain/src/block_builder.rs
+++ b/crates/blockchain/src/block_builder.rs
@@ -737,10 +737,14 @@ fn extend_proofs_greedily(
     }
 
     let mut covered: HashSet<u64> = HashSet::new();
-    let mut remaining_indices: HashSet<usize> = (0..proofs.len()).collect();
+    let mut remaining_indices: Vec<usize> = (0..proofs.len()).collect();
 
     while !remaining_indices.is_empty() {
-        // Pick proof covering the most uncovered validators (count only, no allocation)
+        // Pick proof covering the most uncovered validators (count only, no
+        // allocation). Coverage ties break to the lowest index (pool insertion
+        // order): a HashSet here would let hash-iteration order pick an
+        // arbitrary equal-coverage winner, making the built block's
+        // aggregation bits differ from run to run.
         let best = remaining_indices
             .iter()
             .map(|&idx| {
@@ -750,7 +754,7 @@ fn extend_proofs_greedily(
                     .count();
                 (idx, count)
             })
-            .max_by_key(|&(_, count)| count);
+            .max_by_key(|&(idx, count)| (count, Reverse(idx)));
 
         let Some((best_idx, best_count)) = best else {
             break;
@@ -777,7 +781,7 @@ fn extend_proofs_greedily(
 
         covered.extend(new_covered);
         selected.push((att, proof.clone()));
-        remaining_indices.remove(&best_idx);
+        remaining_indices.retain(|&idx| idx != best_idx);
     }
 }
 
diff --git a/crates/storage/src/lib.rs b/crates/storage/src/lib.rs
index 9b21dc85..6b9a89a0 100644
--- a/crates/storage/src/lib.rs
+++ b/crates/storage/src/lib.rs
@@ -5,4 +5,6 @@ mod state_diff;
 mod store;
 
 pub use api::{ALL_TABLES, StorageBackend, StorageReadView, StorageWriteBatch, Table};
-pub use store::{ForkCheckpoints, GetForkchoiceStoreError, MAX_RESUMABLE_DB_STATE_AGE, Store};
+pub use store::{
+    ForkCheckpoints, GetForkchoiceStoreError, MAX_RESUMABLE_DB_STATE_AGE, NEW_PAYLOAD_CAP, Store,
+};
diff --git a/crates/storage/src/store.rs b/crates/storage/src/store.rs
index c916bfca..f0a181b4 100644
--- a/crates/storage/src/store.rs
+++ b/crates/storage/src/store.rs
@@ -120,7 +120,9 @@ const AGGREGATED_PAYLOAD_CAP: usize = 512;
 
 /// Hard cap for the new (pending) aggregated payload buffer.
 /// Smaller than known since new payloads are drained every interval (~4s).
-const NEW_PAYLOAD_CAP: usize = 64;
+/// Public so pool-seeding callers (the block-building benchmark) can reject
+/// workloads that a single insertion batch would silently evict.
+pub const NEW_PAYLOAD_CAP: usize = 64;
 
 /// Hard cap for the gossip signature buffer (individual signatures, not distinct data_roots).
 /// With 4 validators and 4-second slots, 2048 signatures covers ~512 slots (~34 min).
diff --git a/docs/plans/block-building-benchmark.md b/docs/plans/block-building-benchmark.md
new file mode 100644
index 00000000..9c3ec9b4
--- /dev/null
+++ b/docs/plans/block-building-benchmark.md
@@ -0,0 +1,145 @@
+# Plan: `ethlambda benchmark` — offline block-building benchmark sub-command
+
+## Context
+
+The README roadmap lists **"Optimize block building" (issue #465)** as the top near-term
+priority, but block building is only observable today through Prometheus histograms on a
+live devnet — there is no reproducible, offline way to measure it or to compare an
+optimization against a baseline. This adds an `ethlambda benchmark` sub-command that
+drives the exact production proposer code path against controlled workloads.
+
+Fixed scope decisions: offline harness; synthetic **and** replay-from-datadir workloads;
+real XMSS/leanVM crypto by default with a mock fast mode.
+
+## What gets measured
+
+The proposer pipeline as executed at interval 4, entered through the same functions the
+actor calls:
+
+```
+produce_block_with_signatures (crates/blockchain/src/store.rs:788)   ← already public
+  ├─ preamble: on_tick → interval 0, promote attestations,
+  │            fork-choice head, pool deep-clone      → reported as derived "build_overhead"
+  └─ build_block: select_payloads → compact → stf_simulate
+seal_block (extracted from crates/blockchain/src/lib.rs:504-631, see refactor)
+  └─ sign → wrap_proposer_type1 (leanVM) → merge_type_2 (leanVM)
+```
+
+**Excluded** (same boundary as the node's own `time_block_building` metric): gossip
+publish, slot-alignment sleep, block import.
+
+**Phase capture with zero hot-path changes**: the existing
+`lean_block_proposal_attestation_build_phase_seconds` HistogramVec accumulates exact f64
+sums, observed exactly once per phase per build — the harness deltas per-label sums
+between iterations (prometheus 0.14 exposes `get_sample_sum()`, readable in-process).
+Guards: assert per-phase count advanced by exactly 1, and warn if `wall − Σphases`
+exceeds 2%.
+
+**Statistics**: warmup 3 + 10 iterations (defaults, configurable); min/mean/p50/p90/max +
+CV>10% warning per phase; raw samples always exported; outliers never auto-discarded
+(XMSS rejection-sampling and OTS window advancement produce legitimate tails). Each
+iteration records `block.hash_tree_root()` — diffing root sequences between baseline and
+optimized runs proves an optimization changed only speed, not attestation selection.
+
+## CLI (verified on clap 4.6.1)
+
+Every existing flat invocation (devnet skills, Dockerfile, lean-quickstart) parses
+byte-for-byte unchanged.
+
+- `cli.rs`: add `#[command(subcommand_negates_reqs = true,
+  args_conflicts_with_subcommands = true)]` + `command: Option<Command>`; the 7 required
+  args become `Option<T>` with `#[arg(long, required = true)]` (mandatory — non-Option
+  fields break `ethlambda benchmark` even with `negates_reqs`). Missing-arg error
+  messages for the node path are unchanged; mixed `--genesis x benchmark` is rejected.
+- `main.rs`: early branch right after parse (mirrors the HIVE test-driver early-return);
+  node path unwraps the Options in one helper.
+
+```
+ethlambda benchmark synthetic  --num-validators 8 --warmup-slots 8
+                               --proofs-per-data 1 --seed 42 [--key-cache-dir <dir>]  # cache: M2
+ethlambda benchmark replay     --data-dir <path> --genesis config.yaml [--no-copy]
+                               [--validators … --hash-sig-keys-dir … --node-id …]  # enables seal
+common:  --iterations 10 --mock-crypto --enable-proposer-aggregation
+         --max-attestations-per-block 3 --format human|json --output <path>
+```
+
+Implementation refinements (M1): there is no `--pool-datas` knob — the pool
+accumulates one distinct `AttestationData` per elapsed slot naturally, exactly
+as on a live node, and per-sample `pool_entries` makes the growth visible.
+`--proofs-per-data` defaults to 1 (a single full-coverage aggregate per data,
+what a committee aggregator emits) so justification/finalization advance every
+slot; higher values exercise multi-proof selection but stall justification
+without proposer aggregation — the real coverage cost of that node flag.
+Warmup slots double as chain advancement, so there is no separate warmup-
+iterations knob.
+
+Known pre-existing issue (unrelated): `lean-quickstart/client-cmds/ethlambda-cmd.sh`
+still uses `--custom-network-config-dir`, removed in #321 — needs an upstream fix.
+
+## Harness design (`bin/ethlambda/src/benchmark/{mod,keys,corpus,report}.rs`)
+
+- **Iteration model**: slots advance monotonically, proposer rotates `slot % N` (matches
+  round-robin `is_proposer`); each built block is imported via
+  `on_block_without_verification` so the empty-slot gap stays constant; the pool is
+  re-seeded per iteration in fixed seeded order (insertion order pins proof choice).
+- **Keys**: seeded in-process keygen, cached on disk keyed by (leansig rev, seed, index,
+  role). Minimal-window keygen costs ~1s/key in release (verified empirically; the window
+  floors at 131,072 epochs — ample for thousands of bench slots; the 2^32 lifetime is
+  fixed in the type and unaffected). Arbitrary N, no Docker, no fixture download.
+- **Synthetic corpus**: `State::from_genesis` + `InMemoryBackend`; K warmup blocks; pool
+  = attestations from the last `--pool-datas` slots × `--proofs-per-data` real type-1
+  proofs via `aggregate_signatures` (built outside the timed span, progress on stderr).
+- **`--mock-crypto`**: empty proofs, forces the `keep_best` path (clap `conflicts_with
+  --enable-proposer-aggregation`, since `compact` invokes the real prover), seal skipped
+  and reported as null-not-zero. Runs in seconds → CI smoke test.
+- **Replay (v1 scope)**: copies the datadir before opening (mandatory — `on_tick`/head
+  updates write Metadata per interval and RocksDB has no read-only mode; `--no-copy`
+  opt-out with a warning). Loads via `Store::from_db_state`, builds at head+1. Pools are
+  in-memory-only and unrecoverable from disk, so v1 replay measures selection + STF +
+  state-root realism on real deep states; supplying the node's key trio additionally
+  enables the seal phases. Type-2 splitting / pool recording = deferred future work.
+- **Report**: human table + `--format json` (stdout pipe-clean, logs to stderr) with
+  `schema_version`, environment (CPU model, cores, OS, ethlambda rev via vergen, leansig
+  lock rev via a small `build.rs` Cargo.lock parse — leansig tracks the moving `devnet4`
+  branch), full params + seed, per-iteration raw samples. One configuration per process
+  invocation (global cumulative histograms, rayon/prover state).
+
+## The one library refactor
+
+Extract `crates/blockchain/src/lib.rs:504-631` (proposer sign → type-1 wrap → pubkey
+resolution → type-2 merge) into `pub fn seal_block(...) -> Result<SignedBlock,
+SealBlockError>` in the blockchain crate; `propose_block` calls it. Justified: the
+benchmark cannot reach these phases otherwise (a bin-side copy would drift), it collapses
+six repeated error-return-with-metric blocks into one `match` (net-negative LOC), and
+adding `sign`/`wrap_proposer_type1`/`merge_type_2` labels to the existing phase histogram
+gives production dashboards the currently-untimed expensive steps issue #465 targets.
+Verbatim move, own commit, devnet smoke before merge. `build_block` stays `pub(crate)`.
+
+## Milestones
+
+| | Deliverable | Files |
+|---|---|---|
+| **M1** — CLI + mock end-to-end | `ethlambda benchmark synthetic --mock-crypto` runs in seconds; table + JSON; flat-invocation compat tests; `make bench`; CI smoke step in the existing Test job. Includes one small library fix found by the determinism gate: `extend_proofs_greedily` kept its candidate set in a `HashSet`, so equal-coverage proof ties were broken by randomized hash order and block contents differed run to run — ties now break to the lowest pool index | `cli.rs`, `main.rs`, `benchmark/{mod,corpus,report}.rs`, `build.rs` (leansig rev), `Makefile`, `ci.yml`, `block_builder.rs` (tie-break) |
+| **M2** — real crypto | `seal_block` extraction (first commit) + 3 new phase labels; seeded keygen + cache; real type-1 pools; all 7 phases measured; first baseline JSON recorded | `crates/blockchain/src/{seal.rs,lib.rs,metrics.rs}`, `benchmark/keys.rs`, `types/src/signature.rs` (keygen wrapper) |
+| **M3** — replay + docs | replay mode against a devnet-runner datadir; `docs/benchmarking.md` + `SUMMARY.md` + README roadmap line | `benchmark/corpus.rs`, docs |
+
+One PR per milestone; `make fmt/lint/test` before each; M2 additionally gated by a devnet
+smoke via `test-branch.sh`.
+
+## Verification
+
+- clap `try_parse_from` tests: flat invocation parses, missing-arg errors preserved,
+  `benchmark` parses without node args, mixed invocation rejected.
+- Determinism: two same-seed runs produce identical per-iteration block-root sequences.
+- Accounting: Σphases ≥ 98% of wall per iteration, per-phase count deltas == 1.
+- CI mock smoke: `benchmark synthetic --mock-crypto --num-validators 4 --iterations 3
+  --format json | jq -e '.schema_version == 1'`.
+
+## Main risks
+
+- Real-mode setup cost: iterations × pool proofs of leanVM proving → default real run
+  takes minutes (mitigated: mock mode, small defaults, ETA logging, key cache).
+- `seal_block` extraction touches consensus-critical `propose_block` — verbatim
+  extraction, careful review of the six error branches, devnet smoke.
+- Cross-run comparability: rayon-parallel proving is machine/load-sensitive and leansig
+  is a moving branch — the env block in every report is the guard, not a fix.