From 9ba3cb4969d960987aa97da54beac08cb843e6d8 Mon Sep 17 00:00:00 2001
From: SimJeg <sjegou@nvidia.com>
Date: Mon, 15 Dec 2025 08:17:12 +0100
Subject: [PATCH 1/3] Rename to query-aware

Signed-off-by: SimJeg <sjegou@nvidia.com>
---
 evaluation/evaluate.py          | 16 ++++++++--------
 evaluation/evaluate_config.yaml |  2 +-
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/evaluation/evaluate.py b/evaluation/evaluate.py
index 62b08612..832c9ccc 100644
--- a/evaluation/evaluate.py
+++ b/evaluation/evaluate.py
@@ -50,7 +50,7 @@ class EvaluationConfig:
     fraction: float = 1.0
     max_new_tokens: Optional[int] = None
     max_context_length: Optional[int] = None
-    compress_questions: bool = False
+    query_aware: bool = False
     needle_depth: Optional[int] = None
 
     # Decoding parameters
@@ -136,8 +136,8 @@ def get_results_dir(self, output_dir: Path) -> Path:
             components.append(f"fraction{self.fraction:.3f}")
         if self.max_context_length is not None:
             components.append(f"max_context{self.max_context_length}")
-        if self.compress_questions:
-            components.append("compressed_questions")
+        if self.query_aware:
+            components.append("query_aware")
         if self.key_channel_compression_ratio is not None:
             components.append(f"key_channel_cr{self.key_channel_compression_ratio:.2f}")
         if self.needle_depth is not None and self.dataset == "needle_in_haystack":
@@ -323,17 +323,17 @@ def _load_and_prepare_dataset(self):
             )
 
         if isinstance(self.press, FinchPress):
-            if not self.config.compress_questions:
-                logger.error("FinchPress requires 'compress_questions' to be set to True.")
-                raise ValueError("FinchPress requires compress_questions to be set to True")
+            if not self.config.query_aware:
+                logger.error("FinchPress requires 'query_aware' to be set to True.")
+                raise ValueError("FinchPress requires query_aware to be set to True")
             # FinchPress uses a delimiter token to separate context and question
             # So we need to update the tokenizer and the model embeddings.
             logger.info("FinchPress detected, updating model and tokenizer with delimiter token.")
             self.press.update_model_and_tokenizer(self.pipeline.model, self.pipeline.tokenizer)  # type: ignore[attr-defined]
             df["context"] = df["context"] + self.press.delimiter_token  # type: ignore[attr-defined, index]
 
-        if self.config.compress_questions:
-            logger.info("Compressing questions into context.")
+        if self.config.query_aware:
+            logger.info("Query-aware compression: including question in context for compression.")
             df["context"] = df["context"] + df["question"]  # type: ignore[index]
             df["question"] = ""  # type: ignore[index]
 
diff --git a/evaluation/evaluate_config.yaml b/evaluation/evaluate_config.yaml
index 04a01115..60980257 100644
--- a/evaluation/evaluate_config.yaml
+++ b/evaluation/evaluate_config.yaml
@@ -14,7 +14,7 @@ key_channel_compression_ratio: null                # For ThinKPress and Composed
 fraction: 1.0                                     # Fraction of dataset to evaluate (0.0 to 1.0), for quick testing
 max_new_tokens: null                              # Maximum new tokens to generate (null = use dataset default)
 max_context_length: null                          # Maximum context length (null = use model maximum)
-compress_questions: false                         # Whether to compress questions with context
+query_aware: false                                # Whether to include question in context for query-aware compression
 needle_depth: null                                # Depth (int or list of ints) percentage of the needle in the haystack (0 to 100), only for needle_in_haystack dataset
 
 device: null  # Device to use (null = auto-detect, "cuda:0", "cpu", etc.)

From e3e1159fc7c6300b3bea6cbc124f1096774dc14d Mon Sep 17 00:00:00 2001
From: SimJeg <sjegou@nvidia.com>
Date: Mon, 15 Dec 2025 08:23:18 +0100
Subject: [PATCH 2/3] Update README

Signed-off-by: SimJeg <sjegou@nvidia.com>
---
 evaluation/README.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/evaluation/README.md b/evaluation/README.md
index 971fecd7..d91b5aa4 100644
--- a/evaluation/README.md
+++ b/evaluation/README.md
@@ -32,6 +32,8 @@ python evaluate.py --config_file <your_config.yaml>
 
 Customize your evaluation by editing `evaluate_config.yaml`. This allows you to flexibly configure a variety of settings, like the `fraction` of dataset to use (for quick testing) and the model arguments (e.g. for scaling RoPE). For complete parameter details, see the `evaluation_config.yaml`
 
+💡 Set `query_aware: true` to include the question in the context during compression. This enables query-aware compression as used in methods like SnapKV and FinchPress.
+
 
 ### Available Presses and Datasets 
 We support evaluation with all the presses implemented in the library (and possible combinations). 
@@ -84,4 +86,4 @@ Further methods could be explored:
 - Move beyond pruning, as this method is fundamentally limited (see last figure in [this notebook](../notebooks/expected_attention.ipynb))
 - Fine-tuning LLMs to deal with compressed KV caches
 
-We encourage contributions to explore these ideas and improve the performance of long-context LLMs with compressed caches. We provide benchmark results from 7 presses and 3 models. We include a variant of SnapKV where we include the question in the compression process as in the original paper (snapkv w/ question). All performance curves can be found in the [assets](assets) directory, and predictions are available [here](https://drive.google.com/drive/folders/14BilGw07v8tOUUct-5nDhQlN3zIX9BUf?usp=drive_link).
\ No newline at end of file
+We encourage contributions to explore these ideas and improve the performance of long-context LLMs with compressed caches. We provide benchmark results from 7 presses and 3 models. We include a variant of SnapKV with query-aware compression as in the original paper (snapkv w/ query_aware). All performance curves can be found in the [assets](assets) directory, and predictions are available [here](https://drive.google.com/drive/folders/14BilGw07v8tOUUct-5nDhQlN3zIX9BUf?usp=drive_link).
\ No newline at end of file

From 994808a8c39e81f171412c4a2eb31d3d64d32c24 Mon Sep 17 00:00:00 2001
From: SimJeg <sjegou@nvidia.com>
Date: Mon, 15 Dec 2025 08:23:27 +0100
Subject: [PATCH 3/3] Update README

Signed-off-by: SimJeg <sjegou@nvidia.com>
---
 evaluation/README.md | 14 +-------------
 1 file changed, 1 insertion(+), 13 deletions(-)

diff --git a/evaluation/README.md b/evaluation/README.md
index d91b5aa4..b16a8369 100644
--- a/evaluation/README.md
+++ b/evaluation/README.md
@@ -74,16 +74,4 @@ Where:
 Use the provided `evaluate.sh` script to run multiple presses simultaneously across different GPUs with varying compression ratios.
 
 ### Leaderboard 🥇
-After evaluating your model, you can easily submit it to the [KVPress Leaderboard](https://huggingface.co/spaces/nvidia/kvpress-leaderboard) on Hugging Face! Just copy the output directory in the huggingface space, and your method will soon be displayed in the leaderboard.
-
-### Discussion
-The methods benchmarked so far are not able to efficiently compress the KV cache while maintaining performance on several long-context datasets and models.
-In particular, exact information retrieval tasks such as kv-retrieval are challenging for the current methods.
-Further methods could be explored:
-- {Layer,Head}-wise pruning: pruning with a different compression ratio for each layer or head as in [DMC](https://arxiv.org/abs/2403.09636), [FastGen](https://arxiv.org/abs/2310.01801) or [DuoAttention](https://arxiv.org/abs/2410.10819)
-- Adaptive pruning: pruning based on a score, and not a uniform fixed ratio
-- Taking into account inter-layer dependencies such as in [PyramidKV](https://arxiv.org/abs/2406.02069)
-- Move beyond pruning, as this method is fundamentally limited (see last figure in [this notebook](../notebooks/expected_attention.ipynb))
-- Fine-tuning LLMs to deal with compressed KV caches
-
-We encourage contributions to explore these ideas and improve the performance of long-context LLMs with compressed caches. We provide benchmark results from 7 presses and 3 models. We include a variant of SnapKV with query-aware compression as in the original paper (snapkv w/ query_aware). All performance curves can be found in the [assets](assets) directory, and predictions are available [here](https://drive.google.com/drive/folders/14BilGw07v8tOUUct-5nDhQlN3zIX9BUf?usp=drive_link).
\ No newline at end of file
+After evaluating your model, you can easily submit it to the [KVPress Leaderboard](https://huggingface.co/spaces/nvidia/kvpress-leaderboard) on Hugging Face! Just copy the output directory in the huggingface space, and your method will soon be displayed in the leaderboard.
\ No newline at end of file