From 9ba3cb4969d960987aa97da54beac08cb843e6d8 Mon Sep 17 00:00:00 2001 From: SimJeg Date: Mon, 15 Dec 2025 08:17:12 +0100 Subject: [PATCH 1/3] Rename to query-aware Signed-off-by: SimJeg --- evaluation/evaluate.py | 16 ++++++++-------- evaluation/evaluate_config.yaml | 2 +- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/evaluation/evaluate.py b/evaluation/evaluate.py index 62b08612..832c9ccc 100644 --- a/evaluation/evaluate.py +++ b/evaluation/evaluate.py @@ -50,7 +50,7 @@ class EvaluationConfig: fraction: float = 1.0 max_new_tokens: Optional[int] = None max_context_length: Optional[int] = None - compress_questions: bool = False + query_aware: bool = False needle_depth: Optional[int] = None # Decoding parameters @@ -136,8 +136,8 @@ def get_results_dir(self, output_dir: Path) -> Path: components.append(f"fraction{self.fraction:.3f}") if self.max_context_length is not None: components.append(f"max_context{self.max_context_length}") - if self.compress_questions: - components.append("compressed_questions") + if self.query_aware: + components.append("query_aware") if self.key_channel_compression_ratio is not None: components.append(f"key_channel_cr{self.key_channel_compression_ratio:.2f}") if self.needle_depth is not None and self.dataset == "needle_in_haystack": @@ -323,17 +323,17 @@ def _load_and_prepare_dataset(self): ) if isinstance(self.press, FinchPress): - if not self.config.compress_questions: - logger.error("FinchPress requires 'compress_questions' to be set to True.") - raise ValueError("FinchPress requires compress_questions to be set to True") + if not self.config.query_aware: + logger.error("FinchPress requires 'query_aware' to be set to True.") + raise ValueError("FinchPress requires query_aware to be set to True") # FinchPress uses a delimiter token to separate context and question # So we need to update the tokenizer and the model embeddings. logger.info("FinchPress detected, updating model and tokenizer with delimiter token.") self.press.update_model_and_tokenizer(self.pipeline.model, self.pipeline.tokenizer) # type: ignore[attr-defined] df["context"] = df["context"] + self.press.delimiter_token # type: ignore[attr-defined, index] - if self.config.compress_questions: - logger.info("Compressing questions into context.") + if self.config.query_aware: + logger.info("Query-aware compression: including question in context for compression.") df["context"] = df["context"] + df["question"] # type: ignore[index] df["question"] = "" # type: ignore[index] diff --git a/evaluation/evaluate_config.yaml b/evaluation/evaluate_config.yaml index 04a01115..60980257 100644 --- a/evaluation/evaluate_config.yaml +++ b/evaluation/evaluate_config.yaml @@ -14,7 +14,7 @@ key_channel_compression_ratio: null # For ThinKPress and Composed fraction: 1.0 # Fraction of dataset to evaluate (0.0 to 1.0), for quick testing max_new_tokens: null # Maximum new tokens to generate (null = use dataset default) max_context_length: null # Maximum context length (null = use model maximum) -compress_questions: false # Whether to compress questions with context +query_aware: false # Whether to include question in context for query-aware compression needle_depth: null # Depth (int or list of ints) percentage of the needle in the haystack (0 to 100), only for needle_in_haystack dataset device: null # Device to use (null = auto-detect, "cuda:0", "cpu", etc.) From e3e1159fc7c6300b3bea6cbc124f1096774dc14d Mon Sep 17 00:00:00 2001 From: SimJeg Date: Mon, 15 Dec 2025 08:23:18 +0100 Subject: [PATCH 2/3] Update README Signed-off-by: SimJeg --- evaluation/README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/evaluation/README.md b/evaluation/README.md index 971fecd7..d91b5aa4 100644 --- a/evaluation/README.md +++ b/evaluation/README.md @@ -32,6 +32,8 @@ python evaluate.py --config_file Customize your evaluation by editing `evaluate_config.yaml`. This allows you to flexibly configure a variety of settings, like the `fraction` of dataset to use (for quick testing) and the model arguments (e.g. for scaling RoPE). For complete parameter details, see the `evaluation_config.yaml` +💡 Set `query_aware: true` to include the question in the context during compression. This enables query-aware compression as used in methods like SnapKV and FinchPress. + ### Available Presses and Datasets We support evaluation with all the presses implemented in the library (and possible combinations). @@ -84,4 +86,4 @@ Further methods could be explored: - Move beyond pruning, as this method is fundamentally limited (see last figure in [this notebook](../notebooks/expected_attention.ipynb)) - Fine-tuning LLMs to deal with compressed KV caches -We encourage contributions to explore these ideas and improve the performance of long-context LLMs with compressed caches. We provide benchmark results from 7 presses and 3 models. We include a variant of SnapKV where we include the question in the compression process as in the original paper (snapkv w/ question). All performance curves can be found in the [assets](assets) directory, and predictions are available [here](https://drive.google.com/drive/folders/14BilGw07v8tOUUct-5nDhQlN3zIX9BUf?usp=drive_link). \ No newline at end of file +We encourage contributions to explore these ideas and improve the performance of long-context LLMs with compressed caches. We provide benchmark results from 7 presses and 3 models. We include a variant of SnapKV with query-aware compression as in the original paper (snapkv w/ query_aware). All performance curves can be found in the [assets](assets) directory, and predictions are available [here](https://drive.google.com/drive/folders/14BilGw07v8tOUUct-5nDhQlN3zIX9BUf?usp=drive_link). \ No newline at end of file From 994808a8c39e81f171412c4a2eb31d3d64d32c24 Mon Sep 17 00:00:00 2001 From: SimJeg Date: Mon, 15 Dec 2025 08:23:27 +0100 Subject: [PATCH 3/3] Update README Signed-off-by: SimJeg --- evaluation/README.md | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/evaluation/README.md b/evaluation/README.md index d91b5aa4..b16a8369 100644 --- a/evaluation/README.md +++ b/evaluation/README.md @@ -74,16 +74,4 @@ Where: Use the provided `evaluate.sh` script to run multiple presses simultaneously across different GPUs with varying compression ratios. ### Leaderboard 🥇 -After evaluating your model, you can easily submit it to the [KVPress Leaderboard](https://huggingface.co/spaces/nvidia/kvpress-leaderboard) on Hugging Face! Just copy the output directory in the huggingface space, and your method will soon be displayed in the leaderboard. - -### Discussion -The methods benchmarked so far are not able to efficiently compress the KV cache while maintaining performance on several long-context datasets and models. -In particular, exact information retrieval tasks such as kv-retrieval are challenging for the current methods. -Further methods could be explored: -- {Layer,Head}-wise pruning: pruning with a different compression ratio for each layer or head as in [DMC](https://arxiv.org/abs/2403.09636), [FastGen](https://arxiv.org/abs/2310.01801) or [DuoAttention](https://arxiv.org/abs/2410.10819) -- Adaptive pruning: pruning based on a score, and not a uniform fixed ratio -- Taking into account inter-layer dependencies such as in [PyramidKV](https://arxiv.org/abs/2406.02069) -- Move beyond pruning, as this method is fundamentally limited (see last figure in [this notebook](../notebooks/expected_attention.ipynb)) -- Fine-tuning LLMs to deal with compressed KV caches - -We encourage contributions to explore these ideas and improve the performance of long-context LLMs with compressed caches. We provide benchmark results from 7 presses and 3 models. We include a variant of SnapKV with query-aware compression as in the original paper (snapkv w/ query_aware). All performance curves can be found in the [assets](assets) directory, and predictions are available [here](https://drive.google.com/drive/folders/14BilGw07v8tOUUct-5nDhQlN3zIX9BUf?usp=drive_link). \ No newline at end of file +After evaluating your model, you can easily submit it to the [KVPress Leaderboard](https://huggingface.co/spaces/nvidia/kvpress-leaderboard) on Hugging Face! Just copy the output directory in the huggingface space, and your method will soon be displayed in the leaderboard. \ No newline at end of file