NVIDIA · maxjeblick · Oct 9, 2025 · Aug 27, 2025 · Aug 27, 2025 · Aug 27, 2025
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
@@ -4,10 +4,13 @@ Description of your PR. Fixes # (issue) (if applicable)
 
 ## Checklist
 
-- Tests are working (`make test`)
-- Code is formatted correctly (`make style`, on errors try fix with `make format`)
-- Copyright header is included
+Before submitting a PR, please make sure:
+
+- [ ] Tests are working (`make test`)
+- [ ] Code is formatted correctly (`make style`, on errors try fix with `make format`)
+- [ ] Copyright header is included
 - [ ] All commits are signed-off  using `git commit -s`
+
 - [ ] (new press) `mypress_press.py` is in the `presses` directory
 - [ ] (new press) `MyPress` is in `__init__.py` 
 - [ ] (new press) `README.md` is updated with a 1 liner about the new press in the Available presses section

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -16,6 +16,14 @@ jobs:
         with:
           python-version: 3.10.11
 
+      - name: Setup CUDA
+        uses: Jimver/cuda-toolkit@v0.2.16
+        with:
+          cuda: '12.5.0'
+
+      - name: Set CUDA_HOME
+        run: echo "CUDA_HOME=/usr/local/cuda" >> $GITHUB_ENV
+
       - name: Install uv
         uses: astral-sh/setup-uv@v6
         with:
@@ -25,3 +33,5 @@ jobs:
         run: uv sync --all-groups
 
       - run: make test
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
diff --git a/Makefile b/Makefile
@@ -41,9 +41,22 @@ reports:
 
 .PHONY: test
 test: reports
+	$(UV) pip install optimum-quanto
+	$(UV) pip install flash-attn
 	PYTHONPATH=. \
 	$(UV) run pytest \
 		--cov-report xml:reports/coverage.xml \
 		--cov=kvpress/ \
 		--junitxml=./reports/junit.xml \
-		tests/
+		-v \
+		tests/ | tee reports/pytest_output.log
+	@if grep -q "SKIPPED" reports/pytest_output.log; then \
+		echo "Error: Tests were skipped. All tests must run."; \
+		grep "SKIPPED" reports/pytest_output.log; \
+		exit 1; \
+	fi
+	@if grep -q "FAILED" reports/pytest_output.log; then \
+		echo "Error: Some tests failed."; \
+		grep "FAILED" reports/pytest_output.log; \
+		exit 1; \
+	fi
diff --git a/pyproject.toml b/pyproject.toml
@@ -91,4 +91,4 @@ disable_error_code = ["attr-defined"]
 
 [[tool.mypy.overrides]]
 module = "kvpress.pipeline"
-disable_error_code = ["attr-defined", "assignment", "override"]
+disable_error_code = ["attr-defined", "assignment", "override"]
diff --git a/tests/fixtures.py b/tests/fixtures.py
@@ -7,29 +7,37 @@
 from transformers import AutoModelForCausalLM, pipeline
 
 
+def get_device():
+    """Helper function that returns the appropriate device (GPU if available, otherwise CPU)"""
+    return "cuda:0" if torch.cuda.is_available() else "cpu"
+
+
 @pytest.fixture(scope="session")
 def unit_test_model():
-    return AutoModelForCausalLM.from_pretrained("MaxJeblick/llama2-0b-unit-test").eval()
+    model = AutoModelForCausalLM.from_pretrained("MaxJeblick/llama2-0b-unit-test").eval()
+    return model.to(get_device())
 
 
 @pytest.fixture(scope="session")
 def unit_test_model_output_attention():
-    return AutoModelForCausalLM.from_pretrained(
+    model = AutoModelForCausalLM.from_pretrained(
         "MaxJeblick/llama2-0b-unit-test", attn_implementation="eager", output_attentions=True
     ).eval()
+    return model.to(get_device())
 
 
 @pytest.fixture(scope="session")
 def danube_500m_model():
-    return AutoModelForCausalLM.from_pretrained("h2oai/h2o-danube3-500m-chat").eval()
+    model = AutoModelForCausalLM.from_pretrained("h2oai/h2o-danube3-500m-chat").eval()
+    return model.to(get_device())
 
 
 @pytest.fixture(scope="session")
 def kv_press_unit_test_pipeline():
     return pipeline(
         "kv-press-text-generation",
         model="maxjeblick/llama2-0b-unit-test",
-        device=0 if torch.cuda.is_available() else -1,
+        device=get_device(),
     )
 
 
@@ -38,11 +46,46 @@ def kv_press_danube_pipeline():
     return pipeline(
         "kv-press-text-generation",
         model="h2oai/h2o-danube3-500m-chat",
-        device=0 if torch.cuda.is_available() else -1,
+        device=get_device(),
     )
 
 
 @pytest.fixture(scope="session")
+def kv_press_adaptive_pipeline():
+    """Flexible pipeline that uses GPU+flash attention if available, otherwise CPU"""
+    device = get_device()
+    ckpt = "meta-llama/Llama-3.2-1B-Instruct"
+
+    # Use flash attention only if GPU is available
+    model_kwargs = {}
+    if torch.cuda.is_available():
+        model_kwargs["attn_implementation"] = "flash_attention_2"
+
+    pipe = pipeline(
+        "kv-press-text-generation",
+        model=ckpt,
+        device=device,
+        torch_dtype="auto",
+        model_kwargs=model_kwargs,
+    )
+    return pipe
+
+
+@pytest.fixture(scope="class")
+def kv_press_llama3_1_flash_attn_pipeline():
+    device = "cuda:0"
+    ckpt = "meta-llama/Llama-3.1-8B-Instruct"
+    attn_implementation = "flash_attention_2"
+    pipe = pipeline(
+        "kv-press-text-generation",
+        model=ckpt,
+        device=device,
+        model_kwargs={"attn_implementation": attn_implementation, "torch_dtype": torch.bfloat16},
+    )
+    return pipe
+
+
+@pytest.fixture(scope="class")
 def kv_press_llama3_2_flash_attn_pipeline():
     device = "cuda:0"
     ckpt = "meta-llama/Llama-3.2-1B-Instruct"
@@ -56,10 +99,10 @@ def kv_press_llama3_2_flash_attn_pipeline():
     return pipe
 
 
-@pytest.fixture(scope="session")
-def kv_press_llama3_1_flash_attn_pipeline():
+@pytest.fixture(scope="class")
+def kv_press_qwen3_flash_attn_pipeline():
     device = "cuda:0"
-    ckpt = "meta-llama/Llama-3.1-8B-Instruct"
+    ckpt = "Qwen/Qwen3-4B-Instruct-2507"
     attn_implementation = "flash_attention_2"
     pipe = pipeline(
         "kv-press-text-generation",

diff --git a/tests/integration/test_ruler.py b/tests/integration/test_ruler.py
@@ -6,9 +6,9 @@
 import torch
 from transformers import DynamicCache, QuantoQuantizedCache
 from transformers.utils import is_flash_attn_2_available, is_optimum_quanto_available
-
+from kvpress import QFilterPress
 from tests.default_presses import default_presses
-from tests.fixtures import kv_press_llama3_1_flash_attn_pipeline  # noqa: F401
+from tests.fixtures import kv_press_llama3_2_flash_attn_pipeline, kv_press_qwen3_flash_attn_pipeline  # noqa: F401
 
 
 @pytest.fixture(scope="session")
@@ -18,40 +18,94 @@ def df_ruler():
     return df
 
 
-@pytest.mark.skipif(not torch.cuda.is_available(), reason="GPU is not available")
-@pytest.mark.skipif(not is_flash_attn_2_available(), reason="flash_attn is not installed")
-@pytest.mark.parametrize("press_dict", default_presses)
-@pytest.mark.parametrize("cache", ["dynamic", "quantized"])
-@pytest.mark.parametrize("compression_ratio", [0, 0.1])
-def test_ruler_is_correct(
-    kv_press_llama3_1_flash_attn_pipeline, df_ruler, press_dict, cache, compression_ratio  # noqa: F811
-):
-    cls = press_dict["cls"]
-    kwargs = press_dict["kwargs"][0]
-    press = cls(**kwargs)
-    if not hasattr(cls, "compression_ratio"):
-        pytest.skip(reason="Press does not support compression_ratio")
-    try:
-        # set compression ratio to a small value for testing
-        # we don't want to max out compression, but rather test if cache compression works
-        press.compression_ratio = compression_ratio
-    except AttributeError:
-        # pytest.skip(reason="Press does not support setting compression_ratio")
-        pass
-
-    if cache == "dynamic":
-        cache = DynamicCache()
-    elif cache == "quantized" and is_optimum_quanto_available():
-        cache = QuantoQuantizedCache(config=kv_press_llama3_1_flash_attn_pipeline.model.config, nbits=4)
-    elif cache == "quantized" and not is_optimum_quanto_available():
-        pytest.skip("Quanto is not installed")
-    else:
-        raise ValueError(f"Unknown cache type: {cache}")
-
-    idx = 0
-    context = df_ruler.iloc[idx]["context"]
-    question = df_ruler.iloc[idx]["question"]
-    true_answer = df_ruler.iloc[idx]["answer"][0]
-
-    pred_answer = kv_press_llama3_1_flash_attn_pipeline(context, question=question, press=press, cache=cache)["answer"]
-    assert true_answer in pred_answer
+class TestRuler:
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="GPU is not available")
+    @pytest.mark.skipif(not is_flash_attn_2_available(), reason="flash_attn is not installed")
+    @pytest.mark.parametrize("press_dict", default_presses)
+    @pytest.mark.parametrize("cache", ["dynamic", "quantized"])
+    @pytest.mark.parametrize("compression_ratio", [0, 0.1])
+    def test_ruler_is_correct(
+        self, kv_press_qwen3_flash_attn_pipeline, df_ruler, press_dict, cache, compression_ratio  # noqa: F811
+    ):
+        cls = press_dict["cls"]
+        kwargs = press_dict["kwargs"][0]
+        press = cls(**kwargs)
+        if not hasattr(cls, "compression_ratio"):
+            pytest.skip(reason="Press does not support compression_ratio")
+        try:
+            # set compression ratio to a small value for testing
+            # we don't want to max out compression, but rather test if cache compression works
+            press.compression_ratio = compression_ratio
+        except AttributeError:
+            # pytest.skip(reason="Press does not support setting compression_ratio")
+            pass
+
+        if cache == "dynamic":
+            cache = DynamicCache()
+        elif cache == "quantized" and is_optimum_quanto_available():
+            cache = QuantoQuantizedCache(config=kv_press_qwen3_flash_attn_pipeline.model.config, nbits=4)
+        elif cache == "quantized" and not is_optimum_quanto_available():
+            pytest.skip("Quanto is not installed")
+        else:
+            raise ValueError(f"Unknown cache type: {cache}")
+
+        idx = 6  # qwen model passed idx 6 for all configurations
+        context = df_ruler.iloc[idx]["context"]
+        question = df_ruler.iloc[idx]["question"]
+        true_answer = df_ruler.iloc[idx]["answer"][0]
+
+        if isinstance(press, QFilterPress):
+            # QFilterPress doesn't support Qwen3 4B. Will be tested in the next test class.
+            return
+        else:
+            pred_answer = kv_press_qwen3_flash_attn_pipeline(
+                context,
+                question=question,
+                press=press,
+                cache=cache
+            )["answer"]
+        assert true_answer in pred_answer
+
+
+class TestRulerForQFilter:
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="GPU is not available")
+    @pytest.mark.skipif(not is_flash_attn_2_available(), reason="flash_attn is not installed")
+    @pytest.mark.parametrize("cache", ["dynamic", "quantized"])
+    @pytest.mark.parametrize("compression_ratio", [0, 0.1])
+    def test_ruler_is_correct_for_qfilter(
+        self, kv_press_llama3_2_flash_attn_pipeline, df_ruler, cache, compression_ratio  # noqa: F811
+    ):
+        cls = QFilterPress
+        kwargs = {"compression_ratio": 0.2}
+        press = cls(**kwargs)
+        if not hasattr(cls, "compression_ratio"):
+            pytest.skip(reason="Press does not support compression_ratio")
+        try:
+            # set compression ratio to a small value for testing
+            # we don't want to max out compression, but rather test if cache compression works
+            press.compression_ratio = compression_ratio
+        except AttributeError:
+            # pytest.skip(reason="Press does not support setting compression_ratio")
+            pass
+
+        if cache == "dynamic":
+            cache = DynamicCache()
+        elif cache == "quantized" and is_optimum_quanto_available():
+            cache = QuantoQuantizedCache(config=kv_press_llama3_2_flash_attn_pipeline.model.config, nbits=4)
+        elif cache == "quantized" and not is_optimum_quanto_available():
+            pytest.skip("Quanto is not installed")
+        else:
+            raise ValueError(f"Unknown cache type: {cache}")
+
+        idx = 0
+        context = df_ruler.iloc[idx]["context"]
+        question = df_ruler.iloc[idx]["question"]
+        true_answer = df_ruler.iloc[idx]["answer"][0]
+
+        pred_answer = kv_press_llama3_2_flash_attn_pipeline(
+            context,
+            question=question,
+            press=press,
+            cache=cache
+        )["answer"]
+        assert true_answer in pred_answer
diff --git a/tests/presses/test_block_press.py b/tests/presses/test_block_press.py
@@ -33,7 +33,7 @@ def test_block_press_is_streaming_top_k(unit_test_model):  # noqa: F811
     """
     press = HiddenStatesPress(compression_ratio=0.5)
     generator = torch.Generator().manual_seed(0)
-    input_ids = torch.randint(0, 1024, (1, 256), generator=generator)
+    input_ids = torch.randint(0, 1024, (1, 256), generator=generator).to(unit_test_model.device)
     keys_hash = []
     values_hash = []
 

diff --git a/tests/presses/test_finch_press.py b/tests/presses/test_finch_press.py
@@ -16,6 +16,6 @@ def test_finch_press(unit_test_model):  # noqa: F811
     ]:
         press.delimiter_token_id = unit_test_model.config.eos_token_id
         with press(unit_test_model):
-            input_ids = torch.arange(10, 20)
+            input_ids = torch.arange(10, 20).to(unit_test_model.device)
             input_ids[8] = press.delimiter_token_id
             unit_test_model(input_ids.unsqueeze(0))
diff --git a/tests/presses/test_flash_attention.py b/tests/presses/test_flash_attention.py
@@ -7,19 +7,20 @@
 from transformers.utils import is_flash_attn_2_available
 
 from kvpress import KnormPress
-from tests.fixtures import kv_press_llama3_1_flash_attn_pipeline  # noqa: F401
+from tests.fixtures import kv_press_qwen3_flash_attn_pipeline  # noqa: F401
 
 
-@pytest.mark.skipif(not torch.cuda.is_available(), reason="GPU is not available")
-@pytest.mark.skipif(not is_flash_attn_2_available(), reason="flash_attn is not installed")
-def test_fa_works(kv_press_llama3_1_flash_attn_pipeline):  # noqa: F811
-    # test if fa2 runs, see https://github.com/huggingface/transformers/releases/tag/v4.55.2
-    # and https://github.com/NVIDIA/kvpress/pull/115
-    model = kv_press_llama3_1_flash_attn_pipeline.model
-    tok = AutoTokenizer.from_pretrained("h2oai/h2o-danube3-500m-chat")
-    inputs = tok("Hello, how are you? bla bla how are you? this is some text lala ddd", return_tensors="pt").to(
-        model.device
-    )
+class TestFlashAttention:
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="GPU is not available")
+    @pytest.mark.skipif(not is_flash_attn_2_available(), reason="flash_attn is not installed")
+    def test_fa_works(self, kv_press_qwen3_flash_attn_pipeline):  # noqa: F811
+        # test if fa2 runs, see https://github.com/huggingface/transformers/releases/tag/v4.55.2
+        # and https://github.com/NVIDIA/kvpress/pull/115
+        model = kv_press_qwen3_flash_attn_pipeline.model
+        tok = AutoTokenizer.from_pretrained("Qwen/Qwen3-4B-Instruct-2507")
+        inputs = tok("Hello, how are you? bla bla how are you? this is some text lala ddd", return_tensors="pt").to(
+            model.device
+        )
 
-    with KnormPress(0.8)(model):
-        model.generate(**inputs, max_new_tokens=10, do_sample=False)
+        with KnormPress(0.8)(model):
+            model.generate(**inputs, max_new_tokens=10, do_sample=False)
diff --git a/tests/presses/test_head_compression.py b/tests/presses/test_head_compression.py
@@ -28,7 +28,7 @@ def test_wrapper_head_compression(unit_test_model, wrapper_press, compression_ra
     p = KnormPress(compression_ratio=compression_ratio)
     press = wrapper_press(press=p)
     with press(unit_test_model):
-        input_ids = torch.randint(0, 1024, (1, 128))
+        input_ids = torch.randint(0, 1024, (1, 128)).to(unit_test_model.device)
         unit_test_model(input_ids, past_key_values=DynamicCache()).past_key_values
 
     assert unit_test_model.model.layers[0].self_attn.masked_key_indices is not None
@@ -47,7 +47,7 @@ def test_wrapper_head_compression(unit_test_model, wrapper_press, compression_ra
 def test_head_compression(unit_test_model, press, compression_ratio, layerwise):  # noqa: F811
     press = KVzipPress(compression_ratio=compression_ratio, layerwise=layerwise)
     with press(unit_test_model):
-        input_ids = torch.randint(0, 1024, (1, 128))
+        input_ids = torch.randint(0, 1024, (1, 128)).to(unit_test_model.device)
         unit_test_model(input_ids, past_key_values=DynamicCache()).past_key_values
 
     assert unit_test_model.model.layers[0].self_attn.masked_key_indices is not None