diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8af5182c..2daa81ba 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -60,13 +60,14 @@ jobs: if: matrix.install-profile == 'nlp' run: | pip install -e ".[test,cli,nlp]" -r requirements-test.txt - python -m spacy download en_core_web_sm + python -m spacy download en_core_web_lg - name: Install dependencies (nlp-advanced) if: matrix.install-profile == 'nlp-advanced' run: | pip install -e ".[test,cli,nlp,nlp-advanced]" -r requirements-test.txt - python -m spacy download en_core_web_sm + python -m spacy download en_core_web_lg + datafog download-model urchade/gliner_multi_pii-v1 --engine gliner - name: Run tests (core) if: matrix.install-profile == 'core' diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 53a39dfc..91a459aa 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -128,7 +128,8 @@ jobs: python -m pip install --upgrade pip pip install -e ".[all,test]" pip install -r requirements-test.txt - pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz + python -m spacy download en_core_web_lg + datafog download-model urchade/gliner_multi_pii-v1 --engine gliner - name: Run tests with segfault protection run: | diff --git a/datafog/client.py b/datafog/client.py index a76a30dd..c3d493c8 100644 --- a/datafog/client.py +++ b/datafog/client.py @@ -181,7 +181,7 @@ def download_model( Download a model for specified engine. Examples: - spaCy: datafog download-model en_core_web_sm --engine spacy + spaCy: datafog download-model en_core_web_lg --engine spacy GLiNER: datafog download-model urchade/gliner_multi_pii-v1 --engine gliner """ if engine == "spacy": diff --git a/datafog/models/spacy_nlp.py b/datafog/models/spacy_nlp.py index 7b473a17..5257ba3d 100644 --- a/datafog/models/spacy_nlp.py +++ b/datafog/models/spacy_nlp.py @@ -13,6 +13,8 @@ from .annotator import AnnotationResult, AnnotatorRequest +DEFAULT_SPACY_MODEL = "en_core_web_lg" + class SpacyAnnotator: """ @@ -22,14 +24,18 @@ class SpacyAnnotator: Supports various NLP tasks including entity recognition and model management. """ - def __init__(self, model_name: str = "en_core_web_lg"): + def __init__(self, model_name: str = DEFAULT_SPACY_MODEL): self.model_name = model_name self.nlp = None def load_model(self): - if not spacy.util.is_package(self.model_name): - spacy.cli.download(self.model_name) - self.nlp = spacy.load(self.model_name) + try: + self.nlp = spacy.load(self.model_name) + except OSError as exc: + raise ImportError( + f"spaCy model {self.model_name!r} is not installed. " + f"Download it explicitly with: datafog download-model {self.model_name} --engine spacy" + ) from exc def annotate_text(self, text: str, language: str = "en") -> List[AnnotationResult]: if not self.nlp: @@ -72,6 +78,12 @@ def list_models() -> List[str]: return spacy.util.get_installed_models() @staticmethod - def list_entities() -> List[str]: - nlp = spacy.load("en_core_web_lg") + def list_entities(model_name: str = DEFAULT_SPACY_MODEL) -> List[str]: + try: + nlp = spacy.load(model_name) + except OSError as exc: + raise ImportError( + f"spaCy model {model_name!r} is not installed. " + f"Download it explicitly with: datafog download-model {model_name} --engine spacy" + ) from exc return [ent for ent in nlp.pipe_labels["ner"]] diff --git a/datafog/processing/image_processing/donut_processor.py b/datafog/processing/image_processing/donut_processor.py index 7e100585..50022b6a 100644 --- a/datafog/processing/image_processing/donut_processor.py +++ b/datafog/processing/image_processing/donut_processor.py @@ -6,14 +6,10 @@ from images of documents. """ -import importlib -import importlib.util import json import logging import os import re -import subprocess -import sys from typing import TYPE_CHECKING, Any from .image_downloader import ImageDownloader @@ -43,13 +39,12 @@ def __init__(self, model_path="naver-clova-ix/donut-base-finetuned-cord-v2"): self.model_path = model_path self.downloader = ImageDownloader() - def ensure_installed(self, package_name): - try: - importlib.import_module(package_name) - except ImportError: - subprocess.check_call( - [sys.executable, "-m", "pip", "install", package_name] - ) + @staticmethod + def _missing_dependency_message(package_name: str) -> str: + return ( + f"Donut OCR requires {package_name}. " + "Install with: pip install datafog[nlp-advanced,ocr]" + ) def preprocess_image(self, image: "Image.Image") -> Any: import numpy as np @@ -86,40 +81,40 @@ async def extract_text_from_image(self, image: "Image.Image") -> str: "PYTEST_DONUT=yes is set, running actual OCR in test environment" ) - # Only import torch and transformers when actually needed and not in test environment try: - # Check if torch is available before trying to import it - try: - # Try to find the module without importing it - spec = importlib.util.find_spec("torch") - if spec is None: - # If we're in a test that somehow bypassed the IN_TEST_ENV check, - # still return a mock result instead of failing - logging.warning("torch module not found, returning mock result") - return json.dumps({"text": "Mock OCR text (torch not available)"}) - - # Ensure dependencies are installed - self.ensure_installed("torch") - self.ensure_installed("transformers") - except ImportError: - # If importlib.util is not available, fall back to direct try/except - pass - - # Import dependencies only when needed try: import torch + except ImportError as exc: + raise ImportError(self._missing_dependency_message("torch")) from exc + + try: from transformers import DonutProcessor as TransformersDonutProcessor from transformers import VisionEncoderDecoderModel except ImportError as e: - logging.warning(f"Import error: {e}, returning mock result") - return json.dumps({"text": f"Mock OCR text (import error: {e})"}) + raise ImportError( + self._missing_dependency_message("transformers") + ) from e # Preprocess the image image_np = self.preprocess_image(image) # Initialize model components - processor = TransformersDonutProcessor.from_pretrained(self.model_path) - model = VisionEncoderDecoderModel.from_pretrained(self.model_path) + try: + processor = TransformersDonutProcessor.from_pretrained( + self.model_path, + local_files_only=True, + ) + model = VisionEncoderDecoderModel.from_pretrained( + self.model_path, + local_files_only=True, + ) + except OSError as exc: + raise RuntimeError( + f"Donut model {self.model_path!r} is not available locally. " + "Download it explicitly before using Donut OCR, or pass a local " + "model path." + ) from exc + device = "cuda" if torch.cuda.is_available() else "cpu" model.to(device) model.eval() @@ -153,6 +148,8 @@ async def extract_text_from_image(self, image: "Image.Image") -> str: result = processor.token2json(sequence) return json.dumps(result) + except (ImportError, RuntimeError): + raise except Exception as e: logging.error(f"Error in extract_text_from_image: {e}") # Return a placeholder in case of error diff --git a/datafog/processing/spark_processing/pyspark_udfs.py b/datafog/processing/spark_processing/pyspark_udfs.py index 413e6ef3..2d7e2bc5 100644 --- a/datafog/processing/spark_processing/pyspark_udfs.py +++ b/datafog/processing/spark_processing/pyspark_udfs.py @@ -2,17 +2,16 @@ PySpark UDFs for PII annotation and related utilities. This module provides functions for PII (Personally Identifiable Information) annotation -using SpaCy models in a PySpark environment. It includes utilities for installing -dependencies, creating and broadcasting PII annotator UDFs, and performing PII annotation -on text data. +using SpaCy models in a PySpark environment. It includes utilities for validating +dependencies, creating and broadcasting PII annotator UDFs, and performing PII +annotation on text data. """ import importlib -import subprocess -import sys PII_ANNOTATION_LABELS = ["DATE_TIME", "LOC", "NRP", "ORG", "PER"] MAXIMAL_STRING_SIZE = 1000000 +DEFAULT_SPACY_MODEL = "en_core_web_lg" def pii_annotator(text: str, broadcasted_nlp) -> list[list[str]]: @@ -45,7 +44,7 @@ def pii_annotator(text: str, broadcasted_nlp) -> list[list[str]]: def broadcast_pii_annotator_udf( - spark_session=None, spacy_model: str = "en_core_web_lg" + spark_session=None, spacy_model: str = DEFAULT_SPACY_MODEL ): """Broadcast PII annotator across Spark cluster and create UDF""" ensure_installed("pyspark") @@ -69,5 +68,14 @@ def broadcast_pii_annotator_udf( def ensure_installed(package_name): try: importlib.import_module(package_name) - except ImportError: - subprocess.check_call([sys.executable, "-m", "pip", "install", package_name]) + except ImportError as exc: + if package_name == "pyspark": + extra = "distributed" + elif package_name == "spacy": + extra = "nlp" + else: + extra = "all" + raise ImportError( + f"{package_name} is required for Spark PII UDF support. " + f"Install with: pip install datafog[{extra}]" + ) from exc diff --git a/datafog/processing/text_processing/gliner_annotator.py b/datafog/processing/text_processing/gliner_annotator.py index cbaeca8c..5a195f89 100644 --- a/datafog/processing/text_processing/gliner_annotator.py +++ b/datafog/processing/text_processing/gliner_annotator.py @@ -79,14 +79,18 @@ def create( try: # Load the GLiNER model - model = GLiNER.from_pretrained(model_name) + model = GLiNER.from_pretrained(model_name, local_files_only=True) logging.info(f"Successfully loaded GLiNER model: {model_name}") return cls(model=model, entity_types=entity_types, model_name=model_name) except Exception as e: logging.error(f"Failed to load GLiNER model {model_name}: {str(e)}") - raise + raise RuntimeError( + f"GLiNER model {model_name!r} is not available locally. " + "Download it explicitly with: " + f"datafog download-model {model_name} --engine gliner" + ) from e def annotate(self, text: str) -> Dict[str, List[str]]: """ diff --git a/datafog/processing/text_processing/spacy_pii_annotator.py b/datafog/processing/text_processing/spacy_pii_annotator.py index e871db8a..d2d1d765 100644 --- a/datafog/processing/text_processing/spacy_pii_annotator.py +++ b/datafog/processing/text_processing/spacy_pii_annotator.py @@ -24,39 +24,34 @@ "WORK_OF_ART", ] MAXIMAL_STRING_SIZE = 1000000 +DEFAULT_SPACY_MODEL = "en_core_web_lg" class SpacyPIIAnnotator(BaseModel): model_config = ConfigDict(arbitrary_types_allowed=True) nlp: Any + model_name: str = DEFAULT_SPACY_MODEL @classmethod - def create(cls) -> "SpacyPIIAnnotator": - import spacy - + def create(cls, model_name: str = DEFAULT_SPACY_MODEL) -> "SpacyPIIAnnotator": try: - nlp = spacy.load("en_core_web_lg") - except OSError: - import subprocess - import sys + import spacy + except ImportError as exc: + raise ImportError( + "SpaCy engine requires the nlp extra. " + "Install with: pip install datafog[nlp]" + ) from exc - interpreter_location = sys.executable - subprocess.run( - [ - interpreter_location, - "-m", - "pip", - "install", - "--no-deps", - "--no-cache-dir", - "https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl", - ], - check=True, - ) - nlp = spacy.load("en_core_web_lg") + try: + nlp = spacy.load(model_name) + except OSError as exc: + raise ImportError( + f"spaCy model {model_name!r} is not installed. " + f"Download it explicitly with: datafog download-model {model_name} --engine spacy" + ) from exc - return cls(nlp=nlp) + return cls(nlp=nlp, model_name=model_name) def annotate(self, text: str) -> Dict[str, List[str]]: try: diff --git a/datafog/services/spark_service.py b/datafog/services/spark_service.py index 5b7db28b..bf7d2e48 100644 --- a/datafog/services/spark_service.py +++ b/datafog/services/spark_service.py @@ -1,14 +1,12 @@ """ Spark service for data processing and analysis. -Provides a wrapper around PySpark functionality, including session creation, -JSON reading, and package management. +Provides a wrapper around PySpark functionality, including session creation and +JSON reading. """ import importlib import os -import subprocess -import sys from typing import List @@ -16,14 +14,13 @@ class SparkService: """ Manages Spark operations and dependencies. - Initializes a Spark session, handles imports, and provides methods for - data reading and package installation. + Initializes a Spark session, handles imports, and provides methods for data + reading. """ def __init__(self, master=None): self.master = master - # Ensure pyspark is installed first self.ensure_installed("pyspark") # Now import necessary modules after ensuring pyspark is installed @@ -84,16 +81,8 @@ def read_json(self, path: str) -> List[dict]: def ensure_installed(self, package_name): try: importlib.import_module(package_name) - except ImportError: - print(f"Installing {package_name}...") - try: - subprocess.check_call( - [sys.executable, "-m", "pip", "install", package_name] - ) - print(f"{package_name} installed successfully.") - except subprocess.CalledProcessError as e: - print(f"Failed to install {package_name}: {e}") - raise ImportError( - f"Could not install {package_name}. " - f"Please install it manually with 'pip install {package_name}'." - ) + except ImportError as exc: + raise ImportError( + f"{package_name} is required for Spark support. " + "Install with: pip install datafog[distributed]" + ) from exc diff --git a/setup.py b/setup.py index 43eaff3b..7c3b7992 100644 --- a/setup.py +++ b/setup.py @@ -45,6 +45,7 @@ distributed_deps = [ "pandas>=2.0.0", "numpy>=1.24.0", + "pyspark>=3.5.0", ] web_deps = [ diff --git a/tests/test_donut_lazy_import.py b/tests/test_donut_lazy_import.py index a0b62ab9..80c9ec09 100644 --- a/tests/test_donut_lazy_import.py +++ b/tests/test_donut_lazy_import.py @@ -1,5 +1,4 @@ import sys -from unittest.mock import patch from datafog.services.image_service import ImageService @@ -46,21 +45,5 @@ def test_lazy_import_mechanism(): # Verify that the extract_text_from_image method exists assert hasattr(processor, "extract_text_from_image") - # Mock importlib.import_module to prevent actual imports - with patch("importlib.import_module") as mock_import: - # Set up the mock to return a dummy module - mock_import.return_value = type("DummyModule", (), {}) - - # Mock the ensure_installed method to prevent actual installation - with patch.object(processor, "ensure_installed"): - # Try to call extract_text_from_image which should trigger imports - try: - # We don't actually need to run it asynchronously for this test - # Just call the method directly to see if it tries to import - processor.ensure_installed("torch") - except Exception: - # Ignore any exceptions - pass - - # Verify ensure_installed was called - assert processor.ensure_installed.called + # Runtime package installation helpers should not exist on the processor. + assert not hasattr(processor, "ensure_installed") diff --git a/tests/test_gliner_annotator.py b/tests/test_gliner_annotator.py index bde66d02..fbf0bbdf 100644 --- a/tests/test_gliner_annotator.py +++ b/tests/test_gliner_annotator.py @@ -53,7 +53,8 @@ def test_gliner_annotator_creation_with_dependencies(self, mock_gliner_module): assert "person" in annotator.entity_types assert "email" in annotator.entity_types mock_gliner_class.from_pretrained.assert_called_with( - "urchade/gliner_multi_pii-v1" + "urchade/gliner_multi_pii-v1", + local_files_only=True, ) def test_gliner_annotator_custom_model(self, mock_gliner_module): @@ -69,7 +70,10 @@ def test_gliner_annotator_custom_model(self, mock_gliner_module): assert annotator.model_name == "urchade/gliner_base" assert annotator.entity_types == custom_entities - mock_gliner_class.from_pretrained.assert_called_with("urchade/gliner_base") + mock_gliner_class.from_pretrained.assert_called_with( + "urchade/gliner_base", + local_files_only=True, + ) def test_gliner_annotate_text(self, mock_gliner_module): """Test GLiNER text annotation.""" @@ -353,23 +357,29 @@ def test_text_service_valid_engines(self): elif engine in ["spacy", "auto"]: # Mock spaCy dependencies - with patch( - "datafog.processing.text_processing.spacy_pii_annotator.SpacyPIIAnnotator" - ): - from datafog.services.text_service import TextService + from datafog.services.text_service import TextService - service = TextService(engine=engine) - assert service.engine == engine + with patch.object(TextService, "_ensure_spacy_available"): + with patch.object( + TextService, + "_create_spacy_annotator", + return_value=Mock(), + ): + service = TextService(engine=engine) + assert service.engine == engine elif engine in ["gliner", "smart"]: # Mock GLiNER dependencies - with patch( - "datafog.processing.text_processing.gliner_annotator.GLiNERAnnotator" - ): - from datafog.services.text_service import TextService + from datafog.services.text_service import TextService - service = TextService(engine=engine) - assert service.engine == engine + with patch.object(TextService, "_ensure_gliner_available"): + with patch.object( + TextService, + "_create_gliner_annotator", + return_value=Mock(), + ): + service = TextService(engine=engine) + assert service.engine == engine def test_text_service_invalid_engine(self): """Test that invalid engines raise AssertionError.""" @@ -445,7 +455,7 @@ def test_download_model_cli_output_fix(self): # Capture stdout captured_output = io.StringIO() - with patch("datafog.models.spacy_nlp.SpacyAnnotator.download_model"): + with patch("datafog.client.SpacyAnnotator.download_model"): with patch("sys.stdout", captured_output): with patch("typer.echo") as mock_echo: try: diff --git a/tests/test_runtime_dependency_safety.py b/tests/test_runtime_dependency_safety.py new file mode 100644 index 00000000..adc787ff --- /dev/null +++ b/tests/test_runtime_dependency_safety.py @@ -0,0 +1,83 @@ +import sys +from pathlib import Path + +import pytest + + +def test_runtime_code_does_not_install_packages() -> None: + blocked_snippets = [ + "subprocess.check_call", + "subprocess.run", + '"-m", "pip"', + '"pip", "install"', + "'pip', 'install'", + ] + offenders = [] + + for path in Path("datafog").rglob("*.py"): + source = path.read_text() + for snippet in blocked_snippets: + if snippet in source: + offenders.append(f"{path}: {snippet}") + + assert offenders == [] + + +def test_spacy_pii_missing_model_requires_explicit_download( + monkeypatch: pytest.MonkeyPatch, +) -> None: + class FakeSpacy: + @staticmethod + def load(_model_name): + raise OSError("model not installed") + + monkeypatch.setitem(sys.modules, "spacy", FakeSpacy()) + + from datafog.processing.text_processing.spacy_pii_annotator import SpacyPIIAnnotator + + with pytest.raises(ImportError, match="Download it explicitly"): + SpacyPIIAnnotator.create() + + +def test_spark_missing_dependency_requires_explicit_install( + monkeypatch: pytest.MonkeyPatch, +) -> None: + from datafog.services import spark_service + + def missing_module(_package_name): + raise ImportError("missing") + + monkeypatch.setattr(spark_service.importlib, "import_module", missing_module) + + service = spark_service.SparkService.__new__(spark_service.SparkService) + with pytest.raises(ImportError, match=r"datafog\[distributed\]"): + service.ensure_installed("pyspark") + + +def test_spark_udf_missing_dependency_requires_explicit_install( + monkeypatch: pytest.MonkeyPatch, +) -> None: + from datafog.processing.spark_processing import pyspark_udfs + + def missing_module(_package_name): + raise ImportError("missing") + + monkeypatch.setattr(pyspark_udfs.importlib, "import_module", missing_module) + + with pytest.raises(ImportError, match=r"datafog\[nlp\]"): + pyspark_udfs.ensure_installed("spacy") + + +@pytest.mark.asyncio +async def test_donut_missing_dependency_requires_explicit_install( + monkeypatch: pytest.MonkeyPatch, +) -> None: + from datafog.processing.image_processing import donut_processor + + monkeypatch.setattr(donut_processor, "IN_TEST_ENV", False) + monkeypatch.setitem(sys.modules, "torch", None) + + processor = donut_processor.DonutProcessor() + + with pytest.raises(ImportError, match=r"datafog\[nlp-advanced,ocr\]"): + await processor.extract_text_from_image(object())