Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 41 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,47 @@ jobs:
flags: ${{ matrix.install-profile }}-py${{ matrix.python-version }}
token: ${{ secrets.CODECOV_TOKEN }}

profile-smoke:
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
install-profile:
- core
- cli
- nlp
- nlp-advanced
- ocr
- distributed
- web
steps:
- uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: "3.11"
cache: "pip"

- name: Upgrade pip
run: |
python -m pip install --upgrade pip

- name: Install dependencies (core)
if: matrix.install-profile == 'core'
run: |
pip install -e ".[test]"

- name: Install dependencies (profile)
if: matrix.install-profile != 'core'
run: |
pip install -e ".[test,${{ matrix.install-profile }}]"

- name: Run install profile smoke test
env:
DATAFOG_INSTALL_PROFILE: ${{ matrix.install-profile }}
run: |
pytest tests/test_install_profiles.py -q

wheel-size:
runs-on: ubuntu-latest
steps:
Expand Down
26 changes: 8 additions & 18 deletions datafog/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,17 +171,13 @@ def _gliner_entities(text: str) -> list[Entity]:
def _get_spacy_annotator():
try:
from .processing.text_processing.spacy_pii_annotator import SpacyPIIAnnotator
except ImportError:
return _UnavailableAnnotator(
"SpaCy engine requires the nlp extra. Install with: pip install datafog[nlp]"
)
except ImportError as exc:
return _UnavailableAnnotator(str(exc))

try:
return SpacyPIIAnnotator.create()
except ImportError:
return _UnavailableAnnotator(
"SpaCy engine requires the nlp extra. Install with: pip install datafog[nlp]"
)
except ImportError as exc:
return _UnavailableAnnotator(str(exc))
except Exception as exc:
return _UnavailableAnnotator(
f"SpaCy engine initialization failed: {type(exc).__name__}: {exc}"
Expand All @@ -192,19 +188,13 @@ def _get_spacy_annotator():
def _get_gliner_annotator():
try:
from .processing.text_processing.gliner_annotator import GLiNERAnnotator
except ImportError:
return _UnavailableAnnotator(
"GLiNER engine requires the nlp-advanced extra. "
"Install with: pip install datafog[nlp-advanced]"
)
except ImportError as exc:
return _UnavailableAnnotator(str(exc))

try:
annotator = GLiNERAnnotator.create()
except ImportError:
return _UnavailableAnnotator(
"GLiNER engine requires the nlp-advanced extra. "
"Install with: pip install datafog[nlp-advanced]"
)
except ImportError as exc:
return _UnavailableAnnotator(str(exc))
except Exception as exc:
return _UnavailableAnnotator(
f"GLiNER engine initialization failed: {type(exc).__name__}: {exc}"
Expand Down
3 changes: 1 addition & 2 deletions datafog/models/spacy_nlp.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
from uuid import uuid4

import spacy
from rich.progress import track

from .annotator import AnnotationResult, AnnotatorRequest

Expand Down Expand Up @@ -53,7 +52,7 @@ def annotate_text(self, text: str, language: str = "en") -> List[AnnotationResul
)
doc = self.nlp(annotator_request.text)
results = []
for ent in track(doc.ents, description="Processing entities"):
for ent in doc.ents:
result = AnnotationResult(
start=ent.start_char,
end=ent.end_char,
Expand Down
161 changes: 80 additions & 81 deletions datafog/telemetry.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,11 @@
"""

import hashlib
import importlib.util
import json
import os
import platform
import sys
import threading
import time
import urllib.request
Expand Down Expand Up @@ -114,44 +116,28 @@ def _get_duration_bucket(duration_ms: float) -> str:

def _detect_installed_extras() -> list:
"""Probe which optional extras are installed."""
extras = []

try:
import spacy # noqa: F401

extras.append("nlp")
except ImportError:
pass

try:
import gliner # noqa: F401

extras.append("nlp-advanced")
except ImportError:
pass

try:
import pytesseract # noqa: F401

extras.append("ocr")
except ImportError:
pass

try:
import typer # noqa: F401

extras.append("cli")
except ImportError:
pass

try:
import pyspark # noqa: F401

extras.append("distributed")
except ImportError:
pass

return extras
def _module_available(module_name: str) -> bool:
module = sys.modules.get(module_name)
if module is not None and getattr(module, "__spec__", None) is None:
return True
try:
return importlib.util.find_spec(module_name) is not None
except (ImportError, ValueError):
return False

module_to_extra = {
"spacy": "nlp",
"gliner": "nlp-advanced",
"pytesseract": "ocr",
"typer": "cli",
"pyspark": "distributed",
}
return [
extra
for module_name, extra in module_to_extra.items()
if _module_available(module_name)
]


def _detect_ci() -> bool:
Expand All @@ -170,39 +156,69 @@ def _detect_ci() -> bool:
return any(os.environ.get(v) for v in ci_vars)


def _send_event(event_name: str, properties: dict) -> None:
"""POST event to PostHog /capture/ endpoint in a daemon thread.
def _post_event(event_name: str, properties: dict) -> None:
"""POST event to PostHog /capture/ endpoint.

Fire-and-forget: failures are silently ignored.
Fire-and-forget callers run this in daemon threads. Failures are silently
ignored so telemetry can never affect SDK behavior.
"""
try:
payload = json.dumps(
{
"api_key": _POSTHOG_API_KEY,
"event": event_name,
"properties": {
"distinct_id": _get_anonymous_id(),
**properties,
},
"timestamp": time.strftime("%Y-%m-%dT%H:%M:%S.000Z", time.gmtime()),
}
).encode("utf-8")

req = urllib.request.Request(
f"{_POSTHOG_HOST}/capture/",
data=payload,
headers={"Content-Type": "application/json"},
method="POST",
)
urllib.request.urlopen(req, timeout=5)
except Exception:
pass


def _send_event(event_name: str, properties: dict) -> None:
"""POST event to PostHog /capture/ endpoint in a daemon thread."""
if not _is_telemetry_enabled():
return

def _post():
t = threading.Thread(target=_post_event, args=(event_name, properties), daemon=True)
t.start()


def _send_init_event() -> None:
"""Build and send the process init event without blocking API calls."""

def _post_init():
try:
payload = json.dumps(
{
"api_key": _POSTHOG_API_KEY,
"event": event_name,
"properties": {
"distinct_id": _get_anonymous_id(),
**properties,
},
"timestamp": time.strftime("%Y-%m-%dT%H:%M:%S.000Z", time.gmtime()),
}
).encode("utf-8")

req = urllib.request.Request(
f"{_POSTHOG_HOST}/capture/",
data=payload,
headers={"Content-Type": "application/json"},
method="POST",
)
urllib.request.urlopen(req, timeout=5)
from .__about__ import __version__
except Exception:
pass

t = threading.Thread(target=_post, daemon=True)
__version__ = "unknown"

uname = platform.uname()
_post_event(
"datafog_init",
{
"package_version": __version__,
"python_version": platform.python_version(),
"os": uname.system,
"os_version": uname.release,
"arch": uname.machine,
"installed_extras": _detect_installed_extras(),
"is_ci": _detect_ci(),
},
)

t = threading.Thread(target=_post_init, daemon=True)
t.start()


Expand All @@ -220,24 +236,7 @@ def _ensure_initialized() -> None:
if not _is_telemetry_enabled():
return

try:
from .__about__ import __version__
except Exception:
__version__ = "unknown"

uname = platform.uname()
_send_event(
"datafog_init",
{
"package_version": __version__,
"python_version": platform.python_version(),
"os": uname.system,
"os_version": uname.release,
"arch": uname.machine,
"installed_extras": _detect_installed_extras(),
"is_ci": _detect_ci(),
},
)
_send_init_event()


def track_function_call(function_name: str, module: str, **kwargs) -> None:
Expand Down
Loading
Loading