From 7957e839922c7c39f90b2935fcdfbb63d3f1e6ce Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 13 Feb 2026 10:33:31 -0800 Subject: [PATCH 01/27] chore(deps): bump sentencepiece from 0.2.0 to 0.2.1 (#114) Bumps [sentencepiece](https://github.com/google/sentencepiece) from 0.2.0 to 0.2.1. - [Release notes](https://github.com/google/sentencepiece/releases) - [Commits](https://github.com/google/sentencepiece/compare/v0.2.0...v0.2.1) --- updated-dependencies: - dependency-name: sentencepiece dependency-version: 0.2.1 dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index b4a6e1e7..c531c681 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,7 +4,7 @@ requests==2.32.3 spacy==3.7.5 pydantic==2.11.4 Pillow==11.2.1 -sentencepiece==0.2.0 +sentencepiece==0.2.1 protobuf==6.30.2 pytesseract==0.3.13 aiohttp==3.11.18 From 04091e04dd8f999ada1e3f1cf3625d260ab286f1 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 13 Feb 2026 10:33:56 -0800 Subject: [PATCH 02/27] chore(deps): bump cryptography from 44.0.2 to 46.0.5 (#119) Bumps [cryptography](https://github.com/pyca/cryptography) from 44.0.2 to 46.0.5. - [Changelog](https://github.com/pyca/cryptography/blob/main/CHANGELOG.rst) - [Commits](https://github.com/pyca/cryptography/compare/44.0.2...46.0.5) --- updated-dependencies: - dependency-name: cryptography dependency-version: 46.0.5 dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index c531c681..3fdc78cd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,7 +15,7 @@ setuptools>=68.0.0 pydantic-settings==2.3.4 typer==0.12.3 sphinx==7.2.6 -cryptography==44.0.2 +cryptography==46.0.5 # Testing dependencies pytest==7.4.0 From 1ab0b95d40275107adaa2bacb2cb9dee7c343cd7 Mon Sep 17 00:00:00 2001 From: sid mohan Date: Fri, 13 Feb 2026 10:35:41 -0800 Subject: [PATCH 03/27] chore(deps): bump pillow and protobuf pins --- requirements.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/requirements.txt b/requirements.txt index 3fdc78cd..2078e115 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,9 +3,9 @@ pandas==2.2.3 requests==2.32.3 spacy==3.7.5 pydantic==2.11.4 -Pillow==11.2.1 +Pillow==12.1.1 sentencepiece==0.2.1 -protobuf==6.30.2 +protobuf==6.33.5 pytesseract==0.3.13 aiohttp==3.11.18 numpy==1.26.4 @@ -20,4 +20,4 @@ cryptography==46.0.5 # Testing dependencies pytest==7.4.0 pytest-asyncio==0.21.0 -pytest-cov==4.1.0 \ No newline at end of file +pytest-cov==4.1.0 From 8163e80ab68d14b99fc1f113580b6434cbe86860 Mon Sep 17 00:00:00 2001 From: GitHub Action Date: Mon, 16 Feb 2026 03:14:35 +0000 Subject: [PATCH 04/27] chore: bump version to 4.3.0a1 [skip ci] --- datafog/__about__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafog/__about__.py b/datafog/__about__.py index 111dc917..67aab652 100644 --- a/datafog/__about__.py +++ b/datafog/__about__.py @@ -1 +1 @@ -__version__ = "4.3.0" +__version__ = "4.3.0a1" From 33fae4ed6d82956e39e7a22df474eb2eb7af62d7 Mon Sep 17 00:00:00 2001 From: GitHub Action Date: Thu, 19 Feb 2026 03:11:15 +0000 Subject: [PATCH 05/27] chore: bump version to 4.3.0b2 [skip ci] --- datafog/__about__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafog/__about__.py b/datafog/__about__.py index 67aab652..7f5ba1d2 100644 --- a/datafog/__about__.py +++ b/datafog/__about__.py @@ -1 +1 @@ -__version__ = "4.3.0a1" +__version__ = "4.3.0b2" From 744370b08358de72772ff22f6efb55e08cac23a8 Mon Sep 17 00:00:00 2001 From: GitHub Action Date: Mon, 23 Feb 2026 03:16:15 +0000 Subject: [PATCH 06/27] chore: bump version to 4.3.0a2 [skip ci] --- datafog/__about__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafog/__about__.py b/datafog/__about__.py index 7f5ba1d2..9ef0c4a9 100644 --- a/datafog/__about__.py +++ b/datafog/__about__.py @@ -1 +1 @@ -__version__ = "4.3.0b2" +__version__ = "4.3.0a2" From 84d52dbba36d54b13ab42a91240784ff1edfa362 Mon Sep 17 00:00:00 2001 From: GitHub Action Date: Thu, 26 Feb 2026 03:09:33 +0000 Subject: [PATCH 07/27] chore: bump version to 4.3.0b3 [skip ci] --- datafog/__about__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafog/__about__.py b/datafog/__about__.py index 9ef0c4a9..31595a63 100644 --- a/datafog/__about__.py +++ b/datafog/__about__.py @@ -1 +1 @@ -__version__ = "4.3.0a2" +__version__ = "4.3.0b3" From 4108984ec03fbfa099d56c9fae950de294bd98f8 Mon Sep 17 00:00:00 2001 From: GitHub Action Date: Mon, 2 Mar 2026 03:12:17 +0000 Subject: [PATCH 08/27] chore: bump version to 4.3.0a3 [skip ci] --- datafog/__about__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafog/__about__.py b/datafog/__about__.py index 31595a63..6a0400b0 100644 --- a/datafog/__about__.py +++ b/datafog/__about__.py @@ -1 +1 @@ -__version__ = "4.3.0b3" +__version__ = "4.3.0a3" From cb7d951fce8f50be816e9c7599604f6ef26b3bbb Mon Sep 17 00:00:00 2001 From: GitHub Action Date: Thu, 5 Mar 2026 03:08:13 +0000 Subject: [PATCH 09/27] chore: bump version to 4.3.0b4 [skip ci] --- datafog/__about__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafog/__about__.py b/datafog/__about__.py index 6a0400b0..10d2475c 100644 --- a/datafog/__about__.py +++ b/datafog/__about__.py @@ -1 +1 @@ -__version__ = "4.3.0a3" +__version__ = "4.3.0b4" From 107df2805856e8c3986641b3fe58e0de24608ea8 Mon Sep 17 00:00:00 2001 From: GitHub Action Date: Mon, 9 Mar 2026 03:13:14 +0000 Subject: [PATCH 10/27] chore: bump version to 4.3.0a4 [skip ci] --- datafog/__about__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafog/__about__.py b/datafog/__about__.py index 10d2475c..1e1c3b3b 100644 --- a/datafog/__about__.py +++ b/datafog/__about__.py @@ -1 +1 @@ -__version__ = "4.3.0b4" +__version__ = "4.3.0a4" From 0788e82dc15249f3613dee9b9ad471c69b23dd9e Mon Sep 17 00:00:00 2001 From: GitHub Action Date: Thu, 12 Mar 2026 03:08:47 +0000 Subject: [PATCH 11/27] chore: bump version to 4.3.0b5 [skip ci] --- datafog/__about__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafog/__about__.py b/datafog/__about__.py index 1e1c3b3b..f74eef4c 100644 --- a/datafog/__about__.py +++ b/datafog/__about__.py @@ -1 +1 @@ -__version__ = "4.3.0a4" +__version__ = "4.3.0b5" From 14ae62e9bec2089111a5ce04c29ce2bb0ff2a255 Mon Sep 17 00:00:00 2001 From: GitHub Action Date: Mon, 16 Mar 2026 03:17:24 +0000 Subject: [PATCH 12/27] chore: bump version to 4.3.0a5 [skip ci] --- datafog/__about__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafog/__about__.py b/datafog/__about__.py index f74eef4c..6a3dae89 100644 --- a/datafog/__about__.py +++ b/datafog/__about__.py @@ -1 +1 @@ -__version__ = "4.3.0b5" +__version__ = "4.3.0a5" From fac558d19083cab0811a4af8356177db3908aee6 Mon Sep 17 00:00:00 2001 From: GitHub Action Date: Thu, 19 Mar 2026 03:13:18 +0000 Subject: [PATCH 13/27] chore: bump version to 4.3.0b6 [skip ci] --- datafog/__about__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafog/__about__.py b/datafog/__about__.py index 6a3dae89..8923bd60 100644 --- a/datafog/__about__.py +++ b/datafog/__about__.py @@ -1 +1 @@ -__version__ = "4.3.0a5" +__version__ = "4.3.0b6" From 14347bec43fb76e68ca93fa643f12d45299134ae Mon Sep 17 00:00:00 2001 From: GitHub Action Date: Mon, 23 Mar 2026 03:15:54 +0000 Subject: [PATCH 14/27] chore: bump version to 4.3.0a6 [skip ci] --- datafog/__about__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafog/__about__.py b/datafog/__about__.py index 8923bd60..12f598de 100644 --- a/datafog/__about__.py +++ b/datafog/__about__.py @@ -1 +1 @@ -__version__ = "4.3.0b6" +__version__ = "4.3.0a6" From 189b77de38e3bb8db79e941788d2893cdc134048 Mon Sep 17 00:00:00 2001 From: GitHub Action Date: Thu, 26 Mar 2026 03:13:33 +0000 Subject: [PATCH 15/27] chore: bump version to 4.3.0b7 [skip ci] --- datafog/__about__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafog/__about__.py b/datafog/__about__.py index 12f598de..b2380e72 100644 --- a/datafog/__about__.py +++ b/datafog/__about__.py @@ -1 +1 @@ -__version__ = "4.3.0a6" +__version__ = "4.3.0b7" From d4602d0d2b014986f02d4b3083b4e3ca1c3245f8 Mon Sep 17 00:00:00 2001 From: GitHub Action Date: Mon, 30 Mar 2026 03:18:21 +0000 Subject: [PATCH 16/27] chore: bump version to 4.3.0a7 [skip ci] --- datafog/__about__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafog/__about__.py b/datafog/__about__.py index b2380e72..9dcb5920 100644 --- a/datafog/__about__.py +++ b/datafog/__about__.py @@ -1 +1 @@ -__version__ = "4.3.0b7" +__version__ = "4.3.0a7" From 879ae7047cda7daaf4901cca6b50b7a544f2e005 Mon Sep 17 00:00:00 2001 From: GitHub Action Date: Thu, 2 Apr 2026 03:14:22 +0000 Subject: [PATCH 17/27] chore: bump version to 4.3.0b8 [skip ci] --- datafog/__about__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafog/__about__.py b/datafog/__about__.py index 9dcb5920..7c19e895 100644 --- a/datafog/__about__.py +++ b/datafog/__about__.py @@ -1 +1 @@ -__version__ = "4.3.0a7" +__version__ = "4.3.0b8" From 3b1a5e8b4149ecd8245b9fe8c54fdb6ef661916d Mon Sep 17 00:00:00 2001 From: GitHub Action Date: Mon, 6 Apr 2026 03:17:32 +0000 Subject: [PATCH 18/27] chore: bump version to 4.3.0a8 [skip ci] --- datafog/__about__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafog/__about__.py b/datafog/__about__.py index 7c19e895..ff818429 100644 --- a/datafog/__about__.py +++ b/datafog/__about__.py @@ -1 +1 @@ -__version__ = "4.3.0b8" +__version__ = "4.3.0a8" From 372d3080b2fd5ac2ee99d8e28dcacb080ded3fef Mon Sep 17 00:00:00 2001 From: GitHub Action Date: Thu, 9 Apr 2026 03:14:19 +0000 Subject: [PATCH 19/27] chore: bump version to 4.3.0b9 [skip ci] --- datafog/__about__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafog/__about__.py b/datafog/__about__.py index ff818429..f02f7efd 100644 --- a/datafog/__about__.py +++ b/datafog/__about__.py @@ -1 +1 @@ -__version__ = "4.3.0a8" +__version__ = "4.3.0b9" From 936ca144544a640f7049479f85463aef17af36be Mon Sep 17 00:00:00 2001 From: GitHub Action Date: Mon, 13 Apr 2026 03:18:37 +0000 Subject: [PATCH 20/27] chore: bump version to 4.3.0a9 [skip ci] --- datafog/__about__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafog/__about__.py b/datafog/__about__.py index f02f7efd..5e5421fd 100644 --- a/datafog/__about__.py +++ b/datafog/__about__.py @@ -1 +1 @@ -__version__ = "4.3.0b9" +__version__ = "4.3.0a9" From 22e9820ca638a70df8e0ad5e9b4e9f393ba31677 Mon Sep 17 00:00:00 2001 From: GitHub Action Date: Thu, 16 Apr 2026 03:19:57 +0000 Subject: [PATCH 21/27] chore: bump version to 4.3.0b10 [skip ci] --- datafog/__about__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafog/__about__.py b/datafog/__about__.py index 5e5421fd..6a48ac4f 100644 --- a/datafog/__about__.py +++ b/datafog/__about__.py @@ -1 +1 @@ -__version__ = "4.3.0a9" +__version__ = "4.3.0b10" From 0a67ad3d33f0fd4107723e1292e9b403682f88db Mon Sep 17 00:00:00 2001 From: GitHub Action Date: Mon, 20 Apr 2026 03:17:54 +0000 Subject: [PATCH 22/27] chore: bump version to 4.3.0a10 [skip ci] --- datafog/__about__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafog/__about__.py b/datafog/__about__.py index 6a48ac4f..635b146e 100644 --- a/datafog/__about__.py +++ b/datafog/__about__.py @@ -1 +1 @@ -__version__ = "4.3.0b10" +__version__ = "4.3.0a10" From aa72c2e171e84d84a807baf823482d97b90cf868 Mon Sep 17 00:00:00 2001 From: GitHub Action Date: Thu, 23 Apr 2026 03:15:13 +0000 Subject: [PATCH 23/27] chore: bump version to 4.3.0b11 [skip ci] --- datafog/__about__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafog/__about__.py b/datafog/__about__.py index 635b146e..bc4f11a2 100644 --- a/datafog/__about__.py +++ b/datafog/__about__.py @@ -1 +1 @@ -__version__ = "4.3.0a10" +__version__ = "4.3.0b11" From 47520244a44a8dd9ef955af572dd3cff6ce6e12f Mon Sep 17 00:00:00 2001 From: Sid Mohan <61345237+sidmohan0@users.noreply.github.com> Date: Sat, 25 Apr 2026 19:53:12 -0700 Subject: [PATCH 24/27] Add v4.4 bridge release runway (#130) * feat: add v4.4 bridge release runway * docs: clarify contributor workflow * style: apply lint formatting --- .github/ISSUE_TEMPLATE/bug_report.yml | 62 +++++++ .github/ISSUE_TEMPLATE/config.yml | 5 + .github/ISSUE_TEMPLATE/feature_request.yml | 41 +++++ .github/PULL_REQUEST_TEMPLATE.md | 37 ++++ .github/workflows/ci.yml | 10 +- .github/workflows/release.yml | 34 +++- CONTRIBUTING.md | 112 ++++++++++--- SECURITY.md | 18 ++ datafog/__init__.py | 104 +++++++++++- datafog/config.py | 8 +- datafog/models/anonymizer.py | 7 +- .../text_processing/spacy_pii_annotator.py | 7 +- docs/conf.py | 4 + docs/important-concepts.rst | 12 +- docs/index.rst | 8 +- docs/roadmap.rst | 65 ++++++- docs/v44-bridge-release.rst | 145 ++++++++++++++++ docs/v5-compatibility-matrix.rst | 158 ++++++++++++++++++ docs/v5-cut-line.rst | 142 ++++++++++++++++ docs/v5-product-brief.rst | 114 +++++++++++++ setup.py | 3 +- tests/test_telemetry.py | 9 +- tests/test_v44_bridge_api.py | 82 +++++++++ 23 files changed, 1141 insertions(+), 46 deletions(-) create mode 100644 .github/ISSUE_TEMPLATE/bug_report.yml create mode 100644 .github/ISSUE_TEMPLATE/config.yml create mode 100644 .github/ISSUE_TEMPLATE/feature_request.yml create mode 100644 .github/PULL_REQUEST_TEMPLATE.md create mode 100644 SECURITY.md create mode 100644 docs/v44-bridge-release.rst create mode 100644 docs/v5-compatibility-matrix.rst create mode 100644 docs/v5-cut-line.rst create mode 100644 docs/v5-product-brief.rst create mode 100644 tests/test_v44_bridge_api.py diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml new file mode 100644 index 00000000..39ba49ac --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -0,0 +1,62 @@ +name: Bug report +description: Report something that is broken or surprising. +title: "bug: " +labels: ["bug"] +body: + - type: markdown + attributes: + value: Thanks for reporting a DataFog issue. + - type: textarea + id: summary + attributes: + label: Summary + description: What happened? + validations: + required: true + - type: textarea + id: reproduce + attributes: + label: Reproduction + description: Minimal code, command, or steps to reproduce. + render: python + validations: + required: true + - type: textarea + id: expected + attributes: + label: Expected behavior + validations: + required: true + - type: input + id: version + attributes: + label: DataFog version + placeholder: "4.3.0" + - type: input + id: python + attributes: + label: Python version + placeholder: "3.12.4" + - type: dropdown + id: profile + attributes: + label: Install profile + options: + - core + - cli + - nlp + - nlp-advanced + - ocr + - distributed + - all + - not sure + - type: textarea + id: environment + attributes: + label: Environment details + description: OS, package manager, relevant dependency versions, or CI link. + - type: textarea + id: extra + attributes: + label: Additional context + description: Logs, screenshots, or related issues. diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml new file mode 100644 index 00000000..3505f60f --- /dev/null +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -0,0 +1,5 @@ +blank_issues_enabled: true +contact_links: + - name: Security report + url: https://github.com/DataFog/datafog-python/security/advisories/new + about: Please report security vulnerabilities privately. diff --git a/.github/ISSUE_TEMPLATE/feature_request.yml b/.github/ISSUE_TEMPLATE/feature_request.yml new file mode 100644 index 00000000..8d64b92e --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.yml @@ -0,0 +1,41 @@ +name: Feature request +description: Suggest an improvement or new workflow. +title: "feat: " +labels: ["enhancement"] +body: + - type: textarea + id: problem + attributes: + label: Problem + description: What user problem would this solve? + validations: + required: true + - type: textarea + id: proposal + attributes: + label: Proposed solution + description: What would you like DataFog to do? + validations: + required: true + - type: dropdown + id: area + attributes: + label: Area + options: + - Core scan/redaction + - CLI + - LLM guardrails + - NLP engines + - OCR/image processing + - Spark/distributed processing + - Packaging/install + - Documentation + - Other + - type: textarea + id: alternatives + attributes: + label: Alternatives considered + - type: textarea + id: extra + attributes: + label: Additional context diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 00000000..c3a6b2d3 --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1,37 @@ +## Summary + +Describe the change and why it is needed. + +## Type + +- [ ] Bug fix +- [ ] Feature +- [ ] Docs +- [ ] Tests +- [ ] Chore + +## Target Branch + +- [ ] This PR targets `dev` +- [ ] This PR targets `main` for a release/hotfix + +## Validation + +Commands run: + +```bash + +``` + +Optional profiles tested: + +- [ ] core +- [ ] cli +- [ ] nlp +- [ ] nlp-advanced +- [ ] ocr +- [ ] distributed + +## Notes For Reviewers + +Mention API changes, migrations, warnings, or release-note needs. diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 03df4cb1..4ffb6fda 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -29,8 +29,16 @@ jobs: strategy: fail-fast: false matrix: - python-version: ["3.10", "3.11", "3.12"] + python-version: ["3.10", "3.11", "3.12", "3.13"] install-profile: ["core", "nlp", "nlp-advanced"] + exclude: + # v4.4.0 claims Python 3.13 support for core + CLI first. + # Optional heavyweight profiles remain validated separately before + # we advertise Python 3.13 support for them. + - python-version: "3.13" + install-profile: "nlp" + - python-version: "3.13" + install-profile: "nlp-advanced" steps: - uses: actions/checkout@v4 - name: Set up Python diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 091f9329..f99b874c 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -139,8 +139,40 @@ jobs: OMP_NUM_THREADS=4 MKL_NUM_THREADS=4 OPENBLAS_NUM_THREADS=4 python tests/simple_performance_test.py # ── 3. Build & Publish ──────────────────────────────────────────────── + python313-core: + needs: determine-release + if: needs.determine-release.outputs.has_changes == 'true' + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + ref: ${{ needs.determine-release.outputs.target_branch }} + + - name: Set up Python 3.13 + uses: actions/setup-python@v5 + with: + python-version: "3.13" + cache: "pip" + + - name: Install core + CLI dependencies + run: | + python -m pip install --upgrade pip + pip install pytest pytest-cov coverage + pip install -e ".[dev,cli]" + + - name: Run Python 3.13 core + CLI tests + run: | + pytest tests/ \ + -m "not slow" \ + --ignore=tests/test_gliner_annotator.py \ + --ignore=tests/test_image_service.py \ + --ignore=tests/test_ocr_integration.py \ + --ignore=tests/test_spark_integration.py \ + --ignore=tests/test_text_service_integration.py + publish: - needs: [determine-release, test] + needs: [determine-release, test, python313-core] runs-on: ubuntu-latest outputs: version: ${{ steps.version.outputs.version }} diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 0ec54416..285dd51e 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,29 +1,99 @@ -# Contributing guidelines +# Contributing to DataFog Python -# Contributors +Thanks for helping improve DataFog. The project welcomes issues, bug reports, +documentation fixes, tests, and pull requests. -- sroy9675 -- pselvana -- sidmohan0 +Please follow the [Code of Conduct](CODE_OF_CONDUCT.md) in all project spaces. + +## Branch And PR Policy + +DataFog uses `dev` as the default development branch and `main` as the stable +release branch. + +Use this workflow for normal contributions: + +1. Fork the repository or create a topic branch from `dev`. +2. Name branches with a GitHub username prefix when practical, for example + `sidmohan0/dfpy-v44-bridge` or `yourname/fix-cli-redaction`. +3. Open pull requests into `dev`. +4. Keep pull requests focused and include tests or docs when behavior changes. + +Use `main` only for stable release promotion or urgent release hotfixes. +Do not use `dev` or `main` as working branches. + +Maintainers should prefer pull requests even for small changes. Protected branch +rules should prevent branch deletion, require CI before merge, and avoid direct +pushes except for explicit emergency maintenance. + +## Local Development + +```bash +git clone https://github.com/datafog/datafog-python +cd datafog-python +python -m venv .venv +source .venv/bin/activate # Windows: .venv\Scripts\activate +python -m pip install --upgrade pip +pip install -e ".[dev,cli]" +``` + +For optional NLP or OCR work, install the relevant extras: + +```bash +pip install -e ".[dev,cli,nlp]" +pip install -e ".[dev,cli,nlp,nlp-advanced]" +pip install -e ".[all,dev]" +``` -for their help +## Tests -The datafog community appreciates your contributions via issues and -pull requests. Note that the [code of conduct](CODE_OF_CONDUCT.md) -applies to all interactions with the datafog project, including -issues and pull requests. +Run the core test suite before opening a pull request: -When submitting pull requests, please follow the style guidelines of -the project, ensure that your code is tested and documented, and write -good commit messages, e.g., following [these -guidelines](https://chris.beams.io/posts/git-commit/). +```bash +pytest tests/ -m "not slow" \ + --ignore=tests/test_gliner_annotator.py \ + --ignore=tests/test_image_service.py \ + --ignore=tests/test_ocr_integration.py \ + --ignore=tests/test_spark_integration.py \ + --ignore=tests/test_text_service_integration.py +``` -By submitting a pull request, you are licensing your code under the -project [license](LICENSE) and affirming that you either own copyright -(automatic for most individuals) or are authorized to distribute under -the project license (e.g., in case your employer retains copyright on -your work). +Run the focused test file for the area you changed whenever possible. For +documentation-only changes, build the docs: -### Legal Notice +```bash +sphinx-build -b html docs docs/_build/html +``` -When contributing to this project, you must agree that you have authored 100% of the content, that you have the necessary rights to the content and that the content you contribute may be provided under the project license. +## Pull Request Checklist + +Before requesting review: + +- Rebase or merge the latest `dev`. +- Add or update tests for behavior changes. +- Update docs for user-facing changes. +- Keep public API changes explicit in the PR description. +- Note any optional dependency profile you tested, such as `core`, `nlp`, or + `nlp-advanced`. + +## Commit Messages + +Use clear, descriptive commit messages. Conventional-style prefixes are welcome +but not required, for example: + +- `fix: handle empty scan input` +- `docs: clarify branch policy` +- `test: cover v5 preview redaction wrapper` + +## Legal + +By submitting a pull request, you license your contribution under the project +[license](LICENSE). You also affirm that you authored the contribution or have +the right to submit it under the project license. + +## Contributors + +Thanks to early contributors including: + +- sroy9675 +- pselvana +- sidmohan0 diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 00000000..2821acd0 --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,18 @@ +# Security Policy + +Please do not report security vulnerabilities in public issues. + +Use GitHub private vulnerability reporting when available: + +https://github.com/DataFog/datafog-python/security/advisories/new + +If private vulnerability reporting is unavailable, contact the maintainers +directly and include: + +- affected versions; +- a minimal reproduction or proof of concept; +- expected impact; +- any known mitigations. + +We will acknowledge valid reports as quickly as practical and coordinate fixes +before public disclosure. diff --git a/datafog/__init__.py b/datafog/__init__.py index b3ca498e..e3974ad7 100644 --- a/datafog/__init__.py +++ b/datafog/__init__.py @@ -8,11 +8,17 @@ - pip install datafog[all] - for all features """ +import warnings + from .__about__ import __version__ from .agent import create_guardrail, filter_output, sanitize, scan_prompt # Core API functions - always available (lightweight) from .core import anonymize_text, detect_pii, get_supported_entities, scan_text +from .engine import Entity, RedactResult, ScanResult +from .engine import redact as _redact_entities +from .engine import scan as _scan +from .engine import scan_and_redact as _scan_and_redact # Essential models - always available from .models.common import EntityTypes @@ -134,6 +140,88 @@ def _missing_dependency(*args, **kwargs): ) +_REDACT_PRESETS = { + "default": "token", + "llm": "token", + "mask": "mask", + "hash": "hash", + "replace": "pseudonymize", + "pseudonymize": "pseudonymize", +} + + +def _warn_v5_replacement(old_api: str, replacement: str) -> None: + warnings.warn( + f"datafog.{old_api}() is deprecated for v5. Use {replacement} instead. " + "This compatibility shim will remain through the v5.x line.", + FutureWarning, + stacklevel=3, + ) + + +def scan( + text: str, + engine: str = "regex", + entity_types: list[str] | None = None, +) -> ScanResult: + """ + v5-preview scan entrypoint. + + Defaults to the lightweight regex engine so the core install works without + optional dependency fallback warnings. + """ + return _scan(text=text, engine=engine, entity_types=entity_types) + + +def redact( + text: str, + entities: list[Entity] | None = None, + engine: str = "regex", + entity_types: list[str] | None = None, + strategy: str = "token", + preset: str | None = None, +) -> RedactResult: + """ + v5-preview redaction entrypoint. + + If entities are provided, redact those spans. Otherwise, scan text first + using the selected engine and redact the detected entities. + """ + if preset is not None: + try: + strategy = _REDACT_PRESETS[preset] + except KeyError as exc: + allowed = ", ".join(sorted(_REDACT_PRESETS)) + raise ValueError(f"preset must be one of: {allowed}") from exc + + if entities is not None: + return _redact_entities(text=text, entities=entities, strategy=strategy) + + return _scan_and_redact( + text=text, + engine=engine, + entity_types=entity_types, + strategy=strategy, + ) + + +def protect( + entity_types: list[str] | None = None, + engine: str = "regex", + strategy: str = "token", + on_detect: str = "redact", +): + """ + v5-preview guardrail factory. + """ + return create_guardrail( + entity_types=entity_types, + engine=engine, + strategy=strategy, + on_detect=on_detect, + ) + + # Simple API for core functionality (backward compatibility) def detect(text: str) -> list: """ @@ -150,6 +238,12 @@ def detect(text: str) -> list: >>> detect("Contact john@example.com") [{'type': 'EMAIL', 'value': 'john@example.com', 'start': 8, 'end': 24}] """ + _warn_v5_replacement("detect", "datafog.scan()") + + return _detect_impl(text) + + +def _detect_impl(text: str) -> list: import time as _time _start = _time.monotonic() @@ -217,11 +311,13 @@ def process(text: str, anonymize: bool = False, method: str = "redact") -> dict: 'findings': [{'type': 'EMAIL', 'value': 'john@example.com', ...}] } """ + _warn_v5_replacement("process", "datafog.scan() or datafog.redact()") + import time as _time _start = _time.monotonic() - findings = detect(text) + findings = _detect_impl(text) result = {"original": text, "findings": findings} @@ -268,6 +364,12 @@ def process(text: str, anonymize: bool = False, method: str = "redact") -> dict: # Core exports __all__ = [ "__version__", + "Entity", + "ScanResult", + "RedactResult", + "scan", + "redact", + "protect", "detect", "process", "detect_pii", diff --git a/datafog/config.py b/datafog/config.py index 36a6b3d8..b195893b 100644 --- a/datafog/config.py +++ b/datafog/config.py @@ -9,7 +9,7 @@ from enum import Enum from typing import Optional -from pydantic_settings import BaseSettings +from pydantic_settings import BaseSettings, SettingsConfigDict class DataFogConfig(BaseSettings): @@ -25,6 +25,8 @@ class DataFogConfig(BaseSettings): uses Pydantic for data validation and settings management. """ + model_config = SettingsConfigDict(env_prefix="DATAFOG_", case_sensitive=False) + # API Keys and Authentication api_key: str = os.environ.get("DATAFOG_API_KEY", "") @@ -48,10 +50,6 @@ class DataFogConfig(BaseSettings): # Logging log_level: str = "INFO" - class Config: - env_prefix = "DATAFOG_" - case_sensitive = False - def update(self, **kwargs): """Update configuration with new values""" for key, value in kwargs.items(): diff --git a/datafog/models/anonymizer.py b/datafog/models/anonymizer.py index 79af53ca..6b81f5be 100644 --- a/datafog/models/anonymizer.py +++ b/datafog/models/anonymizer.py @@ -7,7 +7,7 @@ from enum import Enum from typing import List, Optional -from pydantic import BaseModel, Field +from pydantic import BaseModel, ConfigDict, Field from .annotator import AnnotationResult from .common import EntityTypes @@ -34,14 +34,13 @@ class AnonymizerRequest(BaseModel): class AnonymizationResult(BaseModel): + model_config = ConfigDict(populate_by_name=True) + anonymized_text: str anonymized_entities: List[dict] = Field( default_factory=list, alias="replaced_entities" ) - class Config: - populate_by_name = True - class Anonymizer(BaseModel): anonymizer_type: AnonymizerType = AnonymizerType.REPLACE diff --git a/datafog/processing/text_processing/spacy_pii_annotator.py b/datafog/processing/text_processing/spacy_pii_annotator.py index e7bc5732..e871db8a 100644 --- a/datafog/processing/text_processing/spacy_pii_annotator.py +++ b/datafog/processing/text_processing/spacy_pii_annotator.py @@ -1,7 +1,7 @@ import logging from typing import Any, Dict, List -from pydantic import BaseModel +from pydantic import BaseModel, ConfigDict PII_ANNOTATION_LABELS = [ "CARDINAL", @@ -27,6 +27,8 @@ class SpacyPIIAnnotator(BaseModel): + model_config = ConfigDict(arbitrary_types_allowed=True) + nlp: Any @classmethod @@ -75,6 +77,3 @@ def annotate(self, text: str) -> Dict[str, List[str]]: return { label: [] for label in PII_ANNOTATION_LABELS } # Return empty annotations in case of error - - class Config: - arbitrary_types_allowed = True diff --git a/docs/conf.py b/docs/conf.py index 0c10e751..1cb1c895 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -30,3 +30,7 @@ napoleon_use_rtype = False napoleon_use_ivar = False napoleon_use_param = False + +# Keep API docs buildable from the lightweight core/dev install. These +# integrations are documented, but they live behind optional extras. +autodoc_mock_imports = ["PIL", "pytesseract", "spacy"] diff --git a/docs/important-concepts.rst b/docs/important-concepts.rst index 791fa0a0..d08c932c 100644 --- a/docs/important-concepts.rst +++ b/docs/important-concepts.rst @@ -1,6 +1,6 @@ -=========== +================== Important Concepts -=========== +================== Overview -------- @@ -20,6 +20,7 @@ Key data models to support PII annotation and OCR analysis. Processors ^^^^^^^^^^^ Main processors: + * SpacyAnnotator Text annotation with spaCy * DonutProcessor @@ -30,6 +31,7 @@ Main processors: Services ^^^^^^^^^^^ Core services: + * ImageService Image handling and OCR * SparkService @@ -47,6 +49,7 @@ Data Models .. autosummary:: :toctree: generated/ :template: class.rst + AnnotatorRequest AnnotationResult AnalysisExplanation @@ -57,6 +60,7 @@ Data Models .. autosummary:: :toctree: generated/ :template: class.rst + EntityTypes Pattern PatternRecognizer @@ -68,6 +72,7 @@ Data Models .. autosummary:: :toctree: generated/ :template: class.rst + SpacyAnnotator Processors @@ -98,6 +103,7 @@ Services .. autosummary:: :toctree: generated/ :template: class.rst + ImageDownloader ImageService @@ -107,6 +113,7 @@ Services .. autosummary:: :toctree: generated/ :template: class.rst + SparkService @@ -116,4 +123,5 @@ Services .. autosummary:: :toctree: generated/ :template: class.rst + TextService diff --git a/docs/index.rst b/docs/index.rst index e092c758..a22af1c5 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -12,13 +12,17 @@ DataFog is an open-source tool for PII detection and anonymization of unstructur python-sdk definitions roadmap + v44-bridge-release + v5-product-brief + v5-compatibility-matrix + v5-cut-line ===================== Getting Started ===================== ---------------------- Installation +------------ Install DataFog via pip: @@ -102,5 +106,3 @@ Scan image for PII: asyncio.run(run_ocr_pipeline_demo()) For detailed information on the Python SDK, see :doc:`python-sdk`. - - diff --git a/docs/roadmap.rst b/docs/roadmap.rst index 63850b63..acf8b6a0 100644 --- a/docs/roadmap.rst +++ b/docs/roadmap.rst @@ -9,6 +9,69 @@ to a lightweight, modular architecture with optional extras. :local: :depth: 1 +v4.4.0 - Python 3.13 and v5 Migration Bridge +-------------------------------------------- + +Before v5.0.0, DataFog should ship a focused v4.4.0 bridge release. The +purpose is to give users a concrete compatibility win while introducing the +v5 direction gently. + +v4.4.0 should focus on: + +* Python 3.13 support for the core SDK and CLI. +* Dependency validation for optional profiles without blocking core/CLI. +* v5-style preview APIs where they can land safely. +* Targeted deprecation warnings with no warnings on import. +* Migration docs and release notes that announce the v5 path. + +Scope artifact: + +* :doc:`v44-bridge-release` + +v5.0.0 - Offline PII Firewall for AI Apps +----------------------------------------- + +The v5.0.0 release is scoped around a sharper adoption wedge: + + DataFog should be the fastest, easiest offline PII firewall for AI apps, + logs, and datasets. + +The release should prioritize trust and time-to-first-value over broad +enterprise surface area. The first path should be a core install, a simple +top-level API, no network surprises, and copy-pasteable workflows for LLM +prompts/outputs, logs, JSONL datasets, and CI checks. + +Scope artifacts: + +* :doc:`v5-product-brief` +* :doc:`v5-compatibility-matrix` +* :doc:`v5-cut-line` + +v5.0.0 must focus on: + +* Stable top-level APIs: ``scan``, ``redact``, ``protect``, and ``restore``. +* Privacy-safe defaults: no default network behavior, no runtime package + installation, and no implicit model downloads. +* Policy-based redaction with presets for LLMs, logs, strict workflows, and + datasets. +* Reversible token sessions that are explicit and opt-in. +* LLM guardrails, including sync, async, and streaming protection. +* CLI workflows for stdin, files, directories, CSV, JSONL, machine-readable + output, and CI-friendly exit codes. +* Custom recognizers and stronger structured detection for app/log/secrets + data. +* Modern packaging and release gates for install profiles, no-network + behavior, import time, wheel size, accuracy, coverage, and benchmarks. + +Deferred to v5.1+: + +* OCR overhaul. +* Spark overhaul. +* Cloud DLP integrations. +* Enterprise dashboards and analytics. +* Broad multilingual model tuning. +* Large Presidio-style framework expansion. + ✅ 4.1.0 (Released) -------------------- The ``4.1.0`` release represents a major architectural shift to a lightweight @@ -80,4 +143,4 @@ Version ``4.5.0`` will introduce: * **Performance monitoring** and metrics collection The lightweight core will remain unchanged, ensuring existing -integrations continue to work without modification. \ No newline at end of file +integrations continue to work without modification. diff --git a/docs/v44-bridge-release.rst b/docs/v44-bridge-release.rst new file mode 100644 index 00000000..981b517d --- /dev/null +++ b/docs/v44-bridge-release.rst @@ -0,0 +1,145 @@ +================================= +v4.4 Bridge Release Scope +================================= + +Status +------ + +Scope and implementation notes for the v4.4.0 transition release. + +Linear project: `DataFog Python v4.4.0: Python 3.13 + v5 Migration Bridge `_ + +Release Thesis +-------------- + +v4.4.0 should be a bridge release before v5. It should give users an +immediate compatibility win with Python 3.13 and introduce the v5 migration +path without forcing a disruptive API change. + +The release should feel positive: + +* Python 3.13 support for the core SDK and CLI. +* A preview of the v5-style API path. +* Targeted, actionable deprecation warnings. +* Migration docs that explain what is changing and why. + +Required Scope +-------------- + +Python 3.13 Core and CLI Support +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +v4.4.0 should support Python 3.13 for the lightweight core package and CLI +workflow. + +Required changes: + +* Package metadata allows Python 3.13. +* Package classifiers include Python 3.13. +* CI runs the core/CLI profile on Python 3.13. +* Release notes clearly state that Python 3.13 support is guaranteed for + core and CLI. + +Optional Dependency Stance +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Optional heavy profiles should be validated where practical, but they should +not block v4.4.0 unless they break the core or CLI install. + +.. list-table:: + :header-rows: 1 + :widths: 20 30 50 + + * - Profile + - v4.4.0 support stance + - Notes + * - ``core`` + - Required on Python 3.13 + - Must install and run scan/redaction tests. + * - ``cli`` + - Required on Python 3.13 + - Must install and run CLI smoke tests. + * - ``nlp`` + - Validate where practical + - spaCy advertises Python 3.13 support in current releases, but model + install behavior should be tested before claiming full support. + * - ``nlp-advanced`` + - Provisional + - GLiNER and PyTorch dependency behavior should be tested separately. + * - ``ocr`` + - Provisional + - OCR is not on the v4.4 or v5.0 critical path. + * - ``distributed`` + - Provisional + - Spark support should not block the bridge release. + +v5 Preview APIs +~~~~~~~~~~~~~~~ + +v4.4.0 should make the recommended v5 path available early where it can be +done without large internal churn. + +Preview path: + +* ``datafog.scan`` +* ``datafog.redact`` +* ``datafog.protect`` if it can be exposed through existing guardrail helpers + without pulling in the full v5 policy engine. + +The preview APIs should delegate through current stable internals and should +not require optional heavy dependencies. + +For the bridge release, preview APIs should default to the lightweight regex +engine so a core-only install does not emit optional dependency fallback +warnings. Users can still opt into ``engine="smart"`` explicitly. + +Deprecation Warning Policy +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +v4.4.0 should introduce migration signals, not warning noise. + +Rules: + +* No warnings on ``import datafog``. +* Warnings only fire when selected legacy convenience APIs are called. +* Warning messages name the v5-style replacement. +* Warnings use ``stacklevel`` so they point to user code where practical. +* ``DataFog`` and ``TextService`` should not warn by default in v4.4.0. + +Candidate warning targets: + +* ``datafog.detect`` -> ``datafog.scan`` +* ``datafog.process`` -> ``datafog.redact`` + +Deferred warning targets: + +* ``detect_pii`` -> ``scan`` +* ``anonymize_text`` -> ``redact`` +* ``scan_text`` -> ``scan`` +* Old CLI commands once v5-style CLI aliases exist. + +Non-Goals +--------- + +v4.4.0 should not attempt to complete v5. + +Out of scope: + +* Full v5 policy engine. +* Reversible token sessions. +* Streaming guardrails. +* OCR overhaul. +* Spark overhaul. +* Modern packaging migration to ``pyproject.toml``. +* Broad dependency cleanup beyond what Python 3.13 requires. + +Success Criteria +---------------- + +v4.4.0 is ready when: + +* Python 3.13 core/CLI CI is green. +* Package metadata advertises Python 3.13 support. +* v5-style API docs are visible. +* Deprecation warnings are targeted and tested. +* Release notes frame the release as a compatibility win and migration runway. diff --git a/docs/v5-compatibility-matrix.rst b/docs/v5-compatibility-matrix.rst new file mode 100644 index 00000000..41c9e339 --- /dev/null +++ b/docs/v5-compatibility-matrix.rst @@ -0,0 +1,158 @@ +======================= +v5 Compatibility Matrix +======================= + +Status +------ + +Scope lock draft for the v4 to v5 migration contract. + +Migration Stance +---------------- + +v5 should be adoption-first without being hostile to existing users. +The default posture is: + +* Make the new API obvious and stable. +* Keep existing public APIs working through compatibility shims where + practical. +* Warn only when a user is on a path that should move to the v5 API. +* Break behavior only when required for trust, privacy, or install + predictability. + +Compatibility Matrix +-------------------- + +.. list-table:: + :header-rows: 1 + :widths: 26 24 20 30 + + * - Surface + - v4 State + - v5 Status + - v5 Direction + * - ``datafog.scan`` + - v4.4 bridge preview top-level API. + - Stable public API. + - Keep as the obvious scan entrypoint returning typed scan results. + * - ``datafog.redact`` + - v4.4 bridge preview top-level API. + - Stable public API. + - Expand user-facing presets and policy support while preserving typed + redaction metadata. + * - ``datafog.protect`` + - v4.4 bridge preview alias around ``create_guardrail``. + - Stable public API. + - Add decorator/context API for sync, async, and streaming workflows. + * - ``datafog.restore`` + - No primary public API. + - Stable public API. + - Restore text from explicit reversible token sessions only. + * - ``datafog.sanitize`` + - Top-level LLM helper. + - Supported shim. + - Keep as alias-like convenience around ``redact``. + * - ``datafog.scan_prompt`` + - Top-level LLM helper. + - Supported shim. + - Keep for compatibility; docs should prefer ``scan`` or ``protect``. + * - ``datafog.filter_output`` + - Top-level LLM helper. + - Supported shim. + - Keep for compatibility; docs should prefer ``redact`` or ``protect``. + * - ``datafog.create_guardrail`` + - Top-level guardrail factory. + - Supported shim. + - Keep, but docs should make ``protect`` the first path. + * - ``datafog.detect`` + - Top-level convenience detection API. + - Deprecated shim. + - Delegate to ``scan`` and warn with replacement guidance. + * - ``datafog.process`` + - Top-level detect/process helper. + - Deprecated shim. + - Delegate to ``scan`` / ``redact`` and warn with replacement guidance. + * - ``detect_pii`` + - Core helper returning dicts. + - Deprecated shim. + - Keep temporarily for v4 users; prefer ``scan`` typed results. + * - ``anonymize_text`` + - Core anonymization helper. + - Deprecated shim. + - Keep temporarily; prefer ``redact`` with a policy or preset. + * - ``scan_text`` + - Boolean or dict helper. + - Deprecated shim. + - Keep temporarily; prefer ``scan``. + * - ``get_supported_entities`` + - Returns core regex entity names. + - Supported utility. + - Keep, but align with recognizer registry and locale packs. + * - ``DataFog`` + - Legacy class-based entrypoint. + - Compatibility shim. + - Keep through v5 with warnings for new construction patterns. + * - ``TextService`` + - Service class with engine selection and legacy return shapes. + - Compatibility shim. + - Keep for existing users; new docs should use top-level API. + * - ``TextPIIAnnotator`` + - Lightweight wrapper. + - Deprecated shim. + - Keep only if needed for compatibility; prefer ``scan``. + * - ``RegexAnnotator`` + - Exposed implementation detail. + - Advanced API. + - Keep available for advanced users, but docs should prefer registry. + * - ``datafog.engine`` + - Internal boundary introduced before v5. + - Internal implementation. + - Keep stable enough for wrappers, but public imports should use top-level + APIs or public model paths. + * - CLI ``scan-text``, ``redact-text``, ``replace-text``, ``hash-text`` + - Current command set. + - Deprecated command shims. + - Add v5 ``scan``, ``redact``, and ``audit`` commands; keep old commands + with warnings. + * - CLI ``scan-image`` + - OCR-oriented command. + - Optional legacy command. + - Keep under OCR extra; do not make it a v5.0 adoption path. + * - OCR services and processors + - Optional extras with heavyweight dependencies. + - Deferred for v5.1+ overhaul. + - Keep install hints clear; no runtime package installs. + * - Spark services and UDFs + - Optional distributed path. + - Deferred for v5.1+ overhaul. + - Keep compatibility where practical; no runtime package installs. + * - Telemetry + - Default-on opt-out telemetry. + - Trust-critical behavior change. + - Make opt-in or no-network-by-default. + * - ``*_lean`` and ``*_original`` modules + - Parallel historical implementations. + - Remove or make private after migration path. + - Consolidate around the v5 core and delete duplicate runtime surfaces. + +Warning Policy +-------------- + +Deprecation warnings should be specific and actionable. They should name the +old API, the v5 replacement, and the planned support window. For example: + +.. code-block:: text + + datafog.detect() is deprecated in v5. Use datafog.scan() instead. + This compatibility shim will be supported through the v5.x line. + +Breaking-Change Criteria +------------------------ + +A v5 breaking change is acceptable only when it materially improves one of: + +* Privacy or trust defaults. +* Core install reliability. +* No-network behavior. +* Public API clarity. +* Correctness of redaction metadata. diff --git a/docs/v5-cut-line.rst b/docs/v5-cut-line.rst new file mode 100644 index 00000000..71a66a5a --- /dev/null +++ b/docs/v5-cut-line.rst @@ -0,0 +1,142 @@ +================= +v5.0 Release Line +================= + +Status +------ + +Scope lock draft for what belongs in v5.0.0 versus v5.1+. + +v5.0 Release Promise +-------------------- + +v5.0.0 should ship a sharp, adoption-focused SDK: + + The fastest, easiest offline PII firewall for AI apps, logs, and + datasets. + +The release is not judged by how much surface area it adds. It is judged by +whether a new user can trust it quickly, understand it quickly, and protect +real application data quickly. + +Must Ship in v5.0 +----------------- + +Core API +~~~~~~~~ + +* Stable top-level ``scan``, ``redact``, ``protect``, and ``restore`` APIs. +* Public typed result objects and entity models. +* Compatibility shims for important v4 APIs. +* API snapshot tests and executable quickstart smoke tests. + +Trust Defaults +~~~~~~~~~~~~~~ + +* Telemetry opt-in or no-network-by-default. +* No runtime package installation. +* No implicit model downloads. +* Safe redaction metadata by default. +* No-network import and core API tests. + +Policy and Redaction +~~~~~~~~~~~~~~~~~~~~ + +* ``RedactionPolicy`` with entity filters, thresholds, locale, mapping + behavior, and actions per entity type. +* Built-in presets for ``llm``, ``logs``, ``strict``, and ``dataset``. +* Reversible ``TokenSession`` only when explicitly requested. +* HMAC hashing and format-preserving masking strategies. + +LLM and App Workflows +~~~~~~~~~~~~~~~~~~~~~ + +* ``protect`` decorator for sync and async functions. +* Streaming output filtering. +* Guardrail session counters and safe audit metadata. +* Examples for LLM calls, FastAPI middleware, logging filters, LangChain-style + adapters, and LlamaIndex-style adapters. + +CLI and Dataset Workflows +~~~~~~~~~~~~~~~~~~~~~~~~~ + +* v5 CLI commands: ``scan``, ``redact``, and ``audit``. +* Support for direct text, stdin, files, directories, CSV, and JSONL. +* Machine-readable ``--json`` and ``--jsonl`` output. +* CI-friendly ``--fail-on-detect`` behavior and exit codes. +* Dataset audit summaries that avoid raw PII by default. + +Detection Quality +~~~~~~~~~~~~~~~~~ + +* Custom recognizer registry with regex and validator hooks. +* Common app/log recognizers for secrets, tokens, API keys, IPv6, IBAN, + crypto wallets, and account identifiers. +* ``us`` and ``global`` locale packs. +* Deterministic overlap resolution and confidence scoring. +* Expanded corpus and enforced accuracy thresholds in CI. + +Packaging and Release +~~~~~~~~~~~~~~~~~~~~~ + +* ``pyproject.toml`` as primary packaging metadata. +* ``py.typed`` marker. +* Clean runtime dependencies and optional extras. +* CI gates for install profiles, import time, wheel size, no-network behavior, + coverage, accuracy, and benchmark regressions. +* v4 to v5 migration guide and launch assets. + +Defer to v5.1+ +-------------- + +OCR Overhaul +~~~~~~~~~~~~ + +OCR is useful, but it is not the adoption wedge for v5.0. Keep current OCR +compatibility where practical and move OCR quality, preprocessing, and +processor architecture to v5.1+. + +Spark and Distributed Processing +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Spark support should not block the first v5 release. Keep install errors and +compatibility sane, then revisit distributed workflows after the core API and +CLI are stable. + +Cloud Integrations +~~~~~~~~~~~~~~~~~~ + +AWS, GCP, Azure, hosted APIs, and cloud DLP bridges are deferred. v5.0 should +win first as an offline local SDK. + +Enterprise Analytics +~~~~~~~~~~~~~~~~~~~~ + +Dashboards, organization-wide reporting, and advanced analytics are deferred. +The v5.0 audit surface should remain CLI/API friendly and privacy-safe. + +Broad Multilingual Expansion +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +v5.0 may introduce locale structure, but broad multilingual tuning should wait +until the v5 core is stable and measured. + +Large Presidio-Style Framework Expansion +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +DataFog should not try to clone Presidio in v5.0. Custom recognizers are in +scope, but a large enterprise recognizer framework is not. + +Scope Creep Test +---------------- + +A proposed v5.0 item should pass at least one of these tests: + +* Does it make first use faster or clearer? +* Does it improve trust for a PII SDK? +* Does it protect LLM, log, CLI, or dataset workflows? +* Does it make the core install smaller, safer, or more predictable? +* Does it reduce migration risk for existing users? + +If the answer is no, the item probably belongs in v5.1+. + diff --git a/docs/v5-product-brief.rst b/docs/v5-product-brief.rst new file mode 100644 index 00000000..84fd51c7 --- /dev/null +++ b/docs/v5-product-brief.rst @@ -0,0 +1,114 @@ +================ +v5 Product Brief +================ + +Status +------ + +Scope lock draft for DataFog Python v5.0.0. + +Linear project: `DataFog Python v5.0.0: Offline PII Firewall for AI Apps `_ + +Product Thesis +-------------- + +DataFog v5 should be the fastest, easiest offline PII firewall for AI +apps, logs, and datasets. + +The release should optimize for adoption over breadth. The first user +experience should be a small install, a clear one-liner, and strong +privacy defaults: + +.. code-block:: python + + import datafog + + result = datafog.redact("Email john@example.com", preset="llm") + safe_text = result.redacted_text + +In the v4.4 bridge release, ``scan``, ``redact``, and ``protect`` should default +to the lightweight regex engine so the core install remains quiet. Higher-recall +NLP paths can stay explicit through ``engine="smart"`` or optional extras. + +Primary Users +------------- + +v5 is built first for developers who need to keep PII out of: + +* LLM prompts and model outputs. +* Agent tool calls and MCP-style tool results. +* Application logs, traces, analytics events, and support dumps. +* JSONL, CSV, and text datasets used for evals or fine-tuning. +* CI checks that should fail when accidental PII is present. + +Adoption Principles +------------------- + +* **Time to first value under 60 seconds.** A new user should be able to + install DataFog and redact text without reading architecture docs. +* **No network surprises.** A core install should not send telemetry, + download models, install packages, or open network connections by + default. +* **One obvious API path.** The primary path is ``scan``, ``redact``, + ``protect``, and ``restore``. +* **Safe metadata by default.** Redaction results should not expose + original PII mappings unless the user explicitly asks for a reversible + session. +* **Workflow-first docs.** The first examples should be LLM protection, + logging filters, FastAPI middleware, CLI JSONL redaction, and custom + recognizers. +* **Honest positioning.** DataFog should not try to out-Presidio + Presidio. It should win on lightweight offline protection for AI apps, + logs, and datasets. + +v5.0 Must-Haves +--------------- + +* Stable top-level APIs: ``datafog.scan``, ``datafog.redact``, + ``datafog.protect``, and ``datafog.restore``. +* Public typed result objects for scans, entities, redaction results, + and reversible sessions. +* Privacy-safe defaults: telemetry opt-in or no-network-by-default, + no runtime package installation, no implicit model downloads. +* Policy-based redaction with presets for ``llm``, ``logs``, + ``strict``, and ``dataset``. +* Reversible token sessions for workflows that need explicit + restore support. +* LLM guardrails for sync, async, and streaming responses. +* CLI support for stdin, files, directories, CSV, JSONL, machine-readable + output, and CI-friendly exit codes. +* Custom recognizer registry for domain-specific PII and secrets. +* Expanded corpus tests for app, log, and secret-like data. +* Modern packaging, typed package marker, install-profile tests, and + release gates for accuracy, coverage, import time, wheel size, and + no-network behavior. + +Non-Goals for v5.0 +------------------ + +The following work is intentionally outside the v5.0 critical path: + +* OCR overhaul. +* Spark overhaul. +* Cloud DLP integrations. +* Enterprise dashboards or analytics products. +* Broad multilingual model tuning. +* A large Presidio-style recognizer framework. +* Hosted service features. + +Success Metrics +--------------- + +* A clean core install can redact text in under 60 seconds from a fresh + environment. +* ``pip install datafog`` stays lightweight and does not require spaCy, + PyTorch, OCR, Spark, or model downloads. +* ``import datafog`` and core ``scan`` / ``redact`` calls make no network + requests by default. +* The README quickstart uses the v5 API first and does not require legacy + classes. +* At least five adoption workflows are documented and copy-pasteable: + LLM prompts/outputs, streaming outputs, logging, FastAPI, and JSONL CLI. +* Accuracy and regression checks run in CI and produce readable artifacts. +* Existing v4 users have a clear migration path with compatibility shims + and warnings instead of surprising breakage. diff --git a/setup.py b/setup.py index 97b5c909..75284180 100644 --- a/setup.py +++ b/setup.py @@ -95,7 +95,7 @@ packages=find_packages(exclude=["tests", "tests.*"]), install_requires=core_deps, extras_require=extras_require, - python_requires=">=3.10,<3.13", + python_requires=">=3.10,<3.14", entry_points={ "console_scripts": [ "datafog=datafog.client:app [cli]", # Requires cli extra @@ -109,6 +109,7 @@ "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", "Topic :: Software Development :: Libraries :: Python Modules", "Topic :: Text Processing", "Topic :: Security", diff --git a/tests/test_telemetry.py b/tests/test_telemetry.py index bd20e21f..9c69e178 100644 --- a/tests/test_telemetry.py +++ b/tests/test_telemetry.py @@ -356,7 +356,8 @@ class TestIntegration: def test_detect_triggers_telemetry(self, mock_urlopen): from datafog import detect - detect("Contact john@example.com") + with pytest.warns(FutureWarning, match=r"Use datafog\.scan\(\) instead"): + detect("Contact john@example.com") time.sleep(0.3) events = [] @@ -369,7 +370,11 @@ def test_detect_triggers_telemetry(self, mock_urlopen): def test_process_triggers_telemetry(self, mock_urlopen): from datafog import process - process("Contact john@example.com", anonymize=True) + with pytest.warns( + FutureWarning, + match=r"datafog\.scan\(\) or datafog\.redact\(\)", + ): + process("Contact john@example.com", anonymize=True) time.sleep(0.3) events = [] diff --git a/tests/test_v44_bridge_api.py b/tests/test_v44_bridge_api.py new file mode 100644 index 00000000..a26099a9 --- /dev/null +++ b/tests/test_v44_bridge_api.py @@ -0,0 +1,82 @@ +"""Tests for v4.4 bridge APIs that preview the v5 surface.""" + +from __future__ import annotations + +import importlib +import warnings + +import pytest + +import datafog + + +def test_top_level_scan_is_available_without_optional_extras() -> None: + with warnings.catch_warnings(record=True) as captured: + warnings.simplefilter("always") + result = datafog.scan("Email jane@example.com") + + assert result.engine_used == "regex" + assert any(entity.type == "EMAIL" for entity in result.entities) + assert not captured + + +def test_top_level_redact_scans_by_default() -> None: + with warnings.catch_warnings(record=True) as captured: + warnings.simplefilter("always") + result = datafog.redact("Email jane@example.com") + + assert result.redacted_text != "Email jane@example.com" + assert "[EMAIL_1]" in result.redacted_text + assert result.mapping + assert not captured + + +def test_top_level_redact_accepts_precomputed_entities() -> None: + text = "Email jane@example.com" + scan_result = datafog.scan(text, engine="regex") + + result = datafog.redact(text, entities=scan_result.entities, strategy="mask") + + assert "jane@example.com" not in result.redacted_text + assert "*" in result.redacted_text + + +def test_top_level_redact_supports_preview_presets() -> None: + result = datafog.redact("Email jane@example.com", preset="llm") + + assert "[EMAIL_1]" in result.redacted_text + + +def test_top_level_protect_returns_guardrail() -> None: + guardrail = datafog.protect(on_detect="redact") + + with warnings.catch_warnings(record=True) as captured: + warnings.simplefilter("always") + filtered = guardrail.filter("Email jane@example.com") + + assert filtered.redacted_text != "Email jane@example.com" + assert not captured + + +def test_legacy_detect_warns_with_replacement() -> None: + with pytest.warns(FutureWarning, match=r"Use datafog\.scan\(\) instead"): + result = datafog.detect("Email jane@example.com") + + assert result + + +def test_legacy_process_warns_with_replacement() -> None: + with pytest.warns(FutureWarning, match=r"datafog\.scan\(\) or datafog\.redact\(\)"): + result = datafog.process("Email jane@example.com", anonymize=True) + + assert result["anonymized"] != result["original"] + + +def test_import_does_not_emit_migration_warnings() -> None: + with warnings.catch_warnings(record=True) as captured: + warnings.simplefilter("always") + importlib.reload(datafog) + + assert not [ + warning for warning in captured if issubclass(warning.category, FutureWarning) + ] From 4f22b6a12e6b80431f32514ddaa6e6304e9594f5 Mon Sep 17 00:00:00 2001 From: Sid Mohan <61345237+sidmohan0@users.noreply.github.com> Date: Sat, 25 Apr 2026 20:12:55 -0700 Subject: [PATCH 25/27] ci: allow prerelease base override (#131) --- .github/workflows/release.yml | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index f99b874c..98034d7e 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -27,7 +27,7 @@ on: default: false type: boolean version_override: - description: "Override version (e.g. 4.4.0) — stable only" + description: "Override stable version or prerelease base (e.g. 4.4.0)" required: false type: string @@ -214,6 +214,13 @@ jobs: # Strip any pre-release suffix to get base version BASE=$(echo "$CURRENT" | sed -E 's/(a|b)[0-9]+([.][0-9A-Za-z]+)?$//') + if [ -n "${{ inputs.version_override }}" ]; then + BASE="${{ inputs.version_override }}" + if echo "$BASE" | grep -Eq '(a|b)[0-9]+([.][0-9A-Za-z]+)?$'; then + echo "version_override must be a stable base version like 4.4.0, not a prerelease" + exit 1 + fi + fi echo "Base version: $BASE" if [ "$TYPE" = "alpha" ]; then @@ -231,12 +238,7 @@ jobs: VERSION="${BASE}b${BETA_NUM}" else - # Stable: use override or base version - if [ -n "${{ inputs.version_override }}" ]; then - VERSION="${{ inputs.version_override }}" - else - VERSION="$BASE" - fi + VERSION="$BASE" fi echo "version=$VERSION" >> $GITHUB_OUTPUT From 6845ef7f38987508ccdb715d1e4db65f4fb3ca4a Mon Sep 17 00:00:00 2001 From: GitHub Action Date: Sun, 26 Apr 2026 03:31:05 +0000 Subject: [PATCH 26/27] chore: bump version to 4.4.0b1 [skip ci] --- datafog/__about__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafog/__about__.py b/datafog/__about__.py index bc4f11a2..e8957a0b 100644 --- a/datafog/__about__.py +++ b/datafog/__about__.py @@ -1 +1 @@ -__version__ = "4.3.0b11" +__version__ = "4.4.0b1" From 8ff0618bda62f3d497484aaf1c80214336fa9dca Mon Sep 17 00:00:00 2001 From: Sid Mohan <61345237+sidmohan0@users.noreply.github.com> Date: Sun, 26 Apr 2026 14:40:21 -0700 Subject: [PATCH 27/27] Make telemetry opt-in for v4.4 (#132) --- README.md | 14 ++++- datafog/telemetry.py | 20 +++++-- docs/v5-compatibility-matrix.rst | 4 +- scripts/generate_changelog.py | 37 +++++++++++++ tests/test_telemetry.py | 95 +++++++++++++++++++++++--------- 5 files changed, 135 insertions(+), 35 deletions(-) diff --git a/README.md b/README.md index 794defcb..e3a211b3 100644 --- a/README.md +++ b/README.md @@ -25,6 +25,10 @@ pip install datafog[nlp-advanced] pip install datafog[all] ``` +Python 3.13 support is certified for the core SDK and CLI. Optional extras such +as `nlp`, `nlp-advanced`, `ocr`, `distributed`, and `all` are available but not +yet certified on Python 3.13. + ## Quick Start ```python @@ -132,9 +136,15 @@ datafog hash-text "john@example.com" ## Telemetry -DataFog includes anonymous telemetry by default. +DataFog telemetry is disabled by default. + +To opt in: + +```bash +export DATAFOG_TELEMETRY=1 +``` -To opt out: +To force telemetry off: ```bash export DATAFOG_NO_TELEMETRY=1 diff --git a/datafog/telemetry.py b/datafog/telemetry.py index fb7e3137..6b3885a3 100644 --- a/datafog/telemetry.py +++ b/datafog/telemetry.py @@ -1,10 +1,13 @@ """ -Anonymous, opt-out usage telemetry for DataFog. +Anonymous, opt-in usage telemetry for DataFog. Collects anonymous usage data to help the DataFog team understand which engines, functions, and features are actually used. No text content is ever sent. -Opt out by setting either environment variable: +Telemetry is disabled by default. Opt in by setting: + DATAFOG_TELEMETRY=1 + +Force telemetry off by setting either environment variable: DATAFOG_NO_TELEMETRY=1 DO_NOT_TRACK=1 """ @@ -29,13 +32,18 @@ _scope = threading.local() +def _env_truthy(name: str) -> bool: + """Return True when an environment variable explicitly opts in/out.""" + return os.environ.get(name, "").strip().lower() in {"1", "true", "yes", "on"} + + def _is_telemetry_enabled() -> bool: - """Check if telemetry is enabled (opt-out via env vars).""" - if os.environ.get("DATAFOG_NO_TELEMETRY", "").strip() == "1": + """Check if telemetry is enabled (opt-in, with opt-out overrides).""" + if _env_truthy("DATAFOG_NO_TELEMETRY"): return False - if os.environ.get("DO_NOT_TRACK", "").strip() == "1": + if _env_truthy("DO_NOT_TRACK"): return False - return True + return _env_truthy("DATAFOG_TELEMETRY") def _get_anonymous_id() -> str: diff --git a/docs/v5-compatibility-matrix.rst b/docs/v5-compatibility-matrix.rst index 41c9e339..b95483e6 100644 --- a/docs/v5-compatibility-matrix.rst +++ b/docs/v5-compatibility-matrix.rst @@ -127,9 +127,9 @@ Compatibility Matrix - Deferred for v5.1+ overhaul. - Keep compatibility where practical; no runtime package installs. * - Telemetry - - Default-on opt-out telemetry. + - Opt-in telemetry. - Trust-critical behavior change. - - Make opt-in or no-network-by-default. + - Keep no-network-by-default. * - ``*_lean`` and ``*_original`` modules - Parallel historical implementations. - Remove or make private after migration path. diff --git a/scripts/generate_changelog.py b/scripts/generate_changelog.py index 293ac5b8..23babcf8 100755 --- a/scripts/generate_changelog.py +++ b/scripts/generate_changelog.py @@ -7,6 +7,16 @@ from datetime import datetime +def get_current_version(): + """Read the current package version from datafog/__about__.py.""" + try: + with open("datafog/__about__.py") as f: + match = re.search(r'^__version__ = "([^"]+)"', f.read(), re.M) + return match.group(1) if match else None + except OSError: + return None + + def get_latest_tag(): """Get the latest git tag.""" try: @@ -65,6 +75,7 @@ def generate_changelog(beta=False, alpha=False): """Generate changelog content.""" latest_tag = get_latest_tag() commits = get_commits_since_tag(latest_tag) + current_version = get_current_version() if not commits: return "No changes since last release." @@ -85,6 +96,32 @@ def generate_changelog(beta=False, alpha=False): changelog = "# What's New\n\n" changelog += f"*Released: {datetime.now().strftime('%Y-%m-%d')}*\n\n" + if not alpha and not beta and current_version == "4.4.0": + changelog += "## Python 3.13 Support Scope\n\n" + changelog += ( + "Python 3.13 support is certified for the core SDK and CLI: " + "`pip install datafog` and `pip install datafog[cli]`.\n\n" + ) + changelog += ( + "Optional extras including `nlp`, `nlp-advanced`, `ocr`, " + "`distributed`, and `all` are available but not yet certified on " + "Python 3.13. They will be validated separately based on user " + "demand.\n\n" + ) + changelog += "## v5 Migration Bridge\n\n" + changelog += ( + "This release adds the v5-preview top-level APIs `datafog.scan`, " + "`datafog.redact`, and `datafog.protect` while keeping the legacy " + "`datafog.detect` and `datafog.process` APIs working with targeted " + "migration warnings.\n\n" + ) + changelog += "## Privacy Defaults\n\n" + changelog += ( + "Telemetry is now opt-in. DataFog does not send telemetry unless " + "`DATAFOG_TELEMETRY=1` is explicitly set. `DATAFOG_NO_TELEMETRY=1` " + "and `DO_NOT_TRACK=1` continue to force telemetry off.\n\n" + ) + if categories["features"]: changelog += "## 🚀 New Features\n" for commit in categories["features"]: diff --git a/tests/test_telemetry.py b/tests/test_telemetry.py index 9c69e178..3886a1dc 100644 --- a/tests/test_telemetry.py +++ b/tests/test_telemetry.py @@ -28,13 +28,20 @@ def _reset_telemetry_state(): def _clean_state(monkeypatch): """Ensure clean telemetry state for every test and disable network.""" _reset_telemetry_state() - # Default: telemetry enabled but network mocked + # Default: telemetry disabled unless a test opts in explicitly. + monkeypatch.delenv("DATAFOG_TELEMETRY", raising=False) monkeypatch.delenv("DATAFOG_NO_TELEMETRY", raising=False) monkeypatch.delenv("DO_NOT_TRACK", raising=False) yield _reset_telemetry_state() +@pytest.fixture +def enable_telemetry(monkeypatch): + """Opt telemetry in for tests that assert payload behavior.""" + monkeypatch.setenv("DATAFOG_TELEMETRY", "1") + + @pytest.fixture def mock_urlopen(): """Mock urllib.request.urlopen to capture payloads without network.""" @@ -51,26 +58,62 @@ class TestOptOut: def test_datafog_no_telemetry_disables(self, monkeypatch): from datafog.telemetry import _is_telemetry_enabled + monkeypatch.setenv("DATAFOG_TELEMETRY", "1") monkeypatch.setenv("DATAFOG_NO_TELEMETRY", "1") assert _is_telemetry_enabled() is False def test_do_not_track_disables(self, monkeypatch): from datafog.telemetry import _is_telemetry_enabled + monkeypatch.setenv("DATAFOG_TELEMETRY", "1") monkeypatch.setenv("DO_NOT_TRACK", "1") assert _is_telemetry_enabled() is False - def test_enabled_by_default(self): + def test_disabled_by_default(self): + from datafog.telemetry import _is_telemetry_enabled + + assert _is_telemetry_enabled() is False + + def test_datafog_telemetry_enables(self, monkeypatch): from datafog.telemetry import _is_telemetry_enabled + monkeypatch.setenv("DATAFOG_TELEMETRY", "1") assert _is_telemetry_enabled() is True - def test_non_one_value_does_not_disable(self, monkeypatch): + @pytest.mark.parametrize("value", ["true", "yes", "on"]) + def test_truthy_values_enable(self, monkeypatch, value): from datafog.telemetry import _is_telemetry_enabled - monkeypatch.setenv("DATAFOG_NO_TELEMETRY", "true") + monkeypatch.setenv("DATAFOG_TELEMETRY", value) assert _is_telemetry_enabled() is True + def test_falsey_value_does_not_enable(self, monkeypatch): + from datafog.telemetry import _is_telemetry_enabled + + monkeypatch.setenv("DATAFOG_TELEMETRY", "0") + assert _is_telemetry_enabled() is False + + def test_truthy_opt_out_overrides_opt_in(self, monkeypatch): + from datafog.telemetry import _is_telemetry_enabled + + monkeypatch.setenv("DATAFOG_TELEMETRY", "1") + monkeypatch.setenv("DATAFOG_NO_TELEMETRY", "true") + assert _is_telemetry_enabled() is False + + def test_send_event_noop_by_default(self, mock_urlopen): + from datafog.telemetry import _send_event + + _send_event("test_event", {"key": "value"}) + time.sleep(0.1) + mock_urlopen.assert_not_called() + + def test_track_function_call_noop_by_default(self, mock_urlopen): + from datafog.telemetry import track_function_call + + track_function_call("test_fn", "test_module") + time.sleep(0.1) + mock_urlopen.assert_not_called() + def test_send_event_noop_when_disabled(self, monkeypatch, mock_urlopen): from datafog.telemetry import _send_event @@ -166,7 +209,7 @@ def test_anonymous_id_persisted(self, tmp_path, monkeypatch): id2 = tel._get_anonymous_id() assert id1 == id2 - def test_payload_never_contains_text_content(self, mock_urlopen): + def test_payload_never_contains_text_content(self, mock_urlopen, enable_telemetry): """Verify that tracked events don't leak text content.""" from datafog.telemetry import track_function_call @@ -197,7 +240,7 @@ def test_payload_never_contains_text_content(self, mock_urlopen): class TestNonBlocking: - def test_send_event_returns_immediately(self, mock_urlopen): + def test_send_event_returns_immediately(self, mock_urlopen, enable_telemetry): from datafog.telemetry import _send_event # Make urlopen block @@ -210,7 +253,9 @@ def test_send_event_returns_immediately(self, mock_urlopen): # Should return in <100ms even though urlopen blocks for 10s assert elapsed < 0.1 - def test_track_function_call_returns_immediately(self, mock_urlopen): + def test_track_function_call_returns_immediately( + self, mock_urlopen, enable_telemetry + ): from datafog.telemetry import track_function_call mock_urlopen.side_effect = lambda *a, **k: time.sleep(10) @@ -221,7 +266,7 @@ def test_track_function_call_returns_immediately(self, mock_urlopen): assert elapsed < 0.1 - def test_network_failure_is_silent(self, mock_urlopen): + def test_network_failure_is_silent(self, mock_urlopen, enable_telemetry): from datafog.telemetry import track_function_call mock_urlopen.side_effect = Exception("Network down") @@ -229,7 +274,7 @@ def test_network_failure_is_silent(self, mock_urlopen): track_function_call("fn", "mod") time.sleep(0.3) - def test_urlopen_timeout_is_bounded(self, mock_urlopen): + def test_urlopen_timeout_is_bounded(self, mock_urlopen, enable_telemetry): """Verify we pass a timeout to urlopen.""" from datafog.telemetry import _send_event @@ -248,7 +293,7 @@ def test_urlopen_timeout_is_bounded(self, mock_urlopen): class TestPayloadCorrectness: - def test_init_event_sent_once(self, mock_urlopen): + def test_init_event_sent_once(self, mock_urlopen, enable_telemetry): from datafog.telemetry import _ensure_initialized _ensure_initialized() @@ -259,7 +304,7 @@ def test_init_event_sent_once(self, mock_urlopen): # Should only create one thread/call for init assert mock_urlopen.call_count <= 1 - def test_init_event_has_required_properties(self, mock_urlopen): + def test_init_event_has_required_properties(self, mock_urlopen, enable_telemetry): from datafog.telemetry import _ensure_initialized _ensure_initialized() @@ -281,7 +326,7 @@ def test_init_event_has_required_properties(self, mock_urlopen): assert "is_ci" in props assert "distinct_id" in props - def test_function_call_event_properties(self, mock_urlopen): + def test_function_call_event_properties(self, mock_urlopen, enable_telemetry): from datafog.telemetry import track_function_call track_function_call( @@ -308,7 +353,7 @@ def test_function_call_event_properties(self, mock_urlopen): found = True assert found, "datafog_function_called event not found" - def test_error_event_properties(self, mock_urlopen): + def test_error_event_properties(self, mock_urlopen, enable_telemetry): from datafog.telemetry import track_error track_error("detect", "ValueError", engine="regex") @@ -326,7 +371,7 @@ def test_error_event_properties(self, mock_urlopen): found = True assert found, "datafog_error event not found" - def test_posthog_endpoint_url(self, mock_urlopen): + def test_posthog_endpoint_url(self, mock_urlopen, enable_telemetry): from datafog.telemetry import _send_event _send_event("test_event", {"k": "v"}) @@ -336,7 +381,7 @@ def test_posthog_endpoint_url(self, mock_urlopen): req = mock_urlopen.call_args[0][0] assert req.full_url == "https://us.i.posthog.com/capture/" - def test_content_type_is_json(self, mock_urlopen): + def test_content_type_is_json(self, mock_urlopen, enable_telemetry): from datafog.telemetry import _send_event _send_event("test_event", {"k": "v"}) @@ -353,7 +398,7 @@ def test_content_type_is_json(self, mock_urlopen): class TestIntegration: - def test_detect_triggers_telemetry(self, mock_urlopen): + def test_detect_triggers_telemetry(self, mock_urlopen, enable_telemetry): from datafog import detect with pytest.warns(FutureWarning, match=r"Use datafog\.scan\(\) instead"): @@ -367,7 +412,7 @@ def test_detect_triggers_telemetry(self, mock_urlopen): events.append(body["event"]) assert "datafog_function_called" in events - def test_process_triggers_telemetry(self, mock_urlopen): + def test_process_triggers_telemetry(self, mock_urlopen, enable_telemetry): from datafog import process with pytest.warns( @@ -384,7 +429,7 @@ def test_process_triggers_telemetry(self, mock_urlopen): events.append(body["event"]) assert "datafog_function_called" in events - def test_datafog_class_triggers_telemetry(self, mock_urlopen): + def test_datafog_class_triggers_telemetry(self, mock_urlopen, enable_telemetry): from datafog.main import DataFog df = DataFog() @@ -398,7 +443,7 @@ def test_datafog_class_triggers_telemetry(self, mock_urlopen): events.append(body["event"]) assert "datafog_function_called" in events - def test_text_service_triggers_telemetry(self, mock_urlopen): + def test_text_service_triggers_telemetry(self, mock_urlopen, enable_telemetry): try: from datafog.services.text_service import TextService except ImportError: @@ -415,7 +460,7 @@ def test_text_service_triggers_telemetry(self, mock_urlopen): events.append(body["event"]) assert "datafog_function_called" in events - def test_core_detect_pii_triggers_telemetry(self, mock_urlopen): + def test_core_detect_pii_triggers_telemetry(self, mock_urlopen, enable_telemetry): try: from datafog.core import detect_pii @@ -440,7 +485,7 @@ def test_core_detect_pii_triggers_telemetry(self, mock_urlopen): class TestEdgeCases: - def test_empty_text(self, mock_urlopen): + def test_empty_text(self, mock_urlopen, enable_telemetry): from datafog.telemetry import _get_text_length_bucket, track_function_call track_function_call( @@ -456,7 +501,7 @@ def test_large_text_bucket(self, mock_urlopen): assert _get_text_length_bucket(10_000_000) == "100k+" - def test_concurrent_init(self, mock_urlopen): + def test_concurrent_init(self, mock_urlopen, enable_telemetry): """Multiple threads calling _ensure_initialized should only init once.""" from datafog.telemetry import _ensure_initialized @@ -492,7 +537,7 @@ def fake_home(): anon_id = tel._get_anonymous_id() assert len(anon_id) == 64 - def test_dedup_nested_calls(self, mock_urlopen): + def test_dedup_nested_calls(self, mock_urlopen, enable_telemetry): """Nested track_function_call should only record the outer call.""" from datafog.telemetry import track_function_call @@ -531,7 +576,7 @@ def test_services_init_does_not_require_aiohttp(self): ts = TextService(engine="regex") assert ts.engine == "regex" - def test_track_error_sent_on_exception(self, mock_urlopen): + def test_track_error_sent_on_exception(self, mock_urlopen, enable_telemetry): """track_error should fire a datafog_error event.""" from datafog.telemetry import track_error @@ -550,7 +595,7 @@ def test_track_error_sent_on_exception(self, mock_urlopen): assert error_events[0]["error_type"] == "ValueError" assert error_events[0]["engine"] == "regex" - def test_pipeline_error_triggers_track_error(self, mock_urlopen): + def test_pipeline_error_triggers_track_error(self, mock_urlopen, enable_telemetry): """DataFog.run_text_pipeline_sync should fire datafog_error on failure.""" from datafog.main import DataFog