Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 65 additions & 0 deletions MIGRATION.md
Original file line number Diff line number Diff line change
Expand Up @@ -802,3 +802,68 @@ Unlike `AsyncPipeline.run()`, `Pipeline.run()` does not raise when called inside
- `Pipeline.run` runs components sequentially and does not accept `concurrency_limit`; only `run_async` / `run_async_generator` run components concurrently.
- Only `run` supports breakpoints (`break_point` / `pipeline_snapshot`).
- Both run paths are traced under a single `haystack.pipeline.run` operation name, distinguished by a `haystack.pipeline.execution_mode` tag (`sync` or `async`); previously asynchronous runs used `haystack.async_pipeline.run`.

### Auto-generated `Document.id` changes for documents with non-empty `meta`

**What changed:** The hash used to auto-generate `Document.id` is now computed from a canonical (key-sorted) JSON serialization of `meta` instead of the dict's `repr`. Documents with empty `meta` keep the same IDs as before, but documents with non-empty `meta` get different IDs in v3.0. Non-JSON-serializable `meta` values (e.g. `datetime` or custom classes) are now serialized via `str(...)` rather than `repr(...)`, which also changes their IDs. See [#11446](https://github.com/deepset-ai/haystack/pull/11446).

**Why:** Previously the hash reflected the insertion order of keys in `meta`, so two documents with the same content and the same metadata could end up with different IDs depending on how the `meta` dict was constructed. This silently broke `DuplicatePolicy.SKIP` / `FAIL` and any cache or dedup table keyed on the document ID. Sorting the keys before hashing makes the ID order-independent.

**How to migrate:**

If you rely on auto-generated IDs to match documents already persisted in a `DocumentStore` written by Haystack v2.x, re-ingest the affected documents so the new IDs are used consistently, or pass the previous `id` explicitly when constructing the `Document`.

Before (v2.x):
```python
from haystack.dataclasses import Document

# ID was derived from meta's dict repr, so it depended on key insertion order:
# these two documents could end up with different IDs.
doc1 = Document(content="Berlin is the capital of Germany.", meta={"source": "wiki", "lang": "en"})
doc2 = Document(content="Berlin is the capital of Germany.", meta={"lang": "en", "source": "wiki"})

After (v3.0):
```python
from haystack.dataclasses import Document

# Same content + meta now always yields the same ID, regardless of key order,
# but that ID differs from the one v2.x produced for documents with non-empty meta.
doc1 = Document(content="Berlin is the capital of Germany.", meta={"source": "wiki", "lang": "en"})
doc2 = Document(content="Berlin is the capital of Germany.", meta={"lang": "en", "source": "wiki"})
assert doc1.id == doc2.id
```

It is possible to migrate an existing index without rerunning your indexing pipeline, for example to avoid recalculating embeddings. To do that, read stored documents, regenerate their IDs using Haystack 3.0, write the updated documents, and delete the documents stored under their old IDs.

```python
from dataclasses import replace

from haystack import Document
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.document_stores.types import DuplicatePolicy

# Example DocumentStore with IDs generated with Haystack 2.x
store = InMemoryDocumentStore()
store.write_documents(
[
Document(
id="b51c3ee6b892f52bf28af01f5d823a254e438356ec335a20133ad940ef7b8cd7",
content="Berlin is the capital of Germany.",
meta={"source": "wiki", "lang": "en"},
),
Document(
id="f022d8d89a99f89547215f8adcfed92f41518f2bb3e11d14e27987bd9d265ead",
content="Paris is the capital of France.",
meta={"source": "wiki", "lang": "en"},
),
],
policy=DuplicatePolicy.OVERWRITE,
)

# Exemplary steps to re-calculate IDs. Note that all documents are retrieved at once in this example but larger indices require pagination.
old_documents = store.filter_documents()
new_documents = [replace(doc, id="") for doc in old_documents]
store.write_documents(new_documents, policy=DuplicatePolicy.OVERWRITE)
new_ids = {doc.id for doc in new_documents}
store.delete_documents([doc.id for doc in old_documents if doc.id not in new_ids])
```
4 changes: 3 additions & 1 deletion haystack/dataclasses/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
# SPDX-License-Identifier: Apache-2.0

import hashlib
import json
from dataclasses import asdict, dataclass, field, fields
from typing import Any

Expand Down Expand Up @@ -113,7 +114,8 @@ def _create_id(self) -> str:
dataframe = None # this allows the ID creation to remain unchanged even if the dataframe field has been removed
blob = self.blob.data if self.blob is not None else None
mime_type = self.blob.mime_type if self.blob is not None else None
meta = self.meta or {}
# Sort keys so meta order doesn't affect the ID. Keep "{}" for empty meta so existing IDs stay stable.
meta = json.dumps(self.meta, sort_keys=True, default=str) if self.meta else "{}"
embedding = self.embedding if self.embedding is not None else None
sparse_embedding = self.sparse_embedding.to_dict() if self.sparse_embedding is not None else ""
data = f"{text}{dataframe}{blob!r}{mime_type}{meta}{embedding}{sparse_embedding}"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
---
upgrade:
- |
The hash used to auto-generate ``Document.id`` is now computed from a
canonical (key-sorted) JSON serialization of ``meta``. Documents with
empty ``meta`` are unaffected, but most other documents will get different
IDs than they did before:

* documents with non-empty ``meta`` (the serialization changes from
``dict``'s repr to JSON);
* documents whose ``meta`` contains non-JSON-serializable values such as
``datetime`` or custom classes (these are now serialized via ``str(...)``
rather than ``repr(...)``, e.g. ``"2024-01-01 00:00:00"`` instead of
``"datetime.datetime(2024, 1, 1, 0, 0)"``).

If you rely on auto-generated IDs to match documents already persisted in a
``DocumentStore``, you will need to re-ingest the affected documents (or
pass the previous ``id`` explicitly when constructing the ``Document``).
fixes:
- |
``Document.id`` is now deterministic regardless of the insertion order of
keys in ``meta``. Previously the hash was built from ``dict``'s repr, which
reflects insertion order, so two documents with the same content and the
same ``meta`` could get different IDs depending on how the ``meta`` dict was
constructed. This silently broke ``DuplicatePolicy.SKIP`` / ``FAIL`` and
any cache or dedup table keyed on the document ID whenever upstream code
produced ``meta`` in different orders.
8 changes: 4 additions & 4 deletions test/core/pipeline/features/test_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -3097,7 +3097,7 @@ def run(self, query: str) -> dict[str, list[Document]]:
("rag_prompt", 1): {
"documents": [
Document(
id="969664d0cf76e52b0ffb719d00d3e5a6b1c90bb29e56f6107dfd87bf2f5388ed",
id="366a10745500c26f1177f434c74513daacaa7f9d2e09ba892cfcd48652eb80c1",
content="This is a document potentially answering the question.",
meta={"access_group": 1},
)
Expand Down Expand Up @@ -4376,7 +4376,7 @@ def pipeline_that_converts_files():
content="Some test content",
meta={
"file_type": "json",
"source_id": "0c6c5951d18da2935c7af3e24d417a9f94ca85403866dcfee1de93922504e1e5",
"source_id": "7eead7200d4ecead81a174a1da6512d8955f3a23acdc3f8431885d4793a63a74",
"page_number": 1,
"split_id": 0,
"split_idx_start": 0,
Expand All @@ -4386,7 +4386,7 @@ def pipeline_that_converts_files():
content="Text file content ",
meta={
"file_type": "txt",
"source_id": "41cb91740f6e64ab542122936ea746c238ae0a92fd29b698efabbe23d0ba4c42",
"source_id": "696d5c046b58b24bf806ff94f6b529fb3a08f068b6bf39e572683537736a0c27",
"page_number": 1,
"split_id": 0,
"split_idx_start": 0,
Expand All @@ -4396,7 +4396,7 @@ def pipeline_that_converts_files():
content="for testing this.",
meta={
"file_type": "txt",
"source_id": "41cb91740f6e64ab542122936ea746c238ae0a92fd29b698efabbe23d0ba4c42",
"source_id": "696d5c046b58b24bf806ff94f6b529fb3a08f068b6bf39e572683537736a0c27",
"page_number": 1,
"split_id": 1,
"split_idx_start": 18,
Expand Down
19 changes: 17 additions & 2 deletions test/dataclasses/test_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def test_init_with_parameters():
embedding=[0.1, 0.2, 0.3],
sparse_embedding=sparse_embedding,
)
assert doc.id == "1aa43af57c1dbc317241bf55d3067049f334d3b458d95dc72f71a7111f6c1a56"
assert doc.id == "c31efd4986b1f2424e5058482c6f668ccad2309043c2346524cd81d255e159fe"
assert doc.content == "test text"
assert doc.blob is not None
assert doc.blob.data == blob_data
Expand Down Expand Up @@ -95,7 +95,7 @@ def test_init_with_legacy_field():
embedding=[0.1, 0.2, 0.3],
meta={"date": "10-10-2023", "type": "article"},
)
assert doc.id == "a2c0321b34430cc675294611e55529fceb56140ca3202f1c59a43a8cecac1f43"
assert doc.id == "dcd4914f727544e89ce8082f6f2e298d244dd0803a4dc167f19d24e7d43b28ac"
assert doc.content == "test text"
assert doc.meta == {"date": "10-10-2023", "type": "article"}
assert doc.score == 0.812
Expand Down Expand Up @@ -123,6 +123,21 @@ def test_basic_equality_id():
assert doc1 != doc2


def test_id_is_independent_of_meta_key_order():
doc1 = Document(content="hello", meta={"a": 1, "b": 2})
doc2 = Document(content="hello", meta={"b": 2, "a": 1})

assert doc1.meta == doc2.meta
assert doc1.id == doc2.id


def test_id_is_independent_of_nested_meta_key_order():
doc1 = Document(content="hello", meta={"outer": {"a": 1, "b": 2}})
doc2 = Document(content="hello", meta={"outer": {"b": 2, "a": 1}})

assert doc1.id == doc2.id


def test_to_dict():
doc = Document()
assert doc.to_dict() == {
Expand Down
Loading