deepset-ai · julian-risch · Jun 17, 2026 · May 31, 2026 · May 31, 2026 · Jun 2, 2026
@@ -802,3 +802,68 @@ Unlike `AsyncPipeline.run()`, `Pipeline.run()` does not raise when called inside
 - `Pipeline.run` runs components sequentially and does not accept `concurrency_limit`; only `run_async` / `run_async_generator` run components concurrently.
 - Only `run` supports breakpoints (`break_point` / `pipeline_snapshot`).
 - Both run paths are traced under a single `haystack.pipeline.run` operation name, distinguished by a `haystack.pipeline.execution_mode` tag (`sync` or `async`); previously asynchronous runs used `haystack.async_pipeline.run`.
+
+### Auto-generated `Document.id` changes for documents with non-empty `meta`
+
+**What changed:** The hash used to auto-generate `Document.id` is now computed from a canonical (key-sorted) JSON serialization of `meta` instead of the dict's `repr`. Documents with empty `meta` keep the same IDs as before, but documents with non-empty `meta` get different IDs in v3.0. Non-JSON-serializable `meta` values (e.g. `datetime` or custom classes) are now serialized via `str(...)` rather than `repr(...)`, which also changes their IDs. See [#11446](https://github.com/deepset-ai/haystack/pull/11446).
+
+**Why:** Previously the hash reflected the insertion order of keys in `meta`, so two documents with the same content and the same metadata could end up with different IDs depending on how the `meta` dict was constructed. This silently broke `DuplicatePolicy.SKIP` / `FAIL` and any cache or dedup table keyed on the document ID. Sorting the keys before hashing makes the ID order-independent.
+
+**How to migrate:**
+
+If you rely on auto-generated IDs to match documents already persisted in a `DocumentStore` written by Haystack v2.x, re-ingest the affected documents so the new IDs are used consistently, or pass the previous `id` explicitly when constructing the `Document`.
+
+Before (v2.x):
+```python
+from haystack.dataclasses import Document
+
+# ID was derived from meta's dict repr, so it depended on key insertion order:
+# these two documents could end up with different IDs.
+doc1 = Document(content="Berlin is the capital of Germany.", meta={"source": "wiki", "lang": "en"})
+doc2 = Document(content="Berlin is the capital of Germany.", meta={"lang": "en", "source": "wiki"})
+
+After (v3.0):
+```python
+from haystack.dataclasses import Document
+
+# Same content + meta now always yields the same ID, regardless of key order,
+# but that ID differs from the one v2.x produced for documents with non-empty meta.
+doc1 = Document(content="Berlin is the capital of Germany.", meta={"source": "wiki", "lang": "en"})
+doc2 = Document(content="Berlin is the capital of Germany.", meta={"lang": "en", "source": "wiki"})
+assert doc1.id == doc2.id
+```
+
+It is possible to migrate an existing index without rerunning your indexing pipeline, for example to avoid recalculating embeddings. To do that, read stored documents, regenerate their IDs using Haystack 3.0, write the updated documents, and delete the documents stored under their old IDs. 
+
+```python
+from dataclasses import replace
+
+from haystack import Document
+from haystack.document_stores.in_memory import InMemoryDocumentStore
+from haystack.document_stores.types import DuplicatePolicy
+
+# Example DocumentStore with IDs generated with Haystack 2.x
+store = InMemoryDocumentStore()
+store.write_documents(
+    [
+        Document(
+            id="b51c3ee6b892f52bf28af01f5d823a254e438356ec335a20133ad940ef7b8cd7",
+            content="Berlin is the capital of Germany.",
+            meta={"source": "wiki", "lang": "en"},
+        ),
+        Document(
+            id="f022d8d89a99f89547215f8adcfed92f41518f2bb3e11d14e27987bd9d265ead",
+            content="Paris is the capital of France.",
+            meta={"source": "wiki", "lang": "en"},
+        ),
+    ],
+    policy=DuplicatePolicy.OVERWRITE,
+)
+
+# Exemplary steps to re-calculate IDs. Note that all documents are retrieved at once in this example but larger indices require pagination.
+old_documents = store.filter_documents()
+new_documents = [replace(doc, id="") for doc in old_documents]
+store.write_documents(new_documents, policy=DuplicatePolicy.OVERWRITE)
+new_ids = {doc.id for doc in new_documents}
+store.delete_documents([doc.id for doc in old_documents if doc.id not in new_ids])
+```
@@ -3,6 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import hashlib
+import json
 from dataclasses import asdict, dataclass, field, fields
 from typing import Any
 
@@ -113,7 +114,8 @@ def _create_id(self) -> str:
         dataframe = None  # this allows the ID creation to remain unchanged even if the dataframe field has been removed
         blob = self.blob.data if self.blob is not None else None
         mime_type = self.blob.mime_type if self.blob is not None else None
-        meta = self.meta or {}
+        # Sort keys so meta order doesn't affect the ID. Keep "{}" for empty meta so existing IDs stay stable.
+        meta = json.dumps(self.meta, sort_keys=True, default=str) if self.meta else "{}"
         embedding = self.embedding if self.embedding is not None else None
         sparse_embedding = self.sparse_embedding.to_dict() if self.sparse_embedding is not None else ""
         data = f"{text}{dataframe}{blob!r}{mime_type}{meta}{embedding}{sparse_embedding}"

@@ -0,0 +1,27 @@
+---
+upgrade:
+  - |
+    The hash used to auto-generate ``Document.id`` is now computed from a
+    canonical (key-sorted) JSON serialization of ``meta``. Documents with
+    empty ``meta`` are unaffected, but most other documents will get different
+    IDs than they did before:
+
+    * documents with non-empty ``meta`` (the serialization changes from
+      ``dict``'s repr to JSON);
+    * documents whose ``meta`` contains non-JSON-serializable values such as
+      ``datetime`` or custom classes (these are now serialized via ``str(...)``
+      rather than ``repr(...)``, e.g. ``"2024-01-01 00:00:00"`` instead of
+      ``"datetime.datetime(2024, 1, 1, 0, 0)"``).
+
+    If you rely on auto-generated IDs to match documents already persisted in a
+    ``DocumentStore``, you will need to re-ingest the affected documents (or
+    pass the previous ``id`` explicitly when constructing the ``Document``).
+fixes:
+  - |
+    ``Document.id`` is now deterministic regardless of the insertion order of
+    keys in ``meta``. Previously the hash was built from ``dict``'s repr, which
+    reflects insertion order, so two documents with the same content and the
+    same ``meta`` could get different IDs depending on how the ``meta`` dict was
+    constructed. This silently broke ``DuplicatePolicy.SKIP`` / ``FAIL`` and
+    any cache or dedup table keyed on the document ID whenever upstream code
+    produced ``meta`` in different orders.
@@ -3097,7 +3097,7 @@ def run(self, query: str) -> dict[str, list[Document]]:
                     ("rag_prompt", 1): {
                         "documents": [
                             Document(
-                                id="969664d0cf76e52b0ffb719d00d3e5a6b1c90bb29e56f6107dfd87bf2f5388ed",
+                                id="366a10745500c26f1177f434c74513daacaa7f9d2e09ba892cfcd48652eb80c1",
                                 content="This is a document potentially answering the question.",
                                 meta={"access_group": 1},
                             )
@@ -4376,7 +4376,7 @@ def pipeline_that_converts_files():
             content="Some test content",
             meta={
                 "file_type": "json",
-                "source_id": "0c6c5951d18da2935c7af3e24d417a9f94ca85403866dcfee1de93922504e1e5",
+                "source_id": "7eead7200d4ecead81a174a1da6512d8955f3a23acdc3f8431885d4793a63a74",
                 "page_number": 1,
                 "split_id": 0,
                 "split_idx_start": 0,
@@ -4386,7 +4386,7 @@ def pipeline_that_converts_files():
             content="Text file content ",
             meta={
                 "file_type": "txt",
-                "source_id": "41cb91740f6e64ab542122936ea746c238ae0a92fd29b698efabbe23d0ba4c42",
+                "source_id": "696d5c046b58b24bf806ff94f6b529fb3a08f068b6bf39e572683537736a0c27",
                 "page_number": 1,
                 "split_id": 0,
                 "split_idx_start": 0,
@@ -4396,7 +4396,7 @@ def pipeline_that_converts_files():
             content="for testing this.",
             meta={
                 "file_type": "txt",
-                "source_id": "41cb91740f6e64ab542122936ea746c238ae0a92fd29b698efabbe23d0ba4c42",
+                "source_id": "696d5c046b58b24bf806ff94f6b529fb3a08f068b6bf39e572683537736a0c27",
                 "page_number": 1,
                 "split_id": 1,
                 "split_idx_start": 18,

@@ -52,7 +52,7 @@ def test_init_with_parameters():
         embedding=[0.1, 0.2, 0.3],
         sparse_embedding=sparse_embedding,
     )
-    assert doc.id == "1aa43af57c1dbc317241bf55d3067049f334d3b458d95dc72f71a7111f6c1a56"
+    assert doc.id == "c31efd4986b1f2424e5058482c6f668ccad2309043c2346524cd81d255e159fe"
     assert doc.content == "test text"
     assert doc.blob is not None
     assert doc.blob.data == blob_data
@@ -95,7 +95,7 @@ def test_init_with_legacy_field():
         embedding=[0.1, 0.2, 0.3],
         meta={"date": "10-10-2023", "type": "article"},
     )
-    assert doc.id == "a2c0321b34430cc675294611e55529fceb56140ca3202f1c59a43a8cecac1f43"
+    assert doc.id == "dcd4914f727544e89ce8082f6f2e298d244dd0803a4dc167f19d24e7d43b28ac"
     assert doc.content == "test text"
     assert doc.meta == {"date": "10-10-2023", "type": "article"}
     assert doc.score == 0.812
@@ -123,6 +123,21 @@ def test_basic_equality_id():
     assert doc1 != doc2
 
 
+def test_id_is_independent_of_meta_key_order():
+    doc1 = Document(content="hello", meta={"a": 1, "b": 2})
+    doc2 = Document(content="hello", meta={"b": 2, "a": 1})
+
+    assert doc1.meta == doc2.meta
+    assert doc1.id == doc2.id
+
+
+def test_id_is_independent_of_nested_meta_key_order():
+    doc1 = Document(content="hello", meta={"outer": {"a": 1, "b": 2}})
+    doc2 = Document(content="hello", meta={"outer": {"b": 2, "a": 1}})
+
+    assert doc1.id == doc2.id
+
+
 def test_to_dict():
     doc = Document()
     assert doc.to_dict() == {