From 987d02d9bb11eeb55c3099cbbf51c9dbe5e49225 Mon Sep 17 00:00:00 2001 From: Aarkin7 Date: Sun, 31 May 2026 23:28:25 +0530 Subject: [PATCH 01/11] fix: make Document.id deterministic regardless of meta key order The hash was built from dict's repr, which reflects insertion order, so two Documents with equal meta could get different IDs. Serialize meta with sort_keys=True before hashing. Empty-meta IDs are unchanged. --- haystack/dataclasses/document.py | 5 ++++- ...cross-meta-key-order-f0293d51712e82be.yaml | 20 +++++++++++++++++++ test/dataclasses/test_document.py | 19 ++++++++++++++++-- 3 files changed, 41 insertions(+), 3 deletions(-) create mode 100644 releasenotes/notes/make-document-id-deterministic-across-meta-key-order-f0293d51712e82be.yaml diff --git a/haystack/dataclasses/document.py b/haystack/dataclasses/document.py index 6f6853d8e1..d1b6d09ebe 100644 --- a/haystack/dataclasses/document.py +++ b/haystack/dataclasses/document.py @@ -3,6 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 import hashlib +import json from dataclasses import asdict, dataclass, field, fields from typing import Any @@ -113,7 +114,9 @@ def _create_id(self) -> str: dataframe = None # this allows the ID creation to remain unchanged even if the dataframe field has been removed blob = self.blob.data if self.blob is not None else None mime_type = self.blob.mime_type if self.blob is not None else None - meta = self.meta or {} + # Sort keys so meta order doesn't affect the hash. Keep "{}" for empty meta + # so existing IDs stay stable. + meta = json.dumps(self.meta, sort_keys=True, default=str) if self.meta else "{}" embedding = self.embedding if self.embedding is not None else None sparse_embedding = self.sparse_embedding.to_dict() if self.sparse_embedding is not None else "" data = f"{text}{dataframe}{blob!r}{mime_type}{meta}{embedding}{sparse_embedding}" diff --git a/releasenotes/notes/make-document-id-deterministic-across-meta-key-order-f0293d51712e82be.yaml b/releasenotes/notes/make-document-id-deterministic-across-meta-key-order-f0293d51712e82be.yaml new file mode 100644 index 0000000000..fa37d23489 --- /dev/null +++ b/releasenotes/notes/make-document-id-deterministic-across-meta-key-order-f0293d51712e82be.yaml @@ -0,0 +1,20 @@ +--- +upgrade: + - | + The hash used to auto-generate ``Document.id`` is now computed from a + canonical (key-sorted) serialization of ``meta``. Documents created with + non-empty ``meta`` will therefore get different IDs than they did before. + Documents with empty ``meta`` are unaffected. + + If you rely on auto-generated IDs to match documents already persisted in a + ``DocumentStore``, you will need to re-ingest the affected documents (or + pass the previous ``id`` explicitly when constructing the ``Document``). +fixes: + - | + ``Document.id`` is now deterministic regardless of the insertion order of + keys in ``meta``. Previously the hash was built from ``dict``'s repr, which + reflects insertion order, so two documents with the same content and the + same ``meta`` could get different IDs depending on how the ``meta`` dict was + constructed. This silently broke ``DuplicatePolicy.SKIP`` / ``FAIL`` and + any cache or dedup table keyed on the document ID whenever upstream code + produced ``meta`` in different orders. diff --git a/test/dataclasses/test_document.py b/test/dataclasses/test_document.py index 9d7774db5a..18e6566328 100644 --- a/test/dataclasses/test_document.py +++ b/test/dataclasses/test_document.py @@ -52,7 +52,7 @@ def test_init_with_parameters(): embedding=[0.1, 0.2, 0.3], sparse_embedding=sparse_embedding, ) - assert doc.id == "1aa43af57c1dbc317241bf55d3067049f334d3b458d95dc72f71a7111f6c1a56" + assert doc.id == "c31efd4986b1f2424e5058482c6f668ccad2309043c2346524cd81d255e159fe" assert doc.content == "test text" assert doc.blob is not None assert doc.blob.data == blob_data @@ -95,7 +95,7 @@ def test_init_with_legacy_field(): embedding=[0.1, 0.2, 0.3], meta={"date": "10-10-2023", "type": "article"}, ) - assert doc.id == "a2c0321b34430cc675294611e55529fceb56140ca3202f1c59a43a8cecac1f43" + assert doc.id == "dcd4914f727544e89ce8082f6f2e298d244dd0803a4dc167f19d24e7d43b28ac" assert doc.content == "test text" assert doc.meta == {"date": "10-10-2023", "type": "article"} assert doc.score == 0.812 @@ -123,6 +123,21 @@ def test_basic_equality_id(): assert doc1 != doc2 +def test_id_is_independent_of_meta_key_order(): + doc1 = Document(content="hello", meta={"a": 1, "b": 2}) + doc2 = Document(content="hello", meta={"b": 2, "a": 1}) + + assert doc1.meta == doc2.meta + assert doc1.id == doc2.id + + +def test_id_is_independent_of_nested_meta_key_order(): + doc1 = Document(content="hello", meta={"outer": {"a": 1, "b": 2}}) + doc2 = Document(content="hello", meta={"outer": {"b": 2, "a": 1}}) + + assert doc1.id == doc2.id + + def test_to_dict(): doc = Document() assert doc.to_dict() == { From e3d1ffe8077683d5e8ecf915283ed1b15fd8cb29 Mon Sep 17 00:00:00 2001 From: Aarkin7 Date: Mon, 1 Jun 2026 00:24:54 +0530 Subject: [PATCH 02/11] test: update stale Document IDs in pipeline BDD scenarios Two BDD scenarios pinned IDs that were computed from documents with non-empty meta, so the deterministic-id fix changes them. Recompute and update the expected values; no behavior change. --- test/core/pipeline/features/test_run.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/core/pipeline/features/test_run.py b/test/core/pipeline/features/test_run.py index 57383b53be..51ec43c61c 100644 --- a/test/core/pipeline/features/test_run.py +++ b/test/core/pipeline/features/test_run.py @@ -3104,7 +3104,7 @@ def run(self, query: str) -> dict[str, list[Document]]: ("rag_prompt", 1): { "documents": [ Document( - id="969664d0cf76e52b0ffb719d00d3e5a6b1c90bb29e56f6107dfd87bf2f5388ed", + id="366a10745500c26f1177f434c74513daacaa7f9d2e09ba892cfcd48652eb80c1", content="This is a document potentially answering the question.", meta={"access_group": 1}, ) @@ -4381,7 +4381,7 @@ def pipeline_that_converts_files(pipeline_class): content="Some test content", meta={ "file_type": "json", - "source_id": "0c6c5951d18da2935c7af3e24d417a9f94ca85403866dcfee1de93922504e1e5", + "source_id": "7eead7200d4ecead81a174a1da6512d8955f3a23acdc3f8431885d4793a63a74", "page_number": 1, "split_id": 0, "split_idx_start": 0, @@ -4391,7 +4391,7 @@ def pipeline_that_converts_files(pipeline_class): content="Text file content ", meta={ "file_type": "txt", - "source_id": "41cb91740f6e64ab542122936ea746c238ae0a92fd29b698efabbe23d0ba4c42", + "source_id": "696d5c046b58b24bf806ff94f6b529fb3a08f068b6bf39e572683537736a0c27", "page_number": 1, "split_id": 0, "split_idx_start": 0, @@ -4401,7 +4401,7 @@ def pipeline_that_converts_files(pipeline_class): content="for testing this.", meta={ "file_type": "txt", - "source_id": "41cb91740f6e64ab542122936ea746c238ae0a92fd29b698efabbe23d0ba4c42", + "source_id": "696d5c046b58b24bf806ff94f6b529fb3a08f068b6bf39e572683537736a0c27", "page_number": 1, "split_id": 1, "split_idx_start": 18, From ff2fe994db3e593a0b8ef78a91fdf0b6d0547fef Mon Sep 17 00:00:00 2001 From: Aarkin7 Date: Tue, 2 Jun 2026 20:42:07 +0530 Subject: [PATCH 03/11] docs: clarify Document.id upgrade note for non-JSON-serializable meta --- ...stic-across-meta-key-order-f0293d51712e82be.yaml | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/releasenotes/notes/make-document-id-deterministic-across-meta-key-order-f0293d51712e82be.yaml b/releasenotes/notes/make-document-id-deterministic-across-meta-key-order-f0293d51712e82be.yaml index fa37d23489..e2d55f8484 100644 --- a/releasenotes/notes/make-document-id-deterministic-across-meta-key-order-f0293d51712e82be.yaml +++ b/releasenotes/notes/make-document-id-deterministic-across-meta-key-order-f0293d51712e82be.yaml @@ -2,9 +2,16 @@ upgrade: - | The hash used to auto-generate ``Document.id`` is now computed from a - canonical (key-sorted) serialization of ``meta``. Documents created with - non-empty ``meta`` will therefore get different IDs than they did before. - Documents with empty ``meta`` are unaffected. + canonical (key-sorted) JSON serialization of ``meta``. Documents with + empty ``meta`` are unaffected, but most other documents will get different + IDs than they did before: + + * documents with non-empty ``meta`` (the serialization changes from + ``dict``'s repr to JSON); + * documents whose ``meta`` contains non-JSON-serializable values such as + ``datetime`` or custom classes (these are now serialized via ``str(...)`` + rather than ``repr(...)``, e.g. ``"2024-01-01 00:00:00"`` instead of + ``"datetime.datetime(2024, 1, 1, 0, 0)"``). If you rely on auto-generated IDs to match documents already persisted in a ``DocumentStore``, you will need to re-ingest the affected documents (or From fb2aede74559aa2bd6c797ea6d4095ef3e3e0bd8 Mon Sep 17 00:00:00 2001 From: Julian Risch Date: Wed, 17 Jun 2026 17:12:04 +0200 Subject: [PATCH 04/11] docs: document Document.id meta-order breaking change in MIGRATION.md Add a Breaking Changes entry covering the new key-sorted JSON hashing of meta when auto-generating Document.id. Documents with non-empty meta get different IDs in v3.0; the entry explains why and how to migrate (re-ingest or pass the previous id explicitly). Co-Authored-By: Claude Opus 4.8 (1M context) --- MIGRATION.md | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/MIGRATION.md b/MIGRATION.md index 21722c72e6..d05fe29ea3 100644 --- a/MIGRATION.md +++ b/MIGRATION.md @@ -617,3 +617,43 @@ from haystack.components.generators import OpenAIImageGenerator generator = OpenAIImageGenerator(model="gpt-image-2") result = generator.run("A photo of a red apple") ``` + +### Auto-generated `Document.id` changes for documents with non-empty `meta` + +**What changed:** The hash used to auto-generate `Document.id` is now computed from a canonical (key-sorted) JSON serialization of `meta` instead of the dict's `repr`. Documents with empty `meta` keep the same IDs as before, but documents with non-empty `meta` get different IDs in v3.0. Non-JSON-serializable `meta` values (e.g. `datetime` or custom classes) are now serialized via `str(...)` rather than `repr(...)`, which also changes their IDs. See [#11446](https://github.com/deepset-ai/haystack/pull/11446). + +**Why:** Previously the hash reflected the insertion order of keys in `meta`, so two documents with the same content and the same metadata could end up with different IDs depending on how the `meta` dict was constructed. This silently broke `DuplicatePolicy.SKIP` / `FAIL` and any cache or dedup table keyed on the document ID. Sorting the keys before hashing makes the ID order-independent. + +**How to migrate:** + +If you do not rely on the exact value of auto-generated IDs, no action is needed — IDs are now simply stable regardless of `meta` key order. + +If you rely on auto-generated IDs to match documents already persisted in a `DocumentStore` written by Haystack v2.x, re-ingest the affected documents so the new IDs are used consistently, or pass the previous `id` explicitly when constructing the `Document`. + +Before (v2.x): +```python +from haystack.dataclasses import Document + +# ID was derived from meta's dict repr, so it depended on key insertion order: +# these two documents could end up with different IDs. +doc1 = Document(content="Berlin is the capital of Germany.", meta={"source": "wiki", "lang": "en"}) +doc2 = Document(content="Berlin is the capital of Germany.", meta={"lang": "en", "source": "wiki"}) +``` + +After (v3.0): +```python +from haystack.dataclasses import Document + +# Same content + meta now always yields the same ID, regardless of key order, +# but that ID differs from the one v2.x produced for documents with non-empty meta. +doc1 = Document(content="Berlin is the capital of Germany.", meta={"source": "wiki", "lang": "en"}) +doc2 = Document(content="Berlin is the capital of Germany.", meta={"lang": "en", "source": "wiki"}) +assert doc1.id == doc2.id + +# To keep an ID that already exists in a DocumentStore written by v2.x, pass it explicitly: +doc = Document( + content="Berlin is the capital of Germany.", + meta={"source": "wiki", "lang": "en"}, + id="", +) +``` From 7c100724a5cc25218ce76c699c47edd7bbf2d5c0 Mon Sep 17 00:00:00 2001 From: Julian Risch Date: Wed, 17 Jun 2026 17:19:44 +0200 Subject: [PATCH 05/11] Update migration guide for ID stability changes Clarified migration instructions for auto-generated IDs in DocumentStore. --- MIGRATION.md | 9 --------- 1 file changed, 9 deletions(-) diff --git a/MIGRATION.md b/MIGRATION.md index d05fe29ea3..c4f9ad235d 100644 --- a/MIGRATION.md +++ b/MIGRATION.md @@ -626,8 +626,6 @@ result = generator.run("A photo of a red apple") **How to migrate:** -If you do not rely on the exact value of auto-generated IDs, no action is needed — IDs are now simply stable regardless of `meta` key order. - If you rely on auto-generated IDs to match documents already persisted in a `DocumentStore` written by Haystack v2.x, re-ingest the affected documents so the new IDs are used consistently, or pass the previous `id` explicitly when constructing the `Document`. Before (v2.x): @@ -649,11 +647,4 @@ from haystack.dataclasses import Document doc1 = Document(content="Berlin is the capital of Germany.", meta={"source": "wiki", "lang": "en"}) doc2 = Document(content="Berlin is the capital of Germany.", meta={"lang": "en", "source": "wiki"}) assert doc1.id == doc2.id - -# To keep an ID that already exists in a DocumentStore written by v2.x, pass it explicitly: -doc = Document( - content="Berlin is the capital of Germany.", - meta={"source": "wiki", "lang": "en"}, - id="", -) ``` From 4c25a0cc9fd063084bd0f66f9ca81dfa0ae2fe90 Mon Sep 17 00:00:00 2001 From: Julian Risch Date: Wed, 17 Jun 2026 17:38:02 +0200 Subject: [PATCH 06/11] docs: add in-place Document.id migration example to MIGRATION.md Show a runnable InMemoryDocumentStore example that seeds the store with the IDs Haystack 2.x generated and a migrate_document_ids() helper that recomputes each id with the 3.x hashing and overwrites the stored documents. Co-Authored-By: Claude Opus 4.8 (1M context) --- MIGRATION.md | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/MIGRATION.md b/MIGRATION.md index c4f9ad235d..b5b2fa3592 100644 --- a/MIGRATION.md +++ b/MIGRATION.md @@ -648,3 +648,51 @@ doc1 = Document(content="Berlin is the capital of Germany.", meta={"source": "wi doc2 = Document(content="Berlin is the capital of Germany.", meta={"lang": "en", "source": "wiki"}) assert doc1.id == doc2.id ``` + +To migrate an existing index in place — recomputing the stored IDs with the 3.0 hashing instead of re-ingesting from source — read every document back, regenerate its `id`, and overwrite it. The example below seeds an `InMemoryDocumentStore` with 2.x-style IDs and then migrates them: + +```python +from dataclasses import replace + +from haystack import Document +from haystack.document_stores.in_memory import InMemoryDocumentStore +from haystack.document_stores.types import DuplicatePolicy + + +def migrate_document_ids(document_store) -> None: + """Recompute every stored Document.id with 3.x hashing and rewrite the index.""" + old_documents = document_store.filter_documents() + # Rebuilding each Document with id="" lets 3.x regenerate the id from its fields. + new_documents = [replace(doc, id="") for doc in old_documents] + document_store.delete_documents([doc.id for doc in old_documents]) + document_store.write_documents(new_documents, policy=DuplicatePolicy.OVERWRITE) + + +# Seed a store with the IDs Haystack 2.x generated for these documents (derived +# from the repr of the meta dict). Your real store already contains such IDs. +store = InMemoryDocumentStore() +store.write_documents( + [ + Document( + id="b51c3ee6b892f52bf28af01f5d823a254e438356ec335a20133ad940ef7b8cd7", + content="Berlin is the capital of Germany.", + meta={"source": "wiki", "lang": "en"}, + ), + Document( + id="f022d8d89a99f89547215f8adcfed92f41518f2bb3e11d14e27987bd9d265ead", + content="Paris is the capital of France.", + meta={"source": "wiki", "lang": "en"}, + ), + ], + policy=DuplicatePolicy.OVERWRITE, +) + +before = {doc.content: doc.id for doc in store.filter_documents()} +migrate_document_ids(store) +after = {doc.content: doc.id for doc in store.filter_documents()} + +for content in before: + print(content) + print(" 2.x id:", before[content]) + print(" 3.x id:", after[content]) +``` From 4c21a106d3bece70837c00f9c6c27dacb760f5ef Mon Sep 17 00:00:00 2001 From: Julian Risch Date: Wed, 17 Jun 2026 17:44:48 +0200 Subject: [PATCH 07/11] Clarify migration process for document IDs Updated migration instructions to clarify the process of migrating document IDs without rerunning the indexing pipeline. Removed outdated comments and provided a clearer example. --- MIGRATION.md | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/MIGRATION.md b/MIGRATION.md index b5b2fa3592..14d161e6c5 100644 --- a/MIGRATION.md +++ b/MIGRATION.md @@ -649,7 +649,7 @@ doc2 = Document(content="Berlin is the capital of Germany.", meta={"lang": "en", assert doc1.id == doc2.id ``` -To migrate an existing index in place — recomputing the stored IDs with the 3.0 hashing instead of re-ingesting from source — read every document back, regenerate its `id`, and overwrite it. The example below seeds an `InMemoryDocumentStore` with 2.x-style IDs and then migrates them: +It is possible to migrate an existing index without rerunning your indexing pipeline, for example to avoid recalculating embeddings. To achieve that, you need to read every stored document, regenerate its `id`, and overwrite it. ```python from dataclasses import replace @@ -660,16 +660,13 @@ from haystack.document_stores.types import DuplicatePolicy def migrate_document_ids(document_store) -> None: - """Recompute every stored Document.id with 3.x hashing and rewrite the index.""" old_documents = document_store.filter_documents() - # Rebuilding each Document with id="" lets 3.x regenerate the id from its fields. new_documents = [replace(doc, id="") for doc in old_documents] document_store.delete_documents([doc.id for doc in old_documents]) document_store.write_documents(new_documents, policy=DuplicatePolicy.OVERWRITE) -# Seed a store with the IDs Haystack 2.x generated for these documents (derived -# from the repr of the meta dict). Your real store already contains such IDs. +# Example DocumentStore with IDs generated with Haystack 2.x store = InMemoryDocumentStore() store.write_documents( [ @@ -687,12 +684,5 @@ store.write_documents( policy=DuplicatePolicy.OVERWRITE, ) -before = {doc.content: doc.id for doc in store.filter_documents()} migrate_document_ids(store) -after = {doc.content: doc.id for doc in store.filter_documents()} - -for content in before: - print(content) - print(" 2.x id:", before[content]) - print(" 3.x id:", after[content]) ``` From 5a640c5fb4c95e63831bfdd76970ba2f2d7c3105 Mon Sep 17 00:00:00 2001 From: Julian Risch Date: Wed, 17 Jun 2026 17:53:10 +0200 Subject: [PATCH 08/11] docs: make Document.id migration memory-safe and crash-safe Process documents in batches to bound extra memory, write all new documents before deleting any, and delete only the IDs that actually changed (empty-meta documents keep their ID). Note that the DocumentStore API has no pagination, so very large indexes should be read in chunks via the backend's scroll API. Co-Authored-By: Claude Opus 4.8 (1M context) --- MIGRATION.md | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/MIGRATION.md b/MIGRATION.md index 14d161e6c5..eaf5112c87 100644 --- a/MIGRATION.md +++ b/MIGRATION.md @@ -649,7 +649,9 @@ doc2 = Document(content="Berlin is the capital of Germany.", meta={"lang": "en", assert doc1.id == doc2.id ``` -It is possible to migrate an existing index without rerunning your indexing pipeline, for example to avoid recalculating embeddings. To achieve that, you need to read every stored document, regenerate its `id`, and overwrite it. +It is possible to migrate an existing index without rerunning your indexing pipeline, for example to avoid recalculating embeddings. To do that, read every stored document, regenerate its `id`, write the updated documents, and only then delete the documents stored under their old IDs. Processing the documents in batches keeps the extra memory bounded, and deleting only after all new documents are written means the index is never left incomplete if the migration is interrupted. + +Note that the `DocumentStore` API has no pagination: `filter_documents()` returns all matching documents in a single call. If your index is too large to hold in memory at once, read it in chunks using your backend's native batched retrieval (for example a scroll/cursor API) and apply the same regenerate-write-then-delete steps to each chunk. ```python from dataclasses import replace @@ -659,11 +661,20 @@ from haystack.document_stores.in_memory import InMemoryDocumentStore from haystack.document_stores.types import DuplicatePolicy -def migrate_document_ids(document_store) -> None: - old_documents = document_store.filter_documents() - new_documents = [replace(doc, id="") for doc in old_documents] - document_store.delete_documents([doc.id for doc in old_documents]) - document_store.write_documents(new_documents, policy=DuplicatePolicy.OVERWRITE) +def migrate_document_ids(document_store, batch_size: int = 10_000) -> None: + documents = document_store.filter_documents() + ids_to_delete = [] + for start in range(0, len(documents), batch_size): + batch = documents[start : start + batch_size] + migrated = [replace(doc, id="") for doc in batch] + document_store.write_documents(migrated, policy=DuplicatePolicy.OVERWRITE) + migrated_ids = {doc.id for doc in migrated} + # Documents with empty meta keep their ID and were overwritten in place; + # only the IDs that actually changed are stale and need deleting. + ids_to_delete.extend(doc.id for doc in batch if doc.id not in migrated_ids) + # Delete stale IDs only after all new documents have been written, so the + # index is never left incomplete if the migration is interrupted. + document_store.delete_documents(ids_to_delete) # Example DocumentStore with IDs generated with Haystack 2.x From 41399b16e4ee6fb452246988c1855d7f5308a025 Mon Sep 17 00:00:00 2001 From: Julian Risch Date: Wed, 17 Jun 2026 18:33:51 +0200 Subject: [PATCH 09/11] simplify code example regenerating document IDs Updated migration instructions and code examples for regenerating document IDs without interrupting the index. --- MIGRATION.md | 28 +++++++--------------------- 1 file changed, 7 insertions(+), 21 deletions(-) diff --git a/MIGRATION.md b/MIGRATION.md index eaf5112c87..f5bdb895bc 100644 --- a/MIGRATION.md +++ b/MIGRATION.md @@ -649,9 +649,7 @@ doc2 = Document(content="Berlin is the capital of Germany.", meta={"lang": "en", assert doc1.id == doc2.id ``` -It is possible to migrate an existing index without rerunning your indexing pipeline, for example to avoid recalculating embeddings. To do that, read every stored document, regenerate its `id`, write the updated documents, and only then delete the documents stored under their old IDs. Processing the documents in batches keeps the extra memory bounded, and deleting only after all new documents are written means the index is never left incomplete if the migration is interrupted. - -Note that the `DocumentStore` API has no pagination: `filter_documents()` returns all matching documents in a single call. If your index is too large to hold in memory at once, read it in chunks using your backend's native batched retrieval (for example a scroll/cursor API) and apply the same regenerate-write-then-delete steps to each chunk. +It is possible to migrate an existing index without rerunning your indexing pipeline, for example to avoid recalculating embeddings. To do that, read stored documents, regenerate their IDs, write the updated documents, and delete the documents stored under their old IDs. ```python from dataclasses import replace @@ -660,23 +658,6 @@ from haystack import Document from haystack.document_stores.in_memory import InMemoryDocumentStore from haystack.document_stores.types import DuplicatePolicy - -def migrate_document_ids(document_store, batch_size: int = 10_000) -> None: - documents = document_store.filter_documents() - ids_to_delete = [] - for start in range(0, len(documents), batch_size): - batch = documents[start : start + batch_size] - migrated = [replace(doc, id="") for doc in batch] - document_store.write_documents(migrated, policy=DuplicatePolicy.OVERWRITE) - migrated_ids = {doc.id for doc in migrated} - # Documents with empty meta keep their ID and were overwritten in place; - # only the IDs that actually changed are stale and need deleting. - ids_to_delete.extend(doc.id for doc in batch if doc.id not in migrated_ids) - # Delete stale IDs only after all new documents have been written, so the - # index is never left incomplete if the migration is interrupted. - document_store.delete_documents(ids_to_delete) - - # Example DocumentStore with IDs generated with Haystack 2.x store = InMemoryDocumentStore() store.write_documents( @@ -695,5 +676,10 @@ store.write_documents( policy=DuplicatePolicy.OVERWRITE, ) -migrate_document_ids(store) +# Exemplary steps to re-calculate IDs. Note that all documents are retrieved at once in this example but larger indices require pagination. +old_documents = store.filter_documents() +new_documents = [replace(doc, id="") for doc in old_documents] +store.write_documents(new_documents, policy=DuplicatePolicy.OVERWRITE) +new_ids = {doc.id for doc in new_documents} +store.delete_documents([doc.id for doc in old_documents if doc.id not in new_ids]) ``` From 6588e26e40e0983c260c53f906038422d2bb172f Mon Sep 17 00:00:00 2001 From: Julian Risch Date: Wed, 17 Jun 2026 18:36:50 +0200 Subject: [PATCH 10/11] Clarify regenerating IDs requires Haystack 3.0 Updated migration instructions to include Haystack 3.0 for ID regeneration. --- MIGRATION.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MIGRATION.md b/MIGRATION.md index f5bdb895bc..82aa8f4362 100644 --- a/MIGRATION.md +++ b/MIGRATION.md @@ -649,7 +649,7 @@ doc2 = Document(content="Berlin is the capital of Germany.", meta={"lang": "en", assert doc1.id == doc2.id ``` -It is possible to migrate an existing index without rerunning your indexing pipeline, for example to avoid recalculating embeddings. To do that, read stored documents, regenerate their IDs, write the updated documents, and delete the documents stored under their old IDs. +It is possible to migrate an existing index without rerunning your indexing pipeline, for example to avoid recalculating embeddings. To do that, read stored documents, regenerate their IDs using Haystack 3.0, write the updated documents, and delete the documents stored under their old IDs. ```python from dataclasses import replace From a07abcb3cff866cbac77f9910303aac5b9ba0376 Mon Sep 17 00:00:00 2001 From: Julian Risch Date: Wed, 17 Jun 2026 18:41:35 +0200 Subject: [PATCH 11/11] Update haystack/dataclasses/document.py --- haystack/dataclasses/document.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/haystack/dataclasses/document.py b/haystack/dataclasses/document.py index d1b6d09ebe..7792eaee61 100644 --- a/haystack/dataclasses/document.py +++ b/haystack/dataclasses/document.py @@ -114,8 +114,7 @@ def _create_id(self) -> str: dataframe = None # this allows the ID creation to remain unchanged even if the dataframe field has been removed blob = self.blob.data if self.blob is not None else None mime_type = self.blob.mime_type if self.blob is not None else None - # Sort keys so meta order doesn't affect the hash. Keep "{}" for empty meta - # so existing IDs stay stable. + # Sort keys so meta order doesn't affect the ID. Keep "{}" for empty meta so existing IDs stay stable. meta = json.dumps(self.meta, sort_keys=True, default=str) if self.meta else "{}" embedding = self.embedding if self.embedding is not None else None sparse_embedding = self.sparse_embedding.to_dict() if self.sparse_embedding is not None else ""