From 548bb67409fea9eec838a52d4b7537eb30603ae4 Mon Sep 17 00:00:00 2001 From: anakin87 Date: Fri, 12 Jun 2026 17:04:20 +0200 Subject: [PATCH] chore: deprecate Spacy NamedEntityExtractor and add docs --- .../docs/pipeline-components/extractors.mdx | 2 + .../extractors/namedentityextractor.mdx | 21 ++-- .../extractors/spacynamedentityextractor.mdx | 100 ++++++++++++++++++ .../transformersnamedentityextractor.mdx | 15 +-- .../pipeline-components/extractors.mdx | 2 + .../extractors/namedentityextractor.mdx | 21 ++-- .../extractors/spacynamedentityextractor.mdx | 100 ++++++++++++++++++ .../transformersnamedentityextractor.mdx | 15 +-- .../extractors/named_entity_extractor.py | 13 ++- ...med-entity-extractor-cc4e374632cd817a.yaml | 10 ++ 10 files changed, 264 insertions(+), 35 deletions(-) create mode 100644 docs-website/docs/pipeline-components/extractors/spacynamedentityextractor.mdx create mode 100644 docs-website/versioned_docs/version-2.30/pipeline-components/extractors/spacynamedentityextractor.mdx create mode 100644 releasenotes/notes/deprecate-spacy-named-entity-extractor-cc4e374632cd817a.yaml diff --git a/docs-website/docs/pipeline-components/extractors.mdx b/docs-website/docs/pipeline-components/extractors.mdx index d5a13da690..24575836dd 100644 --- a/docs-website/docs/pipeline-components/extractors.mdx +++ b/docs-website/docs/pipeline-components/extractors.mdx @@ -13,3 +13,5 @@ slug: "/extractors" | [NamedEntityExtractor](extractors/namedentityextractor.mdx) | Extracts predefined entities out of a piece of text and writes them into documents' meta field. | | [PresidioEntityExtractor](extractors/presidioentityextractor.mdx) | Detects PII in Documents and stores entities as structured metadata, without modifying the text. Powered by Microsoft Presidio. | | [RegexTextExtractor](extractors/regextextextractor.mdx) | Extracts text from chat messages or strings using a regular expression pattern. | +| [SpacyNamedEntityExtractor](extractors/spacynamedentityextractor.mdx) | Extracts predefined entities out of a piece of text and writes them into documents' meta field. Uses a spaCy model. | +| [TransformersNamedEntityExtractor](extractors/transformersnamedentityextractor.mdx) | Extracts predefined entities out of a piece of text and writes them into documents' meta field. Uses a Hugging Face model. | diff --git a/docs-website/docs/pipeline-components/extractors/namedentityextractor.mdx b/docs-website/docs/pipeline-components/extractors/namedentityextractor.mdx index dd00c9f5ad..6516cab7b8 100644 --- a/docs-website/docs/pipeline-components/extractors/namedentityextractor.mdx +++ b/docs-website/docs/pipeline-components/extractors/namedentityextractor.mdx @@ -11,7 +11,10 @@ This component extracts predefined entities out of a piece of text and writes th :::warning[Deprecated] -`NamedEntityExtractor` is deprecated and will be removed in Haystack 3.0. It has moved to the `transformers-haystack` package and was renamed to `TransformersNamedEntityExtractor`. See [TransformersNamedEntityExtractor](transformersnamedentityextractor.mdx) for the updated documentation. +`NamedEntityExtractor` is deprecated and will be removed in Haystack 3.0. It has moved to dedicated Core Integrations packages depending on the backend: + +- Hugging Face backend: `transformers-haystack` package, renamed to `TransformersNamedEntityExtractor`. See [TransformersNamedEntityExtractor](transformersnamedentityextractor.mdx) for the updated documentation. +- spaCy backend: `spacy-haystack` package, renamed to `SpacyNamedEntityExtractor`. See [SpacyNamedEntityExtractor](spacynamedentityextractor.mdx) for the updated documentation. ::: @@ -65,16 +68,16 @@ documents = [ Document(content="New York State is home to the Empire State Building."), ] -extractor.run(documents) -print(documents) +result = extractor.run(documents) +print(result["documents"]) ``` Here is the example result: ```python -[Document(id=aec840d1b6c85609f4f16c3e222a5a25fd8c4c53bd981a40c1268ab9c72cee10, content: 'My name is Clara and I live in Berkeley, California.', meta: {'named_entities': [NamedEntityAnnotation(entity='PER', start=11, end=16, score=0.99641764), NamedEntityAnnotation(entity='LOC', start=31, end=39, score=0.996198), NamedEntityAnnotation(entity='LOC', start=41, end=51, score=0.9990196)]}), -Document(id=98f1dc5d0ccd9d9950cd191d1076db0f7af40c401dd7608f11c90cb3fc38c0c2, content: 'I'm Merlin, the happy pig!', meta: {'named_entities': [NamedEntityAnnotation(entity='PER', start=4, end=10, score=0.99054915)]}), -Document(id=44948ea0eec018b33aceaaedde4616eb9e93ce075e0090ec1613fc145f84b4a9, content: 'New York State is home to the Empire State Building.', meta: {'named_entities': [NamedEntityAnnotation(entity='LOC', start=0, end=14, score=0.9989541), NamedEntityAnnotation(entity='LOC', start=30, end=51, score=0.95746297)]})] +[Document(id=aec840d1b6c85609f4f16c3e222a5a25fd8c4c53bd981a40c1268ab9c72cee10, content: 'My name is Clara and I live in Berkeley, California.', meta: {'named_entities': [NamedEntityAnnotation(entity='PER', start=11, end=16, score=np.float32(0.99641764)), NamedEntityAnnotation(entity='LOC', start=31, end=39, score=np.float32(0.996198)), NamedEntityAnnotation(entity='LOC', start=41, end=51, score=np.float32(0.9990196))]}), +Document(id=98f1dc5d0ccd9d9950cd191d1076db0f7af40c401dd7608f11c90cb3fc38c0c2, content: 'I'm Merlin, the happy pig!', meta: {'named_entities': [NamedEntityAnnotation(entity='PER', start=4, end=10, score=np.float32(0.99054915))]}), +Document(id=44948ea0eec018b33aceaaedde4616eb9e93ce075e0090ec1613fc145f84b4a9, content: 'New York State is home to the Empire State Building.', meta: {'named_entities': [NamedEntityAnnotation(entity='LOC', start=0, end=14, score=np.float32(0.9989541)), NamedEntityAnnotation(entity='LOC', start=30, end=51, score=np.float32(0.9574631))]})] ``` ### Get stored annotations @@ -93,9 +96,11 @@ documents = [ Document(content="New York State is home to the Empire State Building."), ] -extractor.run(documents) +result = extractor.run(documents) -annotations = [NamedEntityExtractor.get_stored_annotations(doc) for doc in documents] +annotations = [ + NamedEntityExtractor.get_stored_annotations(doc) for doc in result["documents"] +] print(annotations) # If a Document doesn't contain any annotations, this returns None. diff --git a/docs-website/docs/pipeline-components/extractors/spacynamedentityextractor.mdx b/docs-website/docs/pipeline-components/extractors/spacynamedentityextractor.mdx new file mode 100644 index 0000000000..298edd0219 --- /dev/null +++ b/docs-website/docs/pipeline-components/extractors/spacynamedentityextractor.mdx @@ -0,0 +1,100 @@ +--- +title: "SpacyNamedEntityExtractor" +id: spacynamedentityextractor +slug: "/spacynamedentityextractor" +description: "This component extracts predefined entities out of a piece of text and writes them into documents’ meta field." +--- + +# SpacyNamedEntityExtractor + +This component extracts predefined entities out of a piece of text and writes them into documents’ meta field. + +
+ +| | | +| --- | --- | +| **Most common position in a pipeline** | After the [PreProcessor](../preprocessors.mdx) in an indexing pipeline or after a [Retriever](../retrievers.mdx) in a query pipeline | +| **Mandatory init variables** | `model`: Name or path of the spaCy model to use | +| **Mandatory run variables** | `documents`: A list of documents | +| **Output variables** | `documents`: A list of documents | +| **API reference** | [Spacy](/reference/integrations-spacy) | +| **GitHub link** | https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/spacy | +| **Package name** | `spacy-haystack` | + +
+ +## Overview + +`SpacyNamedEntityExtractor` looks for entities, which are spans in the text. The extractor automatically recognizes and groups them depending on their class, such as people's names, organizations, locations, and other types. The exact classes are determined by the model that you initialize the component with. + +`SpacyNamedEntityExtractor` takes a list of documents as input and returns a list of the same documents with their `meta` data enriched with `NamedEntityAnnotations`. A `NamedEntityAnnotation` consists of the type of the entity and the start and end of the span, for example: `NamedEntityAnnotation(entity='PERSON', start=11, end=16, score=None)`. + +When the `SpacyNamedEntityExtractor` is initialized, you need to set a `model`. Optionally, you can set `pipeline_kwargs`, which are then passed on to the spaCy pipeline. You can additionally set the `device` that is used to run the component. + +## Usage + +Install the `spacy-haystack` package to use the `SpacyNamedEntityExtractor`: + +```shell +pip install spacy-haystack +``` + +The component works with any [spaCy model](https://spacy.io/models) that contains an NER component. + +`SpacyNamedEntityExtractor` accepts a list of `Documents` as its input. The extractor annotates the raw text in the documents and stores the annotations in the document's `meta` dictionary under the `named_entities` key. + +```python +from haystack.dataclasses import Document +from haystack_integrations.components.extractors.spacy import ( + SpacyNamedEntityExtractor, +) + +extractor = SpacyNamedEntityExtractor(model="en_core_web_sm") + +documents = [ + Document(content="My name is Clara and I live in Berkeley, California."), + Document(content="I'm Merlin, the happy pig!"), + Document(content="New York State is home to the Empire State Building."), +] + +result = extractor.run(documents) +print(result["documents"]) +``` + +Here is the example result: + +```python +[Document(id=aec840d1b6c85609f4f16c3e222a5a25fd8c4c53bd981a40c1268ab9c72cee10, content: 'My name is Clara and I live in Berkeley, California.', meta: {'named_entities': [NamedEntityAnnotation(entity='PERSON', start=11, end=16, score=None), NamedEntityAnnotation(entity='GPE', start=31, end=39, score=None), NamedEntityAnnotation(entity='GPE', start=41, end=51, score=None)]}), +Document(id=98f1dc5d0ccd9d9950cd191d1076db0f7af40c401dd7608f11c90cb3fc38c0c2, content: 'I'm Merlin, the happy pig!', meta: {'named_entities': [NamedEntityAnnotation(entity='PERSON', start=4, end=10, score=None)]}), +Document(id=44948ea0eec018b33aceaaedde4616eb9e93ce075e0090ec1613fc145f84b4a9, content: 'New York State is home to the Empire State Building.', meta: {'named_entities': [NamedEntityAnnotation(entity='GPE', start=0, end=14, score=None), NamedEntityAnnotation(entity='ORG', start=26, end=51, score=None)]})] +``` + +### Get stored annotations + +This component includes the `get_stored_annotations` helper class method that allows you to retrieve the annotations stored in a `Document` transparently: + +```python +from haystack.dataclasses import Document +from haystack_integrations.components.extractors.spacy import ( + SpacyNamedEntityExtractor, +) + +extractor = SpacyNamedEntityExtractor(model="en_core_web_sm") + +documents = [ + Document(content="My name is Clara and I live in Berkeley, California."), + Document(content="I'm Merlin, the happy pig!"), + Document(content="New York State is home to the Empire State Building."), +] + +result = extractor.run(documents) + +annotations = [ + SpacyNamedEntityExtractor.get_stored_annotations(doc) for doc in result["documents"] +] +print(annotations) + +# If a Document doesn't contain any annotations, this returns None. +new_doc = Document(content="In one of many possible worlds...") +assert SpacyNamedEntityExtractor.get_stored_annotations(new_doc) is None +``` diff --git a/docs-website/docs/pipeline-components/extractors/transformersnamedentityextractor.mdx b/docs-website/docs/pipeline-components/extractors/transformersnamedentityextractor.mdx index db03df935c..cf10f02613 100644 --- a/docs-website/docs/pipeline-components/extractors/transformersnamedentityextractor.mdx +++ b/docs-website/docs/pipeline-components/extractors/transformersnamedentityextractor.mdx @@ -59,16 +59,16 @@ documents = [ Document(content="New York State is home to the Empire State Building."), ] -extractor.run(documents) -print(documents) +result = extractor.run(documents) +print(result["documents"]) ``` Here is the example result: ```python -[Document(id=aec840d1b6c85609f4f16c3e222a5a25fd8c4c53bd981a40c1268ab9c72cee10, content: 'My name is Clara and I live in Berkeley, California.', meta: {'named_entities': [NamedEntityAnnotation(entity='PER', start=11, end=16, score=0.99641764), NamedEntityAnnotation(entity='LOC', start=31, end=39, score=0.996198), NamedEntityAnnotation(entity='LOC', start=41, end=51, score=0.9990196)]}), -Document(id=98f1dc5d0ccd9d9950cd191d1076db0f7af40c401dd7608f11c90cb3fc38c0c2, content: 'I'm Merlin, the happy pig!', meta: {'named_entities': [NamedEntityAnnotation(entity='PER', start=4, end=10, score=0.99054915)]}), -Document(id=44948ea0eec018b33aceaaedde4616eb9e93ce075e0090ec1613fc145f84b4a9, content: 'New York State is home to the Empire State Building.', meta: {'named_entities': [NamedEntityAnnotation(entity='LOC', start=0, end=14, score=0.9989541), NamedEntityAnnotation(entity='LOC', start=30, end=51, score=0.95746297)]})] +[Document(id=aec840d1b6c85609f4f16c3e222a5a25fd8c4c53bd981a40c1268ab9c72cee10, content: 'My name is Clara and I live in Berkeley, California.', meta: {'named_entities': [NamedEntityAnnotation(entity='PER', start=11, end=16, score=np.float32(0.99641764)), NamedEntityAnnotation(entity='LOC', start=31, end=39, score=np.float32(0.996198)), NamedEntityAnnotation(entity='LOC', start=41, end=51, score=np.float32(0.9990196))]}), +Document(id=98f1dc5d0ccd9d9950cd191d1076db0f7af40c401dd7608f11c90cb3fc38c0c2, content: 'I'm Merlin, the happy pig!', meta: {'named_entities': [NamedEntityAnnotation(entity='PER', start=4, end=10, score=np.float32(0.99054915))]}), +Document(id=44948ea0eec018b33aceaaedde4616eb9e93ce075e0090ec1613fc145f84b4a9, content: 'New York State is home to the Empire State Building.', meta: {'named_entities': [NamedEntityAnnotation(entity='LOC', start=0, end=14, score=np.float32(0.9989541)), NamedEntityAnnotation(entity='LOC', start=30, end=51, score=np.float32(0.9574631))]})] ``` ### Get stored annotations @@ -89,10 +89,11 @@ documents = [ Document(content="New York State is home to the Empire State Building."), ] -extractor.run(documents) +result = extractor.run(documents) annotations = [ - TransformersNamedEntityExtractor.get_stored_annotations(doc) for doc in documents + TransformersNamedEntityExtractor.get_stored_annotations(doc) + for doc in result["documents"] ] print(annotations) diff --git a/docs-website/versioned_docs/version-2.30/pipeline-components/extractors.mdx b/docs-website/versioned_docs/version-2.30/pipeline-components/extractors.mdx index d5a13da690..24575836dd 100644 --- a/docs-website/versioned_docs/version-2.30/pipeline-components/extractors.mdx +++ b/docs-website/versioned_docs/version-2.30/pipeline-components/extractors.mdx @@ -13,3 +13,5 @@ slug: "/extractors" | [NamedEntityExtractor](extractors/namedentityextractor.mdx) | Extracts predefined entities out of a piece of text and writes them into documents' meta field. | | [PresidioEntityExtractor](extractors/presidioentityextractor.mdx) | Detects PII in Documents and stores entities as structured metadata, without modifying the text. Powered by Microsoft Presidio. | | [RegexTextExtractor](extractors/regextextextractor.mdx) | Extracts text from chat messages or strings using a regular expression pattern. | +| [SpacyNamedEntityExtractor](extractors/spacynamedentityextractor.mdx) | Extracts predefined entities out of a piece of text and writes them into documents' meta field. Uses a spaCy model. | +| [TransformersNamedEntityExtractor](extractors/transformersnamedentityextractor.mdx) | Extracts predefined entities out of a piece of text and writes them into documents' meta field. Uses a Hugging Face model. | diff --git a/docs-website/versioned_docs/version-2.30/pipeline-components/extractors/namedentityextractor.mdx b/docs-website/versioned_docs/version-2.30/pipeline-components/extractors/namedentityextractor.mdx index dd00c9f5ad..6516cab7b8 100644 --- a/docs-website/versioned_docs/version-2.30/pipeline-components/extractors/namedentityextractor.mdx +++ b/docs-website/versioned_docs/version-2.30/pipeline-components/extractors/namedentityextractor.mdx @@ -11,7 +11,10 @@ This component extracts predefined entities out of a piece of text and writes th :::warning[Deprecated] -`NamedEntityExtractor` is deprecated and will be removed in Haystack 3.0. It has moved to the `transformers-haystack` package and was renamed to `TransformersNamedEntityExtractor`. See [TransformersNamedEntityExtractor](transformersnamedentityextractor.mdx) for the updated documentation. +`NamedEntityExtractor` is deprecated and will be removed in Haystack 3.0. It has moved to dedicated Core Integrations packages depending on the backend: + +- Hugging Face backend: `transformers-haystack` package, renamed to `TransformersNamedEntityExtractor`. See [TransformersNamedEntityExtractor](transformersnamedentityextractor.mdx) for the updated documentation. +- spaCy backend: `spacy-haystack` package, renamed to `SpacyNamedEntityExtractor`. See [SpacyNamedEntityExtractor](spacynamedentityextractor.mdx) for the updated documentation. ::: @@ -65,16 +68,16 @@ documents = [ Document(content="New York State is home to the Empire State Building."), ] -extractor.run(documents) -print(documents) +result = extractor.run(documents) +print(result["documents"]) ``` Here is the example result: ```python -[Document(id=aec840d1b6c85609f4f16c3e222a5a25fd8c4c53bd981a40c1268ab9c72cee10, content: 'My name is Clara and I live in Berkeley, California.', meta: {'named_entities': [NamedEntityAnnotation(entity='PER', start=11, end=16, score=0.99641764), NamedEntityAnnotation(entity='LOC', start=31, end=39, score=0.996198), NamedEntityAnnotation(entity='LOC', start=41, end=51, score=0.9990196)]}), -Document(id=98f1dc5d0ccd9d9950cd191d1076db0f7af40c401dd7608f11c90cb3fc38c0c2, content: 'I'm Merlin, the happy pig!', meta: {'named_entities': [NamedEntityAnnotation(entity='PER', start=4, end=10, score=0.99054915)]}), -Document(id=44948ea0eec018b33aceaaedde4616eb9e93ce075e0090ec1613fc145f84b4a9, content: 'New York State is home to the Empire State Building.', meta: {'named_entities': [NamedEntityAnnotation(entity='LOC', start=0, end=14, score=0.9989541), NamedEntityAnnotation(entity='LOC', start=30, end=51, score=0.95746297)]})] +[Document(id=aec840d1b6c85609f4f16c3e222a5a25fd8c4c53bd981a40c1268ab9c72cee10, content: 'My name is Clara and I live in Berkeley, California.', meta: {'named_entities': [NamedEntityAnnotation(entity='PER', start=11, end=16, score=np.float32(0.99641764)), NamedEntityAnnotation(entity='LOC', start=31, end=39, score=np.float32(0.996198)), NamedEntityAnnotation(entity='LOC', start=41, end=51, score=np.float32(0.9990196))]}), +Document(id=98f1dc5d0ccd9d9950cd191d1076db0f7af40c401dd7608f11c90cb3fc38c0c2, content: 'I'm Merlin, the happy pig!', meta: {'named_entities': [NamedEntityAnnotation(entity='PER', start=4, end=10, score=np.float32(0.99054915))]}), +Document(id=44948ea0eec018b33aceaaedde4616eb9e93ce075e0090ec1613fc145f84b4a9, content: 'New York State is home to the Empire State Building.', meta: {'named_entities': [NamedEntityAnnotation(entity='LOC', start=0, end=14, score=np.float32(0.9989541)), NamedEntityAnnotation(entity='LOC', start=30, end=51, score=np.float32(0.9574631))]})] ``` ### Get stored annotations @@ -93,9 +96,11 @@ documents = [ Document(content="New York State is home to the Empire State Building."), ] -extractor.run(documents) +result = extractor.run(documents) -annotations = [NamedEntityExtractor.get_stored_annotations(doc) for doc in documents] +annotations = [ + NamedEntityExtractor.get_stored_annotations(doc) for doc in result["documents"] +] print(annotations) # If a Document doesn't contain any annotations, this returns None. diff --git a/docs-website/versioned_docs/version-2.30/pipeline-components/extractors/spacynamedentityextractor.mdx b/docs-website/versioned_docs/version-2.30/pipeline-components/extractors/spacynamedentityextractor.mdx new file mode 100644 index 0000000000..298edd0219 --- /dev/null +++ b/docs-website/versioned_docs/version-2.30/pipeline-components/extractors/spacynamedentityextractor.mdx @@ -0,0 +1,100 @@ +--- +title: "SpacyNamedEntityExtractor" +id: spacynamedentityextractor +slug: "/spacynamedentityextractor" +description: "This component extracts predefined entities out of a piece of text and writes them into documents’ meta field." +--- + +# SpacyNamedEntityExtractor + +This component extracts predefined entities out of a piece of text and writes them into documents’ meta field. + +
+ +| | | +| --- | --- | +| **Most common position in a pipeline** | After the [PreProcessor](../preprocessors.mdx) in an indexing pipeline or after a [Retriever](../retrievers.mdx) in a query pipeline | +| **Mandatory init variables** | `model`: Name or path of the spaCy model to use | +| **Mandatory run variables** | `documents`: A list of documents | +| **Output variables** | `documents`: A list of documents | +| **API reference** | [Spacy](/reference/integrations-spacy) | +| **GitHub link** | https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/spacy | +| **Package name** | `spacy-haystack` | + +
+ +## Overview + +`SpacyNamedEntityExtractor` looks for entities, which are spans in the text. The extractor automatically recognizes and groups them depending on their class, such as people's names, organizations, locations, and other types. The exact classes are determined by the model that you initialize the component with. + +`SpacyNamedEntityExtractor` takes a list of documents as input and returns a list of the same documents with their `meta` data enriched with `NamedEntityAnnotations`. A `NamedEntityAnnotation` consists of the type of the entity and the start and end of the span, for example: `NamedEntityAnnotation(entity='PERSON', start=11, end=16, score=None)`. + +When the `SpacyNamedEntityExtractor` is initialized, you need to set a `model`. Optionally, you can set `pipeline_kwargs`, which are then passed on to the spaCy pipeline. You can additionally set the `device` that is used to run the component. + +## Usage + +Install the `spacy-haystack` package to use the `SpacyNamedEntityExtractor`: + +```shell +pip install spacy-haystack +``` + +The component works with any [spaCy model](https://spacy.io/models) that contains an NER component. + +`SpacyNamedEntityExtractor` accepts a list of `Documents` as its input. The extractor annotates the raw text in the documents and stores the annotations in the document's `meta` dictionary under the `named_entities` key. + +```python +from haystack.dataclasses import Document +from haystack_integrations.components.extractors.spacy import ( + SpacyNamedEntityExtractor, +) + +extractor = SpacyNamedEntityExtractor(model="en_core_web_sm") + +documents = [ + Document(content="My name is Clara and I live in Berkeley, California."), + Document(content="I'm Merlin, the happy pig!"), + Document(content="New York State is home to the Empire State Building."), +] + +result = extractor.run(documents) +print(result["documents"]) +``` + +Here is the example result: + +```python +[Document(id=aec840d1b6c85609f4f16c3e222a5a25fd8c4c53bd981a40c1268ab9c72cee10, content: 'My name is Clara and I live in Berkeley, California.', meta: {'named_entities': [NamedEntityAnnotation(entity='PERSON', start=11, end=16, score=None), NamedEntityAnnotation(entity='GPE', start=31, end=39, score=None), NamedEntityAnnotation(entity='GPE', start=41, end=51, score=None)]}), +Document(id=98f1dc5d0ccd9d9950cd191d1076db0f7af40c401dd7608f11c90cb3fc38c0c2, content: 'I'm Merlin, the happy pig!', meta: {'named_entities': [NamedEntityAnnotation(entity='PERSON', start=4, end=10, score=None)]}), +Document(id=44948ea0eec018b33aceaaedde4616eb9e93ce075e0090ec1613fc145f84b4a9, content: 'New York State is home to the Empire State Building.', meta: {'named_entities': [NamedEntityAnnotation(entity='GPE', start=0, end=14, score=None), NamedEntityAnnotation(entity='ORG', start=26, end=51, score=None)]})] +``` + +### Get stored annotations + +This component includes the `get_stored_annotations` helper class method that allows you to retrieve the annotations stored in a `Document` transparently: + +```python +from haystack.dataclasses import Document +from haystack_integrations.components.extractors.spacy import ( + SpacyNamedEntityExtractor, +) + +extractor = SpacyNamedEntityExtractor(model="en_core_web_sm") + +documents = [ + Document(content="My name is Clara and I live in Berkeley, California."), + Document(content="I'm Merlin, the happy pig!"), + Document(content="New York State is home to the Empire State Building."), +] + +result = extractor.run(documents) + +annotations = [ + SpacyNamedEntityExtractor.get_stored_annotations(doc) for doc in result["documents"] +] +print(annotations) + +# If a Document doesn't contain any annotations, this returns None. +new_doc = Document(content="In one of many possible worlds...") +assert SpacyNamedEntityExtractor.get_stored_annotations(new_doc) is None +``` diff --git a/docs-website/versioned_docs/version-2.30/pipeline-components/extractors/transformersnamedentityextractor.mdx b/docs-website/versioned_docs/version-2.30/pipeline-components/extractors/transformersnamedentityextractor.mdx index db03df935c..cf10f02613 100644 --- a/docs-website/versioned_docs/version-2.30/pipeline-components/extractors/transformersnamedentityextractor.mdx +++ b/docs-website/versioned_docs/version-2.30/pipeline-components/extractors/transformersnamedentityextractor.mdx @@ -59,16 +59,16 @@ documents = [ Document(content="New York State is home to the Empire State Building."), ] -extractor.run(documents) -print(documents) +result = extractor.run(documents) +print(result["documents"]) ``` Here is the example result: ```python -[Document(id=aec840d1b6c85609f4f16c3e222a5a25fd8c4c53bd981a40c1268ab9c72cee10, content: 'My name is Clara and I live in Berkeley, California.', meta: {'named_entities': [NamedEntityAnnotation(entity='PER', start=11, end=16, score=0.99641764), NamedEntityAnnotation(entity='LOC', start=31, end=39, score=0.996198), NamedEntityAnnotation(entity='LOC', start=41, end=51, score=0.9990196)]}), -Document(id=98f1dc5d0ccd9d9950cd191d1076db0f7af40c401dd7608f11c90cb3fc38c0c2, content: 'I'm Merlin, the happy pig!', meta: {'named_entities': [NamedEntityAnnotation(entity='PER', start=4, end=10, score=0.99054915)]}), -Document(id=44948ea0eec018b33aceaaedde4616eb9e93ce075e0090ec1613fc145f84b4a9, content: 'New York State is home to the Empire State Building.', meta: {'named_entities': [NamedEntityAnnotation(entity='LOC', start=0, end=14, score=0.9989541), NamedEntityAnnotation(entity='LOC', start=30, end=51, score=0.95746297)]})] +[Document(id=aec840d1b6c85609f4f16c3e222a5a25fd8c4c53bd981a40c1268ab9c72cee10, content: 'My name is Clara and I live in Berkeley, California.', meta: {'named_entities': [NamedEntityAnnotation(entity='PER', start=11, end=16, score=np.float32(0.99641764)), NamedEntityAnnotation(entity='LOC', start=31, end=39, score=np.float32(0.996198)), NamedEntityAnnotation(entity='LOC', start=41, end=51, score=np.float32(0.9990196))]}), +Document(id=98f1dc5d0ccd9d9950cd191d1076db0f7af40c401dd7608f11c90cb3fc38c0c2, content: 'I'm Merlin, the happy pig!', meta: {'named_entities': [NamedEntityAnnotation(entity='PER', start=4, end=10, score=np.float32(0.99054915))]}), +Document(id=44948ea0eec018b33aceaaedde4616eb9e93ce075e0090ec1613fc145f84b4a9, content: 'New York State is home to the Empire State Building.', meta: {'named_entities': [NamedEntityAnnotation(entity='LOC', start=0, end=14, score=np.float32(0.9989541)), NamedEntityAnnotation(entity='LOC', start=30, end=51, score=np.float32(0.9574631))]})] ``` ### Get stored annotations @@ -89,10 +89,11 @@ documents = [ Document(content="New York State is home to the Empire State Building."), ] -extractor.run(documents) +result = extractor.run(documents) annotations = [ - TransformersNamedEntityExtractor.get_stored_annotations(doc) for doc in documents + TransformersNamedEntityExtractor.get_stored_annotations(doc) + for doc in result["documents"] ] print(annotations) diff --git a/haystack/components/extractors/named_entity_extractor.py b/haystack/components/extractors/named_entity_extractor.py index 8bbeffc4be..791bdb913f 100644 --- a/haystack/components/extractors/named_entity_extractor.py +++ b/haystack/components/extractors/named_entity_extractor.py @@ -142,11 +142,14 @@ def __init__( The API token to download private models from Hugging Face. """ warnings.warn( - "`NamedEntityExtractor` will be removed from Haystack in version 3.0, as it is moving to the " - "`transformers-haystack` package and being renamed to `TransformersNamedEntityExtractor`. " - "To continue using it, install that package with " - "`pip install transformers-haystack` and update your import to " - "`from haystack_integrations.components.extractors.transformers import TransformersNamedEntityExtractor`.", + "`NamedEntityExtractor` will be removed from Haystack in version 3.0, as it is moving to dedicated " + "Core Integrations packages depending on the backend. " + "For the Hugging Face backend, install `transformers-haystack` with `pip install transformers-haystack` " + "and update your import to " + "`from haystack_integrations.components.extractors.transformers import TransformersNamedEntityExtractor`. " + "For the spaCy backend, install `spacy-haystack` with `pip install spacy-haystack` " + "and update your import to " + "`from haystack_integrations.components.extractors.spacy import SpacyNamedEntityExtractor`.", FutureWarning, stacklevel=2, ) diff --git a/releasenotes/notes/deprecate-spacy-named-entity-extractor-cc4e374632cd817a.yaml b/releasenotes/notes/deprecate-spacy-named-entity-extractor-cc4e374632cd817a.yaml new file mode 100644 index 0000000000..f7ba998610 --- /dev/null +++ b/releasenotes/notes/deprecate-spacy-named-entity-extractor-cc4e374632cd817a.yaml @@ -0,0 +1,10 @@ +--- +deprecations: + - | + The spacy backend of ``NamedEntityExtractor`` is deprecated and will be removed from Haystack in version 3.0. + It is moving to the ``spacy-haystack`` package and being renamed to ``SpacyNamedEntityExtractor``. To continue + using it, install the package with ``pip install spacy-haystack`` and update your import as follows: + + .. code-block:: python + + from haystack_integrations.components.extractors.spacy import SpacyNamedEntityExtractor