diff --git a/docs-website/docs/pipeline-components/converters/markdowntodocument.mdx b/docs-website/docs/pipeline-components/converters/markdowntodocument.mdx index c911328c0a9..41bdfdd9167 100644 --- a/docs-website/docs/pipeline-components/converters/markdowntodocument.mdx +++ b/docs-website/docs/pipeline-components/converters/markdowntodocument.mdx @@ -28,6 +28,8 @@ The `MarkdownToDocument` component converts Markdown files into documents. You c When you initialize the component, you can optionally turn off progress bars by setting `progress_bar` to `False`. If you want to convert the contents of tables into a single line, you can enable that through the `table_to_single_line` parameter. +If your Markdown files start with YAML frontmatter, set `extract_frontmatter=True` to move that data into `Document.meta` and remove it from the converted document content. Metadata passed through the `meta` input takes precedence over frontmatter keys. + ## Usage You need to install `markdown-it-py` and `mdit_plain packages` to use the `MarkdownToDocument` component: @@ -46,6 +48,31 @@ converter = MarkdownToDocument() docs = converter.run(sources=Path("my_file.md")) ``` +### With YAML frontmatter + +Given `equity_note.md`: + +```markdown +--- +ticker: AAPL +source: earnings_call +date: 2026-06-12 +--- + +# Thesis +Revenue guidance improved. +``` + +```python +from haystack.components.converters import MarkdownToDocument + +converter = MarkdownToDocument(extract_frontmatter=True) + +docs = converter.run(sources=["equity_note.md"])["documents"] +print(docs[0].meta["ticker"]) +print(docs[0].content) +``` + ### In a pipeline ```python diff --git a/haystack/components/converters/markdown.py b/haystack/components/converters/markdown.py index 063c7c5b3a6..d7f15bbc5c3 100644 --- a/haystack/components/converters/markdown.py +++ b/haystack/components/converters/markdown.py @@ -2,10 +2,13 @@ # # SPDX-License-Identifier: Apache-2.0 +import json import os +import re from pathlib import Path from typing import Any +import yaml from tqdm import tqdm from haystack import Document, component, logging @@ -20,6 +23,8 @@ logger = logging.getLogger(__name__) +_FRONTMATTER_PATTERN = re.compile(r"\A---[ \t]*\r?\n(?P.*?)(?:\r?\n)---[ \t]*(?:\r?\n|$)", re.DOTALL) + @component class MarkdownToDocument: @@ -43,7 +48,12 @@ class MarkdownToDocument: """ def __init__( - self, table_to_single_line: bool = False, progress_bar: bool = True, store_full_path: bool = False + self, + table_to_single_line: bool = False, + progress_bar: bool = True, + store_full_path: bool = False, + *, + extract_frontmatter: bool = False, ) -> None: """ Create a MarkdownToDocument component. @@ -55,12 +65,16 @@ def __init__( :param store_full_path: If True, the full path of the file is stored in the metadata of the document. If False, only the file name is stored. + :param extract_frontmatter: + If True, YAML frontmatter at the beginning of the Markdown file is + removed from the document content and added to the document metadata. """ markdown_conversion_imports.check() self.table_to_single_line = table_to_single_line self.progress_bar = progress_bar self.store_full_path = store_full_path + self.extract_frontmatter = extract_frontmatter @component.output_types(documents=list[Document]) def run( @@ -103,6 +117,7 @@ def run( continue try: file_content = bytestream.data.decode("utf-8") + file_content, frontmatter = self._extract_frontmatter(file_content, source) text = parser.render(file_content) except Exception as conversion_e: logger.warning( @@ -112,7 +127,7 @@ def run( ) continue - merged_metadata = {**bytestream.meta, **metadata} + merged_metadata = {**bytestream.meta, **frontmatter, **metadata} if not self.store_full_path and (file_path := bytestream.meta.get("file_path")): merged_metadata["file_path"] = os.path.basename(file_path) @@ -121,3 +136,39 @@ def run( documents.append(document) return {"documents": documents} + + def _extract_frontmatter(self, file_content: str, source: str | Path | ByteStream) -> tuple[str, dict[str, Any]]: + if not self.extract_frontmatter: + return file_content, {} + + match = _FRONTMATTER_PATTERN.match(file_content) + if not match: + return file_content, {} + + frontmatter_text = match.group("frontmatter") + try: + frontmatter = json.loads(json.dumps(yaml.safe_load(frontmatter_text), default=str)) or {} + except yaml.YAMLError as error: + logger.warning( + "Could not parse YAML frontmatter in {source}. Keeping it as content. Error: {error}", + source=source, + error=error, + ) + return file_content, {} + except (TypeError, ValueError) as error: + logger.warning( + "Could not convert YAML frontmatter in {source}. Keeping it as content. Error: {error}", + source=source, + error=error, + ) + return file_content, {} + + if not isinstance(frontmatter, dict): + logger.warning( + "Ignoring YAML frontmatter in {source}: expected a mapping, got {kind}.", + source=source, + kind=type(frontmatter).__name__, + ) + return file_content, {} + + return file_content[match.end() :], frontmatter diff --git a/releasenotes/notes/markdown-frontmatter-metadata-52ad49f535c44311.yaml b/releasenotes/notes/markdown-frontmatter-metadata-52ad49f535c44311.yaml new file mode 100644 index 00000000000..68598b71953 --- /dev/null +++ b/releasenotes/notes/markdown-frontmatter-metadata-52ad49f535c44311.yaml @@ -0,0 +1,6 @@ +--- +features: + - | + Added optional YAML frontmatter extraction to ``MarkdownToDocument``. When initialized with + ``extract_frontmatter=True``, YAML frontmatter at the beginning of a Markdown file is removed from + the converted content and added to ``Document.meta``. diff --git a/test/components/converters/test_markdown_to_document.py b/test/components/converters/test_markdown_to_document.py index 712dd57b3de..a427d7c6550 100644 --- a/test/components/converters/test_markdown_to_document.py +++ b/test/components/converters/test_markdown_to_document.py @@ -16,6 +16,7 @@ def test_init_params_default(self): converter = MarkdownToDocument() assert converter.table_to_single_line is False assert converter.progress_bar is True + assert converter.extract_frontmatter is False def test_init_params_custom(self): converter = MarkdownToDocument(table_to_single_line=True, progress_bar=False, store_full_path=False) @@ -79,6 +80,82 @@ def test_run_with_meta(self, test_files_path): assert output["documents"][0].meta["language"] == "it" assert output["documents"][1].meta["language"] == "it" + def test_run_extracts_yaml_frontmatter_into_metadata(self): + bytestream = ByteStream( + data=( + b"---\n" + b"ticker: AAPL\n" + b"date: 2026-06-12\n" + b"rating_score: 4\n" + b"source: earnings_call\n" + b"tags:\n" + b" - guidance\n" + b"---\n" + b"# Thesis\n" + b"Revenue guidance improved.\n" + ), + meta={"file_path": "/tmp/aapl.md"}, + ) + + converter = MarkdownToDocument(progress_bar=False, extract_frontmatter=True) + output = converter.run(sources=[bytestream]) + document = output["documents"][0] + + assert "Revenue guidance improved." in document.content + assert "ticker: AAPL" not in document.content + assert document.meta["ticker"] == "AAPL" + assert document.meta["date"] == "2026-06-12" + assert document.meta["rating_score"] == 4 + assert document.meta["source"] == "earnings_call" + assert document.meta["tags"] == ["guidance"] + assert document.meta["file_path"] == "aapl.md" + + def test_run_keeps_frontmatter_as_content_by_default(self): + bytestream = ByteStream(data=b"---\nticker: AAPL\n---\n# Thesis\n") + + converter = MarkdownToDocument(progress_bar=False) + output = converter.run(sources=[bytestream]) + document = output["documents"][0] + + assert "ticker: AAPL" in document.content + assert "ticker" not in document.meta + + def test_run_meta_overrides_frontmatter_metadata(self): + bytestream = ByteStream( + data=b"---\nticker: AAPL\nsource: filing\n---\n# Thesis\n", meta={"source": "bytestream"} + ) + + converter = MarkdownToDocument(progress_bar=False, extract_frontmatter=True) + output = converter.run(sources=[bytestream], meta={"ticker": "MSFT"}) + document = output["documents"][0] + + assert document.meta["ticker"] == "MSFT" + assert document.meta["source"] == "filing" + + def test_run_keeps_malformed_frontmatter_as_content_and_logs_warning(self, caplog): + bytestream = ByteStream(data=b"---\nticker: [AAPL\n---\n# Thesis\n") + + converter = MarkdownToDocument(progress_bar=False, extract_frontmatter=True) + with caplog.at_level(logging.WARNING): + output = converter.run(sources=[bytestream]) + + document = output["documents"][0] + assert "ticker: [AAPL" in document.content + assert "ticker" not in document.meta + assert "Could not parse YAML frontmatter" in caplog.text + + def test_run_keeps_unserializable_frontmatter_as_content_and_logs_warning(self, caplog): + bytestream = ByteStream(data=b"---\ncycle: &cycle\n - *cycle\n---\n# Thesis\n") + + converter = MarkdownToDocument(progress_bar=False, extract_frontmatter=True) + with caplog.at_level(logging.WARNING): + output = converter.run(sources=[bytestream]) + + document = output["documents"][0] + assert "cycle:" in document.content + assert "cycle" not in document.meta + assert "Could not convert YAML frontmatter" in caplog.text + @pytest.mark.integration def test_run_wrong_file_type(self, test_files_path, caplog): """