Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ The `MarkdownToDocument` component converts Markdown files into documents. You c

When you initialize the component, you can optionally turn off progress bars by setting `progress_bar` to `False`. If you want to convert the contents of tables into a single line, you can enable that through the `table_to_single_line` parameter.

If your Markdown files start with YAML frontmatter, set `extract_frontmatter=True` to move that data into `Document.meta` and remove it from the converted document content. Metadata passed through the `meta` input takes precedence over frontmatter keys.

## Usage

You need to install `markdown-it-py` and `mdit_plain packages` to use the `MarkdownToDocument` component:
Expand All @@ -46,6 +48,31 @@ converter = MarkdownToDocument()
docs = converter.run(sources=Path("my_file.md"))
```

### With YAML frontmatter

Given `equity_note.md`:

```markdown
---
ticker: AAPL
source: earnings_call
date: 2026-06-12
---

# Thesis
Revenue guidance improved.
```

```python
from haystack.components.converters import MarkdownToDocument

converter = MarkdownToDocument(extract_frontmatter=True)

Comment thread
anakin87 marked this conversation as resolved.
docs = converter.run(sources=["equity_note.md"])["documents"]
print(docs[0].meta["ticker"])
print(docs[0].content)
```

### In a pipeline

```python
Expand Down
55 changes: 53 additions & 2 deletions haystack/components/converters/markdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,13 @@
#
# SPDX-License-Identifier: Apache-2.0

import json
import os
import re
from pathlib import Path
from typing import Any

import yaml
from tqdm import tqdm

from haystack import Document, component, logging
Expand All @@ -20,6 +23,8 @@

logger = logging.getLogger(__name__)

_FRONTMATTER_PATTERN = re.compile(r"\A---[ \t]*\r?\n(?P<frontmatter>.*?)(?:\r?\n)---[ \t]*(?:\r?\n|$)", re.DOTALL)


@component
class MarkdownToDocument:
Expand All @@ -43,7 +48,12 @@ class MarkdownToDocument:
"""

def __init__(
self, table_to_single_line: bool = False, progress_bar: bool = True, store_full_path: bool = False
self,
table_to_single_line: bool = False,
progress_bar: bool = True,
store_full_path: bool = False,
*,
extract_frontmatter: bool = False,
) -> None:
"""
Create a MarkdownToDocument component.
Expand All @@ -55,12 +65,16 @@ def __init__(
:param store_full_path:
If True, the full path of the file is stored in the metadata of the document.
If False, only the file name is stored.
:param extract_frontmatter:
If True, YAML frontmatter at the beginning of the Markdown file is
removed from the document content and added to the document metadata.
"""
markdown_conversion_imports.check()

self.table_to_single_line = table_to_single_line
self.progress_bar = progress_bar
self.store_full_path = store_full_path
self.extract_frontmatter = extract_frontmatter

@component.output_types(documents=list[Document])
def run(
Expand Down Expand Up @@ -103,6 +117,7 @@ def run(
continue
try:
file_content = bytestream.data.decode("utf-8")
file_content, frontmatter = self._extract_frontmatter(file_content, source)
text = parser.render(file_content)
except Exception as conversion_e:
logger.warning(
Expand All @@ -112,7 +127,7 @@ def run(
)
continue

merged_metadata = {**bytestream.meta, **metadata}
merged_metadata = {**bytestream.meta, **frontmatter, **metadata}

if not self.store_full_path and (file_path := bytestream.meta.get("file_path")):
merged_metadata["file_path"] = os.path.basename(file_path)
Expand All @@ -121,3 +136,39 @@ def run(
documents.append(document)

return {"documents": documents}

def _extract_frontmatter(self, file_content: str, source: str | Path | ByteStream) -> tuple[str, dict[str, Any]]:
if not self.extract_frontmatter:
return file_content, {}

match = _FRONTMATTER_PATTERN.match(file_content)
if not match:
return file_content, {}

frontmatter_text = match.group("frontmatter")
try:
Comment thread
anakin87 marked this conversation as resolved.
frontmatter = json.loads(json.dumps(yaml.safe_load(frontmatter_text), default=str)) or {}
except yaml.YAMLError as error:
logger.warning(
"Could not parse YAML frontmatter in {source}. Keeping it as content. Error: {error}",
source=source,
error=error,
)
return file_content, {}
except (TypeError, ValueError) as error:
logger.warning(
"Could not convert YAML frontmatter in {source}. Keeping it as content. Error: {error}",
source=source,
error=error,
)
return file_content, {}

if not isinstance(frontmatter, dict):
logger.warning(
"Ignoring YAML frontmatter in {source}: expected a mapping, got {kind}.",
source=source,
kind=type(frontmatter).__name__,
)
return file_content, {}

return file_content[match.end() :], frontmatter
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
---
features:
- |
Added optional YAML frontmatter extraction to ``MarkdownToDocument``. When initialized with
``extract_frontmatter=True``, YAML frontmatter at the beginning of a Markdown file is removed from
the converted content and added to ``Document.meta``.
77 changes: 77 additions & 0 deletions test/components/converters/test_markdown_to_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ def test_init_params_default(self):
converter = MarkdownToDocument()
assert converter.table_to_single_line is False
assert converter.progress_bar is True
assert converter.extract_frontmatter is False

def test_init_params_custom(self):
converter = MarkdownToDocument(table_to_single_line=True, progress_bar=False, store_full_path=False)
Expand Down Expand Up @@ -79,6 +80,82 @@ def test_run_with_meta(self, test_files_path):
assert output["documents"][0].meta["language"] == "it"
assert output["documents"][1].meta["language"] == "it"

def test_run_extracts_yaml_frontmatter_into_metadata(self):
bytestream = ByteStream(
data=(
b"---\n"
b"ticker: AAPL\n"
b"date: 2026-06-12\n"
b"rating_score: 4\n"
b"source: earnings_call\n"
b"tags:\n"
b" - guidance\n"
b"---\n"
b"# Thesis\n"
b"Revenue guidance improved.\n"
),
meta={"file_path": "/tmp/aapl.md"},
)

converter = MarkdownToDocument(progress_bar=False, extract_frontmatter=True)
output = converter.run(sources=[bytestream])
document = output["documents"][0]

assert "Revenue guidance improved." in document.content
assert "ticker: AAPL" not in document.content
assert document.meta["ticker"] == "AAPL"
assert document.meta["date"] == "2026-06-12"
assert document.meta["rating_score"] == 4
assert document.meta["source"] == "earnings_call"
assert document.meta["tags"] == ["guidance"]
assert document.meta["file_path"] == "aapl.md"

def test_run_keeps_frontmatter_as_content_by_default(self):
bytestream = ByteStream(data=b"---\nticker: AAPL\n---\n# Thesis\n")

converter = MarkdownToDocument(progress_bar=False)
output = converter.run(sources=[bytestream])
document = output["documents"][0]

assert "ticker: AAPL" in document.content
assert "ticker" not in document.meta

def test_run_meta_overrides_frontmatter_metadata(self):
bytestream = ByteStream(
data=b"---\nticker: AAPL\nsource: filing\n---\n# Thesis\n", meta={"source": "bytestream"}
)

converter = MarkdownToDocument(progress_bar=False, extract_frontmatter=True)
output = converter.run(sources=[bytestream], meta={"ticker": "MSFT"})
document = output["documents"][0]

assert document.meta["ticker"] == "MSFT"
assert document.meta["source"] == "filing"

Comment thread
anakin87 marked this conversation as resolved.
def test_run_keeps_malformed_frontmatter_as_content_and_logs_warning(self, caplog):
bytestream = ByteStream(data=b"---\nticker: [AAPL\n---\n# Thesis\n")

converter = MarkdownToDocument(progress_bar=False, extract_frontmatter=True)
with caplog.at_level(logging.WARNING):
output = converter.run(sources=[bytestream])

document = output["documents"][0]
assert "ticker: [AAPL" in document.content
assert "ticker" not in document.meta
assert "Could not parse YAML frontmatter" in caplog.text

def test_run_keeps_unserializable_frontmatter_as_content_and_logs_warning(self, caplog):
bytestream = ByteStream(data=b"---\ncycle: &cycle\n - *cycle\n---\n# Thesis\n")

converter = MarkdownToDocument(progress_bar=False, extract_frontmatter=True)
with caplog.at_level(logging.WARNING):
output = converter.run(sources=[bytestream])

document = output["documents"][0]
assert "cycle:" in document.content
assert "cycle" not in document.meta
assert "Could not convert YAML frontmatter" in caplog.text

@pytest.mark.integration
def test_run_wrong_file_type(self, test_files_path, caplog):
"""
Expand Down
Loading