Export API¶

ragcrawl provides exporters and publishers for outputting crawled content.

Overview¶

Exporters¶

Export data to structured formats:

Exporter	Format	Description
`JSONExporter`	.json	Single JSON array
`JSONLExporter`	.jsonl	JSON Lines format

Publishers¶

Output markdown files:

Publisher	Output	Description
`SinglePagePublisher`	One file	Combined markdown
`MultiPagePublisher`	Directory	Preserves structure

JSONExporter¶

Export documents to a single JSON file.

Python

from ragcrawl.export import JSONExporter
from pathlib import Path

exporter = JSONExporter(indent=2)

# Export documents
exporter.export_documents(documents, Path("output.json"))

# Export chunks
exporter.export_chunks(chunks, Path("chunks.json"))

Output Format¶

JSON

[
  {
    "doc_id": "abc123",
    "source_url": "https://example.com/page",
    "title": "Page Title",
    "markdown": "# Page Title\n\nContent...",
    "status_code": 200,
    "word_count": 500
  }
]

JSONLExporter¶

Export documents to JSON Lines format (one JSON object per line).

Python

from ragcrawl.export import JSONLExporter
from pathlib import Path

exporter = JSONLExporter()

# Export documents
exporter.export_documents(documents, Path("output.jsonl"))

Output Format¶

Text Only

{"doc_id":"abc123","source_url":"https://example.com/page1",...}
{"doc_id":"def456","source_url":"https://example.com/page2",...}

SinglePagePublisher¶

Combine all documents into a single markdown file.

Python

from ragcrawl.output import SinglePagePublisher
from ragcrawl.config import OutputConfig, OutputMode

config = OutputConfig(
    mode=OutputMode.SINGLE,
    root_dir="./output",
    single_file_name="knowledge_base.md",
    include_toc=True,
    include_metadata=True,
)

publisher = SinglePagePublisher(config)
files = publisher.publish(documents)

print(f"Created: {files[0]}")

Configuration¶

Option	Type	Default	Description
`single_file_name`	str	"output.md"	Output filename
`include_toc`	bool	True	Add table of contents
`include_metadata`	bool	True	Add source URLs

MultiPagePublisher¶

Output documents preserving URL structure.

Python

from ragcrawl.output import MultiPagePublisher
from ragcrawl.config import OutputConfig, OutputMode

config = OutputConfig(
    mode=OutputMode.MULTI,
    root_dir="./output",
    rewrite_links=True,
    generate_index=True,
    include_metadata=True,
)

publisher = MultiPagePublisher(config)
files = publisher.publish(documents)

print(f"Created {len(files)} files")

Output Structure¶

Text Only

output/
├── index.md
├── example.com/
│   ├── docs/
│   │   ├── getting-started.md
│   │   └── api-reference.md
│   └── blog/
│       └── post-1.md

Configuration¶

Option	Type	Default	Description
`root_dir`	str	"./output"	Output directory
`rewrite_links`	bool	True	Rewrite internal links
`generate_index`	bool	True	Create index file
`include_metadata`	bool	True	Add frontmatter

Output Configuration¶

Python

from ragcrawl.config import OutputConfig, OutputMode

config = OutputConfig(
    mode=OutputMode.MULTI,  # or SINGLE
    root_dir="./output",

    # Single-page options
    single_file_name="combined.md",
    include_toc=True,

    # Multi-page options
    rewrite_links=True,
    generate_index=True,

    # Common options
    include_metadata=True,
)

Module Reference¶

Export functionality for ragcrawl.

JSONExporter ¶

Python

JSONExporter(
    indent: int | None = 2,
    include_html: bool = False,
    include_diagnostics: bool = True,
)

Bases: Exporter

Exports documents and chunks as JSON.

Initialize JSON exporter.

PARAMETER	DESCRIPTION
`indent`	JSON indentation (None for compact). TYPE: `int \| None` DEFAULT: `2`
`include_html`	Include HTML content in export. TYPE: `bool` DEFAULT: `False`
`include_diagnostics`	Include diagnostic info. TYPE: `bool` DEFAULT: `True`

Source code in src/ragcrawl/export/json_exporter.py

Python
def __init__(
    self,
    indent: int | None = 2,
    include_html: bool = False,
    include_diagnostics: bool = True,
) -> None:
    """
    Initialize JSON exporter.

    Args:
        indent: JSON indentation (None for compact).
        include_html: Include HTML content in export.
        include_diagnostics: Include diagnostic info.
    """
    self.indent = indent
    self.include_html = include_html
    self.include_diagnostics = include_diagnostics

export_document ¶

Python

export_document(
    document: Document, path: Path | None = None
) -> str | None

Export a document as JSON.

Source code in src/ragcrawl/export/json_exporter.py

Python
def export_document(
    self, document: Document, path: Path | None = None
) -> str | None:
    """Export a document as JSON."""
    data = self._document_to_dict(document)
    json_str = json.dumps(data, indent=self.indent, default=self._json_serializer)

    if path:
        path.parent.mkdir(parents=True, exist_ok=True)
        path.write_text(json_str)
        return None

    return json_str

export_documents ¶

Python

export_documents(
    documents: list[Document], path: Path
) -> None

Export documents as JSON array.

Source code in src/ragcrawl/export/json_exporter.py

Python
def export_documents(self, documents: list[Document], path: Path) -> None:
    """Export documents as JSON array."""
    data = [self._document_to_dict(doc) for doc in documents]
    json_str = json.dumps(data, indent=self.indent, default=self._json_serializer)

    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text(json_str)

export_chunk ¶

Python

export_chunk(
    chunk: Chunk, path: Path | None = None
) -> str | None

Export a chunk as JSON.

Source code in src/ragcrawl/export/json_exporter.py

Python
def export_chunk(self, chunk: Chunk, path: Path | None = None) -> str | None:
    """Export a chunk as JSON."""
    data = self._chunk_to_dict(chunk)
    json_str = json.dumps(data, indent=self.indent, default=self._json_serializer)

    if path:
        path.parent.mkdir(parents=True, exist_ok=True)
        path.write_text(json_str)
        return None

    return json_str

export_chunks ¶

Python

export_chunks(chunks: list[Chunk], path: Path) -> None

Export chunks as JSON array.

Source code in src/ragcrawl/export/json_exporter.py

Python
def export_chunks(self, chunks: list[Chunk], path: Path) -> None:
    """Export chunks as JSON array."""
    data = [self._chunk_to_dict(chunk) for chunk in chunks]
    json_str = json.dumps(data, indent=self.indent, default=self._json_serializer)

    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text(json_str)

JSONLExporter ¶

Python

JSONLExporter(
    include_html: bool = False,
    include_diagnostics: bool = True,
)

Bases: Exporter

Exports documents and chunks as JSONL (one JSON object per line).

JSONL is better for streaming and large datasets.

Initialize JSONL exporter.

PARAMETER	DESCRIPTION
`include_html`	Include HTML content. TYPE: `bool` DEFAULT: `False`
`include_diagnostics`	Include diagnostics. TYPE: `bool` DEFAULT: `True`

Source code in src/ragcrawl/export/json_exporter.py

Python
def __init__(
    self,
    include_html: bool = False,
    include_diagnostics: bool = True,
) -> None:
    """
    Initialize JSONL exporter.

    Args:
        include_html: Include HTML content.
        include_diagnostics: Include diagnostics.
    """
    self.include_html = include_html
    self.include_diagnostics = include_diagnostics
    self._json_exporter = JSONExporter(
        indent=None,
        include_html=include_html,
        include_diagnostics=include_diagnostics,
    )

export_document ¶

Python

export_document(
    document: Document, path: Path | None = None
) -> str | None

Export a document as JSONL line.

Source code in src/ragcrawl/export/json_exporter.py

Python
def export_document(
    self, document: Document, path: Path | None = None
) -> str | None:
    """Export a document as JSONL line."""
    return self._json_exporter.export_document(document, path)

export_documents ¶

Python

export_documents(
    documents: list[Document], path: Path
) -> None

Export documents as JSONL file.

Source code in src/ragcrawl/export/json_exporter.py

Python
def export_documents(self, documents: list[Document], path: Path) -> None:
    """Export documents as JSONL file."""
    path.parent.mkdir(parents=True, exist_ok=True)

    with path.open("w") as f:
        for doc in documents:
            line = self._json_exporter.export_document(doc)
            f.write(line + "\n")

export_chunk ¶

Python

export_chunk(
    chunk: Chunk, path: Path | None = None
) -> str | None

Export a chunk as JSONL line.

Source code in src/ragcrawl/export/json_exporter.py

Python
def export_chunk(self, chunk: Chunk, path: Path | None = None) -> str | None:
    """Export a chunk as JSONL line."""
    return self._json_exporter.export_chunk(chunk, path)

export_chunks ¶

Python

export_chunks(chunks: list[Chunk], path: Path) -> None

Export chunks as JSONL file.

Source code in src/ragcrawl/export/json_exporter.py

Python
def export_chunks(self, chunks: list[Chunk], path: Path) -> None:
    """Export chunks as JSONL file."""
    path.parent.mkdir(parents=True, exist_ok=True)

    with path.open("w") as f:
        for chunk in chunks:
            line = self._json_exporter.export_chunk(chunk)
            f.write(line + "\n")

Output publishing for ragcrawl.

SinglePagePublisher ¶

Python

SinglePagePublisher(config: OutputConfig)

Bases: MarkdownPublisher

Publishes all documents to a single markdown file.

Features: - Auto-generated table of contents - Per-page anchors for navigation - Configurable page separators

Source code in src/ragcrawl/output/publisher.py

Python
def __init__(self, config: OutputConfig) -> None:
    """
    Initialize publisher.

    Args:
        config: Output configuration.
    """
    self.config = config
    self.output_path = Path(config.root_dir)

publish ¶

Python

publish(documents: list[Document]) -> list[Path]

Publish all documents to a single file.

PARAMETER	DESCRIPTION
`documents`	Documents to publish. TYPE: `list[Document]`

RETURNS	DESCRIPTION
`list[Path]`	List containing the single output file path.

Source code in src/ragcrawl/output/single_page.py

Python
def publish(self, documents: list[Document]) -> list[Path]:
    """
    Publish all documents to a single file.

    Args:
        documents: Documents to publish.

    Returns:
        List containing the single output file path.
    """
    if not documents:
        return []

    self.ensure_output_dir()

    # Sort documents by depth, then URL
    sorted_docs = sorted(documents, key=lambda d: (d.depth, d.normalized_url))

    # Build content
    content_parts = []

    # Generate TOC if enabled
    if self.config.generate_toc:
        toc = self._generate_toc(sorted_docs)
        content_parts.append(toc)
        content_parts.append(self.config.page_separator)

    # Add each document
    for doc in sorted_docs:
        page_content = self._format_document(doc)
        content_parts.append(page_content)
        content_parts.append(self.config.page_separator)

    # Write file
    output_file = self.output_path / self.config.single_file_name
    output_file.write_text("".join(content_parts))

    return [output_file]

publish_single ¶

Python

publish_single(document: Document) -> Path | None

Single page mode doesn't support individual publishing.

Source code in src/ragcrawl/output/single_page.py

Python
def publish_single(self, document: Document) -> Path | None:
    """Single page mode doesn't support individual publishing."""
    return None

MultiPagePublisher ¶

Python

MultiPagePublisher(config: OutputConfig)

Bases: MarkdownPublisher

Publishes documents as individual markdown files.

Features: - Preserves site folder structure - Rewrites internal links to local markdown files - Generates navigation aids (index, breadcrumbs, prev/next) - Handles deleted pages via tombstones or redirects

Initialize multi-page publisher.

Source code in src/ragcrawl/output/multi_page.py

Python
def __init__(self, config: OutputConfig) -> None:
    """Initialize multi-page publisher."""
    super().__init__(config)
    self.link_rewriter = LinkRewriter(config)
    self.nav_generator = NavigationGenerator(config)

publish ¶

Python

publish(documents: list[Document]) -> list[Path]

Publish documents as individual files.

PARAMETER	DESCRIPTION
`documents`	Documents to publish. TYPE: `list[Document]`

RETURNS	DESCRIPTION
`list[Path]`	List of created file paths.

Source code in src/ragcrawl/output/multi_page.py

Python
def publish(self, documents: list[Document]) -> list[Path]:
    """
    Publish documents as individual files.

    Args:
        documents: Documents to publish.

    Returns:
        List of created file paths.
    """
    if not documents:
        return []

    self.ensure_output_dir()

    # Build URL to path mapping for link rewriting
    url_to_path = {}
    for doc in documents:
        output_path = self._url_to_path(doc.normalized_url)
        url_to_path[doc.normalized_url] = output_path

    self.link_rewriter.set_url_mapping(url_to_path)

    # Sort by depth for proper ordering
    sorted_docs = sorted(documents, key=lambda d: (d.depth, d.normalized_url))

    created_files = []

    # Publish each document
    for i, doc in enumerate(sorted_docs):
        # Get prev/next for navigation
        prev_doc = sorted_docs[i - 1] if i > 0 else None
        next_doc = sorted_docs[i + 1] if i < len(sorted_docs) - 1 else None

        file_path = self._publish_document(doc, prev_doc, next_doc)
        if file_path:
            created_files.append(file_path)

    # Generate index if enabled
    if self.config.generate_index:
        index_path = self._generate_index(sorted_docs)
        created_files.append(index_path)

    return created_files

publish_single ¶

Python

publish_single(document: Document) -> Path | None

Publish a single document.

PARAMETER	DESCRIPTION
`document`	Document to publish. TYPE: `Document`

RETURNS	DESCRIPTION
`Path \| None`	Created file path.

Source code in src/ragcrawl/output/multi_page.py

Python
def publish_single(self, document: Document) -> Path | None:
    """
    Publish a single document.

    Args:
        document: Document to publish.

    Returns:
        Created file path.
    """
    self.ensure_output_dir()
    return self._publish_document(document, None, None)