Skip to content

Export API

ragcrawl provides exporters and publishers for outputting crawled content.

Overview

Exporters

Export data to structured formats:

Exporter Format Description
JSONExporter .json Single JSON array
JSONLExporter .jsonl JSON Lines format

Publishers

Output markdown files:

Publisher Output Description
SinglePagePublisher One file Combined markdown
MultiPagePublisher Directory Preserves structure

JSONExporter

Export documents to a single JSON file.

Python
from ragcrawl.export import JSONExporter
from pathlib import Path

exporter = JSONExporter(indent=2)

# Export documents
exporter.export_documents(documents, Path("output.json"))

# Export chunks
exporter.export_chunks(chunks, Path("chunks.json"))

Output Format

JSON
[
  {
    "doc_id": "abc123",
    "source_url": "https://example.com/page",
    "title": "Page Title",
    "markdown": "# Page Title\n\nContent...",
    "status_code": 200,
    "word_count": 500
  }
]

JSONLExporter

Export documents to JSON Lines format (one JSON object per line).

Python
from ragcrawl.export import JSONLExporter
from pathlib import Path

exporter = JSONLExporter()

# Export documents
exporter.export_documents(documents, Path("output.jsonl"))

Output Format

Text Only
{"doc_id":"abc123","source_url":"https://example.com/page1",...}
{"doc_id":"def456","source_url":"https://example.com/page2",...}

SinglePagePublisher

Combine all documents into a single markdown file.

Python
from ragcrawl.output import SinglePagePublisher
from ragcrawl.config import OutputConfig, OutputMode

config = OutputConfig(
    mode=OutputMode.SINGLE,
    root_dir="./output",
    single_file_name="knowledge_base.md",
    include_toc=True,
    include_metadata=True,
)

publisher = SinglePagePublisher(config)
files = publisher.publish(documents)

print(f"Created: {files[0]}")

Configuration

Option Type Default Description
single_file_name str "output.md" Output filename
include_toc bool True Add table of contents
include_metadata bool True Add source URLs

MultiPagePublisher

Output documents preserving URL structure.

Python
from ragcrawl.output import MultiPagePublisher
from ragcrawl.config import OutputConfig, OutputMode

config = OutputConfig(
    mode=OutputMode.MULTI,
    root_dir="./output",
    rewrite_links=True,
    generate_index=True,
    include_metadata=True,
)

publisher = MultiPagePublisher(config)
files = publisher.publish(documents)

print(f"Created {len(files)} files")

Output Structure

Text Only
output/
├── index.md
├── example.com/
│   ├── docs/
│   │   ├── getting-started.md
│   │   └── api-reference.md
│   └── blog/
│       └── post-1.md

Configuration

Option Type Default Description
root_dir str "./output" Output directory
rewrite_links bool True Rewrite internal links
generate_index bool True Create index file
include_metadata bool True Add frontmatter

Output Configuration

Python
from ragcrawl.config import OutputConfig, OutputMode

config = OutputConfig(
    mode=OutputMode.MULTI,  # or SINGLE
    root_dir="./output",

    # Single-page options
    single_file_name="combined.md",
    include_toc=True,

    # Multi-page options
    rewrite_links=True,
    generate_index=True,

    # Common options
    include_metadata=True,
)

Module Reference

Export functionality for ragcrawl.

JSONExporter

Python
JSONExporter(
    indent: int | None = 2,
    include_html: bool = False,
    include_diagnostics: bool = True,
)

Bases: Exporter

Exports documents and chunks as JSON.

Initialize JSON exporter.

PARAMETER DESCRIPTION
indent

JSON indentation (None for compact).

TYPE: int | None DEFAULT: 2

include_html

Include HTML content in export.

TYPE: bool DEFAULT: False

include_diagnostics

Include diagnostic info.

TYPE: bool DEFAULT: True

Source code in src/ragcrawl/export/json_exporter.py
Python
def __init__(
    self,
    indent: int | None = 2,
    include_html: bool = False,
    include_diagnostics: bool = True,
) -> None:
    """
    Initialize JSON exporter.

    Args:
        indent: JSON indentation (None for compact).
        include_html: Include HTML content in export.
        include_diagnostics: Include diagnostic info.
    """
    self.indent = indent
    self.include_html = include_html
    self.include_diagnostics = include_diagnostics

export_document

Python
export_document(
    document: Document, path: Path | None = None
) -> str | None

Export a document as JSON.

Source code in src/ragcrawl/export/json_exporter.py
Python
def export_document(
    self, document: Document, path: Path | None = None
) -> str | None:
    """Export a document as JSON."""
    data = self._document_to_dict(document)
    json_str = json.dumps(data, indent=self.indent, default=self._json_serializer)

    if path:
        path.parent.mkdir(parents=True, exist_ok=True)
        path.write_text(json_str)
        return None

    return json_str

export_documents

Python
export_documents(
    documents: list[Document], path: Path
) -> None

Export documents as JSON array.

Source code in src/ragcrawl/export/json_exporter.py
Python
def export_documents(self, documents: list[Document], path: Path) -> None:
    """Export documents as JSON array."""
    data = [self._document_to_dict(doc) for doc in documents]
    json_str = json.dumps(data, indent=self.indent, default=self._json_serializer)

    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text(json_str)

export_chunk

Python
export_chunk(
    chunk: Chunk, path: Path | None = None
) -> str | None

Export a chunk as JSON.

Source code in src/ragcrawl/export/json_exporter.py
Python
def export_chunk(self, chunk: Chunk, path: Path | None = None) -> str | None:
    """Export a chunk as JSON."""
    data = self._chunk_to_dict(chunk)
    json_str = json.dumps(data, indent=self.indent, default=self._json_serializer)

    if path:
        path.parent.mkdir(parents=True, exist_ok=True)
        path.write_text(json_str)
        return None

    return json_str

export_chunks

Python
export_chunks(chunks: list[Chunk], path: Path) -> None

Export chunks as JSON array.

Source code in src/ragcrawl/export/json_exporter.py
Python
def export_chunks(self, chunks: list[Chunk], path: Path) -> None:
    """Export chunks as JSON array."""
    data = [self._chunk_to_dict(chunk) for chunk in chunks]
    json_str = json.dumps(data, indent=self.indent, default=self._json_serializer)

    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text(json_str)

JSONLExporter

Python
JSONLExporter(
    include_html: bool = False,
    include_diagnostics: bool = True,
)

Bases: Exporter

Exports documents and chunks as JSONL (one JSON object per line).

JSONL is better for streaming and large datasets.

Initialize JSONL exporter.

PARAMETER DESCRIPTION
include_html

Include HTML content.

TYPE: bool DEFAULT: False

include_diagnostics

Include diagnostics.

TYPE: bool DEFAULT: True

Source code in src/ragcrawl/export/json_exporter.py
Python
def __init__(
    self,
    include_html: bool = False,
    include_diagnostics: bool = True,
) -> None:
    """
    Initialize JSONL exporter.

    Args:
        include_html: Include HTML content.
        include_diagnostics: Include diagnostics.
    """
    self.include_html = include_html
    self.include_diagnostics = include_diagnostics
    self._json_exporter = JSONExporter(
        indent=None,
        include_html=include_html,
        include_diagnostics=include_diagnostics,
    )

export_document

Python
export_document(
    document: Document, path: Path | None = None
) -> str | None

Export a document as JSONL line.

Source code in src/ragcrawl/export/json_exporter.py
Python
def export_document(
    self, document: Document, path: Path | None = None
) -> str | None:
    """Export a document as JSONL line."""
    return self._json_exporter.export_document(document, path)

export_documents

Python
export_documents(
    documents: list[Document], path: Path
) -> None

Export documents as JSONL file.

Source code in src/ragcrawl/export/json_exporter.py
Python
def export_documents(self, documents: list[Document], path: Path) -> None:
    """Export documents as JSONL file."""
    path.parent.mkdir(parents=True, exist_ok=True)

    with path.open("w") as f:
        for doc in documents:
            line = self._json_exporter.export_document(doc)
            f.write(line + "\n")

export_chunk

Python
export_chunk(
    chunk: Chunk, path: Path | None = None
) -> str | None

Export a chunk as JSONL line.

Source code in src/ragcrawl/export/json_exporter.py
Python
def export_chunk(self, chunk: Chunk, path: Path | None = None) -> str | None:
    """Export a chunk as JSONL line."""
    return self._json_exporter.export_chunk(chunk, path)

export_chunks

Python
export_chunks(chunks: list[Chunk], path: Path) -> None

Export chunks as JSONL file.

Source code in src/ragcrawl/export/json_exporter.py
Python
def export_chunks(self, chunks: list[Chunk], path: Path) -> None:
    """Export chunks as JSONL file."""
    path.parent.mkdir(parents=True, exist_ok=True)

    with path.open("w") as f:
        for chunk in chunks:
            line = self._json_exporter.export_chunk(chunk)
            f.write(line + "\n")

Output publishing for ragcrawl.

SinglePagePublisher

Python
SinglePagePublisher(config: OutputConfig)

Bases: MarkdownPublisher

Publishes all documents to a single markdown file.

Features: - Auto-generated table of contents - Per-page anchors for navigation - Configurable page separators

Source code in src/ragcrawl/output/publisher.py
Python
def __init__(self, config: OutputConfig) -> None:
    """
    Initialize publisher.

    Args:
        config: Output configuration.
    """
    self.config = config
    self.output_path = Path(config.root_dir)

publish

Python
publish(documents: list[Document]) -> list[Path]

Publish all documents to a single file.

PARAMETER DESCRIPTION
documents

Documents to publish.

TYPE: list[Document]

RETURNS DESCRIPTION
list[Path]

List containing the single output file path.

Source code in src/ragcrawl/output/single_page.py
Python
def publish(self, documents: list[Document]) -> list[Path]:
    """
    Publish all documents to a single file.

    Args:
        documents: Documents to publish.

    Returns:
        List containing the single output file path.
    """
    if not documents:
        return []

    self.ensure_output_dir()

    # Sort documents by depth, then URL
    sorted_docs = sorted(documents, key=lambda d: (d.depth, d.normalized_url))

    # Build content
    content_parts = []

    # Generate TOC if enabled
    if self.config.generate_toc:
        toc = self._generate_toc(sorted_docs)
        content_parts.append(toc)
        content_parts.append(self.config.page_separator)

    # Add each document
    for doc in sorted_docs:
        page_content = self._format_document(doc)
        content_parts.append(page_content)
        content_parts.append(self.config.page_separator)

    # Write file
    output_file = self.output_path / self.config.single_file_name
    output_file.write_text("".join(content_parts))

    return [output_file]

publish_single

Python
publish_single(document: Document) -> Path | None

Single page mode doesn't support individual publishing.

Source code in src/ragcrawl/output/single_page.py
Python
def publish_single(self, document: Document) -> Path | None:
    """Single page mode doesn't support individual publishing."""
    return None

MultiPagePublisher

Python
MultiPagePublisher(config: OutputConfig)

Bases: MarkdownPublisher

Publishes documents as individual markdown files.

Features: - Preserves site folder structure - Rewrites internal links to local markdown files - Generates navigation aids (index, breadcrumbs, prev/next) - Handles deleted pages via tombstones or redirects

Initialize multi-page publisher.

Source code in src/ragcrawl/output/multi_page.py
Python
def __init__(self, config: OutputConfig) -> None:
    """Initialize multi-page publisher."""
    super().__init__(config)
    self.link_rewriter = LinkRewriter(config)
    self.nav_generator = NavigationGenerator(config)

publish

Python
publish(documents: list[Document]) -> list[Path]

Publish documents as individual files.

PARAMETER DESCRIPTION
documents

Documents to publish.

TYPE: list[Document]

RETURNS DESCRIPTION
list[Path]

List of created file paths.

Source code in src/ragcrawl/output/multi_page.py
Python
def publish(self, documents: list[Document]) -> list[Path]:
    """
    Publish documents as individual files.

    Args:
        documents: Documents to publish.

    Returns:
        List of created file paths.
    """
    if not documents:
        return []

    self.ensure_output_dir()

    # Build URL to path mapping for link rewriting
    url_to_path = {}
    for doc in documents:
        output_path = self._url_to_path(doc.normalized_url)
        url_to_path[doc.normalized_url] = output_path

    self.link_rewriter.set_url_mapping(url_to_path)

    # Sort by depth for proper ordering
    sorted_docs = sorted(documents, key=lambda d: (d.depth, d.normalized_url))

    created_files = []

    # Publish each document
    for i, doc in enumerate(sorted_docs):
        # Get prev/next for navigation
        prev_doc = sorted_docs[i - 1] if i > 0 else None
        next_doc = sorted_docs[i + 1] if i < len(sorted_docs) - 1 else None

        file_path = self._publish_document(doc, prev_doc, next_doc)
        if file_path:
            created_files.append(file_path)

    # Generate index if enabled
    if self.config.generate_index:
        index_path = self._generate_index(sorted_docs)
        created_files.append(index_path)

    return created_files

publish_single

Python
publish_single(document: Document) -> Path | None

Publish a single document.

PARAMETER DESCRIPTION
document

Document to publish.

TYPE: Document

RETURNS DESCRIPTION
Path | None

Created file path.

Source code in src/ragcrawl/output/multi_page.py
Python
def publish_single(self, document: Document) -> Path | None:
    """
    Publish a single document.

    Args:
        document: Document to publish.

    Returns:
        Created file path.
    """
    self.ensure_output_dir()
    return self._publish_document(document, None, None)