VectifyAI · KylinMountain · May 31, 2026 · May 31, 2026 · May 31, 2026 · May 31, 2026
diff --git a/README.md b/README.md
@@ -266,6 +266,7 @@ Settings are initialized by `openkb init`, and stored in `.openkb/config.yaml`:
 model: gpt-5.4                   # LLM model (any LiteLLM-supported provider)
 language: en                     # Wiki output language
 pageindex_threshold: 20          # PDF pages threshold for PageIndex
+parser: local                    # Document parser: local | mineru | mistral | vlm
 ```
 
 Model names use `provider/model` LiteLLM [format](https://docs.litellm.ai/docs/providers) (OpenAI models can omit the prefix):
@@ -276,6 +277,50 @@ Model names use `provider/model` LiteLLM [format](https://docs.litellm.ai/docs/p
 | Anthropic | `anthropic/claude-sonnet-4-6` |
 | Gemini | `gemini/gemini-3.1-pro-preview` |
 
+### Document parsers
+
+By default OpenKB extracts Markdown locally (pymupdf for PDFs, markitdown for
+Office/HTML) — no extra dependencies, unchanged behavior. For higher accuracy on
+complex documents you can route the file → Markdown step through an online or
+self-hosted parser:
+
+```yaml
+# .openkb/config.yaml
+parser: mineru          # local (default) | mineru | mistral | vlm
+parsers:
+  mineru:
+    mode: cloud         # cloud | self_hosted
+    base_url: http://localhost:8000   # required when mode is self_hosted
+  vlm:
+    model: gemini/gemini-2.5-pro      # any LiteLLM vision model (Gemini, GPT-4o, Claude, …)
+```
+
+Install the optional dependency for your parser:
+
+```bash
+pip install openkb[mistral]   # Mistral OCR
+pip install openkb[mineru]    # MinerU (HTTP)
+pip install openkb[parsers]   # all online parsers
+# vlm uses the existing LiteLLM dependency — no extra needed
+```
+
+Set the API key via environment variable: `MINERU_API_KEY` (MinerU cloud mode),
+`MISTRAL_API_KEY`; the `vlm` parser reuses the existing `LLM_API_KEY`. Override
+the parser for a single run with `openkb add --parser mistral file.pdf`
+(`local | mineru | mistral | vlm`).
+
+Each parser handles a subset of formats — `mineru` covers PDF, Word, PPT, Excel,
+and HTML; `mistral` and `vlm` cover PDF. `.md` and any unsupported format always
+fall back to the local parser.
+
+The `vlm` parser is **text-only**: it transcribes a document's text via a vision
+LLM but does **not** extract embedded figures/images. Use `mineru`, `mistral`, or
+`local` if you need image extraction.
+
+> **Note:** Long PDFs (≥ `pageindex_threshold` pages, default 20) continue to be
+> indexed with PageIndex and are **not** affected by the `parser` setting. The
+> parser governs the file → Markdown step for shorter documents and non-PDF files.
+
 ### PageIndex Integration
 
 Long documents are challenging for LLMs due to context limits, context rot, and summarization loss.

diff --git a/openkb/cli.py b/openkb/cli.py
@@ -43,6 +43,7 @@ def filter(self, record: logging.LogRecord) -> bool:
 from openkb.config import DEFAULT_CONFIG, load_config, save_config, load_global_config, register_kb
 from openkb.converter import convert_document
 from openkb.log import append_log
+from openkb.parsers.registry import VALID_PARSERS
 from openkb.schema import AGENTS_MD
 
 # Suppress warnings after all imports — markitdown overrides filters at import time
@@ -124,17 +125,19 @@ def _setup_llm_key(kb_dir: Path | None = None) -> None:
     else:
         litellm.api_key = api_key
 
-        # Dynamically set the provider-specific env var when possible
         if provider:
+            # Active provider is known — set only its key, so LLM_API_KEY is not
+            # sprayed into unrelated provider keys (e.g. MISTRAL_API_KEY, which the
+            # Mistral parser treats as a real Mistral credential).
             provider_env = f"{provider.upper()}_API_KEY"
             if not os.environ.get(provider_env):
                 os.environ[provider_env] = api_key
-
-        # Fallback: also set common provider keys so multi-provider
-        # configs (e.g. PageIndex Cloud) still work
-        for env_var in _KNOWN_PROVIDER_KEYS:
-            if not os.environ.get(env_var):
-                os.environ[env_var] = api_key
+        else:
+            # Provider couldn't be determined — fall back to setting the common
+            # provider keys so multi-provider configs still work.
+            for env_var in _KNOWN_PROVIDER_KEYS:
+                if not os.environ.get(env_var):
+                    os.environ[env_var] = api_key
 
 # Supported document extensions for the `add` command
 SUPPORTED_EXTENSIONS = {
@@ -259,7 +262,7 @@ def _clear_existing_skill_dir(kb_dir: Path, name: str) -> None:
         shutil.rmtree(target)
 
 
-def add_single_file(file_path: Path, kb_dir: Path) -> Literal["added", "skipped", "failed"]:
+def add_single_file(file_path: Path, kb_dir: Path, parser_override: str | None = None) -> Literal["added", "skipped", "failed"]:
     """Convert, index, and compile a single document into the knowledge base.
 
     Steps:
@@ -289,7 +292,7 @@ def add_single_file(file_path: Path, kb_dir: Path) -> Literal["added", "skipped"
     # 2. Convert document
     click.echo(f"Adding: {file_path.name}")
     try:
-        result = convert_document(file_path, kb_dir)
+        result = convert_document(file_path, kb_dir, parser_override=parser_override)
     except Exception as exc:
         click.echo(f"  [ERROR] Conversion failed: {exc}")
         logger.debug("Conversion traceback:", exc_info=True)
@@ -575,8 +578,11 @@ def init(model, language):
 
 @cli.command()
 @click.argument("path")
+@click.option("--parser", "parser_override", default=None,
+              type=click.Choice(VALID_PARSERS),
+              help="Override the configured parser for this run.")
 @click.pass_context
-def add(ctx, path):
+def add(ctx, path, parser_override):
     """Add a document or directory of documents at PATH to the knowledge base.
 
     PATH may be a local file, a local directory (which is walked
@@ -600,7 +606,7 @@ def add(ctx, path):
         fetched = fetch_url_to_raw(path, kb_dir)
         if fetched is None:
             return
-        outcome = add_single_file(fetched, kb_dir)
+        outcome = add_single_file(fetched, kb_dir, parser_override=parser_override)
         # Only clean up on dedup-skip. On "failed" we keep the file so
         # the user can retry (e.g. transient LLM error during compile)
         # without re-downloading — and so they don't lose data when
@@ -626,15 +632,15 @@ def add(ctx, path):
         click.echo(f"Found {total} supported file(s) in {path}.")
         for i, f in enumerate(files, 1):
             click.echo(f"\n[{i}/{total}] ", nl=False)
-            add_single_file(f, kb_dir)
+            add_single_file(f, kb_dir, parser_override=parser_override)
     else:
         if target.suffix.lower() not in SUPPORTED_EXTENSIONS:
             click.echo(
                 f"Unsupported file type: {target.suffix}. "
                 f"Supported: {', '.join(sorted(SUPPORTED_EXTENSIONS))}"
             )
             return
-        add_single_file(target, kb_dir)
+        add_single_file(target, kb_dir, parser_override=parser_override)
 
 
 def _stream_to_tty() -> bool:

diff --git a/openkb/config.py b/openkb/config.py
@@ -9,6 +9,7 @@
     "model": "gpt-5.4-mini",
     "language": "en",
     "pageindex_threshold": 20,
+    "parser": "local",
 }
 
 GLOBAL_CONFIG_DIR = Path.home() / ".config" / "openkb"

diff --git a/openkb/converter.py b/openkb/converter.py
@@ -7,10 +7,11 @@
 from pathlib import Path
 
 import pymupdf
-from markitdown import MarkItDown
 
 from openkb.config import load_config
-from openkb.images import copy_relative_images, extract_base64_images, convert_pdf_with_images
+from openkb.images import localize_images
+from openkb.parsers import get_parser
+from openkb.parsers.local import LocalParser
 from openkb.state import HashRegistry
 
 logger = logging.getLogger(__name__)
@@ -33,16 +34,17 @@ def get_pdf_page_count(path: Path) -> int:
         return doc.page_count
 
 
-def convert_document(src: Path, kb_dir: Path) -> ConvertResult:
+def convert_document(src: Path, kb_dir: Path, parser_override: str | None = None) -> ConvertResult:
     """Convert a document and integrate it into the knowledge base.
 
     Steps:
     1. Hash-check — skip if already known.
     2. Copy source to ``raw/``.
     3. If PDF and page count >= threshold → return :attr:`ConvertResult.is_long_doc`.
-    4. If ``.md`` — read, process relative images, save to ``wiki/sources/``.
-    5. Otherwise — run MarkItDown, extract base64 images, save to ``wiki/sources/``.
-    6. Register hash in the registry.
+    4. Select a parser via :func:`get_parser` (falling back to
+       :class:`LocalParser` for unsupported suffixes like ``.md``), parse the
+       file to Markdown, localize images, and save to ``wiki/sources/``.
+    5. Register hash in the registry.
     """
     # ------------------------------------------------------------------
     # Load config & state
@@ -84,7 +86,7 @@ def convert_document(src: Path, kb_dir: Path) -> ConvertResult:
             return ConvertResult(raw_path=raw_dest, is_long_doc=True, file_hash=file_hash)
 
     # ------------------------------------------------------------------
-    # 4/5. Convert to Markdown
+    # 4. Select parser, convert to Markdown, localize images
     # ------------------------------------------------------------------
     sources_dir = kb_dir / "wiki" / "sources"
     sources_dir.mkdir(parents=True, exist_ok=True)
@@ -93,18 +95,27 @@ def convert_document(src: Path, kb_dir: Path) -> ConvertResult:
 
     doc_name = src.stem
 
-    if src.suffix.lower() == ".md":
-        markdown = src.read_text(encoding="utf-8")
-        markdown = copy_relative_images(markdown, src.parent, doc_name, images_dir)
-    elif src.suffix.lower() == ".pdf":
-        # Use pymupdf dict-mode for PDFs: text + images inline at correct positions
-        markdown = convert_pdf_with_images(src, doc_name, images_dir)
+    parser = get_parser(
+        config,
+        override=parser_override,
+        doc_name=doc_name,
+        images_dir=images_dir,
+        source_dir=src.parent,
+    )
+    if not parser.supports(src.suffix):
+        if parser.name != "local":
+            logger.warning(
+                "Parser %r does not support %r; falling back to the local parser for %s.",
+                parser.name, src.suffix, src.name,
+            )
+        parser = LocalParser(doc_name=doc_name, images_dir=images_dir, source_dir=src.parent)
+
+    parse_result = parser.parse(src)
+    if parser.name == "local":
+        # LocalParser already persisted images and produced canonical links.
+        markdown = parse_result.markdown
     else:
-        # Non-PDF, non-MD: use markitdown (docx, pptx, html, etc.)
-        mid = MarkItDown()
-        result = mid.convert(str(src))
-        markdown = result.text_content
-        markdown = extract_base64_images(markdown, doc_name, images_dir)
+        markdown = localize_images(parse_result.markdown, parse_result.images, doc_name, images_dir)
 
     dest_md = sources_dir / f"{doc_name}.md"
     dest_md.write_text(markdown, encoding="utf-8")

diff --git a/openkb/images.py b/openkb/images.py
@@ -17,6 +17,10 @@
 # Matches: ![alt](relative/path) — excludes http(s):// and data: URIs
 _RELATIVE_RE = re.compile(r'!\[([^\]]*)\]\((?!https?://|data:)([^)]+)\)')
 
+# Matches an image link, capturing: (prefix `![alt](` + ws)(target)(optional
+# title + ws)(closing `)`). Used to rewrite links by their target's basename.
+_IMG_LINK_RE = re.compile(r'(!\[[^\]]*\]\(\s*)([^)\s]+)(\s*(?:"[^"]*"|\'[^\']*\')?\s*)(\))')
+
 
 # Minimum pixel dimension — skip icons, bullets, and tiny artifacts
 _MIN_IMAGE_DIM = 32
@@ -211,6 +215,44 @@ def extract_base64_images(markdown: str, doc_name: str, images_dir: Path) -> str
     return result
 
 
+def localize_images(
+    markdown: str,
+    images: dict[str, bytes],
+    doc_name: str,
+    images_dir: Path,
+) -> str:
+    """Persist parser-supplied images and normalize image links.
+
+    1. Write every ``images`` entry to ``images_dir`` under its basename
+       (``Path(filename).name``), so a name with ``/`` directory components or
+       an absolute path can never write outside ``images_dir``.
+    2. Rewrite markdown image links whose target's basename matches a written
+       image to the canonical ``sources/images/{doc_name}/{basename}`` path —
+       this handles bare names, directory-prefixed targets (e.g.
+       ``images/fig.png``), and links carrying a title attribute.
+    3. Localize any inline base64 images via :func:`extract_base64_images`.
+
+    Returns the normalized markdown.
+    """
+    images_dir.mkdir(parents=True, exist_ok=True)
+    safe_names: set[str] = set()
+    for filename, data in images.items():
+        safe = Path(filename).name or "image"
+        (images_dir / safe).write_bytes(data)
+        safe_names.add(safe)
+
+    def _rewrite(m: "re.Match[str]") -> str:
+        pre, target, title, close = m.group(1), m.group(2), m.group(3), m.group(4)
+        base = Path(target).name
+        if base in safe_names:
+            return f"{pre}sources/images/{doc_name}/{base}{title}{close}"
+        return m.group(0)
+
+    result = _IMG_LINK_RE.sub(_rewrite, markdown)
+    result = extract_base64_images(result, doc_name, images_dir)
+    return result
+
+
 def copy_relative_images(
     markdown: str, source_dir: Path, doc_name: str, images_dir: Path
 ) -> str:

diff --git a/openkb/parsers/__init__.py b/openkb/parsers/__init__.py
@@ -0,0 +1,5 @@
+"""Pluggable document parsers for the file → Markdown step."""
+from openkb.parsers.base import ParseResult, Parser
+from openkb.parsers.registry import get_parser
+
+__all__ = ["ParseResult", "Parser", "get_parser"]
diff --git a/openkb/parsers/base.py b/openkb/parsers/base.py
@@ -0,0 +1,33 @@
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+from pathlib import Path
+
+
+@dataclass
+class ParseResult:
+    """Normalized output of a parser.
+
+    ``markdown`` references images either as bare filenames present in
+    ``images`` or as inline base64 data URIs. ``images`` maps a filename to
+    its raw bytes; the caller persists them and rewrites links via
+    :func:`openkb.images.localize_images`.
+    """
+
+    markdown: str
+    images: dict[str, bytes] = field(default_factory=dict)
+
+
+class Parser(ABC):
+    """Converts a source document to Markdown."""
+
+    name: str
+
+    @abstractmethod
+    def supports(self, suffix: str) -> bool:
+        """Return True if this parser handles files with ``suffix`` (e.g. ``.pdf``)."""
+
+    @abstractmethod
+    def parse(self, src: Path) -> ParseResult:
+        """Parse ``src`` and return a :class:`ParseResult`."""
diff --git a/openkb/parsers/local.py b/openkb/parsers/local.py
@@ -0,0 +1,47 @@
+from __future__ import annotations
+
+from pathlib import Path
+
+from markitdown import MarkItDown
+
+from openkb.images import (
+    convert_pdf_with_images,
+    copy_relative_images,
+    extract_base64_images,
+)
+from openkb.parsers.base import ParseResult, Parser
+
+_LOCAL_EXTENSIONS = {
+    ".pdf", ".md", ".markdown", ".docx", ".pptx", ".xlsx", ".xls",
+    ".html", ".htm", ".txt", ".csv",
+}
+
+
+class LocalParser(Parser):
+    """Default parser: pymupdf for PDF, markitdown for office/html, direct read for md."""
+
+    name = "local"
+
+    def __init__(self, doc_name: str = "", images_dir: Path | None = None,
+                 source_dir: Path | None = None):
+        self.doc_name = doc_name
+        self.images_dir = images_dir
+        self.source_dir = source_dir
+
+    def supports(self, suffix: str) -> bool:
+        return suffix.lower() in _LOCAL_EXTENSIONS
+
+    def parse(self, src: Path) -> ParseResult:
+        suffix = src.suffix.lower()
+        if suffix in {".md", ".markdown"}:
+            markdown = src.read_text(encoding="utf-8")
+            markdown = copy_relative_images(
+                markdown, src.parent, self.doc_name, self.images_dir
+            )
+        elif suffix == ".pdf":
+            markdown = convert_pdf_with_images(src, self.doc_name, self.images_dir)
+        else:
+            mid = MarkItDown()
+            markdown = mid.convert(str(src)).text_content
+            markdown = extract_base64_images(markdown, self.doc_name, self.images_dir)
+        return ParseResult(markdown=markdown)