From 7a5f78151db9bb172f3320ed4fc1cd58f6415fc6 Mon Sep 17 00:00:00 2001
From: mountain <kose2livs@gmail.com>
Date: Sun, 31 May 2026 10:35:28 +0800
Subject: [PATCH 01/26] feat(parsers): add ParseResult and Parser ABC (#77)

---
 openkb/parsers/__init__.py |  4 ++++
 openkb/parsers/base.py     | 33 +++++++++++++++++++++++++++++++++
 tests/test_parsers_base.py | 24 ++++++++++++++++++++++++
 3 files changed, 61 insertions(+)
 create mode 100644 openkb/parsers/__init__.py
 create mode 100644 openkb/parsers/base.py
 create mode 100644 tests/test_parsers_base.py

diff --git a/openkb/parsers/__init__.py b/openkb/parsers/__init__.py
new file mode 100644
index 00000000..0656d733
--- /dev/null
+++ b/openkb/parsers/__init__.py
@@ -0,0 +1,4 @@
+"""Pluggable document parsers for the file → Markdown step."""
+from openkb.parsers.base import ParseResult, Parser
+
+__all__ = ["ParseResult", "Parser"]
diff --git a/openkb/parsers/base.py b/openkb/parsers/base.py
new file mode 100644
index 00000000..deb07d60
--- /dev/null
+++ b/openkb/parsers/base.py
@@ -0,0 +1,33 @@
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+from pathlib import Path
+
+
+@dataclass
+class ParseResult:
+    """Normalized output of a parser.
+
+    ``markdown`` references images either as bare filenames present in
+    ``images`` or as inline base64 data URIs. ``images`` maps a filename to
+    its raw bytes; the caller persists them and rewrites links via
+    :func:`openkb.images.localize_images`.
+    """
+
+    markdown: str
+    images: dict[str, bytes] = field(default_factory=dict)
+
+
+class Parser(ABC):
+    """Converts a source document to Markdown."""
+
+    name: str
+
+    @abstractmethod
+    def supports(self, suffix: str) -> bool:
+        """Return True if this parser handles files with ``suffix`` (e.g. ``.pdf``)."""
+
+    @abstractmethod
+    def parse(self, src: Path) -> ParseResult:
+        """Parse ``src`` and return a :class:`ParseResult`."""
diff --git a/tests/test_parsers_base.py b/tests/test_parsers_base.py
new file mode 100644
index 00000000..1c119a32
--- /dev/null
+++ b/tests/test_parsers_base.py
@@ -0,0 +1,24 @@
+"""Tests for the parser abstraction base types."""
+from __future__ import annotations
+
+import pytest
+
+from openkb.parsers.base import ParseResult, Parser
+
+
+def test_parse_result_defaults_to_empty_images():
+    pr = ParseResult(markdown="# Hi")
+    assert pr.markdown == "# Hi"
+    assert pr.images == {}
+
+
+def test_parser_is_abstract():
+    with pytest.raises(TypeError):
+        Parser()  # cannot instantiate abstract base
+
+
+def test_concrete_parser_must_implement_parse_and_supports():
+    class Incomplete(Parser):
+        name = "incomplete"
+    with pytest.raises(TypeError):
+        Incomplete()

From 592d11dab320e5b6ccf5b5c292e3e9c6ce5faae7 Mon Sep 17 00:00:00 2001
From: mountain <kose2livs@gmail.com>
Date: Sun, 31 May 2026 10:38:17 +0800
Subject: [PATCH 02/26] feat(images): add localize_images helper for parser
 output (#77)

---
 openkb/images.py     | 29 +++++++++++++++++++++++++++++
 tests/test_images.py | 32 +++++++++++++++++++++++++++++++-
 2 files changed, 60 insertions(+), 1 deletion(-)

diff --git a/openkb/images.py b/openkb/images.py
index 76284148..84ed6160 100644
--- a/openkb/images.py
+++ b/openkb/images.py
@@ -211,6 +211,35 @@ def extract_base64_images(markdown: str, doc_name: str, images_dir: Path) -> str
     return result
 
 
+def localize_images(
+    markdown: str,
+    images: dict[str, bytes],
+    doc_name: str,
+    images_dir: Path,
+) -> str:
+    """Persist parser-supplied images and normalize all image links.
+
+    1. Write every ``images`` entry (filename -> bytes) into ``images_dir``.
+    2. Rewrite bare-filename references ``![alt](filename)`` (filename present
+       in ``images``) to the canonical ``sources/images/{doc_name}/{filename}``.
+    3. Run :func:`extract_base64_images` to localize any inline base64 images.
+
+    Returns the normalized markdown.
+    """
+    images_dir.mkdir(parents=True, exist_ok=True)
+    result = markdown
+    for filename, data in images.items():
+        (images_dir / filename).write_bytes(data)
+        # Rewrite a bare ![alt](filename) reference to the canonical KB path.
+        result = re.sub(
+            r"(!\[[^\]]*\]\()" + re.escape(filename) + r"(\))",
+            r"\g<1>" + f"sources/images/{doc_name}/{filename}" + r"\g<2>",
+            result,
+        )
+    result = extract_base64_images(result, doc_name, images_dir)
+    return result
+
+
 def copy_relative_images(
     markdown: str, source_dir: Path, doc_name: str, images_dir: Path
 ) -> str:
diff --git a/tests/test_images.py b/tests/test_images.py
index 9abb3ec2..97d98fee 100644
--- a/tests/test_images.py
+++ b/tests/test_images.py
@@ -4,7 +4,7 @@
 import base64
 
 
-from openkb.images import copy_relative_images, extract_base64_images
+from openkb.images import copy_relative_images, extract_base64_images, localize_images
 
 
 # ---------------------------------------------------------------------------
@@ -164,3 +164,33 @@ def test_multiple_relative_images_all_copied(self, tmp_path):
         assert "![b](sources/images/doc/b.jpg)" in result
         assert (images_dir / "a.png").exists()
         assert (images_dir / "b.jpg").exists()
+
+
+# ---------------------------------------------------------------------------
+# localize_images
+# ---------------------------------------------------------------------------
+
+
+def test_localize_images_writes_bytes_and_rewrites_bare_refs(tmp_path):
+    images_dir = tmp_path / "wiki" / "sources" / "images" / "doc"
+    md = "Before\n\n![fig](p1_img1.png)\n\nAfter"
+    out = localize_images(md, {"p1_img1.png": b"PNGDATA"}, "doc", images_dir)
+    assert "![fig](sources/images/doc/p1_img1.png)" in out
+    assert (images_dir / "p1_img1.png").read_bytes() == b"PNGDATA"
+
+
+def test_localize_images_handles_inline_base64(tmp_path):
+    import base64
+    images_dir = tmp_path / "wiki" / "sources" / "images" / "doc"
+    payload = base64.b64encode(b"JPEGDATA").decode()
+    md = f"![x](data:image/jpeg;base64,{payload})"
+    out = localize_images(md, {}, "doc", images_dir)
+    assert "sources/images/doc/img_001.jpeg" in out
+    assert (images_dir / "img_001.jpeg").read_bytes() == b"JPEGDATA"
+
+
+def test_localize_images_leaves_unreferenced_bytes_on_disk(tmp_path):
+    images_dir = tmp_path / "wiki" / "sources" / "images" / "doc"
+    out = localize_images("no images here", {"orphan.png": b"X"}, "doc", images_dir)
+    assert out == "no images here"
+    assert (images_dir / "orphan.png").read_bytes() == b"X"

From cec36183a0cc6b1fde840d28227244341a1c5a60 Mon Sep 17 00:00:00 2001
From: mountain <kose2livs@gmail.com>
Date: Sun, 31 May 2026 10:41:31 +0800
Subject: [PATCH 03/26] fix(images): use replacement function in
 localize_images to handle arbitrary filenames (#77)

---
 openkb/images.py     | 11 ++++++-----
 tests/test_images.py |  9 +++++++++
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/openkb/images.py b/openkb/images.py
index 84ed6160..9c6a424c 100644
--- a/openkb/images.py
+++ b/openkb/images.py
@@ -231,11 +231,12 @@ def localize_images(
     for filename, data in images.items():
         (images_dir / filename).write_bytes(data)
         # Rewrite a bare ![alt](filename) reference to the canonical KB path.
-        result = re.sub(
-            r"(!\[[^\]]*\]\()" + re.escape(filename) + r"(\))",
-            r"\g<1>" + f"sources/images/{doc_name}/{filename}" + r"\g<2>",
-            result,
-        )
+        # Use a replacement *function* (not a replacement string) so a filename
+        # containing regex-escape sequences (e.g. "\g<1>") can't corrupt the
+        # substitution — localize_images handles arbitrary parser-supplied names.
+        canonical = f"sources/images/{doc_name}/{filename}"
+        pattern = re.compile(r"(!\[[^\]]*\]\()" + re.escape(filename) + r"(\))")
+        result = pattern.sub(lambda m, c=canonical: m.group(1) + c + m.group(2), result)
     result = extract_base64_images(result, doc_name, images_dir)
     return result
 
diff --git a/tests/test_images.py b/tests/test_images.py
index 97d98fee..53906443 100644
--- a/tests/test_images.py
+++ b/tests/test_images.py
@@ -194,3 +194,12 @@ def test_localize_images_leaves_unreferenced_bytes_on_disk(tmp_path):
     out = localize_images("no images here", {"orphan.png": b"X"}, "doc", images_dir)
     assert out == "no images here"
     assert (images_dir / "orphan.png").read_bytes() == b"X"
+
+
+def test_localize_images_filename_with_regex_metachars(tmp_path):
+    images_dir = tmp_path / "wiki" / "sources" / "images" / "doc"
+    weird = r"img\g<9>.png"  # backslash-escape-like name must not crash re.sub
+    md = f"![f]({weird})"
+    out = localize_images(md, {weird: b"DATA"}, "doc", images_dir)
+    assert f"sources/images/doc/{weird}" in out
+    assert (images_dir / weird).read_bytes() == b"DATA"

From 455c74671bc71e2aed40967f935c5456475c43db Mon Sep 17 00:00:00 2001
From: mountain <kose2livs@gmail.com>
Date: Sun, 31 May 2026 10:43:22 +0800
Subject: [PATCH 04/26] feat(parsers): add LocalParser wrapping legacy
 extraction (#77)

---
 openkb/parsers/local.py     | 47 ++++++++++++++++++++++++++++++++++
 tests/test_parsers_local.py | 51 +++++++++++++++++++++++++++++++++++++
 2 files changed, 98 insertions(+)
 create mode 100644 openkb/parsers/local.py
 create mode 100644 tests/test_parsers_local.py

diff --git a/openkb/parsers/local.py b/openkb/parsers/local.py
new file mode 100644
index 00000000..d714d0ce
--- /dev/null
+++ b/openkb/parsers/local.py
@@ -0,0 +1,47 @@
+from __future__ import annotations
+
+from pathlib import Path
+
+from markitdown import MarkItDown
+
+from openkb.images import (
+    convert_pdf_with_images,
+    copy_relative_images,
+    extract_base64_images,
+)
+from openkb.parsers.base import ParseResult, Parser
+
+_LOCAL_EXTENSIONS = {
+    ".pdf", ".md", ".markdown", ".docx", ".pptx", ".xlsx", ".xls",
+    ".html", ".htm", ".txt", ".csv",
+}
+
+
+class LocalParser(Parser):
+    """Default parser: pymupdf for PDF, markitdown for office/html, direct read for md."""
+
+    name = "local"
+
+    def __init__(self, doc_name: str = "", images_dir: Path | None = None,
+                 source_dir: Path | None = None):
+        self.doc_name = doc_name
+        self.images_dir = images_dir
+        self.source_dir = source_dir
+
+    def supports(self, suffix: str) -> bool:
+        return suffix.lower() in _LOCAL_EXTENSIONS
+
+    def parse(self, src: Path) -> ParseResult:
+        suffix = src.suffix.lower()
+        if suffix in {".md", ".markdown"}:
+            markdown = src.read_text(encoding="utf-8")
+            markdown = copy_relative_images(
+                markdown, src.parent, self.doc_name, self.images_dir
+            )
+        elif suffix == ".pdf":
+            markdown = convert_pdf_with_images(src, self.doc_name, self.images_dir)
+        else:
+            mid = MarkItDown()
+            markdown = mid.convert(str(src)).text_content
+            markdown = extract_base64_images(markdown, self.doc_name, self.images_dir)
+        return ParseResult(markdown=markdown)
diff --git a/tests/test_parsers_local.py b/tests/test_parsers_local.py
new file mode 100644
index 00000000..e682c42e
--- /dev/null
+++ b/tests/test_parsers_local.py
@@ -0,0 +1,51 @@
+"""Tests for LocalParser — preserves legacy md/pdf/markitdown behavior."""
+from __future__ import annotations
+
+from pathlib import Path
+from unittest.mock import patch
+
+from openkb.parsers.local import LocalParser
+from openkb.parsers.base import ParseResult
+
+
+def test_supports_all_known_extensions():
+    p = LocalParser()
+    for ext in [".pdf", ".md", ".markdown", ".docx", ".pptx", ".xlsx", ".html", ".txt", ".csv"]:
+        assert p.supports(ext) is True
+
+
+def test_parse_md_reads_text(tmp_path):
+    src = tmp_path / "n.md"
+    src.write_text("# Title\n\nbody", encoding="utf-8")
+    images_dir = tmp_path / "img" / "n"
+    p = LocalParser(doc_name="n", images_dir=images_dir, source_dir=tmp_path)
+    result = p.parse(src)
+    assert isinstance(result, ParseResult)
+    assert result.markdown.startswith("# Title")
+
+
+def test_parse_pdf_delegates_to_convert_pdf_with_images(tmp_path):
+    src = tmp_path / "doc.pdf"
+    src.write_bytes(b"%PDF-1.4 fake")
+    images_dir = tmp_path / "img" / "doc"
+    with patch("openkb.parsers.local.convert_pdf_with_images", return_value="PDF MD") as m:
+        p = LocalParser(doc_name="doc", images_dir=images_dir, source_dir=tmp_path)
+        result = p.parse(src)
+    m.assert_called_once_with(src, "doc", images_dir)
+    assert result.markdown == "PDF MD"
+
+
+def test_parse_other_uses_markitdown_and_extracts_base64(tmp_path):
+    src = tmp_path / "deck.pptx"
+    src.write_bytes(b"PK fake")
+    images_dir = tmp_path / "img" / "deck"
+    fake_mid = patch("openkb.parsers.local.MarkItDown").start()
+    fake_mid.return_value.convert.return_value.text_content = "MARKITDOWN MD"
+    try:
+        with patch("openkb.parsers.local.extract_base64_images", return_value="CLEANED") as ex:
+            p = LocalParser(doc_name="deck", images_dir=images_dir, source_dir=tmp_path)
+            result = p.parse(src)
+        ex.assert_called_once_with("MARKITDOWN MD", "deck", images_dir)
+        assert result.markdown == "CLEANED"
+    finally:
+        patch.stopall()

From 0978cbf58f6fbe91f906e3b044d993560441087a Mon Sep 17 00:00:00 2001
From: mountain <kose2livs@gmail.com>
Date: Sun, 31 May 2026 10:47:01 +0800
Subject: [PATCH 05/26] feat(parsers): add registry + get_parser factory (#77)

---
 openkb/parsers/__init__.py     |  3 ++-
 openkb/parsers/registry.py     | 38 ++++++++++++++++++++++++++++++++++
 tests/test_parsers_registry.py | 33 +++++++++++++++++++++++++++++
 3 files changed, 73 insertions(+), 1 deletion(-)
 create mode 100644 openkb/parsers/registry.py
 create mode 100644 tests/test_parsers_registry.py

diff --git a/openkb/parsers/__init__.py b/openkb/parsers/__init__.py
index 0656d733..aeeeb100 100644
--- a/openkb/parsers/__init__.py
+++ b/openkb/parsers/__init__.py
@@ -1,4 +1,5 @@
 """Pluggable document parsers for the file → Markdown step."""
 from openkb.parsers.base import ParseResult, Parser
+from openkb.parsers.registry import get_parser
 
-__all__ = ["ParseResult", "Parser"]
+__all__ = ["ParseResult", "Parser", "get_parser"]
diff --git a/openkb/parsers/registry.py b/openkb/parsers/registry.py
new file mode 100644
index 00000000..c87ab111
--- /dev/null
+++ b/openkb/parsers/registry.py
@@ -0,0 +1,38 @@
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any
+
+from openkb.parsers.base import Parser
+from openkb.parsers.local import LocalParser
+
+_VALID = ("local", "mineru", "mistral", "vlm")
+
+
+def get_parser(
+    config: dict[str, Any],
+    override: str | None = None,
+    *,
+    doc_name: str = "",
+    images_dir: Path | None = None,
+    source_dir: Path | None = None,
+) -> Parser:
+    """Resolve the configured parser. ``override`` (e.g. CLI ``--parser``) wins."""
+    name = (override or config.get("parser") or "local").lower()
+    if name == "local":
+        return LocalParser(doc_name=doc_name, images_dir=images_dir, source_dir=source_dir)
+
+    parsers_cfg = config.get("parsers", {}) or {}
+    opts = parsers_cfg.get(name, {}) or {}
+    if name == "mistral":
+        from openkb.parsers.mistral import MistralParser
+        return MistralParser(opts)
+    if name == "vlm":
+        from openkb.parsers.vlm import VLMParser
+        return VLMParser(opts, model=config.get("model"))
+    if name == "mineru":
+        from openkb.parsers.mineru import MineruParser
+        return MineruParser(opts)
+    raise ValueError(
+        f"Unknown parser {name!r}. Valid options: {', '.join(_VALID)}."
+    )
diff --git a/tests/test_parsers_registry.py b/tests/test_parsers_registry.py
new file mode 100644
index 00000000..0a500ce9
--- /dev/null
+++ b/tests/test_parsers_registry.py
@@ -0,0 +1,33 @@
+"""Tests for parser selection / registry."""
+from __future__ import annotations
+
+import pytest
+
+from openkb.parsers.registry import get_parser
+from openkb.parsers.local import LocalParser
+
+
+def _kwargs():
+    return {"doc_name": "d", "images_dir": None, "source_dir": None}
+
+
+def test_default_is_local():
+    p = get_parser({}, **_kwargs())
+    assert isinstance(p, LocalParser)
+
+
+def test_explicit_local():
+    p = get_parser({"parser": "local"}, **_kwargs())
+    assert isinstance(p, LocalParser)
+
+
+def test_override_wins_over_config():
+    p = get_parser({"parser": "mistral"}, override="local", **_kwargs())
+    assert isinstance(p, LocalParser)
+
+
+def test_unknown_name_raises_with_valid_options():
+    with pytest.raises(ValueError) as exc:
+        get_parser({"parser": "nope"}, **_kwargs())
+    assert "nope" in str(exc.value)
+    assert "local" in str(exc.value)

From ed3368d0e386fe87dd10859fd152e96d29b5277f Mon Sep 17 00:00:00 2001
From: mountain <kose2livs@gmail.com>
Date: Sun, 31 May 2026 10:52:01 +0800
Subject: [PATCH 06/26] =?UTF-8?q?refactor(converter):=20route=20file?=
 =?UTF-8?q?=E2=86=92markdown=20through=20parser=20abstraction=20(#77)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 openkb/converter.py     | 31 ++++++++++++++++---------------
 tests/test_converter.py | 36 +++++++++++++++++++++++++++++++++++-
 2 files changed, 51 insertions(+), 16 deletions(-)

diff --git a/openkb/converter.py b/openkb/converter.py
index 352c22b3..9d684c8d 100644
--- a/openkb/converter.py
+++ b/openkb/converter.py
@@ -7,10 +7,11 @@
 from pathlib import Path
 
 import pymupdf
-from markitdown import MarkItDown
 
 from openkb.config import load_config
-from openkb.images import copy_relative_images, extract_base64_images, convert_pdf_with_images
+from openkb.images import localize_images
+from openkb.parsers import get_parser
+from openkb.parsers.local import LocalParser
 from openkb.state import HashRegistry
 
 logger = logging.getLogger(__name__)
@@ -33,7 +34,7 @@ def get_pdf_page_count(path: Path) -> int:
         return doc.page_count
 
 
-def convert_document(src: Path, kb_dir: Path) -> ConvertResult:
+def convert_document(src: Path, kb_dir: Path, parser_override: str | None = None) -> ConvertResult:
     """Convert a document and integrate it into the knowledge base.
 
     Steps:
@@ -93,18 +94,18 @@ def convert_document(src: Path, kb_dir: Path) -> ConvertResult:
 
     doc_name = src.stem
 
-    if src.suffix.lower() == ".md":
-        markdown = src.read_text(encoding="utf-8")
-        markdown = copy_relative_images(markdown, src.parent, doc_name, images_dir)
-    elif src.suffix.lower() == ".pdf":
-        # Use pymupdf dict-mode for PDFs: text + images inline at correct positions
-        markdown = convert_pdf_with_images(src, doc_name, images_dir)
-    else:
-        # Non-PDF, non-MD: use markitdown (docx, pptx, html, etc.)
-        mid = MarkItDown()
-        result = mid.convert(str(src))
-        markdown = result.text_content
-        markdown = extract_base64_images(markdown, doc_name, images_dir)
+    parser = get_parser(
+        config,
+        override=parser_override,
+        doc_name=doc_name,
+        images_dir=images_dir,
+        source_dir=src.parent,
+    )
+    if not parser.supports(src.suffix):
+        parser = LocalParser(doc_name=doc_name, images_dir=images_dir, source_dir=src.parent)
+
+    parse_result = parser.parse(src)
+    markdown = localize_images(parse_result.markdown, parse_result.images, doc_name, images_dir)
 
     dest_md = sources_dir / f"{doc_name}.md"
     dest_md.write_text(markdown, encoding="utf-8")
diff --git a/tests/test_converter.py b/tests/test_converter.py
index d7475b09..391dbc18 100644
--- a/tests/test_converter.py
+++ b/tests/test_converter.py
@@ -85,7 +85,7 @@ def test_short_pdf_converted_via_pymupdf(self, kb_dir, tmp_path):
 
         with (
             patch("openkb.converter.pymupdf.open") as mock_mu,
-            patch("openkb.converter.convert_pdf_with_images", return_value="# Short PDF\n\nConverted.") as mock_cpwi,
+            patch("openkb.parsers.local.convert_pdf_with_images", return_value="# Short PDF\n\nConverted.") as mock_cpwi,
         ):
             fake_doc = MagicMock()
             fake_doc.page_count = 5  # below default threshold of 20
@@ -128,3 +128,37 @@ def test_long_pdf_returns_is_long_doc(self, kb_dir, tmp_path):
         assert result.source_path is None
         assert result.skipped is False
         assert result.raw_path is not None
+
+
+from openkb.parsers.base import ParseResult
+
+
+class TestConvertDocumentParserSelection:
+    def test_uses_get_parser_and_localizes(self, kb_dir):
+        src = kb_dir / "raw" / "paper.pdf"
+        src.write_bytes(b"%PDF-1.4 fake")
+
+        fake = MagicMock()
+        fake.supports.return_value = True
+        fake.parse.return_value = ParseResult(markdown="HELLO", images={"a.png": b"X"})
+
+        with patch("openkb.converter.get_pdf_page_count", return_value=1), \
+             patch("openkb.converter.get_parser", return_value=fake) as gp, \
+             patch("openkb.converter.localize_images", return_value="HELLO-LOCALIZED") as li:
+            result = convert_document(src, kb_dir)
+
+        gp.assert_called_once()
+        li.assert_called_once()
+        assert result.source_path.read_text(encoding="utf-8") == "HELLO-LOCALIZED"
+
+    def test_falls_back_to_local_for_unsupported_suffix(self, kb_dir):
+        src = kb_dir / "raw" / "notes.md"
+        src.write_text("# md", encoding="utf-8")
+
+        online = MagicMock()
+        online.supports.return_value = False  # online parser can't do .md
+        with patch("openkb.converter.get_parser", return_value=online), \
+             patch("openkb.converter.LocalParser") as LP:
+            LP.return_value.parse.return_value = ParseResult(markdown="# md")
+            convert_document(src, kb_dir)
+        LP.assert_called_once()  # fell back to LocalParser

From 27d314c9ad327d338bb6bbc13e1460b67b1859d7 Mon Sep 17 00:00:00 2001
From: mountain <kose2livs@gmail.com>
Date: Sun, 31 May 2026 10:55:42 +0800
Subject: [PATCH 07/26] docs(converter): refresh convert_document docstring for
 parser flow; tighten tests (#77)

---
 openkb/converter.py     | 9 +++++----
 tests/test_converter.py | 2 ++
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/openkb/converter.py b/openkb/converter.py
index 9d684c8d..d6c37fc7 100644
--- a/openkb/converter.py
+++ b/openkb/converter.py
@@ -41,9 +41,10 @@ def convert_document(src: Path, kb_dir: Path, parser_override: str | None = None
     1. Hash-check — skip if already known.
     2. Copy source to ``raw/``.
     3. If PDF and page count >= threshold → return :attr:`ConvertResult.is_long_doc`.
-    4. If ``.md`` — read, process relative images, save to ``wiki/sources/``.
-    5. Otherwise — run MarkItDown, extract base64 images, save to ``wiki/sources/``.
-    6. Register hash in the registry.
+    4. Select a parser via :func:`get_parser` (falling back to
+       :class:`LocalParser` for unsupported suffixes like ``.md``), parse the
+       file to Markdown, localize images, and save to ``wiki/sources/``.
+    5. Register hash in the registry.
     """
     # ------------------------------------------------------------------
     # Load config & state
@@ -85,7 +86,7 @@ def convert_document(src: Path, kb_dir: Path, parser_override: str | None = None
             return ConvertResult(raw_path=raw_dest, is_long_doc=True, file_hash=file_hash)
 
     # ------------------------------------------------------------------
-    # 4/5. Convert to Markdown
+    # 4. Select parser, convert to Markdown, localize images
     # ------------------------------------------------------------------
     sources_dir = kb_dir / "wiki" / "sources"
     sources_dir.mkdir(parents=True, exist_ok=True)
diff --git a/tests/test_converter.py b/tests/test_converter.py
index 391dbc18..90b26bf4 100644
--- a/tests/test_converter.py
+++ b/tests/test_converter.py
@@ -148,6 +148,8 @@ def test_uses_get_parser_and_localizes(self, kb_dir):
             result = convert_document(src, kb_dir)
 
         gp.assert_called_once()
+        assert gp.call_args.kwargs["doc_name"] == "paper"
+        assert gp.call_args.kwargs["images_dir"] is not None
         li.assert_called_once()
         assert result.source_path.read_text(encoding="utf-8") == "HELLO-LOCALIZED"
 

From 36e8b4f45a1e5fd5a7683bbc8d7db9724293f272 Mon Sep 17 00:00:00 2001
From: mountain <kose2livs@gmail.com>
Date: Sun, 31 May 2026 10:57:29 +0800
Subject: [PATCH 08/26] feat(parsers): add reusable litellm vision client (#77)

---
 openkb/parsers/vlm_client.py     | 29 ++++++++++++++++++++++++++
 tests/test_parsers_vlm_client.py | 35 ++++++++++++++++++++++++++++++++
 2 files changed, 64 insertions(+)
 create mode 100644 openkb/parsers/vlm_client.py
 create mode 100644 tests/test_parsers_vlm_client.py

diff --git a/openkb/parsers/vlm_client.py b/openkb/parsers/vlm_client.py
new file mode 100644
index 00000000..5979fd9e
--- /dev/null
+++ b/openkb/parsers/vlm_client.py
@@ -0,0 +1,29 @@
+from __future__ import annotations
+
+import base64
+import mimetypes
+from pathlib import Path
+
+import litellm
+
+_DEFAULT_MODEL = "gemini/gemini-2.5-pro"
+
+_PROMPT = (
+    "Transcribe this document to clean GitHub-flavored Markdown. Preserve headings, "
+    "lists, tables (as Markdown or HTML tables), and math (as LaTeX). Output only the "
+    "Markdown content, no commentary."
+)
+
+
+def transcribe_to_markdown(src: Path, model: str | None = None, prompt: str | None = None) -> str:
+    """Send ``src`` (PDF or image) to a vision-capable LLM via litellm; return Markdown."""
+    model = model or _DEFAULT_MODEL
+    mime = mimetypes.guess_type(src.name)[0] or "application/octet-stream"
+    b64 = base64.b64encode(src.read_bytes()).decode()
+    data_uri = f"data:{mime};base64,{b64}"
+    content = [
+        {"type": "text", "text": prompt or _PROMPT},
+        {"type": "image_url", "image_url": {"url": data_uri}},
+    ]
+    resp = litellm.completion(model=model, messages=[{"role": "user", "content": content}])
+    return resp.choices[0].message.content or ""
diff --git a/tests/test_parsers_vlm_client.py b/tests/test_parsers_vlm_client.py
new file mode 100644
index 00000000..09e38256
--- /dev/null
+++ b/tests/test_parsers_vlm_client.py
@@ -0,0 +1,35 @@
+"""Tests for the reusable litellm vision client."""
+from __future__ import annotations
+
+from unittest.mock import MagicMock, patch
+
+from openkb.parsers.vlm_client import transcribe_to_markdown
+
+
+def _fake_response(text):
+    resp = MagicMock()
+    resp.choices = [MagicMock(message=MagicMock(content=text))]
+    return resp
+
+
+def test_transcribe_pdf_sends_data_uri_and_returns_content(tmp_path):
+    src = tmp_path / "doc.pdf"
+    src.write_bytes(b"%PDF-1.4 data")
+    with patch("openkb.parsers.vlm_client.litellm.completion",
+               return_value=_fake_response("# Parsed")) as comp:
+        out = transcribe_to_markdown(src, model="gemini/gemini-2.5-pro")
+    assert out == "# Parsed"
+    _, kwargs = comp.call_args
+    assert kwargs["model"] == "gemini/gemini-2.5-pro"
+    content = kwargs["messages"][0]["content"]
+    assert any("base64" in str(part) for part in content)
+
+
+def test_default_model_used_when_none(tmp_path):
+    src = tmp_path / "img.png"
+    src.write_bytes(b"PNG")
+    with patch("openkb.parsers.vlm_client.litellm.completion",
+               return_value=_fake_response("desc")) as comp:
+        transcribe_to_markdown(src, model=None)
+    _, kwargs = comp.call_args
+    assert kwargs["model"]  # some non-empty default

From 8b1d4eb1e69831f510404b48550c2fd9ca0bc513 Mon Sep 17 00:00:00 2001
From: mountain <kose2livs@gmail.com>
Date: Sun, 31 May 2026 11:00:10 +0800
Subject: [PATCH 09/26] fix(parsers): use litellm file content part for PDFs in
 vlm_client (#77)

---
 openkb/parsers/vlm_client.py     |  7 ++++++-
 tests/test_parsers_vlm_client.py | 26 ++++++++++++++++++++++++++
 2 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/openkb/parsers/vlm_client.py b/openkb/parsers/vlm_client.py
index 5979fd9e..1f2774f8 100644
--- a/openkb/parsers/vlm_client.py
+++ b/openkb/parsers/vlm_client.py
@@ -21,9 +21,14 @@ def transcribe_to_markdown(src: Path, model: str | None = None, prompt: str | No
     mime = mimetypes.guess_type(src.name)[0] or "application/octet-stream"
     b64 = base64.b64encode(src.read_bytes()).decode()
     data_uri = f"data:{mime};base64,{b64}"
+    if mime == "application/pdf":
+        # litellm's document/file content part (image_url is only for raster images).
+        media_part = {"type": "file", "file": {"file_data": data_uri}}
+    else:
+        media_part = {"type": "image_url", "image_url": {"url": data_uri}}
     content = [
         {"type": "text", "text": prompt or _PROMPT},
-        {"type": "image_url", "image_url": {"url": data_uri}},
+        media_part,
     ]
     resp = litellm.completion(model=model, messages=[{"role": "user", "content": content}])
     return resp.choices[0].message.content or ""
diff --git a/tests/test_parsers_vlm_client.py b/tests/test_parsers_vlm_client.py
index 09e38256..3703c179 100644
--- a/tests/test_parsers_vlm_client.py
+++ b/tests/test_parsers_vlm_client.py
@@ -33,3 +33,29 @@ def test_default_model_used_when_none(tmp_path):
         transcribe_to_markdown(src, model=None)
     _, kwargs = comp.call_args
     assert kwargs["model"]  # some non-empty default
+
+
+def test_pdf_uses_file_content_part(tmp_path):
+    src = tmp_path / "doc.pdf"
+    src.write_bytes(b"%PDF-1.4 data")
+    with patch("openkb.parsers.vlm_client.litellm.completion",
+               return_value=_fake_response("x")) as comp:
+        transcribe_to_markdown(src, model="some/model")
+    content = comp.call_args.kwargs["messages"][0]["content"]
+    file_parts = [p for p in content if p.get("type") == "file"]
+    assert len(file_parts) == 1
+    assert file_parts[0]["file"]["file_data"].startswith("data:application/pdf;base64,")
+    assert not any(p.get("type") == "image_url" for p in content)
+
+
+def test_image_uses_image_url_content_part(tmp_path):
+    src = tmp_path / "fig.png"
+    src.write_bytes(b"\x89PNG\r\n")
+    with patch("openkb.parsers.vlm_client.litellm.completion",
+               return_value=_fake_response("x")) as comp:
+        transcribe_to_markdown(src, model="some/model")
+    content = comp.call_args.kwargs["messages"][0]["content"]
+    image_parts = [p for p in content if p.get("type") == "image_url"]
+    assert len(image_parts) == 1
+    assert image_parts[0]["image_url"]["url"].startswith("data:image/png;base64,")
+    assert not any(p.get("type") == "file" for p in content)

From 2a93eec753d97d4ec81bfe83f33a48080fac3e6d Mon Sep 17 00:00:00 2001
From: mountain <kose2livs@gmail.com>
Date: Sun, 31 May 2026 11:01:23 +0800
Subject: [PATCH 10/26] feat(parsers): add VLMParser (vision LLM via litellm)
 (#77)

---
 openkb/parsers/vlm.py     | 27 +++++++++++++++++++++++++++
 tests/test_parsers_vlm.py | 33 +++++++++++++++++++++++++++++++++
 2 files changed, 60 insertions(+)
 create mode 100644 openkb/parsers/vlm.py
 create mode 100644 tests/test_parsers_vlm.py

diff --git a/openkb/parsers/vlm.py b/openkb/parsers/vlm.py
new file mode 100644
index 00000000..834125fc
--- /dev/null
+++ b/openkb/parsers/vlm.py
@@ -0,0 +1,27 @@
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any
+
+from openkb.parsers.base import ParseResult, Parser
+from openkb.parsers.vlm_client import transcribe_to_markdown
+
+_SUPPORTED = {".pdf"}
+
+
+class VLMParser(Parser):
+    """Parse via a vision-capable LLM (litellm). Covers Gemini, GPT-4o, Claude, etc."""
+
+    name = "vlm"
+
+    def __init__(self, opts: dict[str, Any] | None = None, model: str | None = None):
+        opts = opts or {}
+        # parsers.vlm.model overrides the global model; else use the global model.
+        self.model = opts.get("model") or model
+
+    def supports(self, suffix: str) -> bool:
+        return suffix.lower() in _SUPPORTED
+
+    def parse(self, src: Path) -> ParseResult:
+        markdown = transcribe_to_markdown(src, model=self.model)
+        return ParseResult(markdown=markdown)
diff --git a/tests/test_parsers_vlm.py b/tests/test_parsers_vlm.py
new file mode 100644
index 00000000..e1d78683
--- /dev/null
+++ b/tests/test_parsers_vlm.py
@@ -0,0 +1,33 @@
+from __future__ import annotations
+
+from unittest.mock import patch
+
+from openkb.parsers.vlm import VLMParser
+from openkb.parsers.base import ParseResult
+
+
+def test_supports_pdf_only_for_v1():
+    p = VLMParser({}, model="gemini/gemini-2.5-pro")
+    assert p.supports(".pdf") is True
+    assert p.supports(".md") is False
+    assert p.supports(".docx") is False
+
+
+def test_parse_calls_transcribe_with_configured_model(tmp_path):
+    src = tmp_path / "doc.pdf"
+    src.write_bytes(b"%PDF")
+    p = VLMParser({"model": "gpt-4o"}, model="fallback-model")
+    with patch("openkb.parsers.vlm.transcribe_to_markdown", return_value="# MD") as t:
+        result = p.parse(src)
+    t.assert_called_once_with(src, model="gpt-4o")
+    assert isinstance(result, ParseResult)
+    assert result.markdown == "# MD"
+
+
+def test_parse_falls_back_to_global_model(tmp_path):
+    src = tmp_path / "doc.pdf"
+    src.write_bytes(b"%PDF")
+    p = VLMParser({}, model="global-model")
+    with patch("openkb.parsers.vlm.transcribe_to_markdown", return_value="x") as t:
+        p.parse(src)
+    t.assert_called_once_with(src, model="global-model")

From 2c7e6932bdf3291a5cc1512f5ad9576db9e7cda1 Mon Sep 17 00:00:00 2001
From: mountain <kose2livs@gmail.com>
Date: Sun, 31 May 2026 11:04:10 +0800
Subject: [PATCH 11/26] feat(parsers): add MistralParser via mistralai SDK
 (#77)

---
 openkb/parsers/mistral.py     | 63 ++++++++++++++++++++++++++++++++
 tests/test_parsers_mistral.py | 69 +++++++++++++++++++++++++++++++++++
 2 files changed, 132 insertions(+)
 create mode 100644 openkb/parsers/mistral.py
 create mode 100644 tests/test_parsers_mistral.py

diff --git a/openkb/parsers/mistral.py b/openkb/parsers/mistral.py
new file mode 100644
index 00000000..2f0e5622
--- /dev/null
+++ b/openkb/parsers/mistral.py
@@ -0,0 +1,63 @@
+from __future__ import annotations
+
+import base64
+import os
+import re
+from pathlib import Path
+from typing import Any
+
+from openkb.parsers.base import ParseResult, Parser
+
+_SUPPORTED = {".pdf"}
+_DATA_URI_RE = re.compile(r"^data:[^;]+;base64,", re.IGNORECASE)
+
+
+class MistralParser(Parser):
+    """Mistral OCR (Document AI). Synchronous; markdown + base64 images."""
+
+    name = "mistral"
+
+    def __init__(self, opts: dict[str, Any] | None = None):
+        self.opts = opts or {}
+        self.model = self.opts.get("model", "mistral-ocr-latest")
+
+    def supports(self, suffix: str) -> bool:
+        return suffix.lower() in _SUPPORTED
+
+    def parse(self, src: Path) -> ParseResult:
+        api_key = os.environ.get("MISTRAL_API_KEY")
+        if not api_key:
+            raise RuntimeError(
+                "Mistral parser requires the MISTRAL_API_KEY environment variable."
+            )
+        try:
+            from mistralai import Mistral
+        except ImportError as exc:
+            raise RuntimeError(
+                "Mistral parser requires the 'mistralai' package. "
+                "Install with: pip install openkb[mistral]"
+            ) from exc
+
+        client = Mistral(api_key=api_key)
+        uploaded = client.files.upload(
+            file={"file_name": src.name, "content": src.read_bytes()}, purpose="ocr"
+        )
+        signed = client.files.get_signed_url(file_id=uploaded.id)
+        resp = client.ocr.process(
+            model=self.model,
+            document={"type": "document_url", "document_url": signed.url},
+            include_image_base64=True,
+        )
+
+        parts: list[str] = []
+        images: dict[str, bytes] = {}
+        for page in resp.pages:
+            parts.append(page.markdown or "")
+            for img in getattr(page, "images", None) or []:
+                raw = img.image_base64 or ""
+                raw = _DATA_URI_RE.sub("", raw)
+                try:
+                    images[img.id] = base64.b64decode(raw, validate=True)
+                except Exception:
+                    continue
+        return ParseResult(markdown="\n\n".join(parts), images=images)
diff --git a/tests/test_parsers_mistral.py b/tests/test_parsers_mistral.py
new file mode 100644
index 00000000..d72651d0
--- /dev/null
+++ b/tests/test_parsers_mistral.py
@@ -0,0 +1,69 @@
+from __future__ import annotations
+
+import base64
+import sys
+import types
+from unittest.mock import MagicMock
+
+import pytest
+
+from openkb.parsers.base import ParseResult
+
+
+def _install_fake_mistralai(monkeypatch, client_instance):
+    mod = types.ModuleType("mistralai")
+    mod.Mistral = MagicMock(return_value=client_instance)
+    monkeypatch.setitem(sys.modules, "mistralai", mod)
+    return mod
+
+
+def test_supports_pdf():
+    from openkb.parsers.mistral import MistralParser
+    p = MistralParser({})
+    assert p.supports(".pdf") is True
+    assert p.supports(".docx") is False
+
+
+def test_missing_key_raises_actionable(monkeypatch, tmp_path):
+    monkeypatch.delenv("MISTRAL_API_KEY", raising=False)
+    from openkb.parsers.mistral import MistralParser
+    p = MistralParser({})
+    src = tmp_path / "d.pdf"; src.write_bytes(b"%PDF")
+    with pytest.raises(RuntimeError) as exc:
+        p.parse(src)
+    assert "MISTRAL_API_KEY" in str(exc.value)
+
+
+def test_parse_collects_markdown_and_decodes_images(monkeypatch, tmp_path):
+    monkeypatch.setenv("MISTRAL_API_KEY", "k")
+    img_bytes = b"IMGDATA"
+    img_b64 = base64.b64encode(img_bytes).decode()
+
+    client = MagicMock()
+    client.files.upload.return_value = MagicMock(id="file-1")
+    client.files.get_signed_url.return_value = MagicMock(url="https://signed")
+    page = MagicMock()
+    page.markdown = "Text ![img-0.png](img-0.png)"
+    page.images = [MagicMock(id="img-0.png", image_base64=f"data:image/png;base64,{img_b64}")]
+    client.ocr.process.return_value = MagicMock(pages=[page])
+
+    _install_fake_mistralai(monkeypatch, client)
+    from openkb.parsers.mistral import MistralParser
+    p = MistralParser({})
+    src = tmp_path / "d.pdf"; src.write_bytes(b"%PDF")
+    result = p.parse(src)
+
+    assert isinstance(result, ParseResult)
+    assert "img-0.png" in result.markdown
+    assert result.images["img-0.png"] == img_bytes
+
+
+def test_missing_package_raises_install_hint(monkeypatch, tmp_path):
+    monkeypatch.setenv("MISTRAL_API_KEY", "k")
+    monkeypatch.setitem(sys.modules, "mistralai", None)  # force ImportError
+    from openkb.parsers.mistral import MistralParser
+    p = MistralParser({})
+    src = tmp_path / "d.pdf"; src.write_bytes(b"%PDF")
+    with pytest.raises(RuntimeError) as exc:
+        p.parse(src)
+    assert "openkb[mistral]" in str(exc.value)

From 50b83bb14179babac19e9d264c3a82b33936ac6f Mon Sep 17 00:00:00 2001
From: mountain <kose2livs@gmail.com>
Date: Sun, 31 May 2026 11:06:48 +0800
Subject: [PATCH 12/26] feat(parsers): log skipped undecodable Mistral images
 (#77)

---
 openkb/parsers/mistral.py     |  4 ++++
 tests/test_parsers_mistral.py | 19 +++++++++++++++++++
 2 files changed, 23 insertions(+)

diff --git a/openkb/parsers/mistral.py b/openkb/parsers/mistral.py
index 2f0e5622..a494f149 100644
--- a/openkb/parsers/mistral.py
+++ b/openkb/parsers/mistral.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import base64
+import logging
 import os
 import re
 from pathlib import Path
@@ -8,6 +9,8 @@
 
 from openkb.parsers.base import ParseResult, Parser
 
+logger = logging.getLogger(__name__)
+
 _SUPPORTED = {".pdf"}
 _DATA_URI_RE = re.compile(r"^data:[^;]+;base64,", re.IGNORECASE)
 
@@ -59,5 +62,6 @@ def parse(self, src: Path) -> ParseResult:
                 try:
                     images[img.id] = base64.b64decode(raw, validate=True)
                 except Exception:
+                    logger.warning("Skipping undecodable Mistral image: %s", getattr(img, "id", "?"))
                     continue
         return ParseResult(markdown="\n\n".join(parts), images=images)
diff --git a/tests/test_parsers_mistral.py b/tests/test_parsers_mistral.py
index d72651d0..e95c1858 100644
--- a/tests/test_parsers_mistral.py
+++ b/tests/test_parsers_mistral.py
@@ -67,3 +67,22 @@ def test_missing_package_raises_install_hint(monkeypatch, tmp_path):
     with pytest.raises(RuntimeError) as exc:
         p.parse(src)
     assert "openkb[mistral]" in str(exc.value)
+
+
+def test_undecodable_image_logged_and_skipped(monkeypatch, tmp_path, caplog):
+    import logging as _logging
+    monkeypatch.setenv("MISTRAL_API_KEY", "k")
+    client = MagicMock()
+    client.files.upload.return_value = MagicMock(id="file-1")
+    client.files.get_signed_url.return_value = MagicMock(url="https://signed")
+    page = MagicMock()
+    page.markdown = "Text ![bad.png](bad.png)"
+    page.images = [MagicMock(id="bad.png", image_base64="!!!not-base64!!!")]
+    client.ocr.process.return_value = MagicMock(pages=[page])
+    _install_fake_mistralai(monkeypatch, client)
+    from openkb.parsers.mistral import MistralParser
+    src = tmp_path / "d.pdf"; src.write_bytes(b"%PDF")
+    with caplog.at_level(_logging.WARNING):
+        result = MistralParser({}).parse(src)
+    assert "bad.png" not in result.images
+    assert any("bad.png" in r.message for r in caplog.records)

From e452a3aa1ce1dc23e8c79bbde0ff5aaa1b53cbfc Mon Sep 17 00:00:00 2001
From: mountain <kose2livs@gmail.com>
Date: Sun, 31 May 2026 11:08:40 +0800
Subject: [PATCH 13/26] feat(parsers): add MineruParser (cloud + self-hosted
 HTTP) (#77)

---
 openkb/parsers/mineru.py     | 120 +++++++++++++++++++++++++++++++++++
 tests/test_parsers_mineru.py |  68 ++++++++++++++++++++
 2 files changed, 188 insertions(+)
 create mode 100644 openkb/parsers/mineru.py
 create mode 100644 tests/test_parsers_mineru.py

diff --git a/openkb/parsers/mineru.py b/openkb/parsers/mineru.py
new file mode 100644
index 00000000..243bca3d
--- /dev/null
+++ b/openkb/parsers/mineru.py
@@ -0,0 +1,120 @@
+from __future__ import annotations
+
+import io
+import os
+import time
+import zipfile
+from pathlib import Path
+from typing import Any
+
+from openkb.parsers.base import ParseResult, Parser
+
+_SUPPORTED = {".pdf", ".docx", ".pptx", ".xlsx", ".xls", ".html", ".htm"}
+_CLOUD_BASE = "https://mineru.net/api/v4"
+
+
+def _httpx():
+    try:
+        import httpx
+    except ImportError as exc:
+        raise RuntimeError(
+            "MinerU parser requires 'httpx'. Install with: pip install openkb[mineru]"
+        ) from exc
+    return httpx
+
+
+def _result_from_zip(zip_bytes: bytes) -> ParseResult:
+    """Extract the markdown file + images from a MinerU result zip."""
+    images: dict[str, bytes] = {}
+    markdown = ""
+    with zipfile.ZipFile(io.BytesIO(zip_bytes)) as zf:
+        md_names = [n for n in zf.namelist() if n.lower().endswith(".md")]
+        if md_names:
+            chosen = next((n for n in md_names if n.endswith("full.md")), md_names[0])
+            markdown = zf.read(chosen).decode("utf-8", errors="replace")
+        for name in zf.namelist():
+            if name.lower().endswith((".png", ".jpg", ".jpeg", ".gif", ".webp")):
+                images[Path(name).name] = zf.read(name)
+    # Markdown references images as 'images/<file>'; localize_images matches on
+    # the bare filename, so rewrite 'images/fig.png' -> 'fig.png'.
+    for fname in images:
+        markdown = markdown.replace(f"images/{fname}", fname)
+    return ParseResult(markdown=markdown, images=images)
+
+
+class MineruParser(Parser):
+    """MinerU via HTTP — self-hosted server or hosted cloud API."""
+
+    name = "mineru"
+
+    def __init__(self, opts: dict[str, Any] | None = None):
+        self.opts = opts or {}
+        self.mode = self.opts.get("mode", "cloud")
+        self.base_url = self.opts.get("base_url")
+        self.poll_interval = self.opts.get("poll_interval", 3)
+        self.timeout = self.opts.get("timeout", 600)
+
+    def supports(self, suffix: str) -> bool:
+        return suffix.lower() in _SUPPORTED
+
+    def parse(self, src: Path) -> ParseResult:
+        if self.mode == "self_hosted":
+            return self._parse_self_hosted(src)
+        return self._parse_cloud(src)
+
+    def _parse_self_hosted(self, src: Path) -> ParseResult:
+        if not self.base_url:
+            raise RuntimeError(
+                "MinerU self_hosted mode requires 'base_url' in parsers.mineru config."
+            )
+        httpx = _httpx()
+        url = self.base_url.rstrip("/") + "/file_parse"
+        with httpx.Client(timeout=self.timeout) as client:
+            resp = client.post(
+                url,
+                files={"file": (src.name, src.read_bytes())},
+                data={"return_format": "zip"},
+            )
+            resp.raise_for_status()
+            return _result_from_zip(resp.content)
+
+    def _parse_cloud(self, src: Path) -> ParseResult:
+        api_key = os.environ.get("MINERU_API_KEY")
+        if not api_key:
+            raise RuntimeError(
+                "MinerU cloud mode requires the MINERU_API_KEY environment variable."
+            )
+        httpx = _httpx()
+        headers = {"Authorization": f"Bearer {api_key}"}
+        with httpx.Client(timeout=self.timeout) as client:
+            r = client.post(
+                f"{_CLOUD_BASE}/file-urls/batch",
+                headers=headers,
+                json={"files": [{"name": src.name, "is_ocr": True}]},
+            )
+            r.raise_for_status()
+            data = r.json()["data"]
+            batch_id = data["batch_id"]
+            upload_url = data["file_urls"][0]
+            client.put(upload_url, content=src.read_bytes()).raise_for_status()
+            elapsed = 0
+            zip_url = None
+            while elapsed < self.timeout:
+                pr = client.get(
+                    f"{_CLOUD_BASE}/extract-results/batch/{batch_id}", headers=headers
+                )
+                pr.raise_for_status()
+                results = pr.json()["data"]["extract_result"]
+                state = results[0].get("state")
+                if state == "done":
+                    zip_url = results[0]["full_zip_url"]
+                    break
+                if state == "failed":
+                    raise RuntimeError(f"MinerU extraction failed: {results[0]}")
+                time.sleep(self.poll_interval)
+                elapsed += self.poll_interval
+            if zip_url is None:
+                raise RuntimeError("MinerU extraction timed out.")
+            zr = client.get(zip_url)
+            zr.raise_for_status()
+            return _result_from_zip(zr.content)
diff --git a/tests/test_parsers_mineru.py b/tests/test_parsers_mineru.py
new file mode 100644
index 00000000..be63e9b4
--- /dev/null
+++ b/tests/test_parsers_mineru.py
@@ -0,0 +1,68 @@
+from __future__ import annotations
+
+import io
+import sys
+import types
+import zipfile
+from unittest.mock import MagicMock
+
+import pytest
+
+from openkb.parsers.base import ParseResult
+
+
+def test_supports_office_and_pdf():
+    from openkb.parsers.mineru import MineruParser
+    p = MineruParser({})
+    assert p.supports(".pdf") is True
+    assert p.supports(".docx") is True
+    assert p.supports(".md") is False
+
+
+def test_self_hosted_requires_base_url(tmp_path):
+    from openkb.parsers.mineru import MineruParser
+    p = MineruParser({"mode": "self_hosted"})
+    src = tmp_path / "d.pdf"; src.write_bytes(b"%PDF")
+    with pytest.raises(RuntimeError) as exc:
+        p.parse(src)
+    assert "base_url" in str(exc.value)
+
+
+def test_cloud_requires_api_key(monkeypatch, tmp_path):
+    monkeypatch.delenv("MINERU_API_KEY", raising=False)
+    from openkb.parsers.mineru import MineruParser
+    p = MineruParser({"mode": "cloud"})
+    src = tmp_path / "d.pdf"; src.write_bytes(b"%PDF")
+    with pytest.raises(RuntimeError) as exc:
+        p.parse(src)
+    assert "MINERU_API_KEY" in str(exc.value)
+
+
+def test_self_hosted_parses_zip(monkeypatch, tmp_path):
+    buf = io.BytesIO()
+    with zipfile.ZipFile(buf, "w") as zf:
+        zf.writestr("full.md", "# Mineru\n\n![p](images/fig.png)")
+        zf.writestr("images/fig.png", b"PNGBYTES")
+    zip_bytes = buf.getvalue()
+
+    fake_resp = MagicMock(status_code=200, content=zip_bytes)
+    fake_resp.raise_for_status = MagicMock()
+    fake_client = MagicMock()
+    fake_client.__enter__ = MagicMock(return_value=fake_client)
+    fake_client.__exit__ = MagicMock(return_value=False)
+    fake_client.post.return_value = fake_resp
+
+    httpx_mod = types.ModuleType("httpx")
+    httpx_mod.Client = MagicMock(return_value=fake_client)
+    monkeypatch.setitem(sys.modules, "httpx", httpx_mod)
+
+    from openkb.parsers.mineru import MineruParser
+    p = MineruParser({"mode": "self_hosted", "base_url": "http://localhost:8000"})
+    src = tmp_path / "d.pdf"; src.write_bytes(b"%PDF")
+    result = p.parse(src)
+    assert isinstance(result, ParseResult)
+    assert "Mineru" in result.markdown
+    assert result.images["fig.png"] == b"PNGBYTES"
+    # the images/ prefix should be rewritten to the bare filename for localize_images
+    assert "images/fig.png" not in result.markdown
+    assert "![p](fig.png)" in result.markdown

From a6074f68341d7d1a78e6e572a9085ded1308ab38 Mon Sep 17 00:00:00 2001
From: mountain <kose2livs@gmail.com>
Date: Sun, 31 May 2026 11:12:03 +0800
Subject: [PATCH 14/26] test(parsers): cover MinerU cloud poll+download flow
 (#77)

---
 tests/test_parsers_mineru.py | 62 ++++++++++++++++++++++++++++++++++++
 1 file changed, 62 insertions(+)

diff --git a/tests/test_parsers_mineru.py b/tests/test_parsers_mineru.py
index be63e9b4..5e0c9b2c 100644
--- a/tests/test_parsers_mineru.py
+++ b/tests/test_parsers_mineru.py
@@ -66,3 +66,65 @@ def test_self_hosted_parses_zip(monkeypatch, tmp_path):
     # the images/ prefix should be rewritten to the bare filename for localize_images
     assert "images/fig.png" not in result.markdown
     assert "![p](fig.png)" in result.markdown
+
+
+def test_cloud_flow_polls_then_downloads(monkeypatch, tmp_path):
+    monkeypatch.setenv("MINERU_API_KEY", "key")
+    monkeypatch.setattr("openkb.parsers.mineru.time.sleep", lambda *a, **k: None)
+
+    buf = io.BytesIO()
+    with zipfile.ZipFile(buf, "w") as zf:
+        zf.writestr("full.md", "# Cloud\n\n![p](images/fig.png)")
+        zf.writestr("images/fig.png", b"ZBYTES")
+    zip_bytes = buf.getvalue()
+
+    def _resp(json_data=None, content=None):
+        r = MagicMock()
+        r.raise_for_status = MagicMock()
+        if json_data is not None:
+            r.json.return_value = json_data
+        if content is not None:
+            r.content = content
+        return r
+
+    client = MagicMock()
+    client.__enter__ = MagicMock(return_value=client)
+    client.__exit__ = MagicMock(return_value=False)
+    client.post.return_value = _resp(
+        json_data={"data": {"batch_id": "b1", "file_urls": ["https://upload"]}}
+    )
+    client.put.return_value = _resp()
+
+    poll_url = "https://mineru.net/api/v4/extract-results/batch/b1"
+    poll_running = _resp(json_data={"data": {"extract_result": [{"state": "running"}]}})
+    poll_done = _resp(
+        json_data={"data": {"extract_result": [{"state": "done", "full_zip_url": "https://zip"}]}}
+    )
+    zip_resp = _resp(content=zip_bytes)
+
+    def _get(url, *a, **k):
+        if url == "https://zip":
+            return zip_resp
+        assert url == poll_url
+        _get.calls += 1
+        return poll_running if _get.calls == 1 else poll_done
+
+    _get.calls = 0
+    client.get.side_effect = _get
+
+    httpx_mod = types.ModuleType("httpx")
+    httpx_mod.Client = MagicMock(return_value=client)
+    monkeypatch.setitem(sys.modules, "httpx", httpx_mod)
+
+    from openkb.parsers.mineru import MineruParser
+    p = MineruParser({"mode": "cloud", "poll_interval": 0})
+    src = tmp_path / "d.pdf"; src.write_bytes(b"%PDF")
+    result = p.parse(src)
+
+    assert isinstance(result, ParseResult)
+    assert "Cloud" in result.markdown
+    assert result.images["fig.png"] == b"ZBYTES"
+    assert "images/fig.png" not in result.markdown
+    assert "![p](fig.png)" in result.markdown
+    # drove the full poll loop: running once, then done
+    assert _get.calls == 2

From 82958e41c9503091d9e54f7890310770d36ad586 Mon Sep 17 00:00:00 2001
From: mountain <kose2livs@gmail.com>
Date: Sun, 31 May 2026 11:14:11 +0800
Subject: [PATCH 15/26] feat(cli): add --parser override and default parser
 config (#77)

---
 openkb/cli.py             | 14 ++++++++------
 openkb/config.py          |  1 +
 tests/test_add_command.py | 19 ++++++++++++++++++-
 tests/test_config.py      |  5 +++++
 4 files changed, 32 insertions(+), 7 deletions(-)

diff --git a/openkb/cli.py b/openkb/cli.py
index 1a2761d8..13fa89cb 100644
--- a/openkb/cli.py
+++ b/openkb/cli.py
@@ -259,7 +259,7 @@ def _clear_existing_skill_dir(kb_dir: Path, name: str) -> None:
         shutil.rmtree(target)
 
 
-def add_single_file(file_path: Path, kb_dir: Path) -> Literal["added", "skipped", "failed"]:
+def add_single_file(file_path: Path, kb_dir: Path, parser_override: str | None = None) -> Literal["added", "skipped", "failed"]:
     """Convert, index, and compile a single document into the knowledge base.
 
     Steps:
@@ -289,7 +289,7 @@ def add_single_file(file_path: Path, kb_dir: Path) -> Literal["added", "skipped"
     # 2. Convert document
     click.echo(f"Adding: {file_path.name}")
     try:
-        result = convert_document(file_path, kb_dir)
+        result = convert_document(file_path, kb_dir, parser_override=parser_override)
     except Exception as exc:
         click.echo(f"  [ERROR] Conversion failed: {exc}")
         logger.debug("Conversion traceback:", exc_info=True)
@@ -575,8 +575,10 @@ def init(model, language):
 
 @cli.command()
 @click.argument("path")
+@click.option("--parser", "parser_override", default=None,
+              help="Override the configured parser for this run (local|mineru|mistral|vlm).")
 @click.pass_context
-def add(ctx, path):
+def add(ctx, path, parser_override):
     """Add a document or directory of documents at PATH to the knowledge base.
 
     PATH may be a local file, a local directory (which is walked
@@ -600,7 +602,7 @@ def add(ctx, path):
         fetched = fetch_url_to_raw(path, kb_dir)
         if fetched is None:
             return
-        outcome = add_single_file(fetched, kb_dir)
+        outcome = add_single_file(fetched, kb_dir, parser_override=parser_override)
         # Only clean up on dedup-skip. On "failed" we keep the file so
         # the user can retry (e.g. transient LLM error during compile)
         # without re-downloading — and so they don't lose data when
@@ -626,7 +628,7 @@ def add(ctx, path):
         click.echo(f"Found {total} supported file(s) in {path}.")
         for i, f in enumerate(files, 1):
             click.echo(f"\n[{i}/{total}] ", nl=False)
-            add_single_file(f, kb_dir)
+            add_single_file(f, kb_dir, parser_override=parser_override)
     else:
         if target.suffix.lower() not in SUPPORTED_EXTENSIONS:
             click.echo(
@@ -634,7 +636,7 @@ def add(ctx, path):
                 f"Supported: {', '.join(sorted(SUPPORTED_EXTENSIONS))}"
             )
             return
-        add_single_file(target, kb_dir)
+        add_single_file(target, kb_dir, parser_override=parser_override)
 
 
 def _stream_to_tty() -> bool:
diff --git a/openkb/config.py b/openkb/config.py
index b83e1346..dea9d482 100644
--- a/openkb/config.py
+++ b/openkb/config.py
@@ -9,6 +9,7 @@
     "model": "gpt-5.4-mini",
     "language": "en",
     "pageindex_threshold": 20,
+    "parser": "local",
 }
 
 GLOBAL_CONFIG_DIR = Path.home() / ".config" / "openkb"
diff --git a/tests/test_add_command.py b/tests/test_add_command.py
index 1fb4d87f..e03c5b7f 100644
--- a/tests/test_add_command.py
+++ b/tests/test_add_command.py
@@ -70,7 +70,7 @@ def test_add_single_file_calls_helper(self, tmp_path):
         with patch("openkb.cli.add_single_file") as mock_add, \
              patch("openkb.cli._find_kb_dir", return_value=kb_dir):
             runner.invoke(cli, ["add", str(doc)])
-            mock_add.assert_called_once_with(doc, kb_dir)
+            mock_add.assert_called_once_with(doc, kb_dir, parser_override=None)
 
     def test_add_directory_calls_helper_for_each_file(self, tmp_path):
         kb_dir = self._setup_kb(tmp_path)
@@ -147,3 +147,20 @@ def test_add_short_doc_runs_compiler(self, tmp_path):
             result = runner.invoke(cli, ["add", str(doc)])
             mock_arun.assert_called_once()
             assert "OK" in result.output
+
+
+def test_add_single_file_threads_parser_override(tmp_path):
+    from unittest.mock import patch
+    from pathlib import Path
+    from openkb.cli import add_single_file
+
+    fake_result = type("R", (), {"skipped": True, "is_long_doc": False,
+                                 "file_hash": None, "raw_path": None,
+                                 "source_path": None})()
+    with patch("openkb.cli.convert_document", return_value=fake_result) as cd, \
+         patch("openkb.cli._setup_llm_key"), \
+         patch("openkb.cli.load_config", return_value={"model": "m"}):
+        add_single_file(Path("x.pdf"), tmp_path, parser_override="mistral")
+    # parser_override must reach convert_document
+    assert cd.call_args.kwargs.get("parser_override") == "mistral" \
+        or (len(cd.call_args.args) >= 3 and cd.call_args.args[2] == "mistral")
diff --git a/tests/test_config.py b/tests/test_config.py
index 35704a6b..0d9aae36 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -45,3 +45,8 @@ def test_load_overrides_defaults(tmp_path):
     assert loaded["pageindex_threshold"] == 100
     # Non-overridden defaults still present
     assert loaded["language"] == "en"
+
+
+def test_default_parser_is_local():
+    from openkb.config import DEFAULT_CONFIG
+    assert DEFAULT_CONFIG["parser"] == "local"

From 995b90c5a6884a68b4525eaa07149b89602c47f9 Mon Sep 17 00:00:00 2001
From: mountain <kose2livs@gmail.com>
Date: Sun, 31 May 2026 11:17:36 +0800
Subject: [PATCH 16/26] feat(cli): validate --parser against valid set via
 click.Choice (#77)

---
 openkb/cli.py              | 4 +++-
 openkb/parsers/registry.py | 4 ++--
 tests/test_add_command.py  | 9 +++++++++
 3 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/openkb/cli.py b/openkb/cli.py
index 13fa89cb..030e1133 100644
--- a/openkb/cli.py
+++ b/openkb/cli.py
@@ -43,6 +43,7 @@ def filter(self, record: logging.LogRecord) -> bool:
 from openkb.config import DEFAULT_CONFIG, load_config, save_config, load_global_config, register_kb
 from openkb.converter import convert_document
 from openkb.log import append_log
+from openkb.parsers.registry import VALID_PARSERS
 from openkb.schema import AGENTS_MD
 
 # Suppress warnings after all imports — markitdown overrides filters at import time
@@ -576,7 +577,8 @@ def init(model, language):
 @cli.command()
 @click.argument("path")
 @click.option("--parser", "parser_override", default=None,
-              help="Override the configured parser for this run (local|mineru|mistral|vlm).")
+              type=click.Choice(VALID_PARSERS),
+              help="Override the configured parser for this run.")
 @click.pass_context
 def add(ctx, path, parser_override):
     """Add a document or directory of documents at PATH to the knowledge base.
diff --git a/openkb/parsers/registry.py b/openkb/parsers/registry.py
index c87ab111..1dba2c01 100644
--- a/openkb/parsers/registry.py
+++ b/openkb/parsers/registry.py
@@ -6,7 +6,7 @@
 from openkb.parsers.base import Parser
 from openkb.parsers.local import LocalParser
 
-_VALID = ("local", "mineru", "mistral", "vlm")
+VALID_PARSERS = ("local", "mineru", "mistral", "vlm")
 
 
 def get_parser(
@@ -34,5 +34,5 @@ def get_parser(
         from openkb.parsers.mineru import MineruParser
         return MineruParser(opts)
     raise ValueError(
-        f"Unknown parser {name!r}. Valid options: {', '.join(_VALID)}."
+        f"Unknown parser {name!r}. Valid options: {', '.join(VALID_PARSERS)}."
     )
diff --git a/tests/test_add_command.py b/tests/test_add_command.py
index e03c5b7f..4bdf7be1 100644
--- a/tests/test_add_command.py
+++ b/tests/test_add_command.py
@@ -164,3 +164,12 @@ def test_add_single_file_threads_parser_override(tmp_path):
     # parser_override must reach convert_document
     assert cd.call_args.kwargs.get("parser_override") == "mistral" \
         or (len(cd.call_args.args) >= 3 and cd.call_args.args[2] == "mistral")
+
+
+def test_add_parser_option_rejects_invalid_choice(tmp_path):
+    from click.testing import CliRunner
+    from openkb.cli import cli
+    runner = CliRunner()
+    result = runner.invoke(cli, ["add", "--parser", "bogus", str(tmp_path / "x.pdf")])
+    assert result.exit_code != 0
+    assert "bogus" in result.output or "Invalid value" in result.output

From 2959a8d25cbd64f7be9035b3f3d193a472fbad49 Mon Sep 17 00:00:00 2001
From: mountain <kose2livs@gmail.com>
Date: Sun, 31 May 2026 11:19:29 +0800
Subject: [PATCH 17/26] build: add optional parser extras (mistral, mineru,
 parsers) (#77)

---
 pyproject.toml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pyproject.toml b/pyproject.toml
index 026dea23..5d1e241c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -53,6 +53,9 @@ testpaths = ["tests"]
 
 [project.optional-dependencies]
 dev = ["pytest", "pytest-asyncio"]
+mistral = ["mistralai"]
+mineru = ["httpx"]
+parsers = ["mistralai", "httpx"]
 
 [tool.hatch.version]
 source = "vcs"

From 33cee6874eea6d9a066b0e4ee727d721e24f1a35 Mon Sep 17 00:00:00 2001
From: mountain <kose2livs@gmail.com>
Date: Sun, 31 May 2026 11:21:55 +0800
Subject: [PATCH 18/26] docs(readme): document pluggable document parsers (#77)

---
 README.md | 41 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)

diff --git a/README.md b/README.md
index cc19188f..ccefa0e3 100644
--- a/README.md
+++ b/README.md
@@ -266,6 +266,7 @@ Settings are initialized by `openkb init`, and stored in `.openkb/config.yaml`:
 model: gpt-5.4                   # LLM model (any LiteLLM-supported provider)
 language: en                     # Wiki output language
 pageindex_threshold: 20          # PDF pages threshold for PageIndex
+parser: local                    # Document parser: local | mineru | mistral | vlm
 ```
 
 Model names use `provider/model` LiteLLM [format](https://docs.litellm.ai/docs/providers) (OpenAI models can omit the prefix):
@@ -276,6 +277,46 @@ Model names use `provider/model` LiteLLM [format](https://docs.litellm.ai/docs/p
 | Anthropic | `anthropic/claude-sonnet-4-6` |
 | Gemini | `gemini/gemini-3.1-pro-preview` |
 
+### Document parsers
+
+By default OpenKB extracts Markdown locally (pymupdf for PDFs, markitdown for
+Office/HTML) — no extra dependencies, unchanged behavior. For higher accuracy on
+complex documents you can route the file → Markdown step through an online or
+self-hosted parser:
+
+```yaml
+# .openkb/config.yaml
+parser: mineru          # local (default) | mineru | mistral | vlm
+parsers:
+  mineru:
+    mode: cloud         # cloud | self_hosted
+    base_url: http://localhost:8000   # required when mode is self_hosted
+  vlm:
+    model: gemini/gemini-2.5-pro      # any LiteLLM vision model (Gemini, GPT-4o, Claude, …)
+```
+
+Install the optional dependency for your parser:
+
+```bash
+pip install openkb[mistral]   # Mistral OCR
+pip install openkb[mineru]    # MinerU (HTTP)
+pip install openkb[parsers]   # all online parsers
+# vlm uses the existing LiteLLM dependency — no extra needed
+```
+
+Set the API key via environment variable: `MINERU_API_KEY` (MinerU cloud mode),
+`MISTRAL_API_KEY`; the `vlm` parser reuses the existing `LLM_API_KEY`. Override
+the parser for a single run with `openkb add --parser mistral file.pdf`
+(`local | mineru | mistral | vlm`).
+
+Each parser handles a subset of formats — `mineru` covers PDF, Word, PPT, Excel,
+and HTML; `mistral` and `vlm` cover PDF. `.md` and any unsupported format always
+fall back to the local parser.
+
+> **Note:** Long PDFs (≥ `pageindex_threshold` pages, default 20) continue to be
+> indexed with PageIndex and are **not** affected by the `parser` setting. The
+> parser governs the file → Markdown step for shorter documents and non-PDF files.
+
 ### PageIndex Integration
 
 Long documents are challenging for LLMs due to context limits, context rot, and summarization loss.

From 526db30cd9cc57903a64f40efa76f901cd1db0d0 Mon Sep 17 00:00:00 2001
From: mountain <kose2livs@gmail.com>
Date: Sun, 31 May 2026 11:38:30 +0800
Subject: [PATCH 19/26] fix(parsers): harden MinerU poll loop and anchor
 image-link rewrite (#77)

---
 openkb/parsers/mineru.py     | 14 +++++++--
 tests/test_parsers_mineru.py | 58 ++++++++++++++++++++++++++++++++++++
 2 files changed, 70 insertions(+), 2 deletions(-)

diff --git a/openkb/parsers/mineru.py b/openkb/parsers/mineru.py
index 243bca3d..a0e28b26 100644
--- a/openkb/parsers/mineru.py
+++ b/openkb/parsers/mineru.py
@@ -2,6 +2,7 @@
 
 import io
 import os
+import re
 import time
 import zipfile
 from pathlib import Path
@@ -38,7 +39,11 @@ def _result_from_zip(zip_bytes: bytes) -> ParseResult:
     # Markdown references images as 'images/<file>'; localize_images matches on
     # the bare filename, so rewrite 'images/fig.png' -> 'fig.png'.
     for fname in images:
-        markdown = markdown.replace(f"images/{fname}", fname)
+        # Rewrite only `![alt](images/<fname>)` links (anchored on markdown image
+        # syntax) to the bare filename, for localize_images to canonicalize. A
+        # replacement function avoids regex-escape injection from arbitrary names.
+        pattern = re.compile(r"(!\[[^\]]*\]\()" + re.escape("images/" + fname) + r"(\))")
+        markdown = pattern.sub(lambda m, f=fname: m.group(1) + f + m.group(2), markdown)
     return ParseResult(markdown=markdown, images=images)
 
 
@@ -51,7 +56,8 @@ def __init__(self, opts: dict[str, Any] | None = None):
         self.opts = opts or {}
         self.mode = self.opts.get("mode", "cloud")
         self.base_url = self.opts.get("base_url")
-        self.poll_interval = self.opts.get("poll_interval", 3)
+        pi = self.opts.get("poll_interval", 3)
+        self.poll_interval = pi if isinstance(pi, (int, float)) and pi > 0 else 3
         self.timeout = self.opts.get("timeout", 600)
 
     def supports(self, suffix: str) -> bool:
@@ -105,6 +111,10 @@ def _parse_cloud(self, src: Path) -> ParseResult:
                 )
                 pr.raise_for_status()
                 results = pr.json()["data"]["extract_result"]
+                if not results:
+                    time.sleep(self.poll_interval)
+                    elapsed += self.poll_interval
+                    continue
                 state = results[0].get("state")
                 if state == "done":
                     zip_url = results[0]["full_zip_url"]
diff --git a/tests/test_parsers_mineru.py b/tests/test_parsers_mineru.py
index 5e0c9b2c..dc255254 100644
--- a/tests/test_parsers_mineru.py
+++ b/tests/test_parsers_mineru.py
@@ -128,3 +128,61 @@ def _get(url, *a, **k):
     assert "![p](fig.png)" in result.markdown
     # drove the full poll loop: running once, then done
     assert _get.calls == 2
+
+
+def test_poll_interval_zero_is_clamped_to_positive():
+    from openkb.parsers.mineru import MineruParser
+    assert MineruParser({"poll_interval": 0}).poll_interval > 0
+    assert MineruParser({"poll_interval": -5}).poll_interval > 0
+    assert MineruParser({"poll_interval": 2}).poll_interval == 2
+
+
+def test_image_prefix_rewrite_is_anchored(tmp_path):
+    import io, sys, types, zipfile
+    from unittest.mock import MagicMock
+    # markdown has a real image link AND an unrelated 'images/fig.png' substring in prose
+    buf = io.BytesIO()
+    with zipfile.ZipFile(buf, "w") as zf:
+        zf.writestr("full.md", "See path other_images/fig.png in text.\n\n![p](images/fig.png)")
+        zf.writestr("images/fig.png", b"PNG")
+    from openkb.parsers.mineru import _result_from_zip
+    result = _result_from_zip(buf.getvalue())
+    assert "![p](fig.png)" in result.markdown          # link rewritten
+    assert "other_images/fig.png" in result.markdown    # unrelated prose untouched
+    assert result.images["fig.png"] == b"PNG"
+
+
+def test_cloud_empty_extract_result_then_done(monkeypatch, tmp_path):
+    import io, sys, types, zipfile
+    from unittest.mock import MagicMock
+    monkeypatch.setenv("MINERU_API_KEY", "key")
+    monkeypatch.setattr("openkb.parsers.mineru.time.sleep", lambda *a, **k: None)
+    buf = io.BytesIO()
+    with zipfile.ZipFile(buf, "w") as zf:
+        zf.writestr("full.md", "# Ok")
+    zip_bytes = buf.getvalue()
+
+    def _resp(json_data=None, content=None):
+        r = MagicMock(); r.raise_for_status = MagicMock()
+        if json_data is not None: r.json.return_value = json_data
+        if content is not None: r.content = content
+        return r
+    client = MagicMock()
+    client.__enter__ = MagicMock(return_value=client); client.__exit__ = MagicMock(return_value=False)
+    client.post.return_value = _resp(json_data={"data": {"batch_id": "b1", "file_urls": ["https://up"]}})
+    client.put.return_value = _resp()
+    empty = _resp(json_data={"data": {"extract_result": []}})            # queued: empty list
+    done = _resp(json_data={"data": {"extract_result": [{"state": "done", "full_zip_url": "https://zip"}]}})
+    zipr = _resp(content=zip_bytes)
+    def _get(url, *a, **k):
+        if url == "https://zip": return zipr
+        _get.n += 1
+        return empty if _get.n == 1 else done
+    _get.n = 0
+    client.get.side_effect = _get
+    httpx_mod = types.ModuleType("httpx"); httpx_mod.Client = MagicMock(return_value=client)
+    monkeypatch.setitem(sys.modules, "httpx", httpx_mod)
+    from openkb.parsers.mineru import MineruParser
+    src = tmp_path / "d.pdf"; src.write_bytes(b"%PDF")
+    result = MineruParser({"mode": "cloud", "poll_interval": 1}).parse(src)
+    assert "Ok" in result.markdown   # survived the empty-list poll without crashing

From 8af174feac97ff315a2cde43e1de14469ea68995 Mon Sep 17 00:00:00 2001
From: mountain <kose2livs@gmail.com>
Date: Sun, 31 May 2026 11:39:47 +0800
Subject: [PATCH 20/26] fix(parsers): sanitize image filenames against path
 traversal; skip redundant localize for local parser (#77)

---
 openkb/converter.py     |  6 +++++-
 openkb/images.py        | 13 +++++++------
 tests/test_converter.py | 13 +++++++++++++
 tests/test_images.py    | 19 +++++++++++++++++++
 4 files changed, 44 insertions(+), 7 deletions(-)

diff --git a/openkb/converter.py b/openkb/converter.py
index d6c37fc7..2bab3d1b 100644
--- a/openkb/converter.py
+++ b/openkb/converter.py
@@ -106,7 +106,11 @@ def convert_document(src: Path, kb_dir: Path, parser_override: str | None = None
         parser = LocalParser(doc_name=doc_name, images_dir=images_dir, source_dir=src.parent)
 
     parse_result = parser.parse(src)
-    markdown = localize_images(parse_result.markdown, parse_result.images, doc_name, images_dir)
+    if parser.name == "local":
+        # LocalParser already persisted images and produced canonical links.
+        markdown = parse_result.markdown
+    else:
+        markdown = localize_images(parse_result.markdown, parse_result.images, doc_name, images_dir)
 
     dest_md = sources_dir / f"{doc_name}.md"
     dest_md.write_text(markdown, encoding="utf-8")
diff --git a/openkb/images.py b/openkb/images.py
index 9c6a424c..39891b57 100644
--- a/openkb/images.py
+++ b/openkb/images.py
@@ -229,12 +229,13 @@ def localize_images(
     images_dir.mkdir(parents=True, exist_ok=True)
     result = markdown
     for filename, data in images.items():
-        (images_dir / filename).write_bytes(data)
-        # Rewrite a bare ![alt](filename) reference to the canonical KB path.
-        # Use a replacement *function* (not a replacement string) so a filename
-        # containing regex-escape sequences (e.g. "\g<1>") can't corrupt the
-        # substitution — localize_images handles arbitrary parser-supplied names.
-        canonical = f"sources/images/{doc_name}/{filename}"
+        # Strip any directory components from parser-supplied names so a
+        # malicious/odd filename (e.g. "../x.png", "/abs/x.png") can never
+        # write outside images_dir. The markdown still references the original
+        # `filename`, so rewrite that ref to the sanitized canonical path.
+        safe_name = Path(filename).name or "image"
+        (images_dir / safe_name).write_bytes(data)
+        canonical = f"sources/images/{doc_name}/{safe_name}"
         pattern = re.compile(r"(!\[[^\]]*\]\()" + re.escape(filename) + r"(\))")
         result = pattern.sub(lambda m, c=canonical: m.group(1) + c + m.group(2), result)
     result = extract_base64_images(result, doc_name, images_dir)
diff --git a/tests/test_converter.py b/tests/test_converter.py
index 90b26bf4..8e5ce77c 100644
--- a/tests/test_converter.py
+++ b/tests/test_converter.py
@@ -164,3 +164,16 @@ def test_falls_back_to_local_for_unsupported_suffix(self, kb_dir):
             LP.return_value.parse.return_value = ParseResult(markdown="# md")
             convert_document(src, kb_dir)
         LP.assert_called_once()  # fell back to LocalParser
+
+    def test_local_parser_skips_redundant_localize(self, kb_dir):
+        src = kb_dir / "raw" / "notes.md"
+        src.write_text("# md", encoding="utf-8")
+        local = MagicMock()
+        local.name = "local"
+        local.supports.return_value = True
+        local.parse.return_value = ParseResult(markdown="# md final")
+        with patch("openkb.converter.get_parser", return_value=local), \
+             patch("openkb.converter.localize_images") as li:
+            result = convert_document(src, kb_dir)
+        li.assert_not_called()                      # local path skips localize_images
+        assert result.source_path.read_text(encoding="utf-8") == "# md final"
diff --git a/tests/test_images.py b/tests/test_images.py
index 53906443..7292c5c1 100644
--- a/tests/test_images.py
+++ b/tests/test_images.py
@@ -203,3 +203,22 @@ def test_localize_images_filename_with_regex_metachars(tmp_path):
     out = localize_images(md, {weird: b"DATA"}, "doc", images_dir)
     assert f"sources/images/doc/{weird}" in out
     assert (images_dir / weird).read_bytes() == b"DATA"
+
+
+def test_localize_images_strips_path_traversal_in_filename(tmp_path):
+    images_dir = tmp_path / "wiki" / "sources" / "images" / "doc"
+    md = "![bad](../../evil.png)"
+    out = localize_images(md, {"../../evil.png": b"DATA"}, "doc", images_dir)
+    # bytes written INSIDE images_dir under the basename only — no escape
+    assert (images_dir / "evil.png").read_bytes() == b"DATA"
+    assert not (tmp_path / "evil.png").exists()
+    assert not (images_dir.parent.parent / "evil.png").exists()
+    # the original ref is rewritten to the sanitized canonical path
+    assert "sources/images/doc/evil.png" in out
+
+
+def test_localize_images_absolute_filename_stays_inside(tmp_path):
+    images_dir = tmp_path / "wiki" / "sources" / "images" / "doc"
+    out = localize_images("![x](/etc/x.png)", {"/etc/x.png": b"D"}, "doc", images_dir)
+    assert (images_dir / "x.png").read_bytes() == b"D"
+    assert "sources/images/doc/x.png" in out

From e424bb444a19bb73d96f6b6a15d21010ca87bcc7 Mon Sep 17 00:00:00 2001
From: mountain <kose2livs@gmail.com>
Date: Sun, 31 May 2026 11:41:25 +0800
Subject: [PATCH 21/26] fix(parsers): warn on VLM global-model fallback; unify
 parser dispatch/VALID_PARSERS (#77)

---
 openkb/parsers/registry.py     | 48 ++++++++++++++++++++++------------
 openkb/parsers/vlm.py          | 10 +++++++
 tests/test_parsers_registry.py |  7 +++++
 tests/test_parsers_vlm.py      | 14 ++++++++++
 4 files changed, 63 insertions(+), 16 deletions(-)

diff --git a/openkb/parsers/registry.py b/openkb/parsers/registry.py
index 1dba2c01..9a5a55b0 100644
--- a/openkb/parsers/registry.py
+++ b/openkb/parsers/registry.py
@@ -6,7 +6,31 @@
 from openkb.parsers.base import Parser
 from openkb.parsers.local import LocalParser
 
-VALID_PARSERS = ("local", "mineru", "mistral", "vlm")
+
+def _make_mistral(opts, config):
+    from openkb.parsers.mistral import MistralParser
+    return MistralParser(opts)
+
+
+def _make_vlm(opts, config):
+    from openkb.parsers.vlm import VLMParser
+    return VLMParser(opts, model=config.get("model"))
+
+
+def _make_mineru(opts, config):
+    from openkb.parsers.mineru import MineruParser
+    return MineruParser(opts)
+
+
+# Single source of truth: online-parser name -> lazy factory.
+_ONLINE_PARSERS = {
+    "mineru": _make_mineru,
+    "mistral": _make_mistral,
+    "vlm": _make_vlm,
+}
+
+# Valid parser names (drives the CLI --parser choice and error messages).
+VALID_PARSERS = ("local", *_ONLINE_PARSERS)
 
 
 def get_parser(
@@ -21,18 +45,10 @@ def get_parser(
     name = (override or config.get("parser") or "local").lower()
     if name == "local":
         return LocalParser(doc_name=doc_name, images_dir=images_dir, source_dir=source_dir)
-
-    parsers_cfg = config.get("parsers", {}) or {}
-    opts = parsers_cfg.get(name, {}) or {}
-    if name == "mistral":
-        from openkb.parsers.mistral import MistralParser
-        return MistralParser(opts)
-    if name == "vlm":
-        from openkb.parsers.vlm import VLMParser
-        return VLMParser(opts, model=config.get("model"))
-    if name == "mineru":
-        from openkb.parsers.mineru import MineruParser
-        return MineruParser(opts)
-    raise ValueError(
-        f"Unknown parser {name!r}. Valid options: {', '.join(VALID_PARSERS)}."
-    )
+    factory = _ONLINE_PARSERS.get(name)
+    if factory is None:
+        raise ValueError(
+            f"Unknown parser {name!r}. Valid options: {', '.join(VALID_PARSERS)}."
+        )
+    opts = (config.get("parsers", {}) or {}).get(name, {}) or {}
+    return factory(opts, config)
diff --git a/openkb/parsers/vlm.py b/openkb/parsers/vlm.py
index 834125fc..af6f5043 100644
--- a/openkb/parsers/vlm.py
+++ b/openkb/parsers/vlm.py
@@ -1,11 +1,14 @@
 from __future__ import annotations
 
+import logging
 from pathlib import Path
 from typing import Any
 
 from openkb.parsers.base import ParseResult, Parser
 from openkb.parsers.vlm_client import transcribe_to_markdown
 
+logger = logging.getLogger(__name__)
+
 _SUPPORTED = {".pdf"}
 
 
@@ -18,6 +21,13 @@ def __init__(self, opts: dict[str, Any] | None = None, model: str | None = None)
         opts = opts or {}
         # parsers.vlm.model overrides the global model; else use the global model.
         self.model = opts.get("model") or model
+        if not opts.get("model"):
+            logger.warning(
+                "VLM parser: 'parsers.vlm.model' is not set; using the global model "
+                "%r for vision parsing. If that model is not vision-capable, set "
+                "'parsers.vlm.model' to one (e.g. gemini/gemini-2.5-pro).",
+                self.model,
+            )
 
     def supports(self, suffix: str) -> bool:
         return suffix.lower() in _SUPPORTED
diff --git a/tests/test_parsers_registry.py b/tests/test_parsers_registry.py
index 0a500ce9..612ed259 100644
--- a/tests/test_parsers_registry.py
+++ b/tests/test_parsers_registry.py
@@ -31,3 +31,10 @@ def test_unknown_name_raises_with_valid_options():
         get_parser({"parser": "nope"}, **_kwargs())
     assert "nope" in str(exc.value)
     assert "local" in str(exc.value)
+
+
+def test_valid_parsers_matches_dispatch():
+    from openkb.parsers.registry import VALID_PARSERS, _ONLINE_PARSERS
+    # local + every online factory key, no drift
+    assert set(VALID_PARSERS) == {"local", *_ONLINE_PARSERS}
+    assert VALID_PARSERS[0] == "local"
diff --git a/tests/test_parsers_vlm.py b/tests/test_parsers_vlm.py
index e1d78683..eb2ede08 100644
--- a/tests/test_parsers_vlm.py
+++ b/tests/test_parsers_vlm.py
@@ -31,3 +31,17 @@ def test_parse_falls_back_to_global_model(tmp_path):
     with patch("openkb.parsers.vlm.transcribe_to_markdown", return_value="x") as t:
         p.parse(src)
     t.assert_called_once_with(src, model="global-model")
+
+
+def test_warns_when_falling_back_to_global_model(caplog):
+    import logging as _logging
+    with caplog.at_level(_logging.WARNING):
+        VLMParser({}, model="gpt-5.4-mini")
+    assert any("parsers.vlm.model" in r.message for r in caplog.records)
+
+
+def test_no_warning_when_vlm_model_set(caplog):
+    import logging as _logging
+    with caplog.at_level(_logging.WARNING):
+        VLMParser({"model": "gemini/gemini-2.5-pro"}, model="gpt-5.4-mini")
+    assert not any("parsers.vlm.model" in r.message for r in caplog.records)

From a981b91e6345027d3aa633be15c771836ac885e1 Mon Sep 17 00:00:00 2001
From: mountain <kose2livs@gmail.com>
Date: Sun, 31 May 2026 11:43:09 +0800
Subject: [PATCH 22/26] fix(cli): only propagate LLM_API_KEY to the active
 provider key (#77)

---
 openkb/cli.py     | 16 +++++++++-------
 tests/test_cli.py | 14 ++++++++++++++
 2 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/openkb/cli.py b/openkb/cli.py
index 030e1133..b19da6a8 100644
--- a/openkb/cli.py
+++ b/openkb/cli.py
@@ -125,17 +125,19 @@ def _setup_llm_key(kb_dir: Path | None = None) -> None:
     else:
         litellm.api_key = api_key
 
-        # Dynamically set the provider-specific env var when possible
         if provider:
+            # Active provider is known — set only its key, so LLM_API_KEY is not
+            # sprayed into unrelated provider keys (e.g. MISTRAL_API_KEY, which the
+            # Mistral parser treats as a real Mistral credential).
             provider_env = f"{provider.upper()}_API_KEY"
             if not os.environ.get(provider_env):
                 os.environ[provider_env] = api_key
-
-        # Fallback: also set common provider keys so multi-provider
-        # configs (e.g. PageIndex Cloud) still work
-        for env_var in _KNOWN_PROVIDER_KEYS:
-            if not os.environ.get(env_var):
-                os.environ[env_var] = api_key
+        else:
+            # Provider couldn't be determined — fall back to setting the common
+            # provider keys so multi-provider configs still work.
+            for env_var in _KNOWN_PROVIDER_KEYS:
+                if not os.environ.get(env_var):
+                    os.environ[env_var] = api_key
 
 # Supported document extensions for the `add` command
 SUPPORTED_EXTENSIONS = {
diff --git a/tests/test_cli.py b/tests/test_cli.py
index ab3378b1..e80e272f 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -365,3 +365,17 @@ async def fake_run_query(*_args, **_kwargs):
         assert "rnn" in saved
         assert "[[concepts/multi-head-attention]]" not in saved
         assert "multi head attention" in saved
+
+
+def test_setup_llm_key_does_not_spray_unrelated_provider_keys(tmp_path, monkeypatch):
+    import os
+    from openkb.cli import _setup_llm_key
+    # KB with an openai model (known provider)
+    openkb_dir = tmp_path / ".openkb"; openkb_dir.mkdir()
+    (openkb_dir / "config.yaml").write_text("model: openai/gpt-4o\n")
+    for k in ("MISTRAL_API_KEY", "OPENAI_API_KEY", "ANTHROPIC_API_KEY"):
+        monkeypatch.delenv(k, raising=False)
+    monkeypatch.setenv("LLM_API_KEY", "sk-test")
+    _setup_llm_key(tmp_path)
+    assert os.environ.get("OPENAI_API_KEY") == "sk-test"   # active provider set
+    assert os.environ.get("MISTRAL_API_KEY") is None        # unrelated provider NOT sprayed

From 6287deaa7a3e67cd5b753712f7f3dcf62603835a Mon Sep 17 00:00:00 2001
From: mountain <kose2livs@gmail.com>
Date: Mon, 1 Jun 2026 16:38:31 +0800
Subject: [PATCH 23/26] fix(images): match image links by basename
 (dir-prefixed, titled) in localize_images (#77)

---
 openkb/images.py     | 41 ++++++++++++++++++++++++++---------------
 tests/test_images.py | 22 ++++++++++++++++++++++
 2 files changed, 48 insertions(+), 15 deletions(-)

diff --git a/openkb/images.py b/openkb/images.py
index 39891b57..9315a20e 100644
--- a/openkb/images.py
+++ b/openkb/images.py
@@ -17,6 +17,10 @@
 # Matches: ![alt](relative/path) — excludes http(s):// and data: URIs
 _RELATIVE_RE = re.compile(r'!\[([^\]]*)\]\((?!https?://|data:)([^)]+)\)')
 
+# Matches an image link, capturing: (prefix `![alt](` + ws)(target)(optional
+# title + ws)(closing `)`). Used to rewrite links by their target's basename.
+_IMG_LINK_RE = re.compile(r'(!\[[^\]]*\]\(\s*)([^)\s]+)(\s*(?:"[^"]*"|\'[^\']*\')?\s*)(\))')
+
 
 # Minimum pixel dimension — skip icons, bullets, and tiny artifacts
 _MIN_IMAGE_DIM = 32
@@ -217,27 +221,34 @@ def localize_images(
     doc_name: str,
     images_dir: Path,
 ) -> str:
-    """Persist parser-supplied images and normalize all image links.
+    """Persist parser-supplied images and normalize image links.
 
-    1. Write every ``images`` entry (filename -> bytes) into ``images_dir``.
-    2. Rewrite bare-filename references ``![alt](filename)`` (filename present
-       in ``images``) to the canonical ``sources/images/{doc_name}/{filename}``.
-    3. Run :func:`extract_base64_images` to localize any inline base64 images.
+    1. Write every ``images`` entry to ``images_dir`` under its basename
+       (``Path(filename).name``), so a name with ``/`` directory components or
+       an absolute path can never write outside ``images_dir``.
+    2. Rewrite markdown image links whose target's basename matches a written
+       image to the canonical ``sources/images/{doc_name}/{basename}`` path —
+       this handles bare names, directory-prefixed targets (e.g.
+       ``images/fig.png``), and links carrying a title attribute.
+    3. Localize any inline base64 images via :func:`extract_base64_images`.
 
     Returns the normalized markdown.
     """
     images_dir.mkdir(parents=True, exist_ok=True)
-    result = markdown
+    safe_names: set[str] = set()
     for filename, data in images.items():
-        # Strip any directory components from parser-supplied names so a
-        # malicious/odd filename (e.g. "../x.png", "/abs/x.png") can never
-        # write outside images_dir. The markdown still references the original
-        # `filename`, so rewrite that ref to the sanitized canonical path.
-        safe_name = Path(filename).name or "image"
-        (images_dir / safe_name).write_bytes(data)
-        canonical = f"sources/images/{doc_name}/{safe_name}"
-        pattern = re.compile(r"(!\[[^\]]*\]\()" + re.escape(filename) + r"(\))")
-        result = pattern.sub(lambda m, c=canonical: m.group(1) + c + m.group(2), result)
+        safe = Path(filename).name or "image"
+        (images_dir / safe).write_bytes(data)
+        safe_names.add(safe)
+
+    def _rewrite(m: "re.Match[str]") -> str:
+        pre, target, title, close = m.group(1), m.group(2), m.group(3), m.group(4)
+        base = Path(target).name
+        if base in safe_names:
+            return f"{pre}sources/images/{doc_name}/{base}{title}{close}"
+        return m.group(0)
+
+    result = _IMG_LINK_RE.sub(_rewrite, markdown)
     result = extract_base64_images(result, doc_name, images_dir)
     return result
 
diff --git a/tests/test_images.py b/tests/test_images.py
index 7292c5c1..26b8ed1b 100644
--- a/tests/test_images.py
+++ b/tests/test_images.py
@@ -222,3 +222,25 @@ def test_localize_images_absolute_filename_stays_inside(tmp_path):
     out = localize_images("![x](/etc/x.png)", {"/etc/x.png": b"D"}, "doc", images_dir)
     assert (images_dir / "x.png").read_bytes() == b"D"
     assert "sources/images/doc/x.png" in out
+
+
+def test_localize_images_rewrites_directory_prefixed_target(tmp_path):
+    images_dir = tmp_path / "wiki" / "sources" / "images" / "doc"
+    md = "![p](images/fig.png)\n\n![q](./sub/images/other.png)"
+    out = localize_images(md, {"fig.png": b"A", "other.png": b"B"}, "doc", images_dir)
+    assert "![p](sources/images/doc/fig.png)" in out
+    assert "![q](sources/images/doc/other.png)" in out
+    assert (images_dir / "fig.png").read_bytes() == b"A"
+    assert (images_dir / "other.png").read_bytes() == b"B"
+
+
+def test_localize_images_preserves_title_attribute(tmp_path):
+    images_dir = tmp_path / "wiki" / "sources" / "images" / "doc"
+    out = localize_images('![a](fig.png "Figure 1")', {"fig.png": b"X"}, "doc", images_dir)
+    assert '![a](sources/images/doc/fig.png "Figure 1")' in out
+
+
+def test_localize_images_inner_whitespace(tmp_path):
+    images_dir = tmp_path / "wiki" / "sources" / "images" / "doc"
+    out = localize_images("![a]( fig.png )", {"fig.png": b"X"}, "doc", images_dir)
+    assert "sources/images/doc/fig.png" in out

From 6e111fbd35a18ddcf9d5147514d9f17cb364b6a2 Mon Sep 17 00:00:00 2001
From: mountain <kose2livs@gmail.com>
Date: Mon, 1 Jun 2026 16:42:23 +0800
Subject: [PATCH 24/26] fix(parsers): harden MinerU cloud response handling,
 timeout, md selection; drop redundant image rewrite (#77)

---
 openkb/parsers/mineru.py     | 65 +++++++++++++++---------
 tests/test_parsers_mineru.py | 95 +++++++++++++++++++++++++++++++-----
 2 files changed, 124 insertions(+), 36 deletions(-)

diff --git a/openkb/parsers/mineru.py b/openkb/parsers/mineru.py
index a0e28b26..e9356e19 100644
--- a/openkb/parsers/mineru.py
+++ b/openkb/parsers/mineru.py
@@ -1,8 +1,8 @@
 from __future__ import annotations
 
 import io
+import logging
 import os
-import re
 import time
 import zipfile
 from pathlib import Path
@@ -10,6 +10,8 @@
 
 from openkb.parsers.base import ParseResult, Parser
 
+logger = logging.getLogger(__name__)
+
 _SUPPORTED = {".pdf", ".docx", ".pptx", ".xlsx", ".xls", ".html", ".htm"}
 _CLOUD_BASE = "https://mineru.net/api/v4"
 
@@ -29,24 +31,32 @@ def _result_from_zip(zip_bytes: bytes) -> ParseResult:
     images: dict[str, bytes] = {}
     markdown = ""
     with zipfile.ZipFile(io.BytesIO(zip_bytes)) as zf:
-        md_names = [n for n in zf.namelist() if n.lower().endswith(".md")]
+        names = zf.namelist()
+        md_names = sorted(n for n in names if n.lower().endswith(".md"))
         if md_names:
-            chosen = next((n for n in md_names if n.endswith("full.md")), md_names[0])
+            chosen = next((n for n in md_names if Path(n).name == "full.md"), md_names[0])
             markdown = zf.read(chosen).decode("utf-8", errors="replace")
-        for name in zf.namelist():
+        for name in names:
             if name.lower().endswith((".png", ".jpg", ".jpeg", ".gif", ".webp")):
-                images[Path(name).name] = zf.read(name)
-    # Markdown references images as 'images/<file>'; localize_images matches on
-    # the bare filename, so rewrite 'images/fig.png' -> 'fig.png'.
-    for fname in images:
-        # Rewrite only `![alt](images/<fname>)` links (anchored on markdown image
-        # syntax) to the bare filename, for localize_images to canonicalize. A
-        # replacement function avoids regex-escape injection from arbitrary names.
-        pattern = re.compile(r"(!\[[^\]]*\]\()" + re.escape("images/" + fname) + r"(\))")
-        markdown = pattern.sub(lambda m, f=fname: m.group(1) + f + m.group(2), markdown)
+                base = Path(name).name
+                if base in images:
+                    logger.warning(
+                        "MinerU result has multiple images named %r in different "
+                        "folders; keeping the last. Earlier one may be lost.", base
+                    )
+                images[base] = zf.read(name)
     return ParseResult(markdown=markdown, images=images)
 
 
+def _mineru_body(resp):
+    """Return the 'data' dict from a MinerU v4 JSON response, raising on API errors."""
+    body = resp.json()
+    code = body.get("code")
+    if code not in (0, None):
+        raise RuntimeError(f"MinerU API error (code={code}): {body.get('msg')}")
+    return body.get("data") or {}
+
+
 class MineruParser(Parser):
     """MinerU via HTTP — self-hosted server or hosted cloud API."""
 
@@ -58,7 +68,8 @@ def __init__(self, opts: dict[str, Any] | None = None):
         self.base_url = self.opts.get("base_url")
         pi = self.opts.get("poll_interval", 3)
         self.poll_interval = pi if isinstance(pi, (int, float)) and pi > 0 else 3
-        self.timeout = self.opts.get("timeout", 600)
+        t = self.opts.get("timeout", 600)
+        self.timeout = t if isinstance(t, (int, float)) and t > 0 else 600
 
     def supports(self, suffix: str) -> bool:
         return suffix.lower() in _SUPPORTED
@@ -92,37 +103,43 @@ def _parse_cloud(self, src: Path) -> ParseResult:
             )
         httpx = _httpx()
         headers = {"Authorization": f"Bearer {api_key}"}
-        with httpx.Client(timeout=self.timeout) as client:
+        with httpx.Client(timeout=min(self.timeout, 120)) as client:
             r = client.post(
                 f"{_CLOUD_BASE}/file-urls/batch",
                 headers=headers,
                 json={"files": [{"name": src.name, "is_ocr": True}]},
             )
             r.raise_for_status()
-            data = r.json()["data"]
-            batch_id = data["batch_id"]
-            upload_url = data["file_urls"][0]
+            data = _mineru_body(r)
+            batch_id = data.get("batch_id")
+            file_urls = data.get("file_urls") or []
+            if not batch_id or not file_urls:
+                raise RuntimeError(f"MinerU returned no upload URL: {data}")
+            upload_url = file_urls[0]
             client.put(upload_url, content=src.read_bytes()).raise_for_status()
-            elapsed = 0
+            deadline = time.monotonic() + self.timeout
             zip_url = None
-            while elapsed < self.timeout:
+            while time.monotonic() < deadline:
                 pr = client.get(
                     f"{_CLOUD_BASE}/extract-results/batch/{batch_id}", headers=headers
                 )
                 pr.raise_for_status()
-                results = pr.json()["data"]["extract_result"]
+                data = _mineru_body(pr)
+                results = data.get("extract_result") or []
                 if not results:
                     time.sleep(self.poll_interval)
-                    elapsed += self.poll_interval
                     continue
                 state = results[0].get("state")
                 if state == "done":
-                    zip_url = results[0]["full_zip_url"]
+                    zip_url = results[0].get("full_zip_url")
+                    if not zip_url:
+                        raise RuntimeError(
+                            f"MinerU reported done but no full_zip_url: {results[0]}"
+                        )
                     break
                 if state == "failed":
                     raise RuntimeError(f"MinerU extraction failed: {results[0]}")
                 time.sleep(self.poll_interval)
-                elapsed += self.poll_interval
             if zip_url is None:
                 raise RuntimeError("MinerU extraction timed out.")
             zr = client.get(zip_url)
diff --git a/tests/test_parsers_mineru.py b/tests/test_parsers_mineru.py
index dc255254..f81c8033 100644
--- a/tests/test_parsers_mineru.py
+++ b/tests/test_parsers_mineru.py
@@ -63,9 +63,12 @@ def test_self_hosted_parses_zip(monkeypatch, tmp_path):
     assert isinstance(result, ParseResult)
     assert "Mineru" in result.markdown
     assert result.images["fig.png"] == b"PNGBYTES"
-    # the images/ prefix should be rewritten to the bare filename for localize_images
-    assert "images/fig.png" not in result.markdown
-    assert "![p](fig.png)" in result.markdown
+    # _result_from_zip no longer rewrites links; the raw 'images/fig.png' survives
+    assert "images/fig.png" in result.markdown
+    # localize_images (which now rewrites by basename) canonicalizes it
+    from openkb.images import localize_images
+    md2 = localize_images(result.markdown, result.images, "d", tmp_path / "imgs")
+    assert "sources/images/d/fig.png" in md2
 
 
 def test_cloud_flow_polls_then_downloads(monkeypatch, tmp_path):
@@ -74,7 +77,7 @@ def test_cloud_flow_polls_then_downloads(monkeypatch, tmp_path):
 
     buf = io.BytesIO()
     with zipfile.ZipFile(buf, "w") as zf:
-        zf.writestr("full.md", "# Cloud\n\n![p](images/fig.png)")
+        zf.writestr("full.md", "# Cloud")
         zf.writestr("images/fig.png", b"ZBYTES")
     zip_bytes = buf.getvalue()
 
@@ -124,8 +127,6 @@ def _get(url, *a, **k):
     assert isinstance(result, ParseResult)
     assert "Cloud" in result.markdown
     assert result.images["fig.png"] == b"ZBYTES"
-    assert "images/fig.png" not in result.markdown
-    assert "![p](fig.png)" in result.markdown
     # drove the full poll loop: running once, then done
     assert _get.calls == 2
 
@@ -137,19 +138,19 @@ def test_poll_interval_zero_is_clamped_to_positive():
     assert MineruParser({"poll_interval": 2}).poll_interval == 2
 
 
-def test_image_prefix_rewrite_is_anchored(tmp_path):
-    import io, sys, types, zipfile
-    from unittest.mock import MagicMock
-    # markdown has a real image link AND an unrelated 'images/fig.png' substring in prose
+def test_result_from_zip_does_not_rewrite_links(tmp_path):
+    import io, zipfile
+    # The images/ -> bare rewrite moved OUT of _result_from_zip into
+    # localize_images; _result_from_zip must leave the markdown link text intact.
     buf = io.BytesIO()
     with zipfile.ZipFile(buf, "w") as zf:
         zf.writestr("full.md", "See path other_images/fig.png in text.\n\n![p](images/fig.png)")
         zf.writestr("images/fig.png", b"PNG")
     from openkb.parsers.mineru import _result_from_zip
     result = _result_from_zip(buf.getvalue())
-    assert "![p](fig.png)" in result.markdown          # link rewritten
+    assert "![p](images/fig.png)" in result.markdown   # link text unchanged
     assert "other_images/fig.png" in result.markdown    # unrelated prose untouched
-    assert result.images["fig.png"] == b"PNG"
+    assert result.images["fig.png"] == b"PNG"           # images keyed by basename
 
 
 def test_cloud_empty_extract_result_then_done(monkeypatch, tmp_path):
@@ -186,3 +187,73 @@ def _get(url, *a, **k):
     src = tmp_path / "d.pdf"; src.write_bytes(b"%PDF")
     result = MineruParser({"mode": "cloud", "poll_interval": 1}).parse(src)
     assert "Ok" in result.markdown   # survived the empty-list poll without crashing
+
+
+def test_timeout_invalid_is_clamped():
+    from openkb.parsers.mineru import MineruParser
+    assert MineruParser({"timeout": 0}).timeout == 600
+    assert MineruParser({"timeout": "x"}).timeout == 600
+    assert MineruParser({"timeout": 30}).timeout == 30
+
+
+def test_cloud_api_error_envelope_raises(monkeypatch, tmp_path):
+    import sys, types
+    from unittest.mock import MagicMock
+    monkeypatch.setenv("MINERU_API_KEY", "key")
+    r = MagicMock(); r.raise_for_status = MagicMock()
+    r.json.return_value = {"code": -10001, "msg": "token expired", "data": None}
+    client = MagicMock()
+    client.__enter__ = MagicMock(return_value=client); client.__exit__ = MagicMock(return_value=False)
+    client.post.return_value = r
+    httpx_mod = types.ModuleType("httpx"); httpx_mod.Client = MagicMock(return_value=client)
+    monkeypatch.setitem(sys.modules, "httpx", httpx_mod)
+    from openkb.parsers.mineru import MineruParser
+    src = tmp_path / "d.pdf"; src.write_bytes(b"%PDF")
+    import pytest
+    with pytest.raises(RuntimeError) as exc:
+        MineruParser({"mode": "cloud"}).parse(src)
+    assert "token expired" in str(exc.value) or "-10001" in str(exc.value)
+
+
+def test_cloud_empty_file_urls_raises(monkeypatch, tmp_path):
+    import sys, types
+    from unittest.mock import MagicMock
+    monkeypatch.setenv("MINERU_API_KEY", "key")
+    r = MagicMock(); r.raise_for_status = MagicMock()
+    r.json.return_value = {"code": 0, "data": {"batch_id": "b1", "file_urls": []}}
+    client = MagicMock()
+    client.__enter__ = MagicMock(return_value=client); client.__exit__ = MagicMock(return_value=False)
+    client.post.return_value = r
+    httpx_mod = types.ModuleType("httpx"); httpx_mod.Client = MagicMock(return_value=client)
+    monkeypatch.setitem(sys.modules, "httpx", httpx_mod)
+    from openkb.parsers.mineru import MineruParser
+    src = tmp_path / "d.pdf"; src.write_bytes(b"%PDF")
+    import pytest
+    with pytest.raises(RuntimeError) as exc:
+        MineruParser({"mode": "cloud"}).parse(src)
+    assert "upload URL" in str(exc.value)
+
+
+def test_full_md_basename_preferred_over_endswith(tmp_path):
+    import io, zipfile
+    from openkb.parsers.mineru import _result_from_zip
+    buf = io.BytesIO()
+    with zipfile.ZipFile(buf, "w") as zf:
+        zf.writestr("careful.md", "# WRONG")     # ends with 'full.md' but isn't it
+        zf.writestr("full.md", "# RIGHT")
+    result = _result_from_zip(buf.getvalue())
+    assert "RIGHT" in result.markdown
+    assert "WRONG" not in result.markdown
+
+
+def test_image_basename_collision_warns(tmp_path, caplog):
+    import io, zipfile, logging as _logging
+    from openkb.parsers.mineru import _result_from_zip
+    buf = io.BytesIO()
+    with zipfile.ZipFile(buf, "w") as zf:
+        zf.writestr("full.md", "# x")
+        zf.writestr("images/fig.png", b"A")
+        zf.writestr("sub/fig.png", b"B")
+    with caplog.at_level(_logging.WARNING):
+        result = _result_from_zip(buf.getvalue())
+    assert any("fig.png" in r.message for r in caplog.records)

From 02daf5220a885bc2aae0b97cce524b8cfc55aeca Mon Sep 17 00:00:00 2001
From: mountain <kose2livs@gmail.com>
Date: Mon, 1 Jun 2026 16:44:26 +0800
Subject: [PATCH 25/26] fix(parsers): warn that VLM is text-only and on silent
 parser downgrade (#77)

---
 README.md                 |  4 ++++
 openkb/converter.py       |  5 +++++
 openkb/parsers/vlm.py     |  5 +++++
 tests/test_converter.py   | 15 +++++++++++++++
 tests/test_parsers_vlm.py | 11 +++++++++++
 5 files changed, 40 insertions(+)

diff --git a/README.md b/README.md
index ccefa0e3..c1ad36c5 100644
--- a/README.md
+++ b/README.md
@@ -313,6 +313,10 @@ Each parser handles a subset of formats — `mineru` covers PDF, Word, PPT, Exce
 and HTML; `mistral` and `vlm` cover PDF. `.md` and any unsupported format always
 fall back to the local parser.
 
+The `vlm` parser is **text-only**: it transcribes a document's text via a vision
+LLM but does **not** extract embedded figures/images. Use `mineru`, `mistral`, or
+`local` if you need image extraction.
+
 > **Note:** Long PDFs (≥ `pageindex_threshold` pages, default 20) continue to be
 > indexed with PageIndex and are **not** affected by the `parser` setting. The
 > parser governs the file → Markdown step for shorter documents and non-PDF files.
diff --git a/openkb/converter.py b/openkb/converter.py
index 2bab3d1b..2ac6abb1 100644
--- a/openkb/converter.py
+++ b/openkb/converter.py
@@ -103,6 +103,11 @@ def convert_document(src: Path, kb_dir: Path, parser_override: str | None = None
         source_dir=src.parent,
     )
     if not parser.supports(src.suffix):
+        if parser.name != "local":
+            logger.warning(
+                "Parser %r does not support %r; falling back to the local parser for %s.",
+                parser.name, src.suffix, src.name,
+            )
         parser = LocalParser(doc_name=doc_name, images_dir=images_dir, source_dir=src.parent)
 
     parse_result = parser.parse(src)
diff --git a/openkb/parsers/vlm.py b/openkb/parsers/vlm.py
index af6f5043..6467483f 100644
--- a/openkb/parsers/vlm.py
+++ b/openkb/parsers/vlm.py
@@ -34,4 +34,9 @@ def supports(self, suffix: str) -> bool:
 
     def parse(self, src: Path) -> ParseResult:
         markdown = transcribe_to_markdown(src, model=self.model)
+        logger.warning(
+            "VLM parser transcribes %s to text only; embedded figures/images are "
+            "not extracted. Use a parser like 'mineru' if you need figure extraction.",
+            src.name,
+        )
         return ParseResult(markdown=markdown)
diff --git a/tests/test_converter.py b/tests/test_converter.py
index 8e5ce77c..6b5f2e41 100644
--- a/tests/test_converter.py
+++ b/tests/test_converter.py
@@ -177,3 +177,18 @@ def test_local_parser_skips_redundant_localize(self, kb_dir):
             result = convert_document(src, kb_dir)
         li.assert_not_called()                      # local path skips localize_images
         assert result.source_path.read_text(encoding="utf-8") == "# md final"
+
+    def test_warns_on_silent_downgrade(self, kb_dir, caplog):
+        import logging as _logging
+        src = kb_dir / "raw" / "notes.md"
+        src.write_text("# md", encoding="utf-8")
+        online = MagicMock()
+        online.name = "mistral"
+        online.supports.return_value = False
+        with patch("openkb.converter.get_parser", return_value=online), \
+             patch("openkb.converter.LocalParser") as LP:
+            LP.return_value.name = "local"
+            LP.return_value.parse.return_value = ParseResult(markdown="# md")
+            with caplog.at_level(_logging.WARNING):
+                convert_document(src, kb_dir)
+        assert any("falling back to the local parser" in r.message for r in caplog.records)
diff --git a/tests/test_parsers_vlm.py b/tests/test_parsers_vlm.py
index eb2ede08..b5a99400 100644
--- a/tests/test_parsers_vlm.py
+++ b/tests/test_parsers_vlm.py
@@ -45,3 +45,14 @@ def test_no_warning_when_vlm_model_set(caplog):
     with caplog.at_level(_logging.WARNING):
         VLMParser({"model": "gemini/gemini-2.5-pro"}, model="gpt-5.4-mini")
     assert not any("parsers.vlm.model" in r.message for r in caplog.records)
+
+
+def test_parse_warns_text_only(tmp_path, caplog):
+    import logging as _logging
+    from unittest.mock import patch
+    src = tmp_path / "d.pdf"; src.write_bytes(b"%PDF")
+    p = VLMParser({"model": "gemini/gemini-2.5-pro"})
+    with patch("openkb.parsers.vlm.transcribe_to_markdown", return_value="# md"):
+        with caplog.at_level(_logging.WARNING):
+            p.parse(src)
+    assert any("text only" in r.message for r in caplog.records)

From b2435059e95869cf9553fe1052f88729a87c1085 Mon Sep 17 00:00:00 2001
From: mountain <kose2livs@gmail.com>
Date: Mon, 1 Jun 2026 16:46:01 +0800
Subject: [PATCH 26/26] fix(parsers): delete uploaded Mistral OCR files; fix
 patch.stopall test hygiene (#77)

---
 openkb/parsers/mistral.py     | 51 +++++++++++++++++++++--------------
 tests/test_parsers_local.py   | 17 +++++-------
 tests/test_parsers_mistral.py | 34 +++++++++++++++++++++++
 3 files changed, 72 insertions(+), 30 deletions(-)

diff --git a/openkb/parsers/mistral.py b/openkb/parsers/mistral.py
index a494f149..9f5d0706 100644
--- a/openkb/parsers/mistral.py
+++ b/openkb/parsers/mistral.py
@@ -42,26 +42,37 @@ def parse(self, src: Path) -> ParseResult:
             ) from exc
 
         client = Mistral(api_key=api_key)
-        uploaded = client.files.upload(
-            file={"file_name": src.name, "content": src.read_bytes()}, purpose="ocr"
-        )
-        signed = client.files.get_signed_url(file_id=uploaded.id)
-        resp = client.ocr.process(
-            model=self.model,
-            document={"type": "document_url", "document_url": signed.url},
-            include_image_base64=True,
-        )
+        uploaded = None
+        try:
+            uploaded = client.files.upload(
+                file={"file_name": src.name, "content": src.read_bytes()}, purpose="ocr"
+            )
+            signed = client.files.get_signed_url(file_id=uploaded.id)
+            resp = client.ocr.process(
+                model=self.model,
+                document={"type": "document_url", "document_url": signed.url},
+                include_image_base64=True,
+            )
 
-        parts: list[str] = []
-        images: dict[str, bytes] = {}
-        for page in resp.pages:
-            parts.append(page.markdown or "")
-            for img in getattr(page, "images", None) or []:
-                raw = img.image_base64 or ""
-                raw = _DATA_URI_RE.sub("", raw)
+            parts: list[str] = []
+            images: dict[str, bytes] = {}
+            for page in resp.pages:
+                parts.append(page.markdown or "")
+                for img in getattr(page, "images", None) or []:
+                    raw = img.image_base64 or ""
+                    raw = _DATA_URI_RE.sub("", raw)
+                    try:
+                        images[img.id] = base64.b64decode(raw, validate=True)
+                    except Exception:
+                        logger.warning("Skipping undecodable Mistral image: %s", getattr(img, "id", "?"))
+                        continue
+            return ParseResult(markdown="\n\n".join(parts), images=images)
+        finally:
+            if uploaded is not None:
                 try:
-                    images[img.id] = base64.b64decode(raw, validate=True)
+                    client.files.delete(file_id=uploaded.id)
                 except Exception:
-                    logger.warning("Skipping undecodable Mistral image: %s", getattr(img, "id", "?"))
-                    continue
-        return ParseResult(markdown="\n\n".join(parts), images=images)
+                    logger.warning(
+                        "Failed to delete uploaded Mistral OCR file %s",
+                        getattr(uploaded, "id", "?"),
+                    )
diff --git a/tests/test_parsers_local.py b/tests/test_parsers_local.py
index e682c42e..af17ed38 100644
--- a/tests/test_parsers_local.py
+++ b/tests/test_parsers_local.py
@@ -39,13 +39,10 @@ def test_parse_other_uses_markitdown_and_extracts_base64(tmp_path):
     src = tmp_path / "deck.pptx"
     src.write_bytes(b"PK fake")
     images_dir = tmp_path / "img" / "deck"
-    fake_mid = patch("openkb.parsers.local.MarkItDown").start()
-    fake_mid.return_value.convert.return_value.text_content = "MARKITDOWN MD"
-    try:
-        with patch("openkb.parsers.local.extract_base64_images", return_value="CLEANED") as ex:
-            p = LocalParser(doc_name="deck", images_dir=images_dir, source_dir=tmp_path)
-            result = p.parse(src)
-        ex.assert_called_once_with("MARKITDOWN MD", "deck", images_dir)
-        assert result.markdown == "CLEANED"
-    finally:
-        patch.stopall()
+    with patch("openkb.parsers.local.MarkItDown") as fake_mid, \
+         patch("openkb.parsers.local.extract_base64_images", return_value="CLEANED") as ex:
+        fake_mid.return_value.convert.return_value.text_content = "MARKITDOWN MD"
+        p = LocalParser(doc_name="deck", images_dir=images_dir, source_dir=tmp_path)
+        result = p.parse(src)
+    ex.assert_called_once_with("MARKITDOWN MD", "deck", images_dir)
+    assert result.markdown == "CLEANED"
diff --git a/tests/test_parsers_mistral.py b/tests/test_parsers_mistral.py
index e95c1858..8283102d 100644
--- a/tests/test_parsers_mistral.py
+++ b/tests/test_parsers_mistral.py
@@ -86,3 +86,37 @@ def test_undecodable_image_logged_and_skipped(monkeypatch, tmp_path, caplog):
         result = MistralParser({}).parse(src)
     assert "bad.png" not in result.images
     assert any("bad.png" in r.message for r in caplog.records)
+
+
+def test_uploaded_file_is_deleted(monkeypatch, tmp_path):
+    import sys, types
+    from unittest.mock import MagicMock
+    monkeypatch.setenv("MISTRAL_API_KEY", "k")
+    client = MagicMock()
+    client.files.upload.return_value = MagicMock(id="file-1")
+    client.files.get_signed_url.return_value = MagicMock(url="https://signed")
+    client.ocr.process.return_value = MagicMock(pages=[])
+    mod = types.ModuleType("mistralai"); mod.Mistral = MagicMock(return_value=client)
+    monkeypatch.setitem(sys.modules, "mistralai", mod)
+    from openkb.parsers.mistral import MistralParser
+    src = tmp_path / "d.pdf"; src.write_bytes(b"%PDF")
+    MistralParser({}).parse(src)
+    client.files.delete.assert_called_once_with(file_id="file-1")
+
+
+def test_uploaded_file_deleted_even_on_ocr_error(monkeypatch, tmp_path):
+    import sys, types
+    from unittest.mock import MagicMock
+    import pytest
+    monkeypatch.setenv("MISTRAL_API_KEY", "k")
+    client = MagicMock()
+    client.files.upload.return_value = MagicMock(id="file-2")
+    client.files.get_signed_url.return_value = MagicMock(url="https://signed")
+    client.ocr.process.side_effect = RuntimeError("ocr boom")
+    mod = types.ModuleType("mistralai"); mod.Mistral = MagicMock(return_value=client)
+    monkeypatch.setitem(sys.modules, "mistralai", mod)
+    from openkb.parsers.mistral import MistralParser
+    src = tmp_path / "d.pdf"; src.write_bytes(b"%PDF")
+    with pytest.raises(RuntimeError):
+        MistralParser({}).parse(src)
+    client.files.delete.assert_called_once_with(file_id="file-2")