From 7a5f78151db9bb172f3320ed4fc1cd58f6415fc6 Mon Sep 17 00:00:00 2001 From: mountain Date: Sun, 31 May 2026 10:35:28 +0800 Subject: [PATCH 01/26] feat(parsers): add ParseResult and Parser ABC (#77) --- openkb/parsers/__init__.py | 4 ++++ openkb/parsers/base.py | 33 +++++++++++++++++++++++++++++++++ tests/test_parsers_base.py | 24 ++++++++++++++++++++++++ 3 files changed, 61 insertions(+) create mode 100644 openkb/parsers/__init__.py create mode 100644 openkb/parsers/base.py create mode 100644 tests/test_parsers_base.py diff --git a/openkb/parsers/__init__.py b/openkb/parsers/__init__.py new file mode 100644 index 00000000..0656d733 --- /dev/null +++ b/openkb/parsers/__init__.py @@ -0,0 +1,4 @@ +"""Pluggable document parsers for the file → Markdown step.""" +from openkb.parsers.base import ParseResult, Parser + +__all__ = ["ParseResult", "Parser"] diff --git a/openkb/parsers/base.py b/openkb/parsers/base.py new file mode 100644 index 00000000..deb07d60 --- /dev/null +++ b/openkb/parsers/base.py @@ -0,0 +1,33 @@ +from __future__ import annotations + +from abc import ABC, abstractmethod +from dataclasses import dataclass, field +from pathlib import Path + + +@dataclass +class ParseResult: + """Normalized output of a parser. + + ``markdown`` references images either as bare filenames present in + ``images`` or as inline base64 data URIs. ``images`` maps a filename to + its raw bytes; the caller persists them and rewrites links via + :func:`openkb.images.localize_images`. + """ + + markdown: str + images: dict[str, bytes] = field(default_factory=dict) + + +class Parser(ABC): + """Converts a source document to Markdown.""" + + name: str + + @abstractmethod + def supports(self, suffix: str) -> bool: + """Return True if this parser handles files with ``suffix`` (e.g. ``.pdf``).""" + + @abstractmethod + def parse(self, src: Path) -> ParseResult: + """Parse ``src`` and return a :class:`ParseResult`.""" diff --git a/tests/test_parsers_base.py b/tests/test_parsers_base.py new file mode 100644 index 00000000..1c119a32 --- /dev/null +++ b/tests/test_parsers_base.py @@ -0,0 +1,24 @@ +"""Tests for the parser abstraction base types.""" +from __future__ import annotations + +import pytest + +from openkb.parsers.base import ParseResult, Parser + + +def test_parse_result_defaults_to_empty_images(): + pr = ParseResult(markdown="# Hi") + assert pr.markdown == "# Hi" + assert pr.images == {} + + +def test_parser_is_abstract(): + with pytest.raises(TypeError): + Parser() # cannot instantiate abstract base + + +def test_concrete_parser_must_implement_parse_and_supports(): + class Incomplete(Parser): + name = "incomplete" + with pytest.raises(TypeError): + Incomplete() From 592d11dab320e5b6ccf5b5c292e3e9c6ce5faae7 Mon Sep 17 00:00:00 2001 From: mountain Date: Sun, 31 May 2026 10:38:17 +0800 Subject: [PATCH 02/26] feat(images): add localize_images helper for parser output (#77) --- openkb/images.py | 29 +++++++++++++++++++++++++++++ tests/test_images.py | 32 +++++++++++++++++++++++++++++++- 2 files changed, 60 insertions(+), 1 deletion(-) diff --git a/openkb/images.py b/openkb/images.py index 76284148..84ed6160 100644 --- a/openkb/images.py +++ b/openkb/images.py @@ -211,6 +211,35 @@ def extract_base64_images(markdown: str, doc_name: str, images_dir: Path) -> str return result +def localize_images( + markdown: str, + images: dict[str, bytes], + doc_name: str, + images_dir: Path, +) -> str: + """Persist parser-supplied images and normalize all image links. + + 1. Write every ``images`` entry (filename -> bytes) into ``images_dir``. + 2. Rewrite bare-filename references ``![alt](filename)`` (filename present + in ``images``) to the canonical ``sources/images/{doc_name}/{filename}``. + 3. Run :func:`extract_base64_images` to localize any inline base64 images. + + Returns the normalized markdown. + """ + images_dir.mkdir(parents=True, exist_ok=True) + result = markdown + for filename, data in images.items(): + (images_dir / filename).write_bytes(data) + # Rewrite a bare ![alt](filename) reference to the canonical KB path. + result = re.sub( + r"(!\[[^\]]*\]\()" + re.escape(filename) + r"(\))", + r"\g<1>" + f"sources/images/{doc_name}/{filename}" + r"\g<2>", + result, + ) + result = extract_base64_images(result, doc_name, images_dir) + return result + + def copy_relative_images( markdown: str, source_dir: Path, doc_name: str, images_dir: Path ) -> str: diff --git a/tests/test_images.py b/tests/test_images.py index 9abb3ec2..97d98fee 100644 --- a/tests/test_images.py +++ b/tests/test_images.py @@ -4,7 +4,7 @@ import base64 -from openkb.images import copy_relative_images, extract_base64_images +from openkb.images import copy_relative_images, extract_base64_images, localize_images # --------------------------------------------------------------------------- @@ -164,3 +164,33 @@ def test_multiple_relative_images_all_copied(self, tmp_path): assert "![b](sources/images/doc/b.jpg)" in result assert (images_dir / "a.png").exists() assert (images_dir / "b.jpg").exists() + + +# --------------------------------------------------------------------------- +# localize_images +# --------------------------------------------------------------------------- + + +def test_localize_images_writes_bytes_and_rewrites_bare_refs(tmp_path): + images_dir = tmp_path / "wiki" / "sources" / "images" / "doc" + md = "Before\n\n![fig](p1_img1.png)\n\nAfter" + out = localize_images(md, {"p1_img1.png": b"PNGDATA"}, "doc", images_dir) + assert "![fig](sources/images/doc/p1_img1.png)" in out + assert (images_dir / "p1_img1.png").read_bytes() == b"PNGDATA" + + +def test_localize_images_handles_inline_base64(tmp_path): + import base64 + images_dir = tmp_path / "wiki" / "sources" / "images" / "doc" + payload = base64.b64encode(b"JPEGDATA").decode() + md = f"![x](data:image/jpeg;base64,{payload})" + out = localize_images(md, {}, "doc", images_dir) + assert "sources/images/doc/img_001.jpeg" in out + assert (images_dir / "img_001.jpeg").read_bytes() == b"JPEGDATA" + + +def test_localize_images_leaves_unreferenced_bytes_on_disk(tmp_path): + images_dir = tmp_path / "wiki" / "sources" / "images" / "doc" + out = localize_images("no images here", {"orphan.png": b"X"}, "doc", images_dir) + assert out == "no images here" + assert (images_dir / "orphan.png").read_bytes() == b"X" From cec36183a0cc6b1fde840d28227244341a1c5a60 Mon Sep 17 00:00:00 2001 From: mountain Date: Sun, 31 May 2026 10:41:31 +0800 Subject: [PATCH 03/26] fix(images): use replacement function in localize_images to handle arbitrary filenames (#77) --- openkb/images.py | 11 ++++++----- tests/test_images.py | 9 +++++++++ 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/openkb/images.py b/openkb/images.py index 84ed6160..9c6a424c 100644 --- a/openkb/images.py +++ b/openkb/images.py @@ -231,11 +231,12 @@ def localize_images( for filename, data in images.items(): (images_dir / filename).write_bytes(data) # Rewrite a bare ![alt](filename) reference to the canonical KB path. - result = re.sub( - r"(!\[[^\]]*\]\()" + re.escape(filename) + r"(\))", - r"\g<1>" + f"sources/images/{doc_name}/{filename}" + r"\g<2>", - result, - ) + # Use a replacement *function* (not a replacement string) so a filename + # containing regex-escape sequences (e.g. "\g<1>") can't corrupt the + # substitution — localize_images handles arbitrary parser-supplied names. + canonical = f"sources/images/{doc_name}/{filename}" + pattern = re.compile(r"(!\[[^\]]*\]\()" + re.escape(filename) + r"(\))") + result = pattern.sub(lambda m, c=canonical: m.group(1) + c + m.group(2), result) result = extract_base64_images(result, doc_name, images_dir) return result diff --git a/tests/test_images.py b/tests/test_images.py index 97d98fee..53906443 100644 --- a/tests/test_images.py +++ b/tests/test_images.py @@ -194,3 +194,12 @@ def test_localize_images_leaves_unreferenced_bytes_on_disk(tmp_path): out = localize_images("no images here", {"orphan.png": b"X"}, "doc", images_dir) assert out == "no images here" assert (images_dir / "orphan.png").read_bytes() == b"X" + + +def test_localize_images_filename_with_regex_metachars(tmp_path): + images_dir = tmp_path / "wiki" / "sources" / "images" / "doc" + weird = r"img\g<9>.png" # backslash-escape-like name must not crash re.sub + md = f"![f]({weird})" + out = localize_images(md, {weird: b"DATA"}, "doc", images_dir) + assert f"sources/images/doc/{weird}" in out + assert (images_dir / weird).read_bytes() == b"DATA" From 455c74671bc71e2aed40967f935c5456475c43db Mon Sep 17 00:00:00 2001 From: mountain Date: Sun, 31 May 2026 10:43:22 +0800 Subject: [PATCH 04/26] feat(parsers): add LocalParser wrapping legacy extraction (#77) --- openkb/parsers/local.py | 47 ++++++++++++++++++++++++++++++++++ tests/test_parsers_local.py | 51 +++++++++++++++++++++++++++++++++++++ 2 files changed, 98 insertions(+) create mode 100644 openkb/parsers/local.py create mode 100644 tests/test_parsers_local.py diff --git a/openkb/parsers/local.py b/openkb/parsers/local.py new file mode 100644 index 00000000..d714d0ce --- /dev/null +++ b/openkb/parsers/local.py @@ -0,0 +1,47 @@ +from __future__ import annotations + +from pathlib import Path + +from markitdown import MarkItDown + +from openkb.images import ( + convert_pdf_with_images, + copy_relative_images, + extract_base64_images, +) +from openkb.parsers.base import ParseResult, Parser + +_LOCAL_EXTENSIONS = { + ".pdf", ".md", ".markdown", ".docx", ".pptx", ".xlsx", ".xls", + ".html", ".htm", ".txt", ".csv", +} + + +class LocalParser(Parser): + """Default parser: pymupdf for PDF, markitdown for office/html, direct read for md.""" + + name = "local" + + def __init__(self, doc_name: str = "", images_dir: Path | None = None, + source_dir: Path | None = None): + self.doc_name = doc_name + self.images_dir = images_dir + self.source_dir = source_dir + + def supports(self, suffix: str) -> bool: + return suffix.lower() in _LOCAL_EXTENSIONS + + def parse(self, src: Path) -> ParseResult: + suffix = src.suffix.lower() + if suffix in {".md", ".markdown"}: + markdown = src.read_text(encoding="utf-8") + markdown = copy_relative_images( + markdown, src.parent, self.doc_name, self.images_dir + ) + elif suffix == ".pdf": + markdown = convert_pdf_with_images(src, self.doc_name, self.images_dir) + else: + mid = MarkItDown() + markdown = mid.convert(str(src)).text_content + markdown = extract_base64_images(markdown, self.doc_name, self.images_dir) + return ParseResult(markdown=markdown) diff --git a/tests/test_parsers_local.py b/tests/test_parsers_local.py new file mode 100644 index 00000000..e682c42e --- /dev/null +++ b/tests/test_parsers_local.py @@ -0,0 +1,51 @@ +"""Tests for LocalParser — preserves legacy md/pdf/markitdown behavior.""" +from __future__ import annotations + +from pathlib import Path +from unittest.mock import patch + +from openkb.parsers.local import LocalParser +from openkb.parsers.base import ParseResult + + +def test_supports_all_known_extensions(): + p = LocalParser() + for ext in [".pdf", ".md", ".markdown", ".docx", ".pptx", ".xlsx", ".html", ".txt", ".csv"]: + assert p.supports(ext) is True + + +def test_parse_md_reads_text(tmp_path): + src = tmp_path / "n.md" + src.write_text("# Title\n\nbody", encoding="utf-8") + images_dir = tmp_path / "img" / "n" + p = LocalParser(doc_name="n", images_dir=images_dir, source_dir=tmp_path) + result = p.parse(src) + assert isinstance(result, ParseResult) + assert result.markdown.startswith("# Title") + + +def test_parse_pdf_delegates_to_convert_pdf_with_images(tmp_path): + src = tmp_path / "doc.pdf" + src.write_bytes(b"%PDF-1.4 fake") + images_dir = tmp_path / "img" / "doc" + with patch("openkb.parsers.local.convert_pdf_with_images", return_value="PDF MD") as m: + p = LocalParser(doc_name="doc", images_dir=images_dir, source_dir=tmp_path) + result = p.parse(src) + m.assert_called_once_with(src, "doc", images_dir) + assert result.markdown == "PDF MD" + + +def test_parse_other_uses_markitdown_and_extracts_base64(tmp_path): + src = tmp_path / "deck.pptx" + src.write_bytes(b"PK fake") + images_dir = tmp_path / "img" / "deck" + fake_mid = patch("openkb.parsers.local.MarkItDown").start() + fake_mid.return_value.convert.return_value.text_content = "MARKITDOWN MD" + try: + with patch("openkb.parsers.local.extract_base64_images", return_value="CLEANED") as ex: + p = LocalParser(doc_name="deck", images_dir=images_dir, source_dir=tmp_path) + result = p.parse(src) + ex.assert_called_once_with("MARKITDOWN MD", "deck", images_dir) + assert result.markdown == "CLEANED" + finally: + patch.stopall() From 0978cbf58f6fbe91f906e3b044d993560441087a Mon Sep 17 00:00:00 2001 From: mountain Date: Sun, 31 May 2026 10:47:01 +0800 Subject: [PATCH 05/26] feat(parsers): add registry + get_parser factory (#77) --- openkb/parsers/__init__.py | 3 ++- openkb/parsers/registry.py | 38 ++++++++++++++++++++++++++++++++++ tests/test_parsers_registry.py | 33 +++++++++++++++++++++++++++++ 3 files changed, 73 insertions(+), 1 deletion(-) create mode 100644 openkb/parsers/registry.py create mode 100644 tests/test_parsers_registry.py diff --git a/openkb/parsers/__init__.py b/openkb/parsers/__init__.py index 0656d733..aeeeb100 100644 --- a/openkb/parsers/__init__.py +++ b/openkb/parsers/__init__.py @@ -1,4 +1,5 @@ """Pluggable document parsers for the file → Markdown step.""" from openkb.parsers.base import ParseResult, Parser +from openkb.parsers.registry import get_parser -__all__ = ["ParseResult", "Parser"] +__all__ = ["ParseResult", "Parser", "get_parser"] diff --git a/openkb/parsers/registry.py b/openkb/parsers/registry.py new file mode 100644 index 00000000..c87ab111 --- /dev/null +++ b/openkb/parsers/registry.py @@ -0,0 +1,38 @@ +from __future__ import annotations + +from pathlib import Path +from typing import Any + +from openkb.parsers.base import Parser +from openkb.parsers.local import LocalParser + +_VALID = ("local", "mineru", "mistral", "vlm") + + +def get_parser( + config: dict[str, Any], + override: str | None = None, + *, + doc_name: str = "", + images_dir: Path | None = None, + source_dir: Path | None = None, +) -> Parser: + """Resolve the configured parser. ``override`` (e.g. CLI ``--parser``) wins.""" + name = (override or config.get("parser") or "local").lower() + if name == "local": + return LocalParser(doc_name=doc_name, images_dir=images_dir, source_dir=source_dir) + + parsers_cfg = config.get("parsers", {}) or {} + opts = parsers_cfg.get(name, {}) or {} + if name == "mistral": + from openkb.parsers.mistral import MistralParser + return MistralParser(opts) + if name == "vlm": + from openkb.parsers.vlm import VLMParser + return VLMParser(opts, model=config.get("model")) + if name == "mineru": + from openkb.parsers.mineru import MineruParser + return MineruParser(opts) + raise ValueError( + f"Unknown parser {name!r}. Valid options: {', '.join(_VALID)}." + ) diff --git a/tests/test_parsers_registry.py b/tests/test_parsers_registry.py new file mode 100644 index 00000000..0a500ce9 --- /dev/null +++ b/tests/test_parsers_registry.py @@ -0,0 +1,33 @@ +"""Tests for parser selection / registry.""" +from __future__ import annotations + +import pytest + +from openkb.parsers.registry import get_parser +from openkb.parsers.local import LocalParser + + +def _kwargs(): + return {"doc_name": "d", "images_dir": None, "source_dir": None} + + +def test_default_is_local(): + p = get_parser({}, **_kwargs()) + assert isinstance(p, LocalParser) + + +def test_explicit_local(): + p = get_parser({"parser": "local"}, **_kwargs()) + assert isinstance(p, LocalParser) + + +def test_override_wins_over_config(): + p = get_parser({"parser": "mistral"}, override="local", **_kwargs()) + assert isinstance(p, LocalParser) + + +def test_unknown_name_raises_with_valid_options(): + with pytest.raises(ValueError) as exc: + get_parser({"parser": "nope"}, **_kwargs()) + assert "nope" in str(exc.value) + assert "local" in str(exc.value) From ed3368d0e386fe87dd10859fd152e96d29b5277f Mon Sep 17 00:00:00 2001 From: mountain Date: Sun, 31 May 2026 10:52:01 +0800 Subject: [PATCH 06/26] =?UTF-8?q?refactor(converter):=20route=20file?= =?UTF-8?q?=E2=86=92markdown=20through=20parser=20abstraction=20(#77)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- openkb/converter.py | 31 ++++++++++++++++--------------- tests/test_converter.py | 36 +++++++++++++++++++++++++++++++++++- 2 files changed, 51 insertions(+), 16 deletions(-) diff --git a/openkb/converter.py b/openkb/converter.py index 352c22b3..9d684c8d 100644 --- a/openkb/converter.py +++ b/openkb/converter.py @@ -7,10 +7,11 @@ from pathlib import Path import pymupdf -from markitdown import MarkItDown from openkb.config import load_config -from openkb.images import copy_relative_images, extract_base64_images, convert_pdf_with_images +from openkb.images import localize_images +from openkb.parsers import get_parser +from openkb.parsers.local import LocalParser from openkb.state import HashRegistry logger = logging.getLogger(__name__) @@ -33,7 +34,7 @@ def get_pdf_page_count(path: Path) -> int: return doc.page_count -def convert_document(src: Path, kb_dir: Path) -> ConvertResult: +def convert_document(src: Path, kb_dir: Path, parser_override: str | None = None) -> ConvertResult: """Convert a document and integrate it into the knowledge base. Steps: @@ -93,18 +94,18 @@ def convert_document(src: Path, kb_dir: Path) -> ConvertResult: doc_name = src.stem - if src.suffix.lower() == ".md": - markdown = src.read_text(encoding="utf-8") - markdown = copy_relative_images(markdown, src.parent, doc_name, images_dir) - elif src.suffix.lower() == ".pdf": - # Use pymupdf dict-mode for PDFs: text + images inline at correct positions - markdown = convert_pdf_with_images(src, doc_name, images_dir) - else: - # Non-PDF, non-MD: use markitdown (docx, pptx, html, etc.) - mid = MarkItDown() - result = mid.convert(str(src)) - markdown = result.text_content - markdown = extract_base64_images(markdown, doc_name, images_dir) + parser = get_parser( + config, + override=parser_override, + doc_name=doc_name, + images_dir=images_dir, + source_dir=src.parent, + ) + if not parser.supports(src.suffix): + parser = LocalParser(doc_name=doc_name, images_dir=images_dir, source_dir=src.parent) + + parse_result = parser.parse(src) + markdown = localize_images(parse_result.markdown, parse_result.images, doc_name, images_dir) dest_md = sources_dir / f"{doc_name}.md" dest_md.write_text(markdown, encoding="utf-8") diff --git a/tests/test_converter.py b/tests/test_converter.py index d7475b09..391dbc18 100644 --- a/tests/test_converter.py +++ b/tests/test_converter.py @@ -85,7 +85,7 @@ def test_short_pdf_converted_via_pymupdf(self, kb_dir, tmp_path): with ( patch("openkb.converter.pymupdf.open") as mock_mu, - patch("openkb.converter.convert_pdf_with_images", return_value="# Short PDF\n\nConverted.") as mock_cpwi, + patch("openkb.parsers.local.convert_pdf_with_images", return_value="# Short PDF\n\nConverted.") as mock_cpwi, ): fake_doc = MagicMock() fake_doc.page_count = 5 # below default threshold of 20 @@ -128,3 +128,37 @@ def test_long_pdf_returns_is_long_doc(self, kb_dir, tmp_path): assert result.source_path is None assert result.skipped is False assert result.raw_path is not None + + +from openkb.parsers.base import ParseResult + + +class TestConvertDocumentParserSelection: + def test_uses_get_parser_and_localizes(self, kb_dir): + src = kb_dir / "raw" / "paper.pdf" + src.write_bytes(b"%PDF-1.4 fake") + + fake = MagicMock() + fake.supports.return_value = True + fake.parse.return_value = ParseResult(markdown="HELLO", images={"a.png": b"X"}) + + with patch("openkb.converter.get_pdf_page_count", return_value=1), \ + patch("openkb.converter.get_parser", return_value=fake) as gp, \ + patch("openkb.converter.localize_images", return_value="HELLO-LOCALIZED") as li: + result = convert_document(src, kb_dir) + + gp.assert_called_once() + li.assert_called_once() + assert result.source_path.read_text(encoding="utf-8") == "HELLO-LOCALIZED" + + def test_falls_back_to_local_for_unsupported_suffix(self, kb_dir): + src = kb_dir / "raw" / "notes.md" + src.write_text("# md", encoding="utf-8") + + online = MagicMock() + online.supports.return_value = False # online parser can't do .md + with patch("openkb.converter.get_parser", return_value=online), \ + patch("openkb.converter.LocalParser") as LP: + LP.return_value.parse.return_value = ParseResult(markdown="# md") + convert_document(src, kb_dir) + LP.assert_called_once() # fell back to LocalParser From 27d314c9ad327d338bb6bbc13e1460b67b1859d7 Mon Sep 17 00:00:00 2001 From: mountain Date: Sun, 31 May 2026 10:55:42 +0800 Subject: [PATCH 07/26] docs(converter): refresh convert_document docstring for parser flow; tighten tests (#77) --- openkb/converter.py | 9 +++++---- tests/test_converter.py | 2 ++ 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/openkb/converter.py b/openkb/converter.py index 9d684c8d..d6c37fc7 100644 --- a/openkb/converter.py +++ b/openkb/converter.py @@ -41,9 +41,10 @@ def convert_document(src: Path, kb_dir: Path, parser_override: str | None = None 1. Hash-check — skip if already known. 2. Copy source to ``raw/``. 3. If PDF and page count >= threshold → return :attr:`ConvertResult.is_long_doc`. - 4. If ``.md`` — read, process relative images, save to ``wiki/sources/``. - 5. Otherwise — run MarkItDown, extract base64 images, save to ``wiki/sources/``. - 6. Register hash in the registry. + 4. Select a parser via :func:`get_parser` (falling back to + :class:`LocalParser` for unsupported suffixes like ``.md``), parse the + file to Markdown, localize images, and save to ``wiki/sources/``. + 5. Register hash in the registry. """ # ------------------------------------------------------------------ # Load config & state @@ -85,7 +86,7 @@ def convert_document(src: Path, kb_dir: Path, parser_override: str | None = None return ConvertResult(raw_path=raw_dest, is_long_doc=True, file_hash=file_hash) # ------------------------------------------------------------------ - # 4/5. Convert to Markdown + # 4. Select parser, convert to Markdown, localize images # ------------------------------------------------------------------ sources_dir = kb_dir / "wiki" / "sources" sources_dir.mkdir(parents=True, exist_ok=True) diff --git a/tests/test_converter.py b/tests/test_converter.py index 391dbc18..90b26bf4 100644 --- a/tests/test_converter.py +++ b/tests/test_converter.py @@ -148,6 +148,8 @@ def test_uses_get_parser_and_localizes(self, kb_dir): result = convert_document(src, kb_dir) gp.assert_called_once() + assert gp.call_args.kwargs["doc_name"] == "paper" + assert gp.call_args.kwargs["images_dir"] is not None li.assert_called_once() assert result.source_path.read_text(encoding="utf-8") == "HELLO-LOCALIZED" From 36e8b4f45a1e5fd5a7683bbc8d7db9724293f272 Mon Sep 17 00:00:00 2001 From: mountain Date: Sun, 31 May 2026 10:57:29 +0800 Subject: [PATCH 08/26] feat(parsers): add reusable litellm vision client (#77) --- openkb/parsers/vlm_client.py | 29 ++++++++++++++++++++++++++ tests/test_parsers_vlm_client.py | 35 ++++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+) create mode 100644 openkb/parsers/vlm_client.py create mode 100644 tests/test_parsers_vlm_client.py diff --git a/openkb/parsers/vlm_client.py b/openkb/parsers/vlm_client.py new file mode 100644 index 00000000..5979fd9e --- /dev/null +++ b/openkb/parsers/vlm_client.py @@ -0,0 +1,29 @@ +from __future__ import annotations + +import base64 +import mimetypes +from pathlib import Path + +import litellm + +_DEFAULT_MODEL = "gemini/gemini-2.5-pro" + +_PROMPT = ( + "Transcribe this document to clean GitHub-flavored Markdown. Preserve headings, " + "lists, tables (as Markdown or HTML tables), and math (as LaTeX). Output only the " + "Markdown content, no commentary." +) + + +def transcribe_to_markdown(src: Path, model: str | None = None, prompt: str | None = None) -> str: + """Send ``src`` (PDF or image) to a vision-capable LLM via litellm; return Markdown.""" + model = model or _DEFAULT_MODEL + mime = mimetypes.guess_type(src.name)[0] or "application/octet-stream" + b64 = base64.b64encode(src.read_bytes()).decode() + data_uri = f"data:{mime};base64,{b64}" + content = [ + {"type": "text", "text": prompt or _PROMPT}, + {"type": "image_url", "image_url": {"url": data_uri}}, + ] + resp = litellm.completion(model=model, messages=[{"role": "user", "content": content}]) + return resp.choices[0].message.content or "" diff --git a/tests/test_parsers_vlm_client.py b/tests/test_parsers_vlm_client.py new file mode 100644 index 00000000..09e38256 --- /dev/null +++ b/tests/test_parsers_vlm_client.py @@ -0,0 +1,35 @@ +"""Tests for the reusable litellm vision client.""" +from __future__ import annotations + +from unittest.mock import MagicMock, patch + +from openkb.parsers.vlm_client import transcribe_to_markdown + + +def _fake_response(text): + resp = MagicMock() + resp.choices = [MagicMock(message=MagicMock(content=text))] + return resp + + +def test_transcribe_pdf_sends_data_uri_and_returns_content(tmp_path): + src = tmp_path / "doc.pdf" + src.write_bytes(b"%PDF-1.4 data") + with patch("openkb.parsers.vlm_client.litellm.completion", + return_value=_fake_response("# Parsed")) as comp: + out = transcribe_to_markdown(src, model="gemini/gemini-2.5-pro") + assert out == "# Parsed" + _, kwargs = comp.call_args + assert kwargs["model"] == "gemini/gemini-2.5-pro" + content = kwargs["messages"][0]["content"] + assert any("base64" in str(part) for part in content) + + +def test_default_model_used_when_none(tmp_path): + src = tmp_path / "img.png" + src.write_bytes(b"PNG") + with patch("openkb.parsers.vlm_client.litellm.completion", + return_value=_fake_response("desc")) as comp: + transcribe_to_markdown(src, model=None) + _, kwargs = comp.call_args + assert kwargs["model"] # some non-empty default From 8b1d4eb1e69831f510404b48550c2fd9ca0bc513 Mon Sep 17 00:00:00 2001 From: mountain Date: Sun, 31 May 2026 11:00:10 +0800 Subject: [PATCH 09/26] fix(parsers): use litellm file content part for PDFs in vlm_client (#77) --- openkb/parsers/vlm_client.py | 7 ++++++- tests/test_parsers_vlm_client.py | 26 ++++++++++++++++++++++++++ 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/openkb/parsers/vlm_client.py b/openkb/parsers/vlm_client.py index 5979fd9e..1f2774f8 100644 --- a/openkb/parsers/vlm_client.py +++ b/openkb/parsers/vlm_client.py @@ -21,9 +21,14 @@ def transcribe_to_markdown(src: Path, model: str | None = None, prompt: str | No mime = mimetypes.guess_type(src.name)[0] or "application/octet-stream" b64 = base64.b64encode(src.read_bytes()).decode() data_uri = f"data:{mime};base64,{b64}" + if mime == "application/pdf": + # litellm's document/file content part (image_url is only for raster images). + media_part = {"type": "file", "file": {"file_data": data_uri}} + else: + media_part = {"type": "image_url", "image_url": {"url": data_uri}} content = [ {"type": "text", "text": prompt or _PROMPT}, - {"type": "image_url", "image_url": {"url": data_uri}}, + media_part, ] resp = litellm.completion(model=model, messages=[{"role": "user", "content": content}]) return resp.choices[0].message.content or "" diff --git a/tests/test_parsers_vlm_client.py b/tests/test_parsers_vlm_client.py index 09e38256..3703c179 100644 --- a/tests/test_parsers_vlm_client.py +++ b/tests/test_parsers_vlm_client.py @@ -33,3 +33,29 @@ def test_default_model_used_when_none(tmp_path): transcribe_to_markdown(src, model=None) _, kwargs = comp.call_args assert kwargs["model"] # some non-empty default + + +def test_pdf_uses_file_content_part(tmp_path): + src = tmp_path / "doc.pdf" + src.write_bytes(b"%PDF-1.4 data") + with patch("openkb.parsers.vlm_client.litellm.completion", + return_value=_fake_response("x")) as comp: + transcribe_to_markdown(src, model="some/model") + content = comp.call_args.kwargs["messages"][0]["content"] + file_parts = [p for p in content if p.get("type") == "file"] + assert len(file_parts) == 1 + assert file_parts[0]["file"]["file_data"].startswith("data:application/pdf;base64,") + assert not any(p.get("type") == "image_url" for p in content) + + +def test_image_uses_image_url_content_part(tmp_path): + src = tmp_path / "fig.png" + src.write_bytes(b"\x89PNG\r\n") + with patch("openkb.parsers.vlm_client.litellm.completion", + return_value=_fake_response("x")) as comp: + transcribe_to_markdown(src, model="some/model") + content = comp.call_args.kwargs["messages"][0]["content"] + image_parts = [p for p in content if p.get("type") == "image_url"] + assert len(image_parts) == 1 + assert image_parts[0]["image_url"]["url"].startswith("data:image/png;base64,") + assert not any(p.get("type") == "file" for p in content) From 2a93eec753d97d4ec81bfe83f33a48080fac3e6d Mon Sep 17 00:00:00 2001 From: mountain Date: Sun, 31 May 2026 11:01:23 +0800 Subject: [PATCH 10/26] feat(parsers): add VLMParser (vision LLM via litellm) (#77) --- openkb/parsers/vlm.py | 27 +++++++++++++++++++++++++++ tests/test_parsers_vlm.py | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+) create mode 100644 openkb/parsers/vlm.py create mode 100644 tests/test_parsers_vlm.py diff --git a/openkb/parsers/vlm.py b/openkb/parsers/vlm.py new file mode 100644 index 00000000..834125fc --- /dev/null +++ b/openkb/parsers/vlm.py @@ -0,0 +1,27 @@ +from __future__ import annotations + +from pathlib import Path +from typing import Any + +from openkb.parsers.base import ParseResult, Parser +from openkb.parsers.vlm_client import transcribe_to_markdown + +_SUPPORTED = {".pdf"} + + +class VLMParser(Parser): + """Parse via a vision-capable LLM (litellm). Covers Gemini, GPT-4o, Claude, etc.""" + + name = "vlm" + + def __init__(self, opts: dict[str, Any] | None = None, model: str | None = None): + opts = opts or {} + # parsers.vlm.model overrides the global model; else use the global model. + self.model = opts.get("model") or model + + def supports(self, suffix: str) -> bool: + return suffix.lower() in _SUPPORTED + + def parse(self, src: Path) -> ParseResult: + markdown = transcribe_to_markdown(src, model=self.model) + return ParseResult(markdown=markdown) diff --git a/tests/test_parsers_vlm.py b/tests/test_parsers_vlm.py new file mode 100644 index 00000000..e1d78683 --- /dev/null +++ b/tests/test_parsers_vlm.py @@ -0,0 +1,33 @@ +from __future__ import annotations + +from unittest.mock import patch + +from openkb.parsers.vlm import VLMParser +from openkb.parsers.base import ParseResult + + +def test_supports_pdf_only_for_v1(): + p = VLMParser({}, model="gemini/gemini-2.5-pro") + assert p.supports(".pdf") is True + assert p.supports(".md") is False + assert p.supports(".docx") is False + + +def test_parse_calls_transcribe_with_configured_model(tmp_path): + src = tmp_path / "doc.pdf" + src.write_bytes(b"%PDF") + p = VLMParser({"model": "gpt-4o"}, model="fallback-model") + with patch("openkb.parsers.vlm.transcribe_to_markdown", return_value="# MD") as t: + result = p.parse(src) + t.assert_called_once_with(src, model="gpt-4o") + assert isinstance(result, ParseResult) + assert result.markdown == "# MD" + + +def test_parse_falls_back_to_global_model(tmp_path): + src = tmp_path / "doc.pdf" + src.write_bytes(b"%PDF") + p = VLMParser({}, model="global-model") + with patch("openkb.parsers.vlm.transcribe_to_markdown", return_value="x") as t: + p.parse(src) + t.assert_called_once_with(src, model="global-model") From 2c7e6932bdf3291a5cc1512f5ad9576db9e7cda1 Mon Sep 17 00:00:00 2001 From: mountain Date: Sun, 31 May 2026 11:04:10 +0800 Subject: [PATCH 11/26] feat(parsers): add MistralParser via mistralai SDK (#77) --- openkb/parsers/mistral.py | 63 ++++++++++++++++++++++++++++++++ tests/test_parsers_mistral.py | 69 +++++++++++++++++++++++++++++++++++ 2 files changed, 132 insertions(+) create mode 100644 openkb/parsers/mistral.py create mode 100644 tests/test_parsers_mistral.py diff --git a/openkb/parsers/mistral.py b/openkb/parsers/mistral.py new file mode 100644 index 00000000..2f0e5622 --- /dev/null +++ b/openkb/parsers/mistral.py @@ -0,0 +1,63 @@ +from __future__ import annotations + +import base64 +import os +import re +from pathlib import Path +from typing import Any + +from openkb.parsers.base import ParseResult, Parser + +_SUPPORTED = {".pdf"} +_DATA_URI_RE = re.compile(r"^data:[^;]+;base64,", re.IGNORECASE) + + +class MistralParser(Parser): + """Mistral OCR (Document AI). Synchronous; markdown + base64 images.""" + + name = "mistral" + + def __init__(self, opts: dict[str, Any] | None = None): + self.opts = opts or {} + self.model = self.opts.get("model", "mistral-ocr-latest") + + def supports(self, suffix: str) -> bool: + return suffix.lower() in _SUPPORTED + + def parse(self, src: Path) -> ParseResult: + api_key = os.environ.get("MISTRAL_API_KEY") + if not api_key: + raise RuntimeError( + "Mistral parser requires the MISTRAL_API_KEY environment variable." + ) + try: + from mistralai import Mistral + except ImportError as exc: + raise RuntimeError( + "Mistral parser requires the 'mistralai' package. " + "Install with: pip install openkb[mistral]" + ) from exc + + client = Mistral(api_key=api_key) + uploaded = client.files.upload( + file={"file_name": src.name, "content": src.read_bytes()}, purpose="ocr" + ) + signed = client.files.get_signed_url(file_id=uploaded.id) + resp = client.ocr.process( + model=self.model, + document={"type": "document_url", "document_url": signed.url}, + include_image_base64=True, + ) + + parts: list[str] = [] + images: dict[str, bytes] = {} + for page in resp.pages: + parts.append(page.markdown or "") + for img in getattr(page, "images", None) or []: + raw = img.image_base64 or "" + raw = _DATA_URI_RE.sub("", raw) + try: + images[img.id] = base64.b64decode(raw, validate=True) + except Exception: + continue + return ParseResult(markdown="\n\n".join(parts), images=images) diff --git a/tests/test_parsers_mistral.py b/tests/test_parsers_mistral.py new file mode 100644 index 00000000..d72651d0 --- /dev/null +++ b/tests/test_parsers_mistral.py @@ -0,0 +1,69 @@ +from __future__ import annotations + +import base64 +import sys +import types +from unittest.mock import MagicMock + +import pytest + +from openkb.parsers.base import ParseResult + + +def _install_fake_mistralai(monkeypatch, client_instance): + mod = types.ModuleType("mistralai") + mod.Mistral = MagicMock(return_value=client_instance) + monkeypatch.setitem(sys.modules, "mistralai", mod) + return mod + + +def test_supports_pdf(): + from openkb.parsers.mistral import MistralParser + p = MistralParser({}) + assert p.supports(".pdf") is True + assert p.supports(".docx") is False + + +def test_missing_key_raises_actionable(monkeypatch, tmp_path): + monkeypatch.delenv("MISTRAL_API_KEY", raising=False) + from openkb.parsers.mistral import MistralParser + p = MistralParser({}) + src = tmp_path / "d.pdf"; src.write_bytes(b"%PDF") + with pytest.raises(RuntimeError) as exc: + p.parse(src) + assert "MISTRAL_API_KEY" in str(exc.value) + + +def test_parse_collects_markdown_and_decodes_images(monkeypatch, tmp_path): + monkeypatch.setenv("MISTRAL_API_KEY", "k") + img_bytes = b"IMGDATA" + img_b64 = base64.b64encode(img_bytes).decode() + + client = MagicMock() + client.files.upload.return_value = MagicMock(id="file-1") + client.files.get_signed_url.return_value = MagicMock(url="https://signed") + page = MagicMock() + page.markdown = "Text ![img-0.png](img-0.png)" + page.images = [MagicMock(id="img-0.png", image_base64=f"data:image/png;base64,{img_b64}")] + client.ocr.process.return_value = MagicMock(pages=[page]) + + _install_fake_mistralai(monkeypatch, client) + from openkb.parsers.mistral import MistralParser + p = MistralParser({}) + src = tmp_path / "d.pdf"; src.write_bytes(b"%PDF") + result = p.parse(src) + + assert isinstance(result, ParseResult) + assert "img-0.png" in result.markdown + assert result.images["img-0.png"] == img_bytes + + +def test_missing_package_raises_install_hint(monkeypatch, tmp_path): + monkeypatch.setenv("MISTRAL_API_KEY", "k") + monkeypatch.setitem(sys.modules, "mistralai", None) # force ImportError + from openkb.parsers.mistral import MistralParser + p = MistralParser({}) + src = tmp_path / "d.pdf"; src.write_bytes(b"%PDF") + with pytest.raises(RuntimeError) as exc: + p.parse(src) + assert "openkb[mistral]" in str(exc.value) From 50b83bb14179babac19e9d264c3a82b33936ac6f Mon Sep 17 00:00:00 2001 From: mountain Date: Sun, 31 May 2026 11:06:48 +0800 Subject: [PATCH 12/26] feat(parsers): log skipped undecodable Mistral images (#77) --- openkb/parsers/mistral.py | 4 ++++ tests/test_parsers_mistral.py | 19 +++++++++++++++++++ 2 files changed, 23 insertions(+) diff --git a/openkb/parsers/mistral.py b/openkb/parsers/mistral.py index 2f0e5622..a494f149 100644 --- a/openkb/parsers/mistral.py +++ b/openkb/parsers/mistral.py @@ -1,6 +1,7 @@ from __future__ import annotations import base64 +import logging import os import re from pathlib import Path @@ -8,6 +9,8 @@ from openkb.parsers.base import ParseResult, Parser +logger = logging.getLogger(__name__) + _SUPPORTED = {".pdf"} _DATA_URI_RE = re.compile(r"^data:[^;]+;base64,", re.IGNORECASE) @@ -59,5 +62,6 @@ def parse(self, src: Path) -> ParseResult: try: images[img.id] = base64.b64decode(raw, validate=True) except Exception: + logger.warning("Skipping undecodable Mistral image: %s", getattr(img, "id", "?")) continue return ParseResult(markdown="\n\n".join(parts), images=images) diff --git a/tests/test_parsers_mistral.py b/tests/test_parsers_mistral.py index d72651d0..e95c1858 100644 --- a/tests/test_parsers_mistral.py +++ b/tests/test_parsers_mistral.py @@ -67,3 +67,22 @@ def test_missing_package_raises_install_hint(monkeypatch, tmp_path): with pytest.raises(RuntimeError) as exc: p.parse(src) assert "openkb[mistral]" in str(exc.value) + + +def test_undecodable_image_logged_and_skipped(monkeypatch, tmp_path, caplog): + import logging as _logging + monkeypatch.setenv("MISTRAL_API_KEY", "k") + client = MagicMock() + client.files.upload.return_value = MagicMock(id="file-1") + client.files.get_signed_url.return_value = MagicMock(url="https://signed") + page = MagicMock() + page.markdown = "Text ![bad.png](bad.png)" + page.images = [MagicMock(id="bad.png", image_base64="!!!not-base64!!!")] + client.ocr.process.return_value = MagicMock(pages=[page]) + _install_fake_mistralai(monkeypatch, client) + from openkb.parsers.mistral import MistralParser + src = tmp_path / "d.pdf"; src.write_bytes(b"%PDF") + with caplog.at_level(_logging.WARNING): + result = MistralParser({}).parse(src) + assert "bad.png" not in result.images + assert any("bad.png" in r.message for r in caplog.records) From e452a3aa1ce1dc23e8c79bbde0ff5aaa1b53cbfc Mon Sep 17 00:00:00 2001 From: mountain Date: Sun, 31 May 2026 11:08:40 +0800 Subject: [PATCH 13/26] feat(parsers): add MineruParser (cloud + self-hosted HTTP) (#77) --- openkb/parsers/mineru.py | 120 +++++++++++++++++++++++++++++++++++ tests/test_parsers_mineru.py | 68 ++++++++++++++++++++ 2 files changed, 188 insertions(+) create mode 100644 openkb/parsers/mineru.py create mode 100644 tests/test_parsers_mineru.py diff --git a/openkb/parsers/mineru.py b/openkb/parsers/mineru.py new file mode 100644 index 00000000..243bca3d --- /dev/null +++ b/openkb/parsers/mineru.py @@ -0,0 +1,120 @@ +from __future__ import annotations + +import io +import os +import time +import zipfile +from pathlib import Path +from typing import Any + +from openkb.parsers.base import ParseResult, Parser + +_SUPPORTED = {".pdf", ".docx", ".pptx", ".xlsx", ".xls", ".html", ".htm"} +_CLOUD_BASE = "https://mineru.net/api/v4" + + +def _httpx(): + try: + import httpx + except ImportError as exc: + raise RuntimeError( + "MinerU parser requires 'httpx'. Install with: pip install openkb[mineru]" + ) from exc + return httpx + + +def _result_from_zip(zip_bytes: bytes) -> ParseResult: + """Extract the markdown file + images from a MinerU result zip.""" + images: dict[str, bytes] = {} + markdown = "" + with zipfile.ZipFile(io.BytesIO(zip_bytes)) as zf: + md_names = [n for n in zf.namelist() if n.lower().endswith(".md")] + if md_names: + chosen = next((n for n in md_names if n.endswith("full.md")), md_names[0]) + markdown = zf.read(chosen).decode("utf-8", errors="replace") + for name in zf.namelist(): + if name.lower().endswith((".png", ".jpg", ".jpeg", ".gif", ".webp")): + images[Path(name).name] = zf.read(name) + # Markdown references images as 'images/'; localize_images matches on + # the bare filename, so rewrite 'images/fig.png' -> 'fig.png'. + for fname in images: + markdown = markdown.replace(f"images/{fname}", fname) + return ParseResult(markdown=markdown, images=images) + + +class MineruParser(Parser): + """MinerU via HTTP — self-hosted server or hosted cloud API.""" + + name = "mineru" + + def __init__(self, opts: dict[str, Any] | None = None): + self.opts = opts or {} + self.mode = self.opts.get("mode", "cloud") + self.base_url = self.opts.get("base_url") + self.poll_interval = self.opts.get("poll_interval", 3) + self.timeout = self.opts.get("timeout", 600) + + def supports(self, suffix: str) -> bool: + return suffix.lower() in _SUPPORTED + + def parse(self, src: Path) -> ParseResult: + if self.mode == "self_hosted": + return self._parse_self_hosted(src) + return self._parse_cloud(src) + + def _parse_self_hosted(self, src: Path) -> ParseResult: + if not self.base_url: + raise RuntimeError( + "MinerU self_hosted mode requires 'base_url' in parsers.mineru config." + ) + httpx = _httpx() + url = self.base_url.rstrip("/") + "/file_parse" + with httpx.Client(timeout=self.timeout) as client: + resp = client.post( + url, + files={"file": (src.name, src.read_bytes())}, + data={"return_format": "zip"}, + ) + resp.raise_for_status() + return _result_from_zip(resp.content) + + def _parse_cloud(self, src: Path) -> ParseResult: + api_key = os.environ.get("MINERU_API_KEY") + if not api_key: + raise RuntimeError( + "MinerU cloud mode requires the MINERU_API_KEY environment variable." + ) + httpx = _httpx() + headers = {"Authorization": f"Bearer {api_key}"} + with httpx.Client(timeout=self.timeout) as client: + r = client.post( + f"{_CLOUD_BASE}/file-urls/batch", + headers=headers, + json={"files": [{"name": src.name, "is_ocr": True}]}, + ) + r.raise_for_status() + data = r.json()["data"] + batch_id = data["batch_id"] + upload_url = data["file_urls"][0] + client.put(upload_url, content=src.read_bytes()).raise_for_status() + elapsed = 0 + zip_url = None + while elapsed < self.timeout: + pr = client.get( + f"{_CLOUD_BASE}/extract-results/batch/{batch_id}", headers=headers + ) + pr.raise_for_status() + results = pr.json()["data"]["extract_result"] + state = results[0].get("state") + if state == "done": + zip_url = results[0]["full_zip_url"] + break + if state == "failed": + raise RuntimeError(f"MinerU extraction failed: {results[0]}") + time.sleep(self.poll_interval) + elapsed += self.poll_interval + if zip_url is None: + raise RuntimeError("MinerU extraction timed out.") + zr = client.get(zip_url) + zr.raise_for_status() + return _result_from_zip(zr.content) diff --git a/tests/test_parsers_mineru.py b/tests/test_parsers_mineru.py new file mode 100644 index 00000000..be63e9b4 --- /dev/null +++ b/tests/test_parsers_mineru.py @@ -0,0 +1,68 @@ +from __future__ import annotations + +import io +import sys +import types +import zipfile +from unittest.mock import MagicMock + +import pytest + +from openkb.parsers.base import ParseResult + + +def test_supports_office_and_pdf(): + from openkb.parsers.mineru import MineruParser + p = MineruParser({}) + assert p.supports(".pdf") is True + assert p.supports(".docx") is True + assert p.supports(".md") is False + + +def test_self_hosted_requires_base_url(tmp_path): + from openkb.parsers.mineru import MineruParser + p = MineruParser({"mode": "self_hosted"}) + src = tmp_path / "d.pdf"; src.write_bytes(b"%PDF") + with pytest.raises(RuntimeError) as exc: + p.parse(src) + assert "base_url" in str(exc.value) + + +def test_cloud_requires_api_key(monkeypatch, tmp_path): + monkeypatch.delenv("MINERU_API_KEY", raising=False) + from openkb.parsers.mineru import MineruParser + p = MineruParser({"mode": "cloud"}) + src = tmp_path / "d.pdf"; src.write_bytes(b"%PDF") + with pytest.raises(RuntimeError) as exc: + p.parse(src) + assert "MINERU_API_KEY" in str(exc.value) + + +def test_self_hosted_parses_zip(monkeypatch, tmp_path): + buf = io.BytesIO() + with zipfile.ZipFile(buf, "w") as zf: + zf.writestr("full.md", "# Mineru\n\n![p](images/fig.png)") + zf.writestr("images/fig.png", b"PNGBYTES") + zip_bytes = buf.getvalue() + + fake_resp = MagicMock(status_code=200, content=zip_bytes) + fake_resp.raise_for_status = MagicMock() + fake_client = MagicMock() + fake_client.__enter__ = MagicMock(return_value=fake_client) + fake_client.__exit__ = MagicMock(return_value=False) + fake_client.post.return_value = fake_resp + + httpx_mod = types.ModuleType("httpx") + httpx_mod.Client = MagicMock(return_value=fake_client) + monkeypatch.setitem(sys.modules, "httpx", httpx_mod) + + from openkb.parsers.mineru import MineruParser + p = MineruParser({"mode": "self_hosted", "base_url": "http://localhost:8000"}) + src = tmp_path / "d.pdf"; src.write_bytes(b"%PDF") + result = p.parse(src) + assert isinstance(result, ParseResult) + assert "Mineru" in result.markdown + assert result.images["fig.png"] == b"PNGBYTES" + # the images/ prefix should be rewritten to the bare filename for localize_images + assert "images/fig.png" not in result.markdown + assert "![p](fig.png)" in result.markdown From a6074f68341d7d1a78e6e572a9085ded1308ab38 Mon Sep 17 00:00:00 2001 From: mountain Date: Sun, 31 May 2026 11:12:03 +0800 Subject: [PATCH 14/26] test(parsers): cover MinerU cloud poll+download flow (#77) --- tests/test_parsers_mineru.py | 62 ++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/tests/test_parsers_mineru.py b/tests/test_parsers_mineru.py index be63e9b4..5e0c9b2c 100644 --- a/tests/test_parsers_mineru.py +++ b/tests/test_parsers_mineru.py @@ -66,3 +66,65 @@ def test_self_hosted_parses_zip(monkeypatch, tmp_path): # the images/ prefix should be rewritten to the bare filename for localize_images assert "images/fig.png" not in result.markdown assert "![p](fig.png)" in result.markdown + + +def test_cloud_flow_polls_then_downloads(monkeypatch, tmp_path): + monkeypatch.setenv("MINERU_API_KEY", "key") + monkeypatch.setattr("openkb.parsers.mineru.time.sleep", lambda *a, **k: None) + + buf = io.BytesIO() + with zipfile.ZipFile(buf, "w") as zf: + zf.writestr("full.md", "# Cloud\n\n![p](images/fig.png)") + zf.writestr("images/fig.png", b"ZBYTES") + zip_bytes = buf.getvalue() + + def _resp(json_data=None, content=None): + r = MagicMock() + r.raise_for_status = MagicMock() + if json_data is not None: + r.json.return_value = json_data + if content is not None: + r.content = content + return r + + client = MagicMock() + client.__enter__ = MagicMock(return_value=client) + client.__exit__ = MagicMock(return_value=False) + client.post.return_value = _resp( + json_data={"data": {"batch_id": "b1", "file_urls": ["https://upload"]}} + ) + client.put.return_value = _resp() + + poll_url = "https://mineru.net/api/v4/extract-results/batch/b1" + poll_running = _resp(json_data={"data": {"extract_result": [{"state": "running"}]}}) + poll_done = _resp( + json_data={"data": {"extract_result": [{"state": "done", "full_zip_url": "https://zip"}]}} + ) + zip_resp = _resp(content=zip_bytes) + + def _get(url, *a, **k): + if url == "https://zip": + return zip_resp + assert url == poll_url + _get.calls += 1 + return poll_running if _get.calls == 1 else poll_done + + _get.calls = 0 + client.get.side_effect = _get + + httpx_mod = types.ModuleType("httpx") + httpx_mod.Client = MagicMock(return_value=client) + monkeypatch.setitem(sys.modules, "httpx", httpx_mod) + + from openkb.parsers.mineru import MineruParser + p = MineruParser({"mode": "cloud", "poll_interval": 0}) + src = tmp_path / "d.pdf"; src.write_bytes(b"%PDF") + result = p.parse(src) + + assert isinstance(result, ParseResult) + assert "Cloud" in result.markdown + assert result.images["fig.png"] == b"ZBYTES" + assert "images/fig.png" not in result.markdown + assert "![p](fig.png)" in result.markdown + # drove the full poll loop: running once, then done + assert _get.calls == 2 From 82958e41c9503091d9e54f7890310770d36ad586 Mon Sep 17 00:00:00 2001 From: mountain Date: Sun, 31 May 2026 11:14:11 +0800 Subject: [PATCH 15/26] feat(cli): add --parser override and default parser config (#77) --- openkb/cli.py | 14 ++++++++------ openkb/config.py | 1 + tests/test_add_command.py | 19 ++++++++++++++++++- tests/test_config.py | 5 +++++ 4 files changed, 32 insertions(+), 7 deletions(-) diff --git a/openkb/cli.py b/openkb/cli.py index 1a2761d8..13fa89cb 100644 --- a/openkb/cli.py +++ b/openkb/cli.py @@ -259,7 +259,7 @@ def _clear_existing_skill_dir(kb_dir: Path, name: str) -> None: shutil.rmtree(target) -def add_single_file(file_path: Path, kb_dir: Path) -> Literal["added", "skipped", "failed"]: +def add_single_file(file_path: Path, kb_dir: Path, parser_override: str | None = None) -> Literal["added", "skipped", "failed"]: """Convert, index, and compile a single document into the knowledge base. Steps: @@ -289,7 +289,7 @@ def add_single_file(file_path: Path, kb_dir: Path) -> Literal["added", "skipped" # 2. Convert document click.echo(f"Adding: {file_path.name}") try: - result = convert_document(file_path, kb_dir) + result = convert_document(file_path, kb_dir, parser_override=parser_override) except Exception as exc: click.echo(f" [ERROR] Conversion failed: {exc}") logger.debug("Conversion traceback:", exc_info=True) @@ -575,8 +575,10 @@ def init(model, language): @cli.command() @click.argument("path") +@click.option("--parser", "parser_override", default=None, + help="Override the configured parser for this run (local|mineru|mistral|vlm).") @click.pass_context -def add(ctx, path): +def add(ctx, path, parser_override): """Add a document or directory of documents at PATH to the knowledge base. PATH may be a local file, a local directory (which is walked @@ -600,7 +602,7 @@ def add(ctx, path): fetched = fetch_url_to_raw(path, kb_dir) if fetched is None: return - outcome = add_single_file(fetched, kb_dir) + outcome = add_single_file(fetched, kb_dir, parser_override=parser_override) # Only clean up on dedup-skip. On "failed" we keep the file so # the user can retry (e.g. transient LLM error during compile) # without re-downloading — and so they don't lose data when @@ -626,7 +628,7 @@ def add(ctx, path): click.echo(f"Found {total} supported file(s) in {path}.") for i, f in enumerate(files, 1): click.echo(f"\n[{i}/{total}] ", nl=False) - add_single_file(f, kb_dir) + add_single_file(f, kb_dir, parser_override=parser_override) else: if target.suffix.lower() not in SUPPORTED_EXTENSIONS: click.echo( @@ -634,7 +636,7 @@ def add(ctx, path): f"Supported: {', '.join(sorted(SUPPORTED_EXTENSIONS))}" ) return - add_single_file(target, kb_dir) + add_single_file(target, kb_dir, parser_override=parser_override) def _stream_to_tty() -> bool: diff --git a/openkb/config.py b/openkb/config.py index b83e1346..dea9d482 100644 --- a/openkb/config.py +++ b/openkb/config.py @@ -9,6 +9,7 @@ "model": "gpt-5.4-mini", "language": "en", "pageindex_threshold": 20, + "parser": "local", } GLOBAL_CONFIG_DIR = Path.home() / ".config" / "openkb" diff --git a/tests/test_add_command.py b/tests/test_add_command.py index 1fb4d87f..e03c5b7f 100644 --- a/tests/test_add_command.py +++ b/tests/test_add_command.py @@ -70,7 +70,7 @@ def test_add_single_file_calls_helper(self, tmp_path): with patch("openkb.cli.add_single_file") as mock_add, \ patch("openkb.cli._find_kb_dir", return_value=kb_dir): runner.invoke(cli, ["add", str(doc)]) - mock_add.assert_called_once_with(doc, kb_dir) + mock_add.assert_called_once_with(doc, kb_dir, parser_override=None) def test_add_directory_calls_helper_for_each_file(self, tmp_path): kb_dir = self._setup_kb(tmp_path) @@ -147,3 +147,20 @@ def test_add_short_doc_runs_compiler(self, tmp_path): result = runner.invoke(cli, ["add", str(doc)]) mock_arun.assert_called_once() assert "OK" in result.output + + +def test_add_single_file_threads_parser_override(tmp_path): + from unittest.mock import patch + from pathlib import Path + from openkb.cli import add_single_file + + fake_result = type("R", (), {"skipped": True, "is_long_doc": False, + "file_hash": None, "raw_path": None, + "source_path": None})() + with patch("openkb.cli.convert_document", return_value=fake_result) as cd, \ + patch("openkb.cli._setup_llm_key"), \ + patch("openkb.cli.load_config", return_value={"model": "m"}): + add_single_file(Path("x.pdf"), tmp_path, parser_override="mistral") + # parser_override must reach convert_document + assert cd.call_args.kwargs.get("parser_override") == "mistral" \ + or (len(cd.call_args.args) >= 3 and cd.call_args.args[2] == "mistral") diff --git a/tests/test_config.py b/tests/test_config.py index 35704a6b..0d9aae36 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -45,3 +45,8 @@ def test_load_overrides_defaults(tmp_path): assert loaded["pageindex_threshold"] == 100 # Non-overridden defaults still present assert loaded["language"] == "en" + + +def test_default_parser_is_local(): + from openkb.config import DEFAULT_CONFIG + assert DEFAULT_CONFIG["parser"] == "local" From 995b90c5a6884a68b4525eaa07149b89602c47f9 Mon Sep 17 00:00:00 2001 From: mountain Date: Sun, 31 May 2026 11:17:36 +0800 Subject: [PATCH 16/26] feat(cli): validate --parser against valid set via click.Choice (#77) --- openkb/cli.py | 4 +++- openkb/parsers/registry.py | 4 ++-- tests/test_add_command.py | 9 +++++++++ 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/openkb/cli.py b/openkb/cli.py index 13fa89cb..030e1133 100644 --- a/openkb/cli.py +++ b/openkb/cli.py @@ -43,6 +43,7 @@ def filter(self, record: logging.LogRecord) -> bool: from openkb.config import DEFAULT_CONFIG, load_config, save_config, load_global_config, register_kb from openkb.converter import convert_document from openkb.log import append_log +from openkb.parsers.registry import VALID_PARSERS from openkb.schema import AGENTS_MD # Suppress warnings after all imports — markitdown overrides filters at import time @@ -576,7 +577,8 @@ def init(model, language): @cli.command() @click.argument("path") @click.option("--parser", "parser_override", default=None, - help="Override the configured parser for this run (local|mineru|mistral|vlm).") + type=click.Choice(VALID_PARSERS), + help="Override the configured parser for this run.") @click.pass_context def add(ctx, path, parser_override): """Add a document or directory of documents at PATH to the knowledge base. diff --git a/openkb/parsers/registry.py b/openkb/parsers/registry.py index c87ab111..1dba2c01 100644 --- a/openkb/parsers/registry.py +++ b/openkb/parsers/registry.py @@ -6,7 +6,7 @@ from openkb.parsers.base import Parser from openkb.parsers.local import LocalParser -_VALID = ("local", "mineru", "mistral", "vlm") +VALID_PARSERS = ("local", "mineru", "mistral", "vlm") def get_parser( @@ -34,5 +34,5 @@ def get_parser( from openkb.parsers.mineru import MineruParser return MineruParser(opts) raise ValueError( - f"Unknown parser {name!r}. Valid options: {', '.join(_VALID)}." + f"Unknown parser {name!r}. Valid options: {', '.join(VALID_PARSERS)}." ) diff --git a/tests/test_add_command.py b/tests/test_add_command.py index e03c5b7f..4bdf7be1 100644 --- a/tests/test_add_command.py +++ b/tests/test_add_command.py @@ -164,3 +164,12 @@ def test_add_single_file_threads_parser_override(tmp_path): # parser_override must reach convert_document assert cd.call_args.kwargs.get("parser_override") == "mistral" \ or (len(cd.call_args.args) >= 3 and cd.call_args.args[2] == "mistral") + + +def test_add_parser_option_rejects_invalid_choice(tmp_path): + from click.testing import CliRunner + from openkb.cli import cli + runner = CliRunner() + result = runner.invoke(cli, ["add", "--parser", "bogus", str(tmp_path / "x.pdf")]) + assert result.exit_code != 0 + assert "bogus" in result.output or "Invalid value" in result.output From 2959a8d25cbd64f7be9035b3f3d193a472fbad49 Mon Sep 17 00:00:00 2001 From: mountain Date: Sun, 31 May 2026 11:19:29 +0800 Subject: [PATCH 17/26] build: add optional parser extras (mistral, mineru, parsers) (#77) --- pyproject.toml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 026dea23..5d1e241c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -53,6 +53,9 @@ testpaths = ["tests"] [project.optional-dependencies] dev = ["pytest", "pytest-asyncio"] +mistral = ["mistralai"] +mineru = ["httpx"] +parsers = ["mistralai", "httpx"] [tool.hatch.version] source = "vcs" From 33cee6874eea6d9a066b0e4ee727d721e24f1a35 Mon Sep 17 00:00:00 2001 From: mountain Date: Sun, 31 May 2026 11:21:55 +0800 Subject: [PATCH 18/26] docs(readme): document pluggable document parsers (#77) --- README.md | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/README.md b/README.md index cc19188f..ccefa0e3 100644 --- a/README.md +++ b/README.md @@ -266,6 +266,7 @@ Settings are initialized by `openkb init`, and stored in `.openkb/config.yaml`: model: gpt-5.4 # LLM model (any LiteLLM-supported provider) language: en # Wiki output language pageindex_threshold: 20 # PDF pages threshold for PageIndex +parser: local # Document parser: local | mineru | mistral | vlm ``` Model names use `provider/model` LiteLLM [format](https://docs.litellm.ai/docs/providers) (OpenAI models can omit the prefix): @@ -276,6 +277,46 @@ Model names use `provider/model` LiteLLM [format](https://docs.litellm.ai/docs/p | Anthropic | `anthropic/claude-sonnet-4-6` | | Gemini | `gemini/gemini-3.1-pro-preview` | +### Document parsers + +By default OpenKB extracts Markdown locally (pymupdf for PDFs, markitdown for +Office/HTML) — no extra dependencies, unchanged behavior. For higher accuracy on +complex documents you can route the file → Markdown step through an online or +self-hosted parser: + +```yaml +# .openkb/config.yaml +parser: mineru # local (default) | mineru | mistral | vlm +parsers: + mineru: + mode: cloud # cloud | self_hosted + base_url: http://localhost:8000 # required when mode is self_hosted + vlm: + model: gemini/gemini-2.5-pro # any LiteLLM vision model (Gemini, GPT-4o, Claude, …) +``` + +Install the optional dependency for your parser: + +```bash +pip install openkb[mistral] # Mistral OCR +pip install openkb[mineru] # MinerU (HTTP) +pip install openkb[parsers] # all online parsers +# vlm uses the existing LiteLLM dependency — no extra needed +``` + +Set the API key via environment variable: `MINERU_API_KEY` (MinerU cloud mode), +`MISTRAL_API_KEY`; the `vlm` parser reuses the existing `LLM_API_KEY`. Override +the parser for a single run with `openkb add --parser mistral file.pdf` +(`local | mineru | mistral | vlm`). + +Each parser handles a subset of formats — `mineru` covers PDF, Word, PPT, Excel, +and HTML; `mistral` and `vlm` cover PDF. `.md` and any unsupported format always +fall back to the local parser. + +> **Note:** Long PDFs (≥ `pageindex_threshold` pages, default 20) continue to be +> indexed with PageIndex and are **not** affected by the `parser` setting. The +> parser governs the file → Markdown step for shorter documents and non-PDF files. + ### PageIndex Integration Long documents are challenging for LLMs due to context limits, context rot, and summarization loss. From 526db30cd9cc57903a64f40efa76f901cd1db0d0 Mon Sep 17 00:00:00 2001 From: mountain Date: Sun, 31 May 2026 11:38:30 +0800 Subject: [PATCH 19/26] fix(parsers): harden MinerU poll loop and anchor image-link rewrite (#77) --- openkb/parsers/mineru.py | 14 +++++++-- tests/test_parsers_mineru.py | 58 ++++++++++++++++++++++++++++++++++++ 2 files changed, 70 insertions(+), 2 deletions(-) diff --git a/openkb/parsers/mineru.py b/openkb/parsers/mineru.py index 243bca3d..a0e28b26 100644 --- a/openkb/parsers/mineru.py +++ b/openkb/parsers/mineru.py @@ -2,6 +2,7 @@ import io import os +import re import time import zipfile from pathlib import Path @@ -38,7 +39,11 @@ def _result_from_zip(zip_bytes: bytes) -> ParseResult: # Markdown references images as 'images/'; localize_images matches on # the bare filename, so rewrite 'images/fig.png' -> 'fig.png'. for fname in images: - markdown = markdown.replace(f"images/{fname}", fname) + # Rewrite only `![alt](images/)` links (anchored on markdown image + # syntax) to the bare filename, for localize_images to canonicalize. A + # replacement function avoids regex-escape injection from arbitrary names. + pattern = re.compile(r"(!\[[^\]]*\]\()" + re.escape("images/" + fname) + r"(\))") + markdown = pattern.sub(lambda m, f=fname: m.group(1) + f + m.group(2), markdown) return ParseResult(markdown=markdown, images=images) @@ -51,7 +56,8 @@ def __init__(self, opts: dict[str, Any] | None = None): self.opts = opts or {} self.mode = self.opts.get("mode", "cloud") self.base_url = self.opts.get("base_url") - self.poll_interval = self.opts.get("poll_interval", 3) + pi = self.opts.get("poll_interval", 3) + self.poll_interval = pi if isinstance(pi, (int, float)) and pi > 0 else 3 self.timeout = self.opts.get("timeout", 600) def supports(self, suffix: str) -> bool: @@ -105,6 +111,10 @@ def _parse_cloud(self, src: Path) -> ParseResult: ) pr.raise_for_status() results = pr.json()["data"]["extract_result"] + if not results: + time.sleep(self.poll_interval) + elapsed += self.poll_interval + continue state = results[0].get("state") if state == "done": zip_url = results[0]["full_zip_url"] diff --git a/tests/test_parsers_mineru.py b/tests/test_parsers_mineru.py index 5e0c9b2c..dc255254 100644 --- a/tests/test_parsers_mineru.py +++ b/tests/test_parsers_mineru.py @@ -128,3 +128,61 @@ def _get(url, *a, **k): assert "![p](fig.png)" in result.markdown # drove the full poll loop: running once, then done assert _get.calls == 2 + + +def test_poll_interval_zero_is_clamped_to_positive(): + from openkb.parsers.mineru import MineruParser + assert MineruParser({"poll_interval": 0}).poll_interval > 0 + assert MineruParser({"poll_interval": -5}).poll_interval > 0 + assert MineruParser({"poll_interval": 2}).poll_interval == 2 + + +def test_image_prefix_rewrite_is_anchored(tmp_path): + import io, sys, types, zipfile + from unittest.mock import MagicMock + # markdown has a real image link AND an unrelated 'images/fig.png' substring in prose + buf = io.BytesIO() + with zipfile.ZipFile(buf, "w") as zf: + zf.writestr("full.md", "See path other_images/fig.png in text.\n\n![p](images/fig.png)") + zf.writestr("images/fig.png", b"PNG") + from openkb.parsers.mineru import _result_from_zip + result = _result_from_zip(buf.getvalue()) + assert "![p](fig.png)" in result.markdown # link rewritten + assert "other_images/fig.png" in result.markdown # unrelated prose untouched + assert result.images["fig.png"] == b"PNG" + + +def test_cloud_empty_extract_result_then_done(monkeypatch, tmp_path): + import io, sys, types, zipfile + from unittest.mock import MagicMock + monkeypatch.setenv("MINERU_API_KEY", "key") + monkeypatch.setattr("openkb.parsers.mineru.time.sleep", lambda *a, **k: None) + buf = io.BytesIO() + with zipfile.ZipFile(buf, "w") as zf: + zf.writestr("full.md", "# Ok") + zip_bytes = buf.getvalue() + + def _resp(json_data=None, content=None): + r = MagicMock(); r.raise_for_status = MagicMock() + if json_data is not None: r.json.return_value = json_data + if content is not None: r.content = content + return r + client = MagicMock() + client.__enter__ = MagicMock(return_value=client); client.__exit__ = MagicMock(return_value=False) + client.post.return_value = _resp(json_data={"data": {"batch_id": "b1", "file_urls": ["https://up"]}}) + client.put.return_value = _resp() + empty = _resp(json_data={"data": {"extract_result": []}}) # queued: empty list + done = _resp(json_data={"data": {"extract_result": [{"state": "done", "full_zip_url": "https://zip"}]}}) + zipr = _resp(content=zip_bytes) + def _get(url, *a, **k): + if url == "https://zip": return zipr + _get.n += 1 + return empty if _get.n == 1 else done + _get.n = 0 + client.get.side_effect = _get + httpx_mod = types.ModuleType("httpx"); httpx_mod.Client = MagicMock(return_value=client) + monkeypatch.setitem(sys.modules, "httpx", httpx_mod) + from openkb.parsers.mineru import MineruParser + src = tmp_path / "d.pdf"; src.write_bytes(b"%PDF") + result = MineruParser({"mode": "cloud", "poll_interval": 1}).parse(src) + assert "Ok" in result.markdown # survived the empty-list poll without crashing From 8af174feac97ff315a2cde43e1de14469ea68995 Mon Sep 17 00:00:00 2001 From: mountain Date: Sun, 31 May 2026 11:39:47 +0800 Subject: [PATCH 20/26] fix(parsers): sanitize image filenames against path traversal; skip redundant localize for local parser (#77) --- openkb/converter.py | 6 +++++- openkb/images.py | 13 +++++++------ tests/test_converter.py | 13 +++++++++++++ tests/test_images.py | 19 +++++++++++++++++++ 4 files changed, 44 insertions(+), 7 deletions(-) diff --git a/openkb/converter.py b/openkb/converter.py index d6c37fc7..2bab3d1b 100644 --- a/openkb/converter.py +++ b/openkb/converter.py @@ -106,7 +106,11 @@ def convert_document(src: Path, kb_dir: Path, parser_override: str | None = None parser = LocalParser(doc_name=doc_name, images_dir=images_dir, source_dir=src.parent) parse_result = parser.parse(src) - markdown = localize_images(parse_result.markdown, parse_result.images, doc_name, images_dir) + if parser.name == "local": + # LocalParser already persisted images and produced canonical links. + markdown = parse_result.markdown + else: + markdown = localize_images(parse_result.markdown, parse_result.images, doc_name, images_dir) dest_md = sources_dir / f"{doc_name}.md" dest_md.write_text(markdown, encoding="utf-8") diff --git a/openkb/images.py b/openkb/images.py index 9c6a424c..39891b57 100644 --- a/openkb/images.py +++ b/openkb/images.py @@ -229,12 +229,13 @@ def localize_images( images_dir.mkdir(parents=True, exist_ok=True) result = markdown for filename, data in images.items(): - (images_dir / filename).write_bytes(data) - # Rewrite a bare ![alt](filename) reference to the canonical KB path. - # Use a replacement *function* (not a replacement string) so a filename - # containing regex-escape sequences (e.g. "\g<1>") can't corrupt the - # substitution — localize_images handles arbitrary parser-supplied names. - canonical = f"sources/images/{doc_name}/{filename}" + # Strip any directory components from parser-supplied names so a + # malicious/odd filename (e.g. "../x.png", "/abs/x.png") can never + # write outside images_dir. The markdown still references the original + # `filename`, so rewrite that ref to the sanitized canonical path. + safe_name = Path(filename).name or "image" + (images_dir / safe_name).write_bytes(data) + canonical = f"sources/images/{doc_name}/{safe_name}" pattern = re.compile(r"(!\[[^\]]*\]\()" + re.escape(filename) + r"(\))") result = pattern.sub(lambda m, c=canonical: m.group(1) + c + m.group(2), result) result = extract_base64_images(result, doc_name, images_dir) diff --git a/tests/test_converter.py b/tests/test_converter.py index 90b26bf4..8e5ce77c 100644 --- a/tests/test_converter.py +++ b/tests/test_converter.py @@ -164,3 +164,16 @@ def test_falls_back_to_local_for_unsupported_suffix(self, kb_dir): LP.return_value.parse.return_value = ParseResult(markdown="# md") convert_document(src, kb_dir) LP.assert_called_once() # fell back to LocalParser + + def test_local_parser_skips_redundant_localize(self, kb_dir): + src = kb_dir / "raw" / "notes.md" + src.write_text("# md", encoding="utf-8") + local = MagicMock() + local.name = "local" + local.supports.return_value = True + local.parse.return_value = ParseResult(markdown="# md final") + with patch("openkb.converter.get_parser", return_value=local), \ + patch("openkb.converter.localize_images") as li: + result = convert_document(src, kb_dir) + li.assert_not_called() # local path skips localize_images + assert result.source_path.read_text(encoding="utf-8") == "# md final" diff --git a/tests/test_images.py b/tests/test_images.py index 53906443..7292c5c1 100644 --- a/tests/test_images.py +++ b/tests/test_images.py @@ -203,3 +203,22 @@ def test_localize_images_filename_with_regex_metachars(tmp_path): out = localize_images(md, {weird: b"DATA"}, "doc", images_dir) assert f"sources/images/doc/{weird}" in out assert (images_dir / weird).read_bytes() == b"DATA" + + +def test_localize_images_strips_path_traversal_in_filename(tmp_path): + images_dir = tmp_path / "wiki" / "sources" / "images" / "doc" + md = "![bad](../../evil.png)" + out = localize_images(md, {"../../evil.png": b"DATA"}, "doc", images_dir) + # bytes written INSIDE images_dir under the basename only — no escape + assert (images_dir / "evil.png").read_bytes() == b"DATA" + assert not (tmp_path / "evil.png").exists() + assert not (images_dir.parent.parent / "evil.png").exists() + # the original ref is rewritten to the sanitized canonical path + assert "sources/images/doc/evil.png" in out + + +def test_localize_images_absolute_filename_stays_inside(tmp_path): + images_dir = tmp_path / "wiki" / "sources" / "images" / "doc" + out = localize_images("![x](/etc/x.png)", {"/etc/x.png": b"D"}, "doc", images_dir) + assert (images_dir / "x.png").read_bytes() == b"D" + assert "sources/images/doc/x.png" in out From e424bb444a19bb73d96f6b6a15d21010ca87bcc7 Mon Sep 17 00:00:00 2001 From: mountain Date: Sun, 31 May 2026 11:41:25 +0800 Subject: [PATCH 21/26] fix(parsers): warn on VLM global-model fallback; unify parser dispatch/VALID_PARSERS (#77) --- openkb/parsers/registry.py | 48 ++++++++++++++++++++++------------ openkb/parsers/vlm.py | 10 +++++++ tests/test_parsers_registry.py | 7 +++++ tests/test_parsers_vlm.py | 14 ++++++++++ 4 files changed, 63 insertions(+), 16 deletions(-) diff --git a/openkb/parsers/registry.py b/openkb/parsers/registry.py index 1dba2c01..9a5a55b0 100644 --- a/openkb/parsers/registry.py +++ b/openkb/parsers/registry.py @@ -6,7 +6,31 @@ from openkb.parsers.base import Parser from openkb.parsers.local import LocalParser -VALID_PARSERS = ("local", "mineru", "mistral", "vlm") + +def _make_mistral(opts, config): + from openkb.parsers.mistral import MistralParser + return MistralParser(opts) + + +def _make_vlm(opts, config): + from openkb.parsers.vlm import VLMParser + return VLMParser(opts, model=config.get("model")) + + +def _make_mineru(opts, config): + from openkb.parsers.mineru import MineruParser + return MineruParser(opts) + + +# Single source of truth: online-parser name -> lazy factory. +_ONLINE_PARSERS = { + "mineru": _make_mineru, + "mistral": _make_mistral, + "vlm": _make_vlm, +} + +# Valid parser names (drives the CLI --parser choice and error messages). +VALID_PARSERS = ("local", *_ONLINE_PARSERS) def get_parser( @@ -21,18 +45,10 @@ def get_parser( name = (override or config.get("parser") or "local").lower() if name == "local": return LocalParser(doc_name=doc_name, images_dir=images_dir, source_dir=source_dir) - - parsers_cfg = config.get("parsers", {}) or {} - opts = parsers_cfg.get(name, {}) or {} - if name == "mistral": - from openkb.parsers.mistral import MistralParser - return MistralParser(opts) - if name == "vlm": - from openkb.parsers.vlm import VLMParser - return VLMParser(opts, model=config.get("model")) - if name == "mineru": - from openkb.parsers.mineru import MineruParser - return MineruParser(opts) - raise ValueError( - f"Unknown parser {name!r}. Valid options: {', '.join(VALID_PARSERS)}." - ) + factory = _ONLINE_PARSERS.get(name) + if factory is None: + raise ValueError( + f"Unknown parser {name!r}. Valid options: {', '.join(VALID_PARSERS)}." + ) + opts = (config.get("parsers", {}) or {}).get(name, {}) or {} + return factory(opts, config) diff --git a/openkb/parsers/vlm.py b/openkb/parsers/vlm.py index 834125fc..af6f5043 100644 --- a/openkb/parsers/vlm.py +++ b/openkb/parsers/vlm.py @@ -1,11 +1,14 @@ from __future__ import annotations +import logging from pathlib import Path from typing import Any from openkb.parsers.base import ParseResult, Parser from openkb.parsers.vlm_client import transcribe_to_markdown +logger = logging.getLogger(__name__) + _SUPPORTED = {".pdf"} @@ -18,6 +21,13 @@ def __init__(self, opts: dict[str, Any] | None = None, model: str | None = None) opts = opts or {} # parsers.vlm.model overrides the global model; else use the global model. self.model = opts.get("model") or model + if not opts.get("model"): + logger.warning( + "VLM parser: 'parsers.vlm.model' is not set; using the global model " + "%r for vision parsing. If that model is not vision-capable, set " + "'parsers.vlm.model' to one (e.g. gemini/gemini-2.5-pro).", + self.model, + ) def supports(self, suffix: str) -> bool: return suffix.lower() in _SUPPORTED diff --git a/tests/test_parsers_registry.py b/tests/test_parsers_registry.py index 0a500ce9..612ed259 100644 --- a/tests/test_parsers_registry.py +++ b/tests/test_parsers_registry.py @@ -31,3 +31,10 @@ def test_unknown_name_raises_with_valid_options(): get_parser({"parser": "nope"}, **_kwargs()) assert "nope" in str(exc.value) assert "local" in str(exc.value) + + +def test_valid_parsers_matches_dispatch(): + from openkb.parsers.registry import VALID_PARSERS, _ONLINE_PARSERS + # local + every online factory key, no drift + assert set(VALID_PARSERS) == {"local", *_ONLINE_PARSERS} + assert VALID_PARSERS[0] == "local" diff --git a/tests/test_parsers_vlm.py b/tests/test_parsers_vlm.py index e1d78683..eb2ede08 100644 --- a/tests/test_parsers_vlm.py +++ b/tests/test_parsers_vlm.py @@ -31,3 +31,17 @@ def test_parse_falls_back_to_global_model(tmp_path): with patch("openkb.parsers.vlm.transcribe_to_markdown", return_value="x") as t: p.parse(src) t.assert_called_once_with(src, model="global-model") + + +def test_warns_when_falling_back_to_global_model(caplog): + import logging as _logging + with caplog.at_level(_logging.WARNING): + VLMParser({}, model="gpt-5.4-mini") + assert any("parsers.vlm.model" in r.message for r in caplog.records) + + +def test_no_warning_when_vlm_model_set(caplog): + import logging as _logging + with caplog.at_level(_logging.WARNING): + VLMParser({"model": "gemini/gemini-2.5-pro"}, model="gpt-5.4-mini") + assert not any("parsers.vlm.model" in r.message for r in caplog.records) From a981b91e6345027d3aa633be15c771836ac885e1 Mon Sep 17 00:00:00 2001 From: mountain Date: Sun, 31 May 2026 11:43:09 +0800 Subject: [PATCH 22/26] fix(cli): only propagate LLM_API_KEY to the active provider key (#77) --- openkb/cli.py | 16 +++++++++------- tests/test_cli.py | 14 ++++++++++++++ 2 files changed, 23 insertions(+), 7 deletions(-) diff --git a/openkb/cli.py b/openkb/cli.py index 030e1133..b19da6a8 100644 --- a/openkb/cli.py +++ b/openkb/cli.py @@ -125,17 +125,19 @@ def _setup_llm_key(kb_dir: Path | None = None) -> None: else: litellm.api_key = api_key - # Dynamically set the provider-specific env var when possible if provider: + # Active provider is known — set only its key, so LLM_API_KEY is not + # sprayed into unrelated provider keys (e.g. MISTRAL_API_KEY, which the + # Mistral parser treats as a real Mistral credential). provider_env = f"{provider.upper()}_API_KEY" if not os.environ.get(provider_env): os.environ[provider_env] = api_key - - # Fallback: also set common provider keys so multi-provider - # configs (e.g. PageIndex Cloud) still work - for env_var in _KNOWN_PROVIDER_KEYS: - if not os.environ.get(env_var): - os.environ[env_var] = api_key + else: + # Provider couldn't be determined — fall back to setting the common + # provider keys so multi-provider configs still work. + for env_var in _KNOWN_PROVIDER_KEYS: + if not os.environ.get(env_var): + os.environ[env_var] = api_key # Supported document extensions for the `add` command SUPPORTED_EXTENSIONS = { diff --git a/tests/test_cli.py b/tests/test_cli.py index ab3378b1..e80e272f 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -365,3 +365,17 @@ async def fake_run_query(*_args, **_kwargs): assert "rnn" in saved assert "[[concepts/multi-head-attention]]" not in saved assert "multi head attention" in saved + + +def test_setup_llm_key_does_not_spray_unrelated_provider_keys(tmp_path, monkeypatch): + import os + from openkb.cli import _setup_llm_key + # KB with an openai model (known provider) + openkb_dir = tmp_path / ".openkb"; openkb_dir.mkdir() + (openkb_dir / "config.yaml").write_text("model: openai/gpt-4o\n") + for k in ("MISTRAL_API_KEY", "OPENAI_API_KEY", "ANTHROPIC_API_KEY"): + monkeypatch.delenv(k, raising=False) + monkeypatch.setenv("LLM_API_KEY", "sk-test") + _setup_llm_key(tmp_path) + assert os.environ.get("OPENAI_API_KEY") == "sk-test" # active provider set + assert os.environ.get("MISTRAL_API_KEY") is None # unrelated provider NOT sprayed From 6287deaa7a3e67cd5b753712f7f3dcf62603835a Mon Sep 17 00:00:00 2001 From: mountain Date: Mon, 1 Jun 2026 16:38:31 +0800 Subject: [PATCH 23/26] fix(images): match image links by basename (dir-prefixed, titled) in localize_images (#77) --- openkb/images.py | 41 ++++++++++++++++++++++++++--------------- tests/test_images.py | 22 ++++++++++++++++++++++ 2 files changed, 48 insertions(+), 15 deletions(-) diff --git a/openkb/images.py b/openkb/images.py index 39891b57..9315a20e 100644 --- a/openkb/images.py +++ b/openkb/images.py @@ -17,6 +17,10 @@ # Matches: ![alt](relative/path) — excludes http(s):// and data: URIs _RELATIVE_RE = re.compile(r'!\[([^\]]*)\]\((?!https?://|data:)([^)]+)\)') +# Matches an image link, capturing: (prefix `![alt](` + ws)(target)(optional +# title + ws)(closing `)`). Used to rewrite links by their target's basename. +_IMG_LINK_RE = re.compile(r'(!\[[^\]]*\]\(\s*)([^)\s]+)(\s*(?:"[^"]*"|\'[^\']*\')?\s*)(\))') + # Minimum pixel dimension — skip icons, bullets, and tiny artifacts _MIN_IMAGE_DIM = 32 @@ -217,27 +221,34 @@ def localize_images( doc_name: str, images_dir: Path, ) -> str: - """Persist parser-supplied images and normalize all image links. + """Persist parser-supplied images and normalize image links. - 1. Write every ``images`` entry (filename -> bytes) into ``images_dir``. - 2. Rewrite bare-filename references ``![alt](filename)`` (filename present - in ``images``) to the canonical ``sources/images/{doc_name}/{filename}``. - 3. Run :func:`extract_base64_images` to localize any inline base64 images. + 1. Write every ``images`` entry to ``images_dir`` under its basename + (``Path(filename).name``), so a name with ``/`` directory components or + an absolute path can never write outside ``images_dir``. + 2. Rewrite markdown image links whose target's basename matches a written + image to the canonical ``sources/images/{doc_name}/{basename}`` path — + this handles bare names, directory-prefixed targets (e.g. + ``images/fig.png``), and links carrying a title attribute. + 3. Localize any inline base64 images via :func:`extract_base64_images`. Returns the normalized markdown. """ images_dir.mkdir(parents=True, exist_ok=True) - result = markdown + safe_names: set[str] = set() for filename, data in images.items(): - # Strip any directory components from parser-supplied names so a - # malicious/odd filename (e.g. "../x.png", "/abs/x.png") can never - # write outside images_dir. The markdown still references the original - # `filename`, so rewrite that ref to the sanitized canonical path. - safe_name = Path(filename).name or "image" - (images_dir / safe_name).write_bytes(data) - canonical = f"sources/images/{doc_name}/{safe_name}" - pattern = re.compile(r"(!\[[^\]]*\]\()" + re.escape(filename) + r"(\))") - result = pattern.sub(lambda m, c=canonical: m.group(1) + c + m.group(2), result) + safe = Path(filename).name or "image" + (images_dir / safe).write_bytes(data) + safe_names.add(safe) + + def _rewrite(m: "re.Match[str]") -> str: + pre, target, title, close = m.group(1), m.group(2), m.group(3), m.group(4) + base = Path(target).name + if base in safe_names: + return f"{pre}sources/images/{doc_name}/{base}{title}{close}" + return m.group(0) + + result = _IMG_LINK_RE.sub(_rewrite, markdown) result = extract_base64_images(result, doc_name, images_dir) return result diff --git a/tests/test_images.py b/tests/test_images.py index 7292c5c1..26b8ed1b 100644 --- a/tests/test_images.py +++ b/tests/test_images.py @@ -222,3 +222,25 @@ def test_localize_images_absolute_filename_stays_inside(tmp_path): out = localize_images("![x](/etc/x.png)", {"/etc/x.png": b"D"}, "doc", images_dir) assert (images_dir / "x.png").read_bytes() == b"D" assert "sources/images/doc/x.png" in out + + +def test_localize_images_rewrites_directory_prefixed_target(tmp_path): + images_dir = tmp_path / "wiki" / "sources" / "images" / "doc" + md = "![p](images/fig.png)\n\n![q](./sub/images/other.png)" + out = localize_images(md, {"fig.png": b"A", "other.png": b"B"}, "doc", images_dir) + assert "![p](sources/images/doc/fig.png)" in out + assert "![q](sources/images/doc/other.png)" in out + assert (images_dir / "fig.png").read_bytes() == b"A" + assert (images_dir / "other.png").read_bytes() == b"B" + + +def test_localize_images_preserves_title_attribute(tmp_path): + images_dir = tmp_path / "wiki" / "sources" / "images" / "doc" + out = localize_images('![a](fig.png "Figure 1")', {"fig.png": b"X"}, "doc", images_dir) + assert '![a](sources/images/doc/fig.png "Figure 1")' in out + + +def test_localize_images_inner_whitespace(tmp_path): + images_dir = tmp_path / "wiki" / "sources" / "images" / "doc" + out = localize_images("![a]( fig.png )", {"fig.png": b"X"}, "doc", images_dir) + assert "sources/images/doc/fig.png" in out From 6e111fbd35a18ddcf9d5147514d9f17cb364b6a2 Mon Sep 17 00:00:00 2001 From: mountain Date: Mon, 1 Jun 2026 16:42:23 +0800 Subject: [PATCH 24/26] fix(parsers): harden MinerU cloud response handling, timeout, md selection; drop redundant image rewrite (#77) --- openkb/parsers/mineru.py | 65 +++++++++++++++--------- tests/test_parsers_mineru.py | 95 +++++++++++++++++++++++++++++++----- 2 files changed, 124 insertions(+), 36 deletions(-) diff --git a/openkb/parsers/mineru.py b/openkb/parsers/mineru.py index a0e28b26..e9356e19 100644 --- a/openkb/parsers/mineru.py +++ b/openkb/parsers/mineru.py @@ -1,8 +1,8 @@ from __future__ import annotations import io +import logging import os -import re import time import zipfile from pathlib import Path @@ -10,6 +10,8 @@ from openkb.parsers.base import ParseResult, Parser +logger = logging.getLogger(__name__) + _SUPPORTED = {".pdf", ".docx", ".pptx", ".xlsx", ".xls", ".html", ".htm"} _CLOUD_BASE = "https://mineru.net/api/v4" @@ -29,24 +31,32 @@ def _result_from_zip(zip_bytes: bytes) -> ParseResult: images: dict[str, bytes] = {} markdown = "" with zipfile.ZipFile(io.BytesIO(zip_bytes)) as zf: - md_names = [n for n in zf.namelist() if n.lower().endswith(".md")] + names = zf.namelist() + md_names = sorted(n for n in names if n.lower().endswith(".md")) if md_names: - chosen = next((n for n in md_names if n.endswith("full.md")), md_names[0]) + chosen = next((n for n in md_names if Path(n).name == "full.md"), md_names[0]) markdown = zf.read(chosen).decode("utf-8", errors="replace") - for name in zf.namelist(): + for name in names: if name.lower().endswith((".png", ".jpg", ".jpeg", ".gif", ".webp")): - images[Path(name).name] = zf.read(name) - # Markdown references images as 'images/'; localize_images matches on - # the bare filename, so rewrite 'images/fig.png' -> 'fig.png'. - for fname in images: - # Rewrite only `![alt](images/)` links (anchored on markdown image - # syntax) to the bare filename, for localize_images to canonicalize. A - # replacement function avoids regex-escape injection from arbitrary names. - pattern = re.compile(r"(!\[[^\]]*\]\()" + re.escape("images/" + fname) + r"(\))") - markdown = pattern.sub(lambda m, f=fname: m.group(1) + f + m.group(2), markdown) + base = Path(name).name + if base in images: + logger.warning( + "MinerU result has multiple images named %r in different " + "folders; keeping the last. Earlier one may be lost.", base + ) + images[base] = zf.read(name) return ParseResult(markdown=markdown, images=images) +def _mineru_body(resp): + """Return the 'data' dict from a MinerU v4 JSON response, raising on API errors.""" + body = resp.json() + code = body.get("code") + if code not in (0, None): + raise RuntimeError(f"MinerU API error (code={code}): {body.get('msg')}") + return body.get("data") or {} + + class MineruParser(Parser): """MinerU via HTTP — self-hosted server or hosted cloud API.""" @@ -58,7 +68,8 @@ def __init__(self, opts: dict[str, Any] | None = None): self.base_url = self.opts.get("base_url") pi = self.opts.get("poll_interval", 3) self.poll_interval = pi if isinstance(pi, (int, float)) and pi > 0 else 3 - self.timeout = self.opts.get("timeout", 600) + t = self.opts.get("timeout", 600) + self.timeout = t if isinstance(t, (int, float)) and t > 0 else 600 def supports(self, suffix: str) -> bool: return suffix.lower() in _SUPPORTED @@ -92,37 +103,43 @@ def _parse_cloud(self, src: Path) -> ParseResult: ) httpx = _httpx() headers = {"Authorization": f"Bearer {api_key}"} - with httpx.Client(timeout=self.timeout) as client: + with httpx.Client(timeout=min(self.timeout, 120)) as client: r = client.post( f"{_CLOUD_BASE}/file-urls/batch", headers=headers, json={"files": [{"name": src.name, "is_ocr": True}]}, ) r.raise_for_status() - data = r.json()["data"] - batch_id = data["batch_id"] - upload_url = data["file_urls"][0] + data = _mineru_body(r) + batch_id = data.get("batch_id") + file_urls = data.get("file_urls") or [] + if not batch_id or not file_urls: + raise RuntimeError(f"MinerU returned no upload URL: {data}") + upload_url = file_urls[0] client.put(upload_url, content=src.read_bytes()).raise_for_status() - elapsed = 0 + deadline = time.monotonic() + self.timeout zip_url = None - while elapsed < self.timeout: + while time.monotonic() < deadline: pr = client.get( f"{_CLOUD_BASE}/extract-results/batch/{batch_id}", headers=headers ) pr.raise_for_status() - results = pr.json()["data"]["extract_result"] + data = _mineru_body(pr) + results = data.get("extract_result") or [] if not results: time.sleep(self.poll_interval) - elapsed += self.poll_interval continue state = results[0].get("state") if state == "done": - zip_url = results[0]["full_zip_url"] + zip_url = results[0].get("full_zip_url") + if not zip_url: + raise RuntimeError( + f"MinerU reported done but no full_zip_url: {results[0]}" + ) break if state == "failed": raise RuntimeError(f"MinerU extraction failed: {results[0]}") time.sleep(self.poll_interval) - elapsed += self.poll_interval if zip_url is None: raise RuntimeError("MinerU extraction timed out.") zr = client.get(zip_url) diff --git a/tests/test_parsers_mineru.py b/tests/test_parsers_mineru.py index dc255254..f81c8033 100644 --- a/tests/test_parsers_mineru.py +++ b/tests/test_parsers_mineru.py @@ -63,9 +63,12 @@ def test_self_hosted_parses_zip(monkeypatch, tmp_path): assert isinstance(result, ParseResult) assert "Mineru" in result.markdown assert result.images["fig.png"] == b"PNGBYTES" - # the images/ prefix should be rewritten to the bare filename for localize_images - assert "images/fig.png" not in result.markdown - assert "![p](fig.png)" in result.markdown + # _result_from_zip no longer rewrites links; the raw 'images/fig.png' survives + assert "images/fig.png" in result.markdown + # localize_images (which now rewrites by basename) canonicalizes it + from openkb.images import localize_images + md2 = localize_images(result.markdown, result.images, "d", tmp_path / "imgs") + assert "sources/images/d/fig.png" in md2 def test_cloud_flow_polls_then_downloads(monkeypatch, tmp_path): @@ -74,7 +77,7 @@ def test_cloud_flow_polls_then_downloads(monkeypatch, tmp_path): buf = io.BytesIO() with zipfile.ZipFile(buf, "w") as zf: - zf.writestr("full.md", "# Cloud\n\n![p](images/fig.png)") + zf.writestr("full.md", "# Cloud") zf.writestr("images/fig.png", b"ZBYTES") zip_bytes = buf.getvalue() @@ -124,8 +127,6 @@ def _get(url, *a, **k): assert isinstance(result, ParseResult) assert "Cloud" in result.markdown assert result.images["fig.png"] == b"ZBYTES" - assert "images/fig.png" not in result.markdown - assert "![p](fig.png)" in result.markdown # drove the full poll loop: running once, then done assert _get.calls == 2 @@ -137,19 +138,19 @@ def test_poll_interval_zero_is_clamped_to_positive(): assert MineruParser({"poll_interval": 2}).poll_interval == 2 -def test_image_prefix_rewrite_is_anchored(tmp_path): - import io, sys, types, zipfile - from unittest.mock import MagicMock - # markdown has a real image link AND an unrelated 'images/fig.png' substring in prose +def test_result_from_zip_does_not_rewrite_links(tmp_path): + import io, zipfile + # The images/ -> bare rewrite moved OUT of _result_from_zip into + # localize_images; _result_from_zip must leave the markdown link text intact. buf = io.BytesIO() with zipfile.ZipFile(buf, "w") as zf: zf.writestr("full.md", "See path other_images/fig.png in text.\n\n![p](images/fig.png)") zf.writestr("images/fig.png", b"PNG") from openkb.parsers.mineru import _result_from_zip result = _result_from_zip(buf.getvalue()) - assert "![p](fig.png)" in result.markdown # link rewritten + assert "![p](images/fig.png)" in result.markdown # link text unchanged assert "other_images/fig.png" in result.markdown # unrelated prose untouched - assert result.images["fig.png"] == b"PNG" + assert result.images["fig.png"] == b"PNG" # images keyed by basename def test_cloud_empty_extract_result_then_done(monkeypatch, tmp_path): @@ -186,3 +187,73 @@ def _get(url, *a, **k): src = tmp_path / "d.pdf"; src.write_bytes(b"%PDF") result = MineruParser({"mode": "cloud", "poll_interval": 1}).parse(src) assert "Ok" in result.markdown # survived the empty-list poll without crashing + + +def test_timeout_invalid_is_clamped(): + from openkb.parsers.mineru import MineruParser + assert MineruParser({"timeout": 0}).timeout == 600 + assert MineruParser({"timeout": "x"}).timeout == 600 + assert MineruParser({"timeout": 30}).timeout == 30 + + +def test_cloud_api_error_envelope_raises(monkeypatch, tmp_path): + import sys, types + from unittest.mock import MagicMock + monkeypatch.setenv("MINERU_API_KEY", "key") + r = MagicMock(); r.raise_for_status = MagicMock() + r.json.return_value = {"code": -10001, "msg": "token expired", "data": None} + client = MagicMock() + client.__enter__ = MagicMock(return_value=client); client.__exit__ = MagicMock(return_value=False) + client.post.return_value = r + httpx_mod = types.ModuleType("httpx"); httpx_mod.Client = MagicMock(return_value=client) + monkeypatch.setitem(sys.modules, "httpx", httpx_mod) + from openkb.parsers.mineru import MineruParser + src = tmp_path / "d.pdf"; src.write_bytes(b"%PDF") + import pytest + with pytest.raises(RuntimeError) as exc: + MineruParser({"mode": "cloud"}).parse(src) + assert "token expired" in str(exc.value) or "-10001" in str(exc.value) + + +def test_cloud_empty_file_urls_raises(monkeypatch, tmp_path): + import sys, types + from unittest.mock import MagicMock + monkeypatch.setenv("MINERU_API_KEY", "key") + r = MagicMock(); r.raise_for_status = MagicMock() + r.json.return_value = {"code": 0, "data": {"batch_id": "b1", "file_urls": []}} + client = MagicMock() + client.__enter__ = MagicMock(return_value=client); client.__exit__ = MagicMock(return_value=False) + client.post.return_value = r + httpx_mod = types.ModuleType("httpx"); httpx_mod.Client = MagicMock(return_value=client) + monkeypatch.setitem(sys.modules, "httpx", httpx_mod) + from openkb.parsers.mineru import MineruParser + src = tmp_path / "d.pdf"; src.write_bytes(b"%PDF") + import pytest + with pytest.raises(RuntimeError) as exc: + MineruParser({"mode": "cloud"}).parse(src) + assert "upload URL" in str(exc.value) + + +def test_full_md_basename_preferred_over_endswith(tmp_path): + import io, zipfile + from openkb.parsers.mineru import _result_from_zip + buf = io.BytesIO() + with zipfile.ZipFile(buf, "w") as zf: + zf.writestr("careful.md", "# WRONG") # ends with 'full.md' but isn't it + zf.writestr("full.md", "# RIGHT") + result = _result_from_zip(buf.getvalue()) + assert "RIGHT" in result.markdown + assert "WRONG" not in result.markdown + + +def test_image_basename_collision_warns(tmp_path, caplog): + import io, zipfile, logging as _logging + from openkb.parsers.mineru import _result_from_zip + buf = io.BytesIO() + with zipfile.ZipFile(buf, "w") as zf: + zf.writestr("full.md", "# x") + zf.writestr("images/fig.png", b"A") + zf.writestr("sub/fig.png", b"B") + with caplog.at_level(_logging.WARNING): + result = _result_from_zip(buf.getvalue()) + assert any("fig.png" in r.message for r in caplog.records) From 02daf5220a885bc2aae0b97cce524b8cfc55aeca Mon Sep 17 00:00:00 2001 From: mountain Date: Mon, 1 Jun 2026 16:44:26 +0800 Subject: [PATCH 25/26] fix(parsers): warn that VLM is text-only and on silent parser downgrade (#77) --- README.md | 4 ++++ openkb/converter.py | 5 +++++ openkb/parsers/vlm.py | 5 +++++ tests/test_converter.py | 15 +++++++++++++++ tests/test_parsers_vlm.py | 11 +++++++++++ 5 files changed, 40 insertions(+) diff --git a/README.md b/README.md index ccefa0e3..c1ad36c5 100644 --- a/README.md +++ b/README.md @@ -313,6 +313,10 @@ Each parser handles a subset of formats — `mineru` covers PDF, Word, PPT, Exce and HTML; `mistral` and `vlm` cover PDF. `.md` and any unsupported format always fall back to the local parser. +The `vlm` parser is **text-only**: it transcribes a document's text via a vision +LLM but does **not** extract embedded figures/images. Use `mineru`, `mistral`, or +`local` if you need image extraction. + > **Note:** Long PDFs (≥ `pageindex_threshold` pages, default 20) continue to be > indexed with PageIndex and are **not** affected by the `parser` setting. The > parser governs the file → Markdown step for shorter documents and non-PDF files. diff --git a/openkb/converter.py b/openkb/converter.py index 2bab3d1b..2ac6abb1 100644 --- a/openkb/converter.py +++ b/openkb/converter.py @@ -103,6 +103,11 @@ def convert_document(src: Path, kb_dir: Path, parser_override: str | None = None source_dir=src.parent, ) if not parser.supports(src.suffix): + if parser.name != "local": + logger.warning( + "Parser %r does not support %r; falling back to the local parser for %s.", + parser.name, src.suffix, src.name, + ) parser = LocalParser(doc_name=doc_name, images_dir=images_dir, source_dir=src.parent) parse_result = parser.parse(src) diff --git a/openkb/parsers/vlm.py b/openkb/parsers/vlm.py index af6f5043..6467483f 100644 --- a/openkb/parsers/vlm.py +++ b/openkb/parsers/vlm.py @@ -34,4 +34,9 @@ def supports(self, suffix: str) -> bool: def parse(self, src: Path) -> ParseResult: markdown = transcribe_to_markdown(src, model=self.model) + logger.warning( + "VLM parser transcribes %s to text only; embedded figures/images are " + "not extracted. Use a parser like 'mineru' if you need figure extraction.", + src.name, + ) return ParseResult(markdown=markdown) diff --git a/tests/test_converter.py b/tests/test_converter.py index 8e5ce77c..6b5f2e41 100644 --- a/tests/test_converter.py +++ b/tests/test_converter.py @@ -177,3 +177,18 @@ def test_local_parser_skips_redundant_localize(self, kb_dir): result = convert_document(src, kb_dir) li.assert_not_called() # local path skips localize_images assert result.source_path.read_text(encoding="utf-8") == "# md final" + + def test_warns_on_silent_downgrade(self, kb_dir, caplog): + import logging as _logging + src = kb_dir / "raw" / "notes.md" + src.write_text("# md", encoding="utf-8") + online = MagicMock() + online.name = "mistral" + online.supports.return_value = False + with patch("openkb.converter.get_parser", return_value=online), \ + patch("openkb.converter.LocalParser") as LP: + LP.return_value.name = "local" + LP.return_value.parse.return_value = ParseResult(markdown="# md") + with caplog.at_level(_logging.WARNING): + convert_document(src, kb_dir) + assert any("falling back to the local parser" in r.message for r in caplog.records) diff --git a/tests/test_parsers_vlm.py b/tests/test_parsers_vlm.py index eb2ede08..b5a99400 100644 --- a/tests/test_parsers_vlm.py +++ b/tests/test_parsers_vlm.py @@ -45,3 +45,14 @@ def test_no_warning_when_vlm_model_set(caplog): with caplog.at_level(_logging.WARNING): VLMParser({"model": "gemini/gemini-2.5-pro"}, model="gpt-5.4-mini") assert not any("parsers.vlm.model" in r.message for r in caplog.records) + + +def test_parse_warns_text_only(tmp_path, caplog): + import logging as _logging + from unittest.mock import patch + src = tmp_path / "d.pdf"; src.write_bytes(b"%PDF") + p = VLMParser({"model": "gemini/gemini-2.5-pro"}) + with patch("openkb.parsers.vlm.transcribe_to_markdown", return_value="# md"): + with caplog.at_level(_logging.WARNING): + p.parse(src) + assert any("text only" in r.message for r in caplog.records) From b2435059e95869cf9553fe1052f88729a87c1085 Mon Sep 17 00:00:00 2001 From: mountain Date: Mon, 1 Jun 2026 16:46:01 +0800 Subject: [PATCH 26/26] fix(parsers): delete uploaded Mistral OCR files; fix patch.stopall test hygiene (#77) --- openkb/parsers/mistral.py | 51 +++++++++++++++++++++-------------- tests/test_parsers_local.py | 17 +++++------- tests/test_parsers_mistral.py | 34 +++++++++++++++++++++++ 3 files changed, 72 insertions(+), 30 deletions(-) diff --git a/openkb/parsers/mistral.py b/openkb/parsers/mistral.py index a494f149..9f5d0706 100644 --- a/openkb/parsers/mistral.py +++ b/openkb/parsers/mistral.py @@ -42,26 +42,37 @@ def parse(self, src: Path) -> ParseResult: ) from exc client = Mistral(api_key=api_key) - uploaded = client.files.upload( - file={"file_name": src.name, "content": src.read_bytes()}, purpose="ocr" - ) - signed = client.files.get_signed_url(file_id=uploaded.id) - resp = client.ocr.process( - model=self.model, - document={"type": "document_url", "document_url": signed.url}, - include_image_base64=True, - ) + uploaded = None + try: + uploaded = client.files.upload( + file={"file_name": src.name, "content": src.read_bytes()}, purpose="ocr" + ) + signed = client.files.get_signed_url(file_id=uploaded.id) + resp = client.ocr.process( + model=self.model, + document={"type": "document_url", "document_url": signed.url}, + include_image_base64=True, + ) - parts: list[str] = [] - images: dict[str, bytes] = {} - for page in resp.pages: - parts.append(page.markdown or "") - for img in getattr(page, "images", None) or []: - raw = img.image_base64 or "" - raw = _DATA_URI_RE.sub("", raw) + parts: list[str] = [] + images: dict[str, bytes] = {} + for page in resp.pages: + parts.append(page.markdown or "") + for img in getattr(page, "images", None) or []: + raw = img.image_base64 or "" + raw = _DATA_URI_RE.sub("", raw) + try: + images[img.id] = base64.b64decode(raw, validate=True) + except Exception: + logger.warning("Skipping undecodable Mistral image: %s", getattr(img, "id", "?")) + continue + return ParseResult(markdown="\n\n".join(parts), images=images) + finally: + if uploaded is not None: try: - images[img.id] = base64.b64decode(raw, validate=True) + client.files.delete(file_id=uploaded.id) except Exception: - logger.warning("Skipping undecodable Mistral image: %s", getattr(img, "id", "?")) - continue - return ParseResult(markdown="\n\n".join(parts), images=images) + logger.warning( + "Failed to delete uploaded Mistral OCR file %s", + getattr(uploaded, "id", "?"), + ) diff --git a/tests/test_parsers_local.py b/tests/test_parsers_local.py index e682c42e..af17ed38 100644 --- a/tests/test_parsers_local.py +++ b/tests/test_parsers_local.py @@ -39,13 +39,10 @@ def test_parse_other_uses_markitdown_and_extracts_base64(tmp_path): src = tmp_path / "deck.pptx" src.write_bytes(b"PK fake") images_dir = tmp_path / "img" / "deck" - fake_mid = patch("openkb.parsers.local.MarkItDown").start() - fake_mid.return_value.convert.return_value.text_content = "MARKITDOWN MD" - try: - with patch("openkb.parsers.local.extract_base64_images", return_value="CLEANED") as ex: - p = LocalParser(doc_name="deck", images_dir=images_dir, source_dir=tmp_path) - result = p.parse(src) - ex.assert_called_once_with("MARKITDOWN MD", "deck", images_dir) - assert result.markdown == "CLEANED" - finally: - patch.stopall() + with patch("openkb.parsers.local.MarkItDown") as fake_mid, \ + patch("openkb.parsers.local.extract_base64_images", return_value="CLEANED") as ex: + fake_mid.return_value.convert.return_value.text_content = "MARKITDOWN MD" + p = LocalParser(doc_name="deck", images_dir=images_dir, source_dir=tmp_path) + result = p.parse(src) + ex.assert_called_once_with("MARKITDOWN MD", "deck", images_dir) + assert result.markdown == "CLEANED" diff --git a/tests/test_parsers_mistral.py b/tests/test_parsers_mistral.py index e95c1858..8283102d 100644 --- a/tests/test_parsers_mistral.py +++ b/tests/test_parsers_mistral.py @@ -86,3 +86,37 @@ def test_undecodable_image_logged_and_skipped(monkeypatch, tmp_path, caplog): result = MistralParser({}).parse(src) assert "bad.png" not in result.images assert any("bad.png" in r.message for r in caplog.records) + + +def test_uploaded_file_is_deleted(monkeypatch, tmp_path): + import sys, types + from unittest.mock import MagicMock + monkeypatch.setenv("MISTRAL_API_KEY", "k") + client = MagicMock() + client.files.upload.return_value = MagicMock(id="file-1") + client.files.get_signed_url.return_value = MagicMock(url="https://signed") + client.ocr.process.return_value = MagicMock(pages=[]) + mod = types.ModuleType("mistralai"); mod.Mistral = MagicMock(return_value=client) + monkeypatch.setitem(sys.modules, "mistralai", mod) + from openkb.parsers.mistral import MistralParser + src = tmp_path / "d.pdf"; src.write_bytes(b"%PDF") + MistralParser({}).parse(src) + client.files.delete.assert_called_once_with(file_id="file-1") + + +def test_uploaded_file_deleted_even_on_ocr_error(monkeypatch, tmp_path): + import sys, types + from unittest.mock import MagicMock + import pytest + monkeypatch.setenv("MISTRAL_API_KEY", "k") + client = MagicMock() + client.files.upload.return_value = MagicMock(id="file-2") + client.files.get_signed_url.return_value = MagicMock(url="https://signed") + client.ocr.process.side_effect = RuntimeError("ocr boom") + mod = types.ModuleType("mistralai"); mod.Mistral = MagicMock(return_value=client) + monkeypatch.setitem(sys.modules, "mistralai", mod) + from openkb.parsers.mistral import MistralParser + src = tmp_path / "d.pdf"; src.write_bytes(b"%PDF") + with pytest.raises(RuntimeError): + MistralParser({}).parse(src) + client.files.delete.assert_called_once_with(file_id="file-2")