diff --git a/openkb/agent/query.py b/openkb/agent/query.py index 219277cc..84926e6f 100644 --- a/openkb/agent/query.py +++ b/openkb/agent/query.py @@ -8,6 +8,7 @@ from agents import ToolOutputImage, ToolOutputText from openkb.agent.tools import ( get_wiki_page_content, + grep_wiki_files, read_wiki_file, read_wiki_image, write_kb_file, @@ -38,7 +39,21 @@ ranges to help you target. Never fetch the whole document. 6. Source content may reference images (e.g. ![image](sources/images/doc/file.png)). Use the get_image tool to view them when needed. -7. Synthesize a clear, concise, well-cited answer grounded in wiki content. +7. DRILL FOR DETAIL with grep_wiki (after reading the curated pages above): + summaries are lossy, so when the question needs specifics they do not + fully contain — numbers, names, exact claims, edge cases — use grep_wiki + to LOCATE which pages hold them. grep is lexical, so try a few term + variants: acronym and expansion, singular/plural, close synonyms. Treat + the results as a reading list: each line is `path:line:text` — for every + relevant page you have NOT already read in full, read_file that path + (everything before the first colon) and extract the detail. Do NOT answer + from the grep line alone; open the page. If a page contradicts what you + already have, note both claims with their citations rather than silently + choosing one. Repeat locate-then-read until the pages that actually + contain the needed detail have been read (at most 3 grep rounds; stop once + a round surfaces no new relevant page). grep_wiki complements index.md and + summaries (your starting point) — it does not replace them. +8. Synthesize a clear, concise, well-cited answer grounded in wiki content. Answer based only on wiki content. Be concise. Before each tool call, output one short sentence explaining the reason. @@ -87,12 +102,39 @@ def get_image(image_path: str) -> ToolOutputImage | ToolOutputText: return ToolOutputImage(image_url=result["image_url"]) return ToolOutputText(text=result["text"]) + @function_tool + def grep_wiki(pattern: str, ignore_case: bool = True, fixed_string: bool = False) -> str: + """Locate wiki pages that contain specific detail, by lexical grep. + + Use this to FIND which pages hold specifics the summaries lack — + numbers, names, exact claims, edge cases — then read_file those pages + to extract the detail. It searches every wiki .md file (including + short-doc sources/); it does NOT search long-document page content + (use get_page_content for that). + + Returns up to 50 matches, one per line as 'path.md:LINE:text'. Each + result is a page to OPEN, not an answer: take the path (everything + before the FIRST colon) and read_file it — do not answer from the grep + line alone. Pattern is an extended regex (ERE): alternation 'a|b', '?', + '+', '()' work; set fixed_string=True for a literal search. Try a few + term variants (acronym/expansion, singular/plural, synonyms) — this is + lexical, not semantic. + + Args: + pattern: Search pattern (extended regex by default). + ignore_case: Case-insensitive (default True). + fixed_string: Treat pattern as a literal string, not a regex. + """ + return grep_wiki_files( + pattern, wiki_root, ignore_case=ignore_case, fixed_string=fixed_string, + ) + from agents.model_settings import ModelSettings return Agent( name="wiki-query", instructions=instructions, - tools=[read_file, get_page_content, get_image], + tools=[read_file, get_page_content, get_image, grep_wiki], model=f"litellm/{model}", model_settings=ModelSettings(parallel_tool_calls=False), ) diff --git a/openkb/agent/tools.py b/openkb/agent/tools.py index f954623f..0af9a63b 100644 --- a/openkb/agent/tools.py +++ b/openkb/agent/tools.py @@ -7,9 +7,19 @@ from __future__ import annotations import contextlib +import functools import json as _json +import os +import shutil +import subprocess from pathlib import Path +from openkb.schema import EXCLUDED_WIKI_FILES + +# grep_wiki_files tuning +_GREP_MAX_LINES = 50 +_GREP_TIMEOUT_S = 10 + def list_wiki_files(directory: str, wiki_root: str) -> str: """List all Markdown files in a wiki subdirectory. @@ -54,6 +64,108 @@ def read_wiki_file(path: str, wiki_root: str) -> str: return full_path.read_text(encoding="utf-8") +@functools.cache +def _grep_binary() -> str | None: + """Locate the system grep once per process (PATH does not change at runtime).""" + return shutil.which("grep") + + +def grep_wiki_files( + pattern: str, + wiki_root: str, + *, + ignore_case: bool = True, + fixed_string: bool = False, +) -> str: + """Lexically search the wiki's markdown layer for ``pattern`` using grep. + + A completeness sweep over every ``*.md`` file under *wiki_root* — + summaries, concepts, entities, explorations, ``index.md``, and short-doc + ``sources/*.md``. Long-doc per-page ``*.json`` (PageIndex's domain) is + excluded (only ``*.md`` is searched), as are the wiki's bookkeeping / + scaffolding files (``log.md``, ``AGENTS.md``, ``SCHEMA.md`` — see + :data:`openkb.schema.EXCLUDED_WIKI_FILES`). + + Shells out to the system ``grep`` (POSIX, ubiquitous on macOS/Linux) with + ``shell=False``, so a hostile *pattern* cannot inject commands. ``pattern`` + is an **extended** regular expression (ERE) by default — alternation + ``a|b``, ``?``, ``+``, ``()`` all work — or a literal string when + *fixed_string* is True. + + Args: + pattern: Search pattern. ERE by default; literal when *fixed_string*. + wiki_root: Absolute path to the wiki root directory. + ignore_case: Case-insensitive match (default True). + fixed_string: Treat *pattern* as a literal string, not a regex. + + Returns: + Up to :data:`_GREP_MAX_LINES` matches, each line ``relative/path.md:LINE:text`` + (the path is everything before the first colon), plus a truncation + notice if capped. On empty pattern / no match / missing grep / timeout / + error-with-no-results, returns an explicit message string. Never raises. + """ + if not pattern or not pattern.strip(): + return "Provide a non-empty search pattern." + + root = Path(wiki_root).resolve() + if not root.exists(): + return f"Wiki root not found: {wiki_root}" + + grep = _grep_binary() + if not grep: + return "grep unavailable on this system." + + cmd = [grep, "-rn", "--include=*.md", "--exclude-dir=images", "--exclude-dir=.git"] + for name in sorted(EXCLUDED_WIKI_FILES): + cmd.append(f"--exclude={name}") + if ignore_case: + cmd.append("-i") + cmd.append("-F" if fixed_string else "-E") + cmd += ["-e", pattern, str(root)] + + try: + proc = subprocess.run( + cmd, capture_output=True, text=True, errors="replace", + timeout=_GREP_TIMEOUT_S, check=False, + ) + except subprocess.TimeoutExpired: + return "grep timed out; narrow the pattern." + + prefix = str(root) + os.sep + results: list[str] = [] + for line in proc.stdout.splitlines(): + if not line: + continue + if not line.startswith(prefix): + continue # defensive: only surface paths under wiki_root + rel = line[len(prefix):] + path_part = rel.split(":", 1)[0] + # Defense in depth: --exclude already drops these basenames; this also + # catches a same-named file in a subdirectory. + if Path(path_part).name in EXCLUDED_WIKI_FILES: + continue + results.append(rel) + if len(results) > _GREP_MAX_LINES: + break # only need 51 to detect truncation; stop processing + + if not results: + # grep exit codes: 0 = match, 1 = no match, >=2 = error. grep can exit + # >=2 (e.g. one unreadable file) while still printing valid matches — + # those were collected above. Only report an error when nothing usable + # came back. + if proc.returncode >= 2: + stderr_lines = (proc.stderr or "").strip().splitlines() + first = stderr_lines[0] if stderr_lines else "unknown error" + return f"grep error: {first}." + return f"No matches for {pattern}." + + truncated = len(results) > _GREP_MAX_LINES + out = "\n".join(results[:_GREP_MAX_LINES]) + if truncated: + out += "\n… more matches; narrow the pattern." + return out + + def parse_pages(pages: str) -> list[int]: """Parse a page specification string into a sorted, deduplicated list of page numbers. diff --git a/openkb/lint.py b/openkb/lint.py index 2ac6af1d..5e8d0af1 100644 --- a/openkb/lint.py +++ b/openkb/lint.py @@ -15,13 +15,13 @@ import yaml -from openkb.schema import PAGE_CONTENT_DIRS +from openkb.schema import EXCLUDED_WIKI_FILES, PAGE_CONTENT_DIRS # Matches [[wikilink]] or [[subdir/link]] _WIKILINK_RE = re.compile(r"\[\[([^\]]+)\]\]") # Files to exclude from lint scanning (schema, logs, etc.) -_EXCLUDED_FILES = {"AGENTS.md", "SCHEMA.md", "log.md"} +_EXCLUDED_FILES = EXCLUDED_WIKI_FILES def _normalize_target(target: str) -> str: diff --git a/openkb/schema.py b/openkb/schema.py index 8a6322e6..ced01ffb 100644 --- a/openkb/schema.py +++ b/openkb/schema.py @@ -6,6 +6,11 @@ # for surfaces that enumerate page content (list, lint, status, skill gate). PAGE_CONTENT_DIRS = ("summaries", "concepts", "entities") +# Bookkeeping / scaffolding files that live under wiki/ but are NOT content. +# Single source of truth shared by the structural linter and the grep search +# tool so their exclusion policy can never drift. +EXCLUDED_WIKI_FILES: frozenset[str] = frozenset({"AGENTS.md", "SCHEMA.md", "log.md"}) + # Canonical empty index.md seed. Used by `openkb init` and the compiler's # lazy-create path so they never drift. INDEX_SEED = "# Knowledge Base Index\n\n## Documents\n\n## Concepts\n\n## Entities\n\n## Explorations\n" diff --git a/tests/test_grep.py b/tests/test_grep.py new file mode 100644 index 00000000..65fa19e6 --- /dev/null +++ b/tests/test_grep.py @@ -0,0 +1,331 @@ +"""Tests for openkb.agent.tools.grep_wiki_files — grep-based wiki search.""" +from __future__ import annotations + +import shutil + +import pytest + +import openkb.agent.tools as tools_mod +from openkb.agent.tools import grep_wiki_files + +_HAS_GREP = shutil.which("grep") is not None +requires_grep = pytest.mark.skipif(not _HAS_GREP, reason="system grep not available") + + +def _wiki(tmp_path): + """Build a minimal wiki/ tree and return its root as a string.""" + root = tmp_path / "wiki" + (root / "summaries").mkdir(parents=True) + (root / "concepts").mkdir(parents=True) + (root / "entities").mkdir(parents=True) + (root / "sources" / "images").mkdir(parents=True) + (root / "summaries" / "paper.md").write_text( + "# Paper\nThe transformer architecture uses self-attention.\n", + encoding="utf-8", + ) + (root / "concepts" / "attention.md").write_text( + "# Attention\nScaled dot-product Attention is central.\n", + encoding="utf-8", + ) + (root / "entities" / "vaswani.md").write_text( + "# Vaswani\nAshish Vaswani is a lead author.\n", encoding="utf-8", + ) + (root / "sources" / "note.md").write_text( + "Short note: the lottery ticket hypothesis appears here only.\n" + "It also discusses a large language model in passing.\n", + encoding="utf-8", + ) + # Long-doc per-page JSON — never grepped (only *.md is searched). + (root / "sources" / "book.json").write_text( + '[{"page": 1, "text": "transformer secret in json"}]\n', + encoding="utf-8", + ) + # Bookkeeping / scaffolding — never grepped. + (root / "log.md").write_text( + "# Operations Log\n## [2026-01-01] ingest | transformer\n", + encoding="utf-8", + ) + (root / "AGENTS.md").write_text( + "# Schema\nThis schema describes synthesis and transformer concepts.\n", + encoding="utf-8", + ) + (root / "SCHEMA.md").write_text( + "# Schema alias\nMentions transformer too.\n", encoding="utf-8", + ) + return str(root) + + +# --- scope: what gets matched ------------------------------------------------- + +@requires_grep +def test_finds_match_in_summaries(tmp_path): + out = grep_wiki_files("self-attention", _wiki(tmp_path)) + assert "summaries/paper.md:" in out + assert "self-attention" in out + + +@requires_grep +def test_finds_match_in_concepts(tmp_path): + out = grep_wiki_files("Scaled dot-product", _wiki(tmp_path)) + assert "concepts/attention.md:" in out + + +@requires_grep +def test_finds_match_in_entities(tmp_path): + out = grep_wiki_files("Ashish Vaswani", _wiki(tmp_path)) + assert "entities/vaswani.md:" in out + + +@requires_grep +def test_finds_match_in_short_source_md(tmp_path): + out = grep_wiki_files("lottery ticket", _wiki(tmp_path)) + assert "sources/note.md:" in out + + +# --- scope: what gets excluded ------------------------------------------------ + +@requires_grep +def test_excludes_long_doc_json(tmp_path): + out = grep_wiki_files("transformer", _wiki(tmp_path)) + assert "book.json" not in out + + +@requires_grep +def test_excludes_log_md(tmp_path): + out = grep_wiki_files("transformer", _wiki(tmp_path)) + assert "log.md" not in out + + +@requires_grep +def test_excludes_agents_md(tmp_path): + # AGENTS.md contains 'synthesis' and 'transformer' but is scaffolding. + out = grep_wiki_files("synthesis", _wiki(tmp_path)) + assert "AGENTS.md" not in out + assert out == "No matches for synthesis." + + +@requires_grep +def test_excludes_schema_md(tmp_path): + out = grep_wiki_files("transformer", _wiki(tmp_path)) + assert "SCHEMA.md" not in out + + +@requires_grep +def test_excludes_images_dir(tmp_path): + wiki = _wiki(tmp_path) + (tmp_path / "wiki" / "sources" / "images" / "caption.md").write_text( + "transformer figure caption\n", encoding="utf-8", + ) + out = grep_wiki_files("transformer", wiki) + assert "images/" not in out + + +# --- regex dialect ------------------------------------------------------------ + +@requires_grep +def test_ere_alternation_matches(tmp_path): + # ERE alternation must work (regression for the BRE-vs-Rust-regex bug). + out = grep_wiki_files("LLM|large language model", _wiki(tmp_path)) + assert "sources/note.md:" in out + + +@requires_grep +def test_fixed_string_treats_pipe_literally(tmp_path): + # As a literal, 'LLM|large language model' does not appear anywhere. + out = grep_wiki_files("LLM|large language model", _wiki(tmp_path), fixed_string=True) + assert out == "No matches for LLM|large language model." + + +@requires_grep +def test_fixed_string_vs_regex_dot(tmp_path): + wiki = _wiki(tmp_path) + # literal 'self.attention' does not appear (text has 'self-attention') + assert grep_wiki_files("self.attention", wiki, fixed_string=True) == \ + "No matches for self.attention." + # as a regex, '.' matches the hyphen + assert "summaries/paper.md:" in grep_wiki_files("self.attention", wiki, fixed_string=False) + + +# --- case sensitivity --------------------------------------------------------- + +@requires_grep +def test_case_insensitive_by_default(tmp_path): + out = grep_wiki_files("TRANSFORMER", _wiki(tmp_path)) + assert "summaries/paper.md:" in out + + +@requires_grep +def test_case_sensitive_when_disabled(tmp_path): + out = grep_wiki_files("TRANSFORMER", _wiki(tmp_path), ignore_case=False) + assert out == "No matches for TRANSFORMER." + + +# --- guards / messages -------------------------------------------------------- + +@requires_grep +def test_no_match_returns_message(tmp_path): + out = grep_wiki_files("nonexistentterm12345", _wiki(tmp_path)) + assert out == "No matches for nonexistentterm12345." + + +def test_empty_pattern_guarded(tmp_path): + out = grep_wiki_files("", _wiki(tmp_path)) + assert out == "Provide a non-empty search pattern." + + +def test_whitespace_pattern_guarded(tmp_path): + out = grep_wiki_files(" ", _wiki(tmp_path)) + assert out == "Provide a non-empty search pattern." + + +def test_grep_unavailable_returns_message(tmp_path, monkeypatch): + monkeypatch.setattr(tools_mod, "_grep_binary", lambda: None) + out = grep_wiki_files("transformer", _wiki(tmp_path)) + assert out == "grep unavailable on this system." + + +# --- paths -------------------------------------------------------------------- + +@requires_grep +def test_paths_are_relative_to_wiki_root(tmp_path): + wiki = _wiki(tmp_path) + out = grep_wiki_files("self-attention", wiki) + assert wiki not in out # no absolute-path leak + # order-independent: the summaries hit is present as a relative path + assert any(ln.startswith("summaries/paper.md:") for ln in out.splitlines()) + + +@requires_grep +def test_result_cap_and_truncation_notice(tmp_path): + wiki = _wiki(tmp_path) + big = "\n".join(f"line {i} needle" for i in range(60)) + (tmp_path / "wiki" / "summaries" / "big.md").write_text(big + "\n", encoding="utf-8") + out = grep_wiki_files("needle", wiki) + lines = out.splitlines() + assert lines[-1] == "… more matches; narrow the pattern." + assert len(lines) == 51 + + +# --- safety ------------------------------------------------------------------- + +@requires_grep +def test_shell_metacharacters_do_not_execute(tmp_path): + wiki = _wiki(tmp_path) + sentinel = tmp_path / "pwned" + grep_wiki_files("; touch " + str(sentinel), wiki) + assert not sentinel.exists() + + +@requires_grep +def test_non_utf8_bytes_do_not_raise(tmp_path): + wiki = _wiki(tmp_path) + # A matched line with a non-UTF-8 byte must not raise (errors='replace'). + (tmp_path / "wiki" / "summaries" / "latin.md").write_bytes( + b"caf\xe9 transformer here\n" + ) + out = grep_wiki_files("transformer", wiki) # must return a string, not raise + assert isinstance(out, str) + assert "summaries/" in out + + +# --- command construction (binary-agnostic, no real grep needed) -------------- + +def test_grep_command_built_with_ere_and_excludes(tmp_path, monkeypatch): + wiki = _wiki(tmp_path) + monkeypatch.setattr(tools_mod, "_grep_binary", lambda: "/usr/bin/grep") + captured = {} + + class _FakeProc: + returncode = 1 + stdout = "" + stderr = "" + + def _fake_run(cmd, *args, **kwargs): + captured["cmd"] = cmd + captured["shell"] = kwargs.get("shell", False) + return _FakeProc() + + monkeypatch.setattr(tools_mod.subprocess, "run", _fake_run) + grep_wiki_files("needle", wiki, ignore_case=True, fixed_string=False) + + cmd = captured["cmd"] + assert cmd[0] == "/usr/bin/grep" + assert captured["shell"] is False + assert "-rn" in cmd + assert "--include=*.md" in cmd + assert "-i" in cmd + assert "-E" in cmd and "-F" not in cmd + for name in ("AGENTS.md", "SCHEMA.md", "log.md"): + assert f"--exclude={name}" in cmd + assert cmd[-3] == "-e" + assert cmd[-2] == "needle" + from pathlib import Path as _P + assert cmd[-1] == str(_P(wiki).resolve()) + assert "--exclude-dir=images" in cmd + assert "--exclude-dir=.git" in cmd + + +def test_grep_command_uses_F_when_fixed_and_omits_i(tmp_path, monkeypatch): + wiki = _wiki(tmp_path) + monkeypatch.setattr(tools_mod, "_grep_binary", lambda: "/usr/bin/grep") + captured = {} + + class _FakeProc: + returncode = 1 + stdout = "" + stderr = "" + + monkeypatch.setattr( + tools_mod.subprocess, "run", + lambda cmd, *a, **k: (captured.__setitem__("cmd", cmd) or _FakeProc()), + ) + grep_wiki_files("a|b", wiki, ignore_case=False, fixed_string=True) + cmd = captured["cmd"] + assert "-F" in cmd and "-E" not in cmd + assert "-i" not in cmd + + +# --- returncode handling ------------------------------------------------------ + +def test_partial_error_preserves_matches(tmp_path, monkeypatch): + """grep exit >=2 (e.g. one unreadable file) must NOT discard valid matches.""" + wiki = _wiki(tmp_path) + root_str = str((tmp_path / "wiki").resolve()) + monkeypatch.setattr(tools_mod, "_grep_binary", lambda: "/usr/bin/grep") + + class _FakeProc: + returncode = 2 + stdout = f"{root_str}/summaries/paper.md:2:self-attention here\n" + stderr = "grep: /x/locked: Permission denied" + + monkeypatch.setattr(tools_mod.subprocess, "run", lambda *a, **k: _FakeProc()) + out = grep_wiki_files("self-attention", wiki) + assert "summaries/paper.md:" in out + assert "grep error" not in out + + +def test_error_with_no_results_returns_error(tmp_path, monkeypatch): + wiki = _wiki(tmp_path) + monkeypatch.setattr(tools_mod, "_grep_binary", lambda: "/usr/bin/grep") + + class _FakeProc: + returncode = 2 + stdout = "" + stderr = "grep: something broke\nsecond line" + + monkeypatch.setattr(tools_mod.subprocess, "run", lambda *a, **k: _FakeProc()) + out = grep_wiki_files("whatever", wiki) + assert out == "grep error: grep: something broke." + + +def test_timeout_returns_message(tmp_path, monkeypatch): + import subprocess as _sp + wiki = _wiki(tmp_path) + monkeypatch.setattr(tools_mod, "_grep_binary", lambda: "/usr/bin/grep") + + def _raise_timeout(*a, **k): + raise _sp.TimeoutExpired(cmd="grep", timeout=10) + + monkeypatch.setattr(tools_mod.subprocess, "run", _raise_timeout) + out = grep_wiki_files("transformer", wiki) + assert out == "grep timed out; narrow the pattern." diff --git a/tests/test_query.py b/tests/test_query.py index e9585d32..cb32de9e 100644 --- a/tests/test_query.py +++ b/tests/test_query.py @@ -16,9 +16,9 @@ def test_agent_name(self, tmp_path): agent = build_query_agent(str(tmp_path), "gpt-4o-mini") assert agent.name == "wiki-query" - def test_agent_has_three_tools(self, tmp_path): + def test_agent_has_four_tools(self, tmp_path): agent = build_query_agent(str(tmp_path), "gpt-4o-mini") - assert len(agent.tools) == 3 + assert len(agent.tools) == 4 def test_agent_tool_names(self, tmp_path): agent = build_query_agent(str(tmp_path), "gpt-4o-mini") @@ -26,6 +26,7 @@ def test_agent_tool_names(self, tmp_path): assert "read_file" in names assert "get_page_content" in names assert "get_image" in names + assert "grep_wiki" in names def test_instructions_mention_get_page_content(self, tmp_path): agent = build_query_agent(str(tmp_path), "gpt-4o-mini")