From 1c31a7fedb057eb3d7503e8d8607a0c9da608866 Mon Sep 17 00:00:00 2001 From: mountain Date: Tue, 2 Jun 2026 12:34:55 +0800 Subject: [PATCH 1/8] feat(agent): add grep_wiki_files lexical wiki search helper --- openkb/agent/tools.py | 100 ++++++++++++++++++++++++++++++++++++ tests/test_grep.py | 114 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 214 insertions(+) create mode 100644 tests/test_grep.py diff --git a/openkb/agent/tools.py b/openkb/agent/tools.py index f954623f..8cc06704 100644 --- a/openkb/agent/tools.py +++ b/openkb/agent/tools.py @@ -8,8 +8,14 @@ import contextlib import json as _json +import shutil +import subprocess from pathlib import Path +# grep_wiki_files tuning +_GREP_MAX_LINES = 50 +_GREP_TIMEOUT_S = 10 + def list_wiki_files(directory: str, wiki_root: str) -> str: """List all Markdown files in a wiki subdirectory. @@ -54,6 +60,100 @@ def read_wiki_file(path: str, wiki_root: str) -> str: return full_path.read_text(encoding="utf-8") +def grep_wiki_files( + pattern: str, + wiki_root: str, + *, + ignore_case: bool = True, + fixed_string: bool = False, +) -> str: + """Lexically search the wiki's markdown layer for ``pattern``. + + A completeness sweep: shells out to ripgrep (preferred) or grep + (fallback) over every ``*.md`` file under *wiki_root* — summaries, + concepts, entities, explorations, ``index.md``, and short-doc + ``sources/*.md``. Long-doc per-page ``*.json`` (PageIndex's domain) and + ``log.md`` bookkeeping are excluded. + + Args: + pattern: Search pattern. Regex by default; literal when + *fixed_string* is True. + wiki_root: Absolute path to the wiki root directory. + ignore_case: Case-insensitive match (default True). + fixed_string: Treat *pattern* as a literal string, not a regex. + + Returns: + Up to :data:`_GREP_MAX_LINES` matches as ``relative/path.md:LINE: text`` + lines, plus a truncation notice if capped. On no match / missing + binary / timeout / error, returns an explicit message string. Never + raises and never invokes a shell (``shell=False``), so a hostile + *pattern* cannot inject commands. + """ + root = Path(wiki_root).resolve() + if not root.exists(): + return f"Wiki root not found: {wiki_root}" + + rg = shutil.which("rg") + grep = shutil.which("grep") + + if rg: + # --no-ignore: the wiki dir is often gitignored; without this rg + # silently returns zero matches inside a real OpenKB checkout. + cmd = [ + rg, "--line-number", "--no-heading", "--color", "never", + "--no-ignore", "-g", "*.md", "-g", "!log.md", + ] + if ignore_case: + cmd.append("-i") + if fixed_string: + cmd.append("-F") + cmd += ["-e", pattern, str(root)] + elif grep: + cmd = [grep, "-rn", "--include=*.md", "--exclude-dir=images"] + if ignore_case: + cmd.append("-i") + if fixed_string: + cmd.append("-F") + cmd += ["-e", pattern, str(root)] + else: + return "grep unavailable on this system." + + try: + proc = subprocess.run( + cmd, capture_output=True, text=True, + timeout=_GREP_TIMEOUT_S, check=False, + ) + except subprocess.TimeoutExpired: + return "grep timed out; narrow the pattern." + + # rg/grep convention: 0 = matches, 1 = no matches, >=2 = real error. + if proc.returncode >= 2: + stderr_lines = (proc.stderr or "").strip().splitlines() + first = stderr_lines[0] if stderr_lines else "unknown error" + return f"grep error: {first}." + + prefix = str(root) + "/" + results: list[str] = [] + for line in proc.stdout.splitlines(): + if not line.strip(): + continue + rel = line[len(prefix):] if line.startswith(prefix) else line + path_part = rel.split(":", 1)[0] + # Defensive: grep --include=*.md still matches log.md; drop it. + if path_part == "log.md" or path_part.endswith("/log.md"): + continue + results.append(rel) + + if not results: + return f"No matches for {pattern}." + + truncated = len(results) > _GREP_MAX_LINES + out = "\n".join(results[:_GREP_MAX_LINES]) + if truncated: + out += "\n… more matches; narrow the pattern." + return out + + def parse_pages(pages: str) -> list[int]: """Parse a page specification string into a sorted, deduplicated list of page numbers. diff --git a/tests/test_grep.py b/tests/test_grep.py new file mode 100644 index 00000000..322af7f2 --- /dev/null +++ b/tests/test_grep.py @@ -0,0 +1,114 @@ +"""Tests for openkb.agent.tools.grep_wiki_files — lexical wiki search.""" +from __future__ import annotations + +from openkb.agent.tools import grep_wiki_files + + +def _wiki(tmp_path): + """Build a minimal wiki/ tree and return its root as a string.""" + root = tmp_path / "wiki" + (root / "summaries").mkdir(parents=True) + (root / "concepts").mkdir(parents=True) + (root / "entities").mkdir(parents=True) + (root / "sources" / "images").mkdir(parents=True) + (root / "summaries" / "paper.md").write_text( + "# Paper\nThe transformer architecture uses self-attention.\n", + encoding="utf-8", + ) + (root / "concepts" / "attention.md").write_text( + "# Attention\nScaled dot-product Attention is central.\n", + encoding="utf-8", + ) + (root / "sources" / "note.md").write_text( + "Short note: the lottery ticket hypothesis appears here only.\n", + encoding="utf-8", + ) + # Long-doc per-page JSON — must NEVER be grepped. + (root / "sources" / "book.json").write_text( + '[{"page": 1, "text": "transformer secret in json"}]\n', + encoding="utf-8", + ) + # Bookkeeping — must NEVER be grepped. + (root / "log.md").write_text( + "# Operations Log\n## [2026-01-01] ingest | transformer\n", + encoding="utf-8", + ) + return str(root) + + +def test_finds_match_in_summaries(tmp_path): + wiki = _wiki(tmp_path) + out = grep_wiki_files("self-attention", wiki) + assert "summaries/paper.md:" in out + assert "self-attention" in out + + +def test_finds_match_in_short_source_md(tmp_path): + wiki = _wiki(tmp_path) + out = grep_wiki_files("lottery ticket", wiki) + assert "sources/note.md:" in out + + +def test_excludes_long_doc_json(tmp_path): + wiki = _wiki(tmp_path) + out = grep_wiki_files("transformer", wiki) + assert "book.json" not in out + + +def test_excludes_log_md(tmp_path): + wiki = _wiki(tmp_path) + out = grep_wiki_files("transformer", wiki) + assert "log.md" not in out + + +def test_case_insensitive_by_default(tmp_path): + wiki = _wiki(tmp_path) + out = grep_wiki_files("TRANSFORMER", wiki) + assert "summaries/paper.md:" in out + + +def test_case_sensitive_when_disabled(tmp_path): + wiki = _wiki(tmp_path) + out = grep_wiki_files("TRANSFORMER", wiki, ignore_case=False) + assert out == "No matches for TRANSFORMER." + + +def test_fixed_string_treats_regex_literally(tmp_path): + wiki = _wiki(tmp_path) + out = grep_wiki_files("self.attention", wiki, fixed_string=True) + # literal "self.attention" does not appear (text has "self-attention") + assert out == "No matches for self.attention." + # but as a regex, "." matches the hyphen + out2 = grep_wiki_files("self.attention", wiki, fixed_string=False) + assert "summaries/paper.md:" in out2 + + +def test_no_match_returns_message(tmp_path): + wiki = _wiki(tmp_path) + out = grep_wiki_files("nonexistentterm12345", wiki) + assert out == "No matches for nonexistentterm12345." + + +def test_paths_are_relative_to_wiki_root(tmp_path): + wiki = _wiki(tmp_path) + out = grep_wiki_files("self-attention", wiki) + assert wiki not in out + assert out.splitlines()[0].startswith("summaries/") + + +def test_result_cap_and_truncation_notice(tmp_path): + wiki = _wiki(tmp_path) + root = tmp_path / "wiki" / "summaries" + big = "\n".join(f"line {i} needle" for i in range(60)) + (root / "big.md").write_text(big + "\n", encoding="utf-8") + out = grep_wiki_files("needle", wiki) + lines = out.splitlines() + assert lines[-1] == "… more matches; narrow the pattern." + assert len(lines) == 51 + + +def test_shell_metacharacters_do_not_execute(tmp_path): + wiki = _wiki(tmp_path) + sentinel = tmp_path / "pwned" + grep_wiki_files("; touch " + str(sentinel), wiki) + assert not sentinel.exists() From e9de960453922fbd9217f3e1fe9df4201512520e Mon Sep 17 00:00:00 2001 From: mountain Date: Tue, 2 Jun 2026 12:40:14 +0800 Subject: [PATCH 2/8] test(agent): cover ripgrep branch + entities scope in grep_wiki_files --- tests/test_grep.py | 72 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) diff --git a/tests/test_grep.py b/tests/test_grep.py index 322af7f2..31c65e47 100644 --- a/tests/test_grep.py +++ b/tests/test_grep.py @@ -112,3 +112,75 @@ def test_shell_metacharacters_do_not_execute(tmp_path): sentinel = tmp_path / "pwned" grep_wiki_files("; touch " + str(sentinel), wiki) assert not sentinel.exists() + + +def test_rg_branch_builds_expected_command(tmp_path, monkeypatch): + import subprocess as _sp + import openkb.agent.tools as tools_mod + + wiki = _wiki(tmp_path) + monkeypatch.setattr(tools_mod.shutil, "which", lambda name: "/usr/bin/rg" if name == "rg" else None) + + captured = {} + + class _FakeProc: + returncode = 0 + stdout = "" + stderr = "" + + def _fake_run(cmd, *args, **kwargs): + captured["cmd"] = cmd + captured["shell"] = kwargs.get("shell", False) + return _FakeProc() + + monkeypatch.setattr(tools_mod.subprocess, "run", _fake_run) + + tools_mod.grep_wiki_files("needle", wiki, ignore_case=True, fixed_string=True) + + cmd = captured["cmd"] + # rg binary chosen, shell never used + assert cmd[0] == "/usr/bin/rg" + assert captured["shell"] is False + # load-bearing flags present + assert "--no-ignore" in cmd + assert "--line-number" in cmd + assert "--no-heading" in cmd + assert "-i" in cmd # ignore_case + assert "-F" in cmd # fixed_string + # md include + log.md exclude globs + assert cmd[cmd.index("-g") :].count("-g") == 2 or cmd.count("-g") == 2 + assert "*.md" in cmd + assert "!log.md" in cmd + # pattern passed via -e as a separate argv (injection-safe), root last + assert cmd[-3] == "-e" + assert cmd[-2] == "needle" + assert cmd[-1] == str(__import__("pathlib").Path(wiki).resolve()) + + +def test_rg_branch_omits_flags_when_disabled(tmp_path, monkeypatch): + import openkb.agent.tools as tools_mod + + wiki = _wiki(tmp_path) + monkeypatch.setattr(tools_mod.shutil, "which", lambda name: "/usr/bin/rg" if name == "rg" else None) + captured = {} + + class _FakeProc: + returncode = 1 + stdout = "" + stderr = "" + + monkeypatch.setattr(tools_mod.subprocess, "run", lambda cmd, *a, **k: (captured.__setitem__("cmd", cmd) or _FakeProc())) + + tools_mod.grep_wiki_files("needle", wiki, ignore_case=False, fixed_string=False) + cmd = captured["cmd"] + assert "-i" not in cmd + assert "-F" not in cmd + + +def test_finds_match_in_entities(tmp_path): + wiki = _wiki(tmp_path) + (tmp_path / "wiki" / "entities" / "vaswani.md").write_text( + "# Vaswani\nAshish Vaswani is a lead author.\n", encoding="utf-8", + ) + out = grep_wiki_files("Ashish Vaswani", wiki) + assert "entities/vaswani.md:" in out From ea90b001309fb9dd315e6ab37117608524385536 Mon Sep 17 00:00:00 2001 From: mountain Date: Tue, 2 Jun 2026 12:41:39 +0800 Subject: [PATCH 3/8] feat(agent): expose grep_wiki tool on the query agent --- openkb/agent/query.py | 27 ++++++++++++++++++++++++++- tests/test_query.py | 5 +++-- 2 files changed, 29 insertions(+), 3 deletions(-) diff --git a/openkb/agent/query.py b/openkb/agent/query.py index 219277cc..53ec4b53 100644 --- a/openkb/agent/query.py +++ b/openkb/agent/query.py @@ -8,6 +8,7 @@ from agents import ToolOutputImage, ToolOutputText from openkb.agent.tools import ( get_wiki_page_content, + grep_wiki_files, read_wiki_file, read_wiki_image, write_kb_file, @@ -87,12 +88,36 @@ def get_image(image_path: str) -> ToolOutputImage | ToolOutputText: return ToolOutputImage(image_url=result["image_url"]) return ToolOutputText(text=result["text"]) + @function_tool + def grep_wiki(pattern: str, ignore_case: bool = True, fixed_string: bool = False) -> str: + """Lexically grep the wiki's markdown for a pattern. + + Use this as a FINAL completeness check, after you have drafted an + answer from index.md / summaries / concepts / entities. It searches + every wiki .md file (including short-doc sources/) for the literal + terms of the question — catching details the summaries compressed + away, pages you never opened, or contradicting mentions. It does NOT + search long-document page content (use get_page_content for that). + + Returns up to 50 matches as 'relative/path.md:LINE: text'. Feed any + new path into read_file. Try a few term variants (acronym/expansion, + singular/plural, synonyms) — this is lexical, not semantic. + + Args: + pattern: Search pattern (regex by default). + ignore_case: Case-insensitive (default True). + fixed_string: Treat pattern as a literal string, not a regex. + """ + return grep_wiki_files( + pattern, wiki_root, ignore_case=ignore_case, fixed_string=fixed_string, + ) + from agents.model_settings import ModelSettings return Agent( name="wiki-query", instructions=instructions, - tools=[read_file, get_page_content, get_image], + tools=[read_file, get_page_content, get_image, grep_wiki], model=f"litellm/{model}", model_settings=ModelSettings(parallel_tool_calls=False), ) diff --git a/tests/test_query.py b/tests/test_query.py index e9585d32..cb32de9e 100644 --- a/tests/test_query.py +++ b/tests/test_query.py @@ -16,9 +16,9 @@ def test_agent_name(self, tmp_path): agent = build_query_agent(str(tmp_path), "gpt-4o-mini") assert agent.name == "wiki-query" - def test_agent_has_three_tools(self, tmp_path): + def test_agent_has_four_tools(self, tmp_path): agent = build_query_agent(str(tmp_path), "gpt-4o-mini") - assert len(agent.tools) == 3 + assert len(agent.tools) == 4 def test_agent_tool_names(self, tmp_path): agent = build_query_agent(str(tmp_path), "gpt-4o-mini") @@ -26,6 +26,7 @@ def test_agent_tool_names(self, tmp_path): assert "read_file" in names assert "get_page_content" in names assert "get_image" in names + assert "grep_wiki" in names def test_instructions_mention_get_page_content(self, tmp_path): agent = build_query_agent(str(tmp_path), "gpt-4o-mini") From 8abd631a51b1e5f367ecb568aee1e54340a993f1 Mon Sep 17 00:00:00 2001 From: mountain Date: Tue, 2 Jun 2026 12:44:31 +0800 Subject: [PATCH 4/8] feat(agent): instruct query agent to grep-sweep for completeness --- openkb/agent/query.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/openkb/agent/query.py b/openkb/agent/query.py index 53ec4b53..3b6733aa 100644 --- a/openkb/agent/query.py +++ b/openkb/agent/query.py @@ -39,7 +39,17 @@ ranges to help you target. Never fetch the whole document. 6. Source content may reference images (e.g. ![image](sources/images/doc/file.png)). Use the get_image tool to view them when needed. -7. Synthesize a clear, concise, well-cited answer grounded in wiki content. +7. COMPLETENESS SWEEP (do this before finalizing): the summary layer is + lossy, so before you commit to an answer, call grep_wiki for the salient + terms of the question and your draft — proper nouns, technical terms, + numbers, key entities. Because grep is lexical (not semantic), try a few + term variants: acronym and expansion, singular/plural, close synonyms. + For any matching page you have NOT already read, read_file it and fold in + relevant content. If grep surfaces a claim that contradicts your draft, + surface the conflict rather than silently choosing one. Do at most 3 grep + rounds; stop once a round adds nothing new. grep_wiki is a check, not the + primary search — index.md and summaries still come first. +8. Synthesize a clear, concise, well-cited answer grounded in wiki content. Answer based only on wiki content. Be concise. Before each tool call, output one short sentence explaining the reason. From 8da312acfae7863824edc337245567fca4a783d5 Mon Sep 17 00:00:00 2001 From: mountain Date: Tue, 2 Jun 2026 12:48:24 +0800 Subject: [PATCH 5/8] refine(agent): sharpen grep-sweep instruction (scope terms, define round) --- openkb/agent/query.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/openkb/agent/query.py b/openkb/agent/query.py index 3b6733aa..d87c5fb9 100644 --- a/openkb/agent/query.py +++ b/openkb/agent/query.py @@ -40,15 +40,17 @@ 6. Source content may reference images (e.g. ![image](sources/images/doc/file.png)). Use the get_image tool to view them when needed. 7. COMPLETENESS SWEEP (do this before finalizing): the summary layer is - lossy, so before you commit to an answer, call grep_wiki for the salient - terms of the question and your draft — proper nouns, technical terms, - numbers, key entities. Because grep is lexical (not semantic), try a few - term variants: acronym and expansion, singular/plural, close synonyms. - For any matching page you have NOT already read, read_file it and fold in - relevant content. If grep surfaces a claim that contradicts your draft, - surface the conflict rather than silently choosing one. Do at most 3 grep - rounds; stop once a round adds nothing new. grep_wiki is a check, not the - primary search — index.md and summaries still come first. + lossy, so before you commit to an answer, call grep_wiki for the + question's salient terms — proper nouns, technical terms, numbers, key + entities — plus any claim you asserted in your draft that you have not + yet seen on a wiki page. Because grep is lexical (not semantic), try a + few term variants: acronym and expansion, singular/plural, close + synonyms. For any matching page you have NOT already read, read_file it + and fold in relevant content. If grep surfaces a claim that contradicts + your draft, note both claims with their citations rather than silently + choosing one. Do at most 3 grep rounds (a round = one concept and its + variants); stop once a round surfaces no new page. grep_wiki is a check, + not the primary search — index.md and summaries still come first. 8. Synthesize a clear, concise, well-cited answer grounded in wiki content. Answer based only on wiki content. Be concise. From 098c7b450daf6afb5940fa61dd94dccde827ea71 Mon Sep 17 00:00:00 2001 From: mountain Date: Tue, 2 Jun 2026 17:24:31 +0800 Subject: [PATCH 6/8] =?UTF-8?q?fix(agent):=20grep-only=20wiki=20search=20?= =?UTF-8?q?=E2=80=94=20fix=20dialect,=20encoding,=20partial-error,=20exclu?= =?UTF-8?q?sions,=20guards=20(addresses=20code=20review)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- openkb/agent/query.py | 12 +- openkb/agent/tools.py | 108 +++++++++-------- openkb/lint.py | 4 +- openkb/schema.py | 5 + tests/test_grep.py | 266 +++++++++++++++++++++++++++++++----------- 5 files changed, 274 insertions(+), 121 deletions(-) diff --git a/openkb/agent/query.py b/openkb/agent/query.py index d87c5fb9..3eafa53f 100644 --- a/openkb/agent/query.py +++ b/openkb/agent/query.py @@ -46,7 +46,8 @@ yet seen on a wiki page. Because grep is lexical (not semantic), try a few term variants: acronym and expansion, singular/plural, close synonyms. For any matching page you have NOT already read, read_file it - and fold in relevant content. If grep surfaces a claim that contradicts + (grep_wiki lines are `path:line:text`; pass only the path, before the + first colon) and fold in relevant content. If grep surfaces a claim that contradicts your draft, note both claims with their citations rather than silently choosing one. Do at most 3 grep rounds (a round = one concept and its variants); stop once a round surfaces no new page. grep_wiki is a check, @@ -111,12 +112,15 @@ def grep_wiki(pattern: str, ignore_case: bool = True, fixed_string: bool = False away, pages you never opened, or contradicting mentions. It does NOT search long-document page content (use get_page_content for that). - Returns up to 50 matches as 'relative/path.md:LINE: text'. Feed any - new path into read_file. Try a few term variants (acronym/expansion, + Returns up to 50 matches, one per line as 'path.md:LINE:text'. The + path is everything before the FIRST colon — pass only that path to + read_file (not the whole line). Pattern is an extended regex (ERE): + alternation 'a|b', '?', '+', '()' work; set fixed_string=True for a + literal search. Try a few term variants (acronym/expansion, singular/plural, synonyms) — this is lexical, not semantic. Args: - pattern: Search pattern (regex by default). + pattern: Search pattern (extended regex by default). ignore_case: Case-insensitive (default True). fixed_string: Treat pattern as a literal string, not a regex. """ diff --git a/openkb/agent/tools.py b/openkb/agent/tools.py index 8cc06704..8b88cf29 100644 --- a/openkb/agent/tools.py +++ b/openkb/agent/tools.py @@ -7,11 +7,15 @@ from __future__ import annotations import contextlib +import functools import json as _json +import os import shutil import subprocess from pathlib import Path +from openkb.schema import EXCLUDED_WIKI_FILES + # grep_wiki_files tuning _GREP_MAX_LINES = 50 _GREP_TIMEOUT_S = 10 @@ -60,6 +64,12 @@ def read_wiki_file(path: str, wiki_root: str) -> str: return full_path.read_text(encoding="utf-8") +@functools.cache +def _grep_binary() -> str | None: + """Locate the system grep once per process (PATH does not change at runtime).""" + return shutil.which("grep") + + def grep_wiki_files( pattern: str, wiki_root: str, @@ -67,84 +77,86 @@ def grep_wiki_files( ignore_case: bool = True, fixed_string: bool = False, ) -> str: - """Lexically search the wiki's markdown layer for ``pattern``. + """Lexically search the wiki's markdown layer for ``pattern`` using grep. + + A completeness sweep over every ``*.md`` file under *wiki_root* — + summaries, concepts, entities, explorations, ``index.md``, and short-doc + ``sources/*.md``. Long-doc per-page ``*.json`` (PageIndex's domain) is + excluded (only ``*.md`` is searched), as are the wiki's bookkeeping / + scaffolding files (``log.md``, ``AGENTS.md``, ``SCHEMA.md`` — see + :data:`openkb.schema.EXCLUDED_WIKI_FILES`). - A completeness sweep: shells out to ripgrep (preferred) or grep - (fallback) over every ``*.md`` file under *wiki_root* — summaries, - concepts, entities, explorations, ``index.md``, and short-doc - ``sources/*.md``. Long-doc per-page ``*.json`` (PageIndex's domain) and - ``log.md`` bookkeeping are excluded. + Shells out to the system ``grep`` (POSIX, ubiquitous on macOS/Linux) with + ``shell=False``, so a hostile *pattern* cannot inject commands. ``pattern`` + is an **extended** regular expression (ERE) by default — alternation + ``a|b``, ``?``, ``+``, ``()`` all work — or a literal string when + *fixed_string* is True. Args: - pattern: Search pattern. Regex by default; literal when - *fixed_string* is True. + pattern: Search pattern. ERE by default; literal when *fixed_string*. wiki_root: Absolute path to the wiki root directory. ignore_case: Case-insensitive match (default True). fixed_string: Treat *pattern* as a literal string, not a regex. Returns: - Up to :data:`_GREP_MAX_LINES` matches as ``relative/path.md:LINE: text`` - lines, plus a truncation notice if capped. On no match / missing - binary / timeout / error, returns an explicit message string. Never - raises and never invokes a shell (``shell=False``), so a hostile - *pattern* cannot inject commands. + Up to :data:`_GREP_MAX_LINES` matches, each line ``relative/path.md:LINE:text`` + (the path is everything before the first colon), plus a truncation + notice if capped. On empty pattern / no match / missing grep / timeout / + error-with-no-results, returns an explicit message string. Never raises. """ + if not pattern or not pattern.strip(): + return "Provide a non-empty search pattern." + root = Path(wiki_root).resolve() if not root.exists(): return f"Wiki root not found: {wiki_root}" - rg = shutil.which("rg") - grep = shutil.which("grep") - - if rg: - # --no-ignore: the wiki dir is often gitignored; without this rg - # silently returns zero matches inside a real OpenKB checkout. - cmd = [ - rg, "--line-number", "--no-heading", "--color", "never", - "--no-ignore", "-g", "*.md", "-g", "!log.md", - ] - if ignore_case: - cmd.append("-i") - if fixed_string: - cmd.append("-F") - cmd += ["-e", pattern, str(root)] - elif grep: - cmd = [grep, "-rn", "--include=*.md", "--exclude-dir=images"] - if ignore_case: - cmd.append("-i") - if fixed_string: - cmd.append("-F") - cmd += ["-e", pattern, str(root)] - else: + grep = _grep_binary() + if not grep: return "grep unavailable on this system." + cmd = [grep, "-rn", "--include=*.md"] + for name in sorted(EXCLUDED_WIKI_FILES): + cmd.append(f"--exclude={name}") + if ignore_case: + cmd.append("-i") + cmd.append("-F" if fixed_string else "-E") + cmd += ["-e", pattern, str(root)] + try: proc = subprocess.run( - cmd, capture_output=True, text=True, + cmd, capture_output=True, text=True, errors="replace", timeout=_GREP_TIMEOUT_S, check=False, ) except subprocess.TimeoutExpired: return "grep timed out; narrow the pattern." - # rg/grep convention: 0 = matches, 1 = no matches, >=2 = real error. - if proc.returncode >= 2: - stderr_lines = (proc.stderr or "").strip().splitlines() - first = stderr_lines[0] if stderr_lines else "unknown error" - return f"grep error: {first}." - - prefix = str(root) + "/" + prefix = str(root) + os.sep results: list[str] = [] for line in proc.stdout.splitlines(): - if not line.strip(): + if not line: continue - rel = line[len(prefix):] if line.startswith(prefix) else line + if not line.startswith(prefix): + continue # defensive: only surface paths under wiki_root + rel = line[len(prefix):] path_part = rel.split(":", 1)[0] - # Defensive: grep --include=*.md still matches log.md; drop it. - if path_part == "log.md" or path_part.endswith("/log.md"): + # Defense in depth: --exclude already drops these basenames; this also + # catches a same-named file in a subdirectory. + if Path(path_part).name in EXCLUDED_WIKI_FILES: continue results.append(rel) + if len(results) > _GREP_MAX_LINES: + break # only need 51 to detect truncation; stop processing if not results: + # grep exit codes: 0 = match, 1 = no match, >=2 = error. grep can exit + # >=2 (e.g. one unreadable file) while still printing valid matches — + # those were collected above. Only report an error when nothing usable + # came back. + if proc.returncode >= 2: + stderr_lines = (proc.stderr or "").strip().splitlines() + first = stderr_lines[0] if stderr_lines else "unknown error" + return f"grep error: {first}." return f"No matches for {pattern}." truncated = len(results) > _GREP_MAX_LINES diff --git a/openkb/lint.py b/openkb/lint.py index 2ac6af1d..5e8d0af1 100644 --- a/openkb/lint.py +++ b/openkb/lint.py @@ -15,13 +15,13 @@ import yaml -from openkb.schema import PAGE_CONTENT_DIRS +from openkb.schema import EXCLUDED_WIKI_FILES, PAGE_CONTENT_DIRS # Matches [[wikilink]] or [[subdir/link]] _WIKILINK_RE = re.compile(r"\[\[([^\]]+)\]\]") # Files to exclude from lint scanning (schema, logs, etc.) -_EXCLUDED_FILES = {"AGENTS.md", "SCHEMA.md", "log.md"} +_EXCLUDED_FILES = EXCLUDED_WIKI_FILES def _normalize_target(target: str) -> str: diff --git a/openkb/schema.py b/openkb/schema.py index 8a6322e6..ced01ffb 100644 --- a/openkb/schema.py +++ b/openkb/schema.py @@ -6,6 +6,11 @@ # for surfaces that enumerate page content (list, lint, status, skill gate). PAGE_CONTENT_DIRS = ("summaries", "concepts", "entities") +# Bookkeeping / scaffolding files that live under wiki/ but are NOT content. +# Single source of truth shared by the structural linter and the grep search +# tool so their exclusion policy can never drift. +EXCLUDED_WIKI_FILES: frozenset[str] = frozenset({"AGENTS.md", "SCHEMA.md", "log.md"}) + # Canonical empty index.md seed. Used by `openkb init` and the compiler's # lazy-create path so they never drift. INDEX_SEED = "# Knowledge Base Index\n\n## Documents\n\n## Concepts\n\n## Entities\n\n## Explorations\n" diff --git a/tests/test_grep.py b/tests/test_grep.py index 31c65e47..4a9b6473 100644 --- a/tests/test_grep.py +++ b/tests/test_grep.py @@ -1,8 +1,16 @@ -"""Tests for openkb.agent.tools.grep_wiki_files — lexical wiki search.""" +"""Tests for openkb.agent.tools.grep_wiki_files — grep-based wiki search.""" from __future__ import annotations +import shutil + +import pytest + +import openkb.agent.tools as tools_mod from openkb.agent.tools import grep_wiki_files +_HAS_GREP = shutil.which("grep") is not None +requires_grep = pytest.mark.skipif(not _HAS_GREP, reason="system grep not available") + def _wiki(tmp_path): """Build a minimal wiki/ tree and return its root as a string.""" @@ -19,94 +27,178 @@ def _wiki(tmp_path): "# Attention\nScaled dot-product Attention is central.\n", encoding="utf-8", ) + (root / "entities" / "vaswani.md").write_text( + "# Vaswani\nAshish Vaswani is a lead author.\n", encoding="utf-8", + ) (root / "sources" / "note.md").write_text( - "Short note: the lottery ticket hypothesis appears here only.\n", + "Short note: the lottery ticket hypothesis appears here only.\n" + "It also discusses a large language model in passing.\n", encoding="utf-8", ) - # Long-doc per-page JSON — must NEVER be grepped. + # Long-doc per-page JSON — never grepped (only *.md is searched). (root / "sources" / "book.json").write_text( '[{"page": 1, "text": "transformer secret in json"}]\n', encoding="utf-8", ) - # Bookkeeping — must NEVER be grepped. + # Bookkeeping / scaffolding — never grepped. (root / "log.md").write_text( "# Operations Log\n## [2026-01-01] ingest | transformer\n", encoding="utf-8", ) + (root / "AGENTS.md").write_text( + "# Schema\nThis schema describes synthesis and transformer concepts.\n", + encoding="utf-8", + ) + (root / "SCHEMA.md").write_text( + "# Schema alias\nMentions transformer too.\n", encoding="utf-8", + ) return str(root) +# --- scope: what gets matched ------------------------------------------------- + +@requires_grep def test_finds_match_in_summaries(tmp_path): - wiki = _wiki(tmp_path) - out = grep_wiki_files("self-attention", wiki) + out = grep_wiki_files("self-attention", _wiki(tmp_path)) assert "summaries/paper.md:" in out assert "self-attention" in out +@requires_grep +def test_finds_match_in_concepts(tmp_path): + out = grep_wiki_files("Scaled dot-product", _wiki(tmp_path)) + assert "concepts/attention.md:" in out + + +@requires_grep +def test_finds_match_in_entities(tmp_path): + out = grep_wiki_files("Ashish Vaswani", _wiki(tmp_path)) + assert "entities/vaswani.md:" in out + + +@requires_grep def test_finds_match_in_short_source_md(tmp_path): - wiki = _wiki(tmp_path) - out = grep_wiki_files("lottery ticket", wiki) + out = grep_wiki_files("lottery ticket", _wiki(tmp_path)) assert "sources/note.md:" in out +# --- scope: what gets excluded ------------------------------------------------ + +@requires_grep def test_excludes_long_doc_json(tmp_path): - wiki = _wiki(tmp_path) - out = grep_wiki_files("transformer", wiki) + out = grep_wiki_files("transformer", _wiki(tmp_path)) assert "book.json" not in out +@requires_grep def test_excludes_log_md(tmp_path): - wiki = _wiki(tmp_path) - out = grep_wiki_files("transformer", wiki) + out = grep_wiki_files("transformer", _wiki(tmp_path)) assert "log.md" not in out -def test_case_insensitive_by_default(tmp_path): +@requires_grep +def test_excludes_agents_md(tmp_path): + # AGENTS.md contains 'synthesis' and 'transformer' but is scaffolding. + out = grep_wiki_files("synthesis", _wiki(tmp_path)) + assert "AGENTS.md" not in out + assert out == "No matches for synthesis." + + +@requires_grep +def test_excludes_schema_md(tmp_path): + out = grep_wiki_files("transformer", _wiki(tmp_path)) + assert "SCHEMA.md" not in out + + +# --- regex dialect ------------------------------------------------------------ + +@requires_grep +def test_ere_alternation_matches(tmp_path): + # ERE alternation must work (regression for the BRE-vs-Rust-regex bug). + out = grep_wiki_files("LLM|large language model", _wiki(tmp_path)) + assert "sources/note.md:" in out + + +@requires_grep +def test_fixed_string_treats_pipe_literally(tmp_path): + # As a literal, 'LLM|large language model' does not appear anywhere. + out = grep_wiki_files("LLM|large language model", _wiki(tmp_path), fixed_string=True) + assert out == "No matches for LLM|large language model." + + +@requires_grep +def test_fixed_string_vs_regex_dot(tmp_path): wiki = _wiki(tmp_path) - out = grep_wiki_files("TRANSFORMER", wiki) + # literal 'self.attention' does not appear (text has 'self-attention') + assert grep_wiki_files("self.attention", wiki, fixed_string=True) == \ + "No matches for self.attention." + # as a regex, '.' matches the hyphen + assert "summaries/paper.md:" in grep_wiki_files("self.attention", wiki, fixed_string=False) + + +# --- case sensitivity --------------------------------------------------------- + +@requires_grep +def test_case_insensitive_by_default(tmp_path): + out = grep_wiki_files("TRANSFORMER", _wiki(tmp_path)) assert "summaries/paper.md:" in out +@requires_grep def test_case_sensitive_when_disabled(tmp_path): - wiki = _wiki(tmp_path) - out = grep_wiki_files("TRANSFORMER", wiki, ignore_case=False) + out = grep_wiki_files("TRANSFORMER", _wiki(tmp_path), ignore_case=False) assert out == "No matches for TRANSFORMER." -def test_fixed_string_treats_regex_literally(tmp_path): - wiki = _wiki(tmp_path) - out = grep_wiki_files("self.attention", wiki, fixed_string=True) - # literal "self.attention" does not appear (text has "self-attention") - assert out == "No matches for self.attention." - # but as a regex, "." matches the hyphen - out2 = grep_wiki_files("self.attention", wiki, fixed_string=False) - assert "summaries/paper.md:" in out2 - +# --- guards / messages -------------------------------------------------------- +@requires_grep def test_no_match_returns_message(tmp_path): - wiki = _wiki(tmp_path) - out = grep_wiki_files("nonexistentterm12345", wiki) + out = grep_wiki_files("nonexistentterm12345", _wiki(tmp_path)) assert out == "No matches for nonexistentterm12345." +def test_empty_pattern_guarded(tmp_path): + out = grep_wiki_files("", _wiki(tmp_path)) + assert out == "Provide a non-empty search pattern." + + +def test_whitespace_pattern_guarded(tmp_path): + out = grep_wiki_files(" ", _wiki(tmp_path)) + assert out == "Provide a non-empty search pattern." + + +def test_grep_unavailable_returns_message(tmp_path, monkeypatch): + monkeypatch.setattr(tools_mod, "_grep_binary", lambda: None) + out = grep_wiki_files("transformer", _wiki(tmp_path)) + assert out == "grep unavailable on this system." + + +# --- paths -------------------------------------------------------------------- + +@requires_grep def test_paths_are_relative_to_wiki_root(tmp_path): wiki = _wiki(tmp_path) out = grep_wiki_files("self-attention", wiki) - assert wiki not in out - assert out.splitlines()[0].startswith("summaries/") + assert wiki not in out # no absolute-path leak + # order-independent: the summaries hit is present as a relative path + assert any(ln.startswith("summaries/paper.md:") for ln in out.splitlines()) +@requires_grep def test_result_cap_and_truncation_notice(tmp_path): wiki = _wiki(tmp_path) - root = tmp_path / "wiki" / "summaries" big = "\n".join(f"line {i} needle" for i in range(60)) - (root / "big.md").write_text(big + "\n", encoding="utf-8") + (tmp_path / "wiki" / "summaries" / "big.md").write_text(big + "\n", encoding="utf-8") out = grep_wiki_files("needle", wiki) lines = out.splitlines() assert lines[-1] == "… more matches; narrow the pattern." assert len(lines) == 51 +# --- safety ------------------------------------------------------------------- + +@requires_grep def test_shell_metacharacters_do_not_execute(tmp_path): wiki = _wiki(tmp_path) sentinel = tmp_path / "pwned" @@ -114,17 +206,27 @@ def test_shell_metacharacters_do_not_execute(tmp_path): assert not sentinel.exists() -def test_rg_branch_builds_expected_command(tmp_path, monkeypatch): - import subprocess as _sp - import openkb.agent.tools as tools_mod - +@requires_grep +def test_non_utf8_bytes_do_not_raise(tmp_path): wiki = _wiki(tmp_path) - monkeypatch.setattr(tools_mod.shutil, "which", lambda name: "/usr/bin/rg" if name == "rg" else None) + # A matched line with a non-UTF-8 byte must not raise (errors='replace'). + (tmp_path / "wiki" / "summaries" / "latin.md").write_bytes( + b"caf\xe9 transformer here\n" + ) + out = grep_wiki_files("transformer", wiki) # must return a string, not raise + assert isinstance(out, str) + assert "summaries/" in out + +# --- command construction (binary-agnostic, no real grep needed) -------------- + +def test_grep_command_built_with_ere_and_excludes(tmp_path, monkeypatch): + wiki = _wiki(tmp_path) + monkeypatch.setattr(tools_mod, "_grep_binary", lambda: "/usr/bin/grep") captured = {} class _FakeProc: - returncode = 0 + returncode = 1 stdout = "" stderr = "" @@ -134,34 +236,25 @@ def _fake_run(cmd, *args, **kwargs): return _FakeProc() monkeypatch.setattr(tools_mod.subprocess, "run", _fake_run) - - tools_mod.grep_wiki_files("needle", wiki, ignore_case=True, fixed_string=True) + grep_wiki_files("needle", wiki, ignore_case=True, fixed_string=False) cmd = captured["cmd"] - # rg binary chosen, shell never used - assert cmd[0] == "/usr/bin/rg" + assert cmd[0] == "/usr/bin/grep" assert captured["shell"] is False - # load-bearing flags present - assert "--no-ignore" in cmd - assert "--line-number" in cmd - assert "--no-heading" in cmd - assert "-i" in cmd # ignore_case - assert "-F" in cmd # fixed_string - # md include + log.md exclude globs - assert cmd[cmd.index("-g") :].count("-g") == 2 or cmd.count("-g") == 2 - assert "*.md" in cmd - assert "!log.md" in cmd - # pattern passed via -e as a separate argv (injection-safe), root last + assert "-rn" in cmd + assert "--include=*.md" in cmd + assert "-i" in cmd + assert "-E" in cmd and "-F" not in cmd + for name in ("AGENTS.md", "SCHEMA.md", "log.md"): + assert f"--exclude={name}" in cmd assert cmd[-3] == "-e" assert cmd[-2] == "needle" - assert cmd[-1] == str(__import__("pathlib").Path(wiki).resolve()) - + assert cmd[-1].endswith("wiki") -def test_rg_branch_omits_flags_when_disabled(tmp_path, monkeypatch): - import openkb.agent.tools as tools_mod +def test_grep_command_uses_F_when_fixed_and_omits_i(tmp_path, monkeypatch): wiki = _wiki(tmp_path) - monkeypatch.setattr(tools_mod.shutil, "which", lambda name: "/usr/bin/rg" if name == "rg" else None) + monkeypatch.setattr(tools_mod, "_grep_binary", lambda: "/usr/bin/grep") captured = {} class _FakeProc: @@ -169,18 +262,57 @@ class _FakeProc: stdout = "" stderr = "" - monkeypatch.setattr(tools_mod.subprocess, "run", lambda cmd, *a, **k: (captured.__setitem__("cmd", cmd) or _FakeProc())) - - tools_mod.grep_wiki_files("needle", wiki, ignore_case=False, fixed_string=False) + monkeypatch.setattr( + tools_mod.subprocess, "run", + lambda cmd, *a, **k: (captured.__setitem__("cmd", cmd) or _FakeProc()), + ) + grep_wiki_files("a|b", wiki, ignore_case=False, fixed_string=True) cmd = captured["cmd"] + assert "-F" in cmd and "-E" not in cmd assert "-i" not in cmd - assert "-F" not in cmd -def test_finds_match_in_entities(tmp_path): +# --- returncode handling ------------------------------------------------------ + +def test_partial_error_preserves_matches(tmp_path, monkeypatch): + """grep exit >=2 (e.g. one unreadable file) must NOT discard valid matches.""" wiki = _wiki(tmp_path) - (tmp_path / "wiki" / "entities" / "vaswani.md").write_text( - "# Vaswani\nAshish Vaswani is a lead author.\n", encoding="utf-8", - ) - out = grep_wiki_files("Ashish Vaswani", wiki) - assert "entities/vaswani.md:" in out + root_str = str((tmp_path / "wiki").resolve()) + monkeypatch.setattr(tools_mod, "_grep_binary", lambda: "/usr/bin/grep") + + class _FakeProc: + returncode = 2 + stdout = f"{root_str}/summaries/paper.md:2:self-attention here\n" + stderr = "grep: /x/locked: Permission denied" + + monkeypatch.setattr(tools_mod.subprocess, "run", lambda *a, **k: _FakeProc()) + out = grep_wiki_files("self-attention", wiki) + assert "summaries/paper.md:" in out + assert "grep error" not in out + + +def test_error_with_no_results_returns_error(tmp_path, monkeypatch): + wiki = _wiki(tmp_path) + monkeypatch.setattr(tools_mod, "_grep_binary", lambda: "/usr/bin/grep") + + class _FakeProc: + returncode = 2 + stdout = "" + stderr = "grep: something broke\nsecond line" + + monkeypatch.setattr(tools_mod.subprocess, "run", lambda *a, **k: _FakeProc()) + out = grep_wiki_files("whatever", wiki) + assert out == "grep error: grep: something broke." + + +def test_timeout_returns_message(tmp_path, monkeypatch): + import subprocess as _sp + wiki = _wiki(tmp_path) + monkeypatch.setattr(tools_mod, "_grep_binary", lambda: "/usr/bin/grep") + + def _raise_timeout(*a, **k): + raise _sp.TimeoutExpired(cmd="grep", timeout=10) + + monkeypatch.setattr(tools_mod.subprocess, "run", _raise_timeout) + out = grep_wiki_files("transformer", wiki) + assert out == "grep timed out; narrow the pattern." From 98d26c5e61f8a8e07f0130a1ba7d747d6b314f55 Mon Sep 17 00:00:00 2001 From: mountain Date: Tue, 2 Jun 2026 17:28:39 +0800 Subject: [PATCH 7/8] fix(agent): exclude images/ and .git dirs from grep search; tighten cmd test --- openkb/agent/tools.py | 2 +- tests/test_grep.py | 15 ++++++++++++++- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/openkb/agent/tools.py b/openkb/agent/tools.py index 8b88cf29..0af9a63b 100644 --- a/openkb/agent/tools.py +++ b/openkb/agent/tools.py @@ -115,7 +115,7 @@ def grep_wiki_files( if not grep: return "grep unavailable on this system." - cmd = [grep, "-rn", "--include=*.md"] + cmd = [grep, "-rn", "--include=*.md", "--exclude-dir=images", "--exclude-dir=.git"] for name in sorted(EXCLUDED_WIKI_FILES): cmd.append(f"--exclude={name}") if ignore_case: diff --git a/tests/test_grep.py b/tests/test_grep.py index 4a9b6473..65fa19e6 100644 --- a/tests/test_grep.py +++ b/tests/test_grep.py @@ -110,6 +110,16 @@ def test_excludes_schema_md(tmp_path): assert "SCHEMA.md" not in out +@requires_grep +def test_excludes_images_dir(tmp_path): + wiki = _wiki(tmp_path) + (tmp_path / "wiki" / "sources" / "images" / "caption.md").write_text( + "transformer figure caption\n", encoding="utf-8", + ) + out = grep_wiki_files("transformer", wiki) + assert "images/" not in out + + # --- regex dialect ------------------------------------------------------------ @requires_grep @@ -249,7 +259,10 @@ def _fake_run(cmd, *args, **kwargs): assert f"--exclude={name}" in cmd assert cmd[-3] == "-e" assert cmd[-2] == "needle" - assert cmd[-1].endswith("wiki") + from pathlib import Path as _P + assert cmd[-1] == str(_P(wiki).resolve()) + assert "--exclude-dir=images" in cmd + assert "--exclude-dir=.git" in cmd def test_grep_command_uses_F_when_fixed_and_omits_i(tmp_path, monkeypatch): From e9bfea8f4a0e30309b076e8c331fd27d60e1045b Mon Sep 17 00:00:00 2001 From: mountain Date: Tue, 2 Jun 2026 21:18:09 +0800 Subject: [PATCH 8/8] refine(agent): reframe grep step as locate-then-read (drill for detail), not a terminal sweep --- openkb/agent/query.py | 57 ++++++++++++++++++++++--------------------- 1 file changed, 29 insertions(+), 28 deletions(-) diff --git a/openkb/agent/query.py b/openkb/agent/query.py index 3eafa53f..84926e6f 100644 --- a/openkb/agent/query.py +++ b/openkb/agent/query.py @@ -39,19 +39,20 @@ ranges to help you target. Never fetch the whole document. 6. Source content may reference images (e.g. ![image](sources/images/doc/file.png)). Use the get_image tool to view them when needed. -7. COMPLETENESS SWEEP (do this before finalizing): the summary layer is - lossy, so before you commit to an answer, call grep_wiki for the - question's salient terms — proper nouns, technical terms, numbers, key - entities — plus any claim you asserted in your draft that you have not - yet seen on a wiki page. Because grep is lexical (not semantic), try a - few term variants: acronym and expansion, singular/plural, close - synonyms. For any matching page you have NOT already read, read_file it - (grep_wiki lines are `path:line:text`; pass only the path, before the - first colon) and fold in relevant content. If grep surfaces a claim that contradicts - your draft, note both claims with their citations rather than silently - choosing one. Do at most 3 grep rounds (a round = one concept and its - variants); stop once a round surfaces no new page. grep_wiki is a check, - not the primary search — index.md and summaries still come first. +7. DRILL FOR DETAIL with grep_wiki (after reading the curated pages above): + summaries are lossy, so when the question needs specifics they do not + fully contain — numbers, names, exact claims, edge cases — use grep_wiki + to LOCATE which pages hold them. grep is lexical, so try a few term + variants: acronym and expansion, singular/plural, close synonyms. Treat + the results as a reading list: each line is `path:line:text` — for every + relevant page you have NOT already read in full, read_file that path + (everything before the first colon) and extract the detail. Do NOT answer + from the grep line alone; open the page. If a page contradicts what you + already have, note both claims with their citations rather than silently + choosing one. Repeat locate-then-read until the pages that actually + contain the needed detail have been read (at most 3 grep rounds; stop once + a round surfaces no new relevant page). grep_wiki complements index.md and + summaries (your starting point) — it does not replace them. 8. Synthesize a clear, concise, well-cited answer grounded in wiki content. Answer based only on wiki content. Be concise. @@ -103,21 +104,21 @@ def get_image(image_path: str) -> ToolOutputImage | ToolOutputText: @function_tool def grep_wiki(pattern: str, ignore_case: bool = True, fixed_string: bool = False) -> str: - """Lexically grep the wiki's markdown for a pattern. - - Use this as a FINAL completeness check, after you have drafted an - answer from index.md / summaries / concepts / entities. It searches - every wiki .md file (including short-doc sources/) for the literal - terms of the question — catching details the summaries compressed - away, pages you never opened, or contradicting mentions. It does NOT - search long-document page content (use get_page_content for that). - - Returns up to 50 matches, one per line as 'path.md:LINE:text'. The - path is everything before the FIRST colon — pass only that path to - read_file (not the whole line). Pattern is an extended regex (ERE): - alternation 'a|b', '?', '+', '()' work; set fixed_string=True for a - literal search. Try a few term variants (acronym/expansion, - singular/plural, synonyms) — this is lexical, not semantic. + """Locate wiki pages that contain specific detail, by lexical grep. + + Use this to FIND which pages hold specifics the summaries lack — + numbers, names, exact claims, edge cases — then read_file those pages + to extract the detail. It searches every wiki .md file (including + short-doc sources/); it does NOT search long-document page content + (use get_page_content for that). + + Returns up to 50 matches, one per line as 'path.md:LINE:text'. Each + result is a page to OPEN, not an answer: take the path (everything + before the FIRST colon) and read_file it — do not answer from the grep + line alone. Pattern is an extended regex (ERE): alternation 'a|b', '?', + '+', '()' work; set fixed_string=True for a literal search. Try a few + term variants (acronym/expansion, singular/plural, synonyms) — this is + lexical, not semantic. Args: pattern: Search pattern (extended regex by default).