Skip to content
Open
46 changes: 44 additions & 2 deletions openkb/agent/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from agents import ToolOutputImage, ToolOutputText
from openkb.agent.tools import (
get_wiki_page_content,
grep_wiki_files,
read_wiki_file,
read_wiki_image,
write_kb_file,
Expand Down Expand Up @@ -38,7 +39,21 @@
ranges to help you target. Never fetch the whole document.
6. Source content may reference images (e.g. ![image](sources/images/doc/file.png)).
Use the get_image tool to view them when needed.
7. Synthesize a clear, concise, well-cited answer grounded in wiki content.
7. DRILL FOR DETAIL with grep_wiki (after reading the curated pages above):
summaries are lossy, so when the question needs specifics they do not
fully contain — numbers, names, exact claims, edge cases — use grep_wiki
to LOCATE which pages hold them. grep is lexical, so try a few term
variants: acronym and expansion, singular/plural, close synonyms. Treat
the results as a reading list: each line is `path:line:text` — for every
relevant page you have NOT already read in full, read_file that path
(everything before the first colon) and extract the detail. Do NOT answer
from the grep line alone; open the page. If a page contradicts what you
already have, note both claims with their citations rather than silently
choosing one. Repeat locate-then-read until the pages that actually
contain the needed detail have been read (at most 3 grep rounds; stop once
a round surfaces no new relevant page). grep_wiki complements index.md and
summaries (your starting point) — it does not replace them.
8. Synthesize a clear, concise, well-cited answer grounded in wiki content.

Answer based only on wiki content. Be concise.
Before each tool call, output one short sentence explaining the reason.
Expand Down Expand Up @@ -87,12 +102,39 @@ def get_image(image_path: str) -> ToolOutputImage | ToolOutputText:
return ToolOutputImage(image_url=result["image_url"])
return ToolOutputText(text=result["text"])

@function_tool
def grep_wiki(pattern: str, ignore_case: bool = True, fixed_string: bool = False) -> str:
"""Locate wiki pages that contain specific detail, by lexical grep.

Use this to FIND which pages hold specifics the summaries lack —
numbers, names, exact claims, edge cases — then read_file those pages
to extract the detail. It searches every wiki .md file (including
short-doc sources/); it does NOT search long-document page content
(use get_page_content for that).

Returns up to 50 matches, one per line as 'path.md:LINE:text'. Each
result is a page to OPEN, not an answer: take the path (everything
before the FIRST colon) and read_file it — do not answer from the grep
line alone. Pattern is an extended regex (ERE): alternation 'a|b', '?',
'+', '()' work; set fixed_string=True for a literal search. Try a few
term variants (acronym/expansion, singular/plural, synonyms) — this is
lexical, not semantic.

Args:
pattern: Search pattern (extended regex by default).
ignore_case: Case-insensitive (default True).
fixed_string: Treat pattern as a literal string, not a regex.
"""
return grep_wiki_files(
pattern, wiki_root, ignore_case=ignore_case, fixed_string=fixed_string,
)

from agents.model_settings import ModelSettings

return Agent(
name="wiki-query",
instructions=instructions,
tools=[read_file, get_page_content, get_image],
tools=[read_file, get_page_content, get_image, grep_wiki],
model=f"litellm/{model}",
model_settings=ModelSettings(parallel_tool_calls=False),
)
Expand Down
112 changes: 112 additions & 0 deletions openkb/agent/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,19 @@
from __future__ import annotations

import contextlib
import functools
import json as _json
import os
import shutil
import subprocess
from pathlib import Path

from openkb.schema import EXCLUDED_WIKI_FILES

# grep_wiki_files tuning
_GREP_MAX_LINES = 50
_GREP_TIMEOUT_S = 10


def list_wiki_files(directory: str, wiki_root: str) -> str:
"""List all Markdown files in a wiki subdirectory.
Expand Down Expand Up @@ -54,6 +64,108 @@ def read_wiki_file(path: str, wiki_root: str) -> str:
return full_path.read_text(encoding="utf-8")


@functools.cache
def _grep_binary() -> str | None:
"""Locate the system grep once per process (PATH does not change at runtime)."""
return shutil.which("grep")


def grep_wiki_files(
pattern: str,
wiki_root: str,
*,
ignore_case: bool = True,
fixed_string: bool = False,
) -> str:
"""Lexically search the wiki's markdown layer for ``pattern`` using grep.

A completeness sweep over every ``*.md`` file under *wiki_root* —
summaries, concepts, entities, explorations, ``index.md``, and short-doc
``sources/*.md``. Long-doc per-page ``*.json`` (PageIndex's domain) is
excluded (only ``*.md`` is searched), as are the wiki's bookkeeping /
scaffolding files (``log.md``, ``AGENTS.md``, ``SCHEMA.md`` — see
:data:`openkb.schema.EXCLUDED_WIKI_FILES`).

Shells out to the system ``grep`` (POSIX, ubiquitous on macOS/Linux) with
``shell=False``, so a hostile *pattern* cannot inject commands. ``pattern``
is an **extended** regular expression (ERE) by default — alternation
``a|b``, ``?``, ``+``, ``()`` all work — or a literal string when
*fixed_string* is True.

Args:
pattern: Search pattern. ERE by default; literal when *fixed_string*.
wiki_root: Absolute path to the wiki root directory.
ignore_case: Case-insensitive match (default True).
fixed_string: Treat *pattern* as a literal string, not a regex.

Returns:
Up to :data:`_GREP_MAX_LINES` matches, each line ``relative/path.md:LINE:text``
(the path is everything before the first colon), plus a truncation
notice if capped. On empty pattern / no match / missing grep / timeout /
error-with-no-results, returns an explicit message string. Never raises.
"""
if not pattern or not pattern.strip():
return "Provide a non-empty search pattern."

root = Path(wiki_root).resolve()
if not root.exists():
return f"Wiki root not found: {wiki_root}"

grep = _grep_binary()
if not grep:
return "grep unavailable on this system."

cmd = [grep, "-rn", "--include=*.md", "--exclude-dir=images", "--exclude-dir=.git"]
for name in sorted(EXCLUDED_WIKI_FILES):
cmd.append(f"--exclude={name}")
if ignore_case:
cmd.append("-i")
cmd.append("-F" if fixed_string else "-E")
cmd += ["-e", pattern, str(root)]

try:
proc = subprocess.run(
cmd, capture_output=True, text=True, errors="replace",
timeout=_GREP_TIMEOUT_S, check=False,
)
except subprocess.TimeoutExpired:
return "grep timed out; narrow the pattern."

prefix = str(root) + os.sep
results: list[str] = []
for line in proc.stdout.splitlines():
if not line:
continue
if not line.startswith(prefix):
continue # defensive: only surface paths under wiki_root
rel = line[len(prefix):]
path_part = rel.split(":", 1)[0]
# Defense in depth: --exclude already drops these basenames; this also
# catches a same-named file in a subdirectory.
if Path(path_part).name in EXCLUDED_WIKI_FILES:
continue
results.append(rel)
if len(results) > _GREP_MAX_LINES:
break # only need 51 to detect truncation; stop processing

if not results:
# grep exit codes: 0 = match, 1 = no match, >=2 = error. grep can exit
# >=2 (e.g. one unreadable file) while still printing valid matches —
# those were collected above. Only report an error when nothing usable
# came back.
if proc.returncode >= 2:
stderr_lines = (proc.stderr or "").strip().splitlines()
first = stderr_lines[0] if stderr_lines else "unknown error"
return f"grep error: {first}."
return f"No matches for {pattern}."

truncated = len(results) > _GREP_MAX_LINES
out = "\n".join(results[:_GREP_MAX_LINES])
if truncated:
out += "\n… more matches; narrow the pattern."
return out


def parse_pages(pages: str) -> list[int]:
"""Parse a page specification string into a sorted, deduplicated list of page numbers.

Expand Down
4 changes: 2 additions & 2 deletions openkb/lint.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,13 @@

import yaml

from openkb.schema import PAGE_CONTENT_DIRS
from openkb.schema import EXCLUDED_WIKI_FILES, PAGE_CONTENT_DIRS

# Matches [[wikilink]] or [[subdir/link]]
_WIKILINK_RE = re.compile(r"\[\[([^\]]+)\]\]")

# Files to exclude from lint scanning (schema, logs, etc.)
_EXCLUDED_FILES = {"AGENTS.md", "SCHEMA.md", "log.md"}
_EXCLUDED_FILES = EXCLUDED_WIKI_FILES


def _normalize_target(target: str) -> str:
Expand Down
5 changes: 5 additions & 0 deletions openkb/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,11 @@
# for surfaces that enumerate page content (list, lint, status, skill gate).
PAGE_CONTENT_DIRS = ("summaries", "concepts", "entities")

# Bookkeeping / scaffolding files that live under wiki/ but are NOT content.
# Single source of truth shared by the structural linter and the grep search
# tool so their exclusion policy can never drift.
EXCLUDED_WIKI_FILES: frozenset[str] = frozenset({"AGENTS.md", "SCHEMA.md", "log.md"})

# Canonical empty index.md seed. Used by `openkb init` and the compiler's
# lazy-create path so they never drift.
INDEX_SEED = "# Knowledge Base Index\n\n## Documents\n\n## Concepts\n\n## Entities\n\n## Explorations\n"
Expand Down
Loading