From 70b47cb8a5a3242e891886f06c25a86a81d02686 Mon Sep 17 00:00:00 2001 From: Johnson Chu Date: Mon, 8 Jun 2026 16:04:18 +0800 Subject: [PATCH 1/4] Fix value-leading YAML document markers + restore start() as line-start (#23) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A value-leading `---` / `...` (`note: --- x`, `x: ... bar`, `- --- x`) was scoped as a document marker in the flat YAML highlighter. The parser already constrains the markers to stream position structurally (DocStart / DocEnd appear only in the Stream grammar), but gen-tm emits the token pattern into every context, dropping that constraint — the flat-highlighter analogue of a position the parser gets free. Fix: anchor the markers to line start (YAML §9.1.1 makes a marker column-0-only), which carries the same constraint into the derived grammar. This exposed a latent lexer bug: `start()` compiled to a bare `^` under the sticky `y` matcher, which matches only at index 0 (file start), so a marker at the start of a LATER line (`# c\n---\n…`) stopped lexing — parser-alignment dropped 100% → 95%. `start()` means line start everywhere else (it serializes to `^`, stripped in monarch / tree-sitter), so the lexer is corrected to compile start-anchored token patterns with the `m` flag — `^` then matches at every line start, restoring 100%. Also adds test/yaml-depth-witnesses.ts: a raw-scope regression gate for the flat highlighter's depth/position sites. The scope-gap metric reported monogramWrong=0 here because it is corpus-bound (these inputs aren't in yaml-test-suite) AND excludes lexical-floor roles (a `-` mis-painted as string is invisible). The gate constructs one witness per scanner state field and asserts the raw inner scope, so neither blind spot can hide a regression. #24 (nested compact sequence sibling vs plain-scalar fold) is tracked there as a known bug pending indent-region derivation. Parser CST + the other six grammars byte-identical; src-coverage-yaml 100%, scope-gap-yaml monogramWrong=0, tree-sitter-yaml 97.8%, js shebang unregressed. --- .github/workflows/ci.yml | 1 + src/gen-lexer.ts | 12 +++- test/yaml-depth-witnesses.ts | 120 +++++++++++++++++++++++++++++++++++ yaml.tmLanguage.json | 4 +- yaml.ts | 17 +++-- 5 files changed, 142 insertions(+), 12 deletions(-) create mode 100644 test/yaml-depth-witnesses.ts diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c4ba0d5..4eaafd0 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -56,6 +56,7 @@ jobs: node test/vue-embed-boundary.ts node test/vue-interp-expr.ts node test/yaml-issue12-regressions.ts + node test/yaml-depth-witnesses.ts # The derived tree-sitter highlighter is the strongest thesis proof (a real GLR # parser from the same grammar, beating the official hand-written one). Build its diff --git a/src/gen-lexer.ts b/src/gen-lexer.ts index b773558..bf44172 100644 --- a/src/gen-lexer.ts +++ b/src/gen-lexer.ts @@ -1,6 +1,6 @@ import type { CstGrammar } from './types.ts'; import { collectLiterals, isKeywordLiteral } from './grammar-utils.ts'; -import { tokenBlockPatternFirstCharSet, tokenBlockPatternSource, tokenEscapeValidPatternSource, tokenPatternFirstCharSet, tokenPatternSource } from './token-pattern.ts'; +import { tokenBlockPatternFirstCharSet, tokenBlockPatternSource, tokenEscapeValidPatternSource, tokenPatternFirstCharSet, tokenPatternHasStartAnchor, tokenPatternSource } from './token-pattern.ts'; // A lexer token: a declared token (type = its name) or a punctuation literal (type = ''). // `$templateHead/$templateMiddle/$templateTail` are synthetic types the lexer emits for @@ -45,10 +45,16 @@ export function createLexer(grammar: CstGrammar) { const tokenMatchers = grammar.tokens.map(t => { const pattern = tokenPatternSource(t); const blockPattern = tokenBlockPatternSource(t); + // A token whose pattern carries a line-START anchor (`start()` → `^`, e.g. YAML's `---`/`...` + // document markers, a shebang) needs the `m` flag: under the sticky `y` matcher a bare `^` + // matches only at index 0 (file start), so a marker at the start of a LATER line (`# c\n---\n…`, + // `%TAG …\n---\n…`) would fail to lex. With `m`, `^` matches at every line start, so a sticky + // match at `lastIndex = pos` succeeds iff `pos` is a line start — exactly `start()`'s meaning. + const flags = tokenPatternHasStartAnchor(t) ? 'ym' : 'y'; return { name: t.name, - regex: new RegExp(`(?:${pattern})`, 'y'), - blockRegex: blockPattern ? new RegExp(`(?:${blockPattern})`, 'y') : null, + regex: new RegExp(`(?:${pattern})`, flags), + blockRegex: blockPattern ? new RegExp(`(?:${blockPattern})`, flags) : null, skip: t.flags.includes('skip'), isRegex: t.flags.includes('regex'), isString: !!t.string, diff --git a/test/yaml-depth-witnesses.ts b/test/yaml-depth-witnesses.ts new file mode 100644 index 0000000..39d0122 --- /dev/null +++ b/test/yaml-depth-witnesses.ts @@ -0,0 +1,120 @@ +// yaml-depth-witnesses.ts — a RAW-SCOPE regression gate for the flat YAML TextMate highlighter's +// depth/position sites. It exists because the scope-gap metric reported `monogramWrong=0` while real +// bugs (monogram#23/#24) sat in plain sight: that metric is corpus-bound (the witnesses aren't in +// yaml-test-suite) AND excludes lexical-floor roles (a `-` mis-painted as string is invisible because +// `punctuation` is floor-excluded and the `b` beside it grades correct). So a "0 wrong" headline never +// meant "no bug" — only "no bug my metric can see". +// +// THEOREM behind the cases: where a construct's correct scope depends on cross-line STATE the parser +// keeps in a stack (depth), and the derived TextMate grammar is flat (no stack), the set of inputs +// where they disagree is provably NON-EMPTY. So we don't wait for a corpus to surface these — we +// CONSTRUCT one witness per state field of the derived YAML scanner (indent stack, flow depth, +// block-scalar region, document-marker position, node-property lead) and assert the RAW inner scope at +// the position the depth decides. This is oracle-independent (a fixed expected scope) and floor-blind +// (it checks the punctuation/string class directly), so neither blind spot can hide a regression. +// +// Run (bare node): node test/yaml-depth-witnesses.ts +import { readFileSync } from 'node:fs'; +import { createRequire } from 'node:module'; +import vsctm from 'vscode-textmate'; +import onig from 'vscode-oniguruma'; + +const { INITIAL, Registry, parseRawGrammar } = vsctm; +const { loadWASM, OnigScanner, OnigString } = onig; +const require = createRequire(import.meta.url); +const bin = readFileSync(require.resolve('vscode-oniguruma/release/onig.wasm')); +await loadWASM(bin.buffer.slice(bin.byteOffset, bin.byteOffset + bin.byteLength)); +const reg = new Registry({ + onigLib: Promise.resolve({ createOnigScanner: (p: string[]) => new OnigScanner(p), createOnigString: (s: string) => new OnigString(s) }), + loadGrammar: async (sn: string) => sn === 'source.yaml' ? parseRawGrammar(readFileSync('yaml.tmLanguage.json', 'utf8'), 'y.json') : null, +}); +const grammar = (await reg.loadGrammar('source.yaml'))!; + +interface Tok { start: number; end: number; scopes: string[] } +function tokenize(text: string): Tok[] { + const toks: Tok[] = []; let rs = INITIAL, off = 0; + for (const line of text.split('\n')) { const r = grammar.tokenizeLine(line, rs); for (const t of r.tokens) toks.push({ start: off + t.startIndex, end: off + t.endIndex, scopes: t.scopes }); rs = r.ruleStack; off += line.length + 1; } + return toks; +} +function scopeAt(toks: Tok[], pos: number): string { + let lo = 0, hi = toks.length - 1, ans = -1; + while (lo <= hi) { const mid = (lo + hi) >> 1; if (toks[mid].start <= pos) { ans = mid; lo = mid + 1; } else hi = mid - 1; } + const s = ans >= 0 && toks[ans].end > pos ? toks[ans].scopes : []; + return s.length ? s[s.length - 1] : '(none)'; +} +// Locate the byte offset of `find` in `input` (optionally the n-th occurrence, 0-based). +function locate(input: string, find: string, nth = 0): number { + let i = -1; for (let k = 0; k <= nth; k++) { i = input.indexOf(find, i + 1); if (i < 0) throw new Error(`witness focus not found: ${JSON.stringify(find)}#${nth}`); } + return i; +} + +interface Case { + state: string; // the scanner state field this witness probes + input: string; + find: string; nth?: number; off?: number; // focus = nth occurrence of `find`, plus `off` chars + want?: string; // inner scope MUST start with this + notWant?: string; // inner scope MUST NOT start with this + note: string; + knownBug?: boolean; // a depth site not yet fixed in the flat derivation — tracked, not asserted +} + +const cases: Case[] = [ + // ── document-marker POSITION (monogram#23): a marker is column-0-only; a value-leading `---`/`...` + // is string content. Fixed by anchoring DocStart/DocEnd with start() (yaml.ts) + the lexer m-flag. + { state: 'doc-marker position', input: 'note: --- not a marker\n', find: '---', want: 'string', + notWant: 'entity.other.document', note: 'value-leading `---` is string, not document.begin' }, + { state: 'doc-marker position', input: 'x: ... bar\n', find: '...', want: 'string', + notWant: 'entity.other.document', note: 'value-leading `...` is string, not document.end' }, + { state: 'doc-marker position', input: '- --- x\n', find: '---', want: 'string', + notWant: 'entity.other.document', note: 'sequence-item value-leading `---` is string' }, + // a LEGITIMATE column-0 marker must still scope as document structure (the fix must not over-correct) + { state: 'doc-marker position', input: '---\nkey: value\n', find: '---', want: 'entity.other.document', + note: 'a real column-0 `---` is still a document marker' }, + + // ── block-scalar REGION: inside `|`/`>` the body is literal text — `#`/`-` are NOT comment/indicator. + // Handled by the block-scalar begin/end region (a depth mechanism the flat grammar DOES carry). + { state: 'block-scalar region', input: 'a: |\n # literal\n x\n', find: '# literal', want: 'string', + notWant: 'comment', note: 'inside a block scalar `#` is text, not a comment' }, + { state: 'block-scalar region', input: 'a: |\n - literal\n x\n', find: '- literal', want: 'string', + notWant: 'punctuation', note: 'inside a block scalar `-` is text, not a sequence indicator' }, + + // ── flow DEPTH: outside flow, `,` and an inner `:` are plain-scalar content (block `{k:"a,b"}`). + { state: 'flow depth', input: 'k: a,b\n', find: ',b', want: 'string', + notWant: 'punctuation.separator', note: 'block plain scalar — `,` is content, not a flow separator' }, + + // ── indent STACK (monogram#24): a nested compact sequence sibling vs a plain-scalar fold. The `-` on + // the indented line is a sequence indicator when a sequence is established at that column, but + // folds into the preceding plain scalar otherwise — same surface, opposite answer, decided only by + // the indent stack a flat grammar lacks. KNOWN BUG until gen-tm derives indent-tracking regions. + { state: 'indent stack (sibling vs fold)', input: '- - a\n - b\n- c\n', find: '- b', off: 0, want: 'punctuation', + notWant: 'string', note: 'inner-sequence sibling `-` is punctuation, not folded into a plain scalar', knownBug: true }, + // the counter-proof — SAME indented `- b` line, but here it MUST fold (no sequence at column 2). This + // is asserted (not a known bug): the eventual #24 fix must keep this one folding. + { state: 'indent stack (counter-proof)', input: 'x: hello\n - b\n', find: '- b', want: 'string', + note: 'plain-scalar continuation — `- b` folds (no sequence established at column 2)' }, +]; + +let pass = 0, knownBugs = 0, regressions = 0; +for (const c of cases) { + const toks = tokenize(c.input); + const pos = locate(c.input, c.find, c.nth) + (c.off ?? 0); + const got = scopeAt(toks, pos).replace(/\.yaml$/, ''); + const okWant = c.want ? got.startsWith(c.want) : true; + const okNot = c.notWant ? !got.startsWith(c.notWant) : true; + const ok = okWant && okNot; + const expectStr = [c.want && `want ${c.want}*`, c.notWant && `not ${c.notWant}*`].filter(Boolean).join(', '); + if (c.knownBug) { + knownBugs++; + console.log(` ${ok ? '✓ FIXED' : '· known'} [${c.state}] ${JSON.stringify(c.input)} @«${c.find}» → «${got}» (${expectStr})`); + if (ok) console.log(` ↑ this known bug now PASSES — flip knownBug:false to lock it in.`); + } else if (ok) { + pass++; + console.log(` ✓ ok [${c.state}] @«${c.find}» → «${got}»`); + } else { + regressions++; + console.log(` ✗ FAIL [${c.state}] ${JSON.stringify(c.input)} @«${c.find}» → «${got}» — expected ${expectStr}`); + console.log(` ${c.note}`); + } +} +console.log(`\n ${pass} pass · ${knownBugs} known-bug (depth sites not yet derived) · ${regressions} regression`); +if (regressions > 0) { console.error('\nDEPTH WITNESS REGRESSION — a flat-highlighter depth/position site broke.'); process.exit(1); } diff --git a/yaml.tmLanguage.json b/yaml.tmLanguage.json index 6f1b41b..50a63d3 100644 --- a/yaml.tmLanguage.json +++ b/yaml.tmLanguage.json @@ -321,11 +321,11 @@ "repository": { "docstart": { "name": "entity.other.document.begin.yaml", - "match": "---(?=[\\t ]|\\r|\\n|$)" + "match": "^---(?=[\\t ]|\\r|\\n|$)" }, "docend": { "name": "entity.other.document.end.yaml", - "match": "\\.\\.\\.(?=[\\t ]|\\r|\\n|$)" + "match": "^\\.\\.\\.(?=[\\t ]|\\r|\\n|$)" }, "yamldirective": { "name": "keyword.other.directive.yaml", diff --git a/yaml.ts b/yaml.ts index 69c2587..847d281 100644 --- a/yaml.ts +++ b/yaml.ts @@ -10,7 +10,7 @@ import { token, rule, defineGrammar, alt, many, many1, opt, not, noCommentBefore, noMultilineFlowBefore, altPattern, optPattern, seq, oneOf, noneOf, range, star, plus, repeat, followedBy, notFollowedBy, - precededBy, notPrecededBy, never, end, + precededBy, notPrecededBy, never, start, end, } from './src/api.ts'; import type { IndentConfig } from './src/types.ts'; @@ -31,13 +31,16 @@ const whitespace = oneOf('\t', '\n', '\f', '\r', ' '); const nonWhitespace = noneOf(whitespace); const hashAfterNonSpace = seq('#', precededBy(seq(nonWhitespace, '#'))); // Document markers: `---` (directives end / document begin) and `...` (document end). Both must be -// followed by whitespace or EOL — `---foo` / `...bar` are plain scalars, not markers — so the -// lookahead keeps the marker from stealing a plain scalar's leading dashes/dots. Scoped -// `entity.other.document.*` (the maintained-grammar convention) so the highlighter paints them as -// document structure, not as a string. +// at the START of a line (YAML §9.1.1 — a marker is column 0) AND followed by whitespace or EOL — +// `---foo` / `...bar` are plain scalars, and a `---` / `...` that OPENS A VALUE (`note: --- x`, +// `x: ... bar`) is string content, not a marker. The parser already constrains the markers to stream +// position structurally (DocStart / DocEnd are referenced only in the Stream grammar), so the CST is +// unchanged; the line-start `start()` anchor carries that same column-0 constraint into the FLAT +// derived highlighter, which otherwise retries the marker pattern at every token boundary and would +// scope a value-leading `---` as a document marker (monogram#23). Scoped `entity.other.document.*`. const docMarkerEnd = followedBy(altPattern(oneOf('\t', ' '), '\r', '\n', end())); -const DocStart = token(seq('---', docMarkerEnd), { scope: 'entity.other.document.begin' }); -const DocEnd = token(seq('...', docMarkerEnd), { scope: 'entity.other.document.end' }); +const DocStart = token(seq(start(), '---', docMarkerEnd), { scope: 'entity.other.document.begin' }); +const DocEnd = token(seq(start(), '...', docMarkerEnd), { scope: 'entity.other.document.end' }); // A `#` is a comment indicator only at line start or AFTER whitespace (YAML §6.6); a `#` glued to a // non-space char is content, not a comment (`a#b` is a plain scalar, `%YAML 1.1#…` keeps the `#…` as // directive content — monogram#12 #8). The `notPrecededBy(nonWhitespace)` guard (a fixed-width, portable From f0d4ed249dba777122f27473a6141739bd5e330a Mon Sep 17 00:00:00 2001 From: Johnson Chu Date: Mon, 8 Jun 2026 17:04:32 +0800 Subject: [PATCH 2/4] Fix nested compact sequence sibling swallowed by plain fold (monogram#24) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the flat derived YAML TextMate highlighter, a nested compact sequence's sibling item (`- - a\n - b\n- c`) was wrongly swallowed by the preceding plain scalar's multi-line fold: the inner `- b` lost its `punctuation` (sequence indicator) scope and read as one `string.unquoted` token. The §2a' fold region is LINE-relative (its `\1` is the line's leading whitespace), but a YAML continuation is NODE-relative (more indented than the enclosing dash/key). For a single sequence or a mapping the two coincide, so those folds are correct; they diverge only for a COMPACT nested sequence, whose inner dash sits at column 2 after the outer `- ` prefix (not whitespace) — a sibling `- b` at column 2 reads to a `\1=""` fold as "indented past column 0" and is folded. Derive (from the grammar's block-sequence rule + indent config, gated on `grammar.indent`) a column-anchored COMPACT block-sequence region (gen-tm §2c): a `\G`-anchored begin/while — re-anchored each line by the meta.stream wrapper — that reclaims the inner sibling `- ` at the inner indicator's column before the plain fold can swallow it. It mirrors the maintained RedCMD YAML grammar's block-sequence but uses only a FIXED-width compact re-anchor lookbehind (`(?<=[-?:])`), portable under Onigmo (RedCMD's variable-length `(? [r.name, r] as const)); + const headSinglePunct = (e: RuleExpr): string | null => + e.type === 'literal' && e.value.length === 1 && !/[\w\s]/.test(e.value) ? e.value : null; + // The item rule's indicator: unwrap a ref to a rule whose body's first seq element is a 1-char punct. + const itemIndicator = (e: RuleExpr): string | null => { + let body = e.type === 'ref' ? ruleByName.get(e.name)?.body : e; + if (!body) return null; + // a rule body written `[[...]]` is a single-alt seq; unwrap a lone-arm alt + if (body.type === 'alt' && body.items.length === 1) body = body.items[0]; + return body.type === 'seq' ? headSinglePunct(body.items[0]) : null; + }; + let indicator: string | null = null; + const visit = (e: RuleExpr): void => { + if (e.type === 'seq') { + // `[item, (Newline item)*]`: first element + a `*`/`+` over a `[Newline, item]` seq + if (e.items.length >= 2) { + const head = e.items[0]; + const q = e.items[1]; + if (q.type === 'quantifier' && (q.kind === '*' || q.kind === '+') && q.body.type === 'seq' + && q.body.items.length >= 2 && q.body.items[0].type === 'ref' && q.body.items[0].name === newlineToken) { + const ind = itemIndicator(head); + // the repeated element's item must share the head's indicator (a homogeneous sequence) + if (ind && itemIndicator(q.body.items[1]) === ind) indicator = ind; + } + } + e.items.forEach(visit); + } else if (e.type === 'alt') e.items.forEach(visit); + else if (e.type === 'quantifier' || e.type === 'group' || e.type === 'not') visit(e.body); + else if (e.type === 'sep') visit(e.element); + }; + for (const r of grammar.rules) visit(r.body); + return indicator ? { indicator } : null; +} + // ── Flow-collection detection (YAML `{ … }` mapping / `[ … ]` sequence) ── // // A flat per-token grammar cannot scope a flow MAPPING's keys: in `{ a: 1 }` the `a` is a key @@ -5088,6 +5141,70 @@ export function generateTmLanguage(grammar: CstGrammar, langName: string): TmGra }); topPatterns.push({ include: '#plain-bare-fold' }); } + + // ── 2c. COMPACT block-sequence — a column-anchored region for nested `- - …` (monogram#24) ── + // The §2a′/§2a″ plain folds are LINE-relative (their `\1` is the line's leading whitespace), but + // a YAML continuation is NODE-relative (more indented than the ENCLOSING dash/key). For a single + // sequence (`- a`) or a mapping (`x: y`) the two coincide — the dash/key sits at column 0 = the + // line indent — so those folds are correct. They DIVERGE only for a COMPACT nested sequence + // (`- - a`): the inner sequence's dash is at column 2 (after the outer `- ` prefix, which is NOT + // whitespace), so a sibling `- b` at column 2 reads — to a line-relative `\1=""` fold — as "indented + // past column 0", and is wrongly folded into the plain scalar `a`. The decider is the inner + // indicator's column, which no `\1`-relative backref can express (the prefix is `- `, not spaces) and + // no possessive `[ \t]++` can split from the deeper-fold case `x: y\n - b` (same line, must fold). + // + // The fix mirrors the maintained RedCMD YAML grammar's block-sequence: a `\G`-anchored region whose + // rule stack carries the indent depth. The meta.stream wrapper re-anchors `\G` at every line, so the + // region's `( *+)`-captured column (`\1\2`) lets its `while` reclaim a same-column sibling `- ` while + // a DEEPER line stays folded into the item value. We emit it ONLY for the COMPACT case (a dash + // followed by ANOTHER dash on the same line) — `begin … (?=[\t ]+${dash}[\t ])` — so a single `- a`, + // a `- key: v` mapping item, a `- {…}`/`- "…"`/`- |` value, etc. are UNTOUCHED (still handled by the + // top-level token includes + the §2a′ fold), confining this region to exactly the bug's shape. The + // compact re-anchor `(?=((?<=${reanchor}) )?+)` (a FIXED-width lookbehind — portable, unlike RedCMD's + // variable-length `(?[\\t ]++|\\G)#)', name: plainContent, patterns: [ + { match: '\\G[\\t ]++', name: plainContent }, { match: '[\\t ]++$', name: plainContent }] }, + { begin: '(?!\\G)', while: '\\G', patterns: commentIncludeKeys.map(k => ({ include: `#${k}` })) }, + ], + }; + // The region SHELL (begin/while/captures); its body `patterns` is filled at the END (after the + // top-level dispatch is built + ordered), since the item content reuses that full dispatch. + repository['block-sequence'] = { + begin: `(?=((?<=${reanchor}) )?+)\\G( *+)(${dash})(?=[\\t ]+${dash}[\\t ])`, + beginCaptures: { '3': { name: `punctuation.${langName}` } }, + while: `\\G(?>(\\1\\2)(?=[\\t ]*${dash}[\\t ]|[\\t ]*${dash}$)|(?!\\1\\2)([\\t ]+)(?=[^\\t\\r\\n#])|[\\t ]*$)`, + patterns: [], + }; + topPatterns.push({ include: '#block-sequence' }); + } } } @@ -7691,6 +7808,13 @@ export function generateTmLanguage(grammar: CstGrammar, langName: string): TmGra // continuation as the KEY (entity.name.tag), so it must win for the `?` case; #plain-continuation // still handles `key:`/`- ` folds (its lookahead, unlike this one, is not pinned to the `?`). if (key === 'explicit-key-continuation') return 0.68; + // The COMPACT block-sequence region (§2c) must out-rank #plain-continuation (0.7): both open on a + // `- `-led header (the `-` is in compactCls, so the fold's lookahead matches a compact line too), but + // for `- - a` the sequence region bounds the inner sibling at the inner indicator's column while the + // fold would swallow it line-relative. Ranked above the fold so the compact case is claimed first; + // its begin requires a SECOND dash (`(?=[\t ]+-[\t ])`), so a non-compact `- a`/`- key:`/`x: y` line + // never matches it and still falls through to #plain-continuation. (monogram#24.) + if (key === 'block-sequence') return 0.69; if (key === 'plain-continuation') return 0.7; // The BARE plain-scalar same-column fold (§2a″) likewise begins AT LINE START and must out-rank the // scalar tokens (#key/#num/#boolnull/#plain ≥ 0.8) so it opens on a bare value scalar and claims its @@ -7768,6 +7892,21 @@ export function generateTmLanguage(grammar: CstGrammar, langName: string): TmGra .sort((a, b) => scopeOrder(a) - scopeOrder(b)) .map(include => ({ include })); + // Fill the COMPACT block-sequence region's body (§2c). Its item content reuses the FULL top-level + // dispatch (so a `- - {…}` flow value, a `- - "x"` quoted value, a `- - key: v` nested mapping, a + // `- - |` block scalar are all scoped correctly) — the same ordered includes meta.stream wraps — with + // two changes: the two LINE-relative plain folds (§2a′/§2a″) are REMOVED (the sequence's own + // column-anchored `while` + the bounded #block-plain-item handle the item-value fold node-relatively; + // leaving them in would re-introduce the line-relative swallow this region exists to prevent), and the + // bounded #block-plain-item is appended for a bare plain item value. The self-include (#block-sequence, + // already in the ordered list at rank 0.69) gives deeper compact nesting (`- - - x`). + if (repository['block-sequence']) { + repository['block-sequence'].patterns = [ + ...orderedPatterns.filter(p => p.include !== '#plain-continuation' && p.include !== '#plain-bare-fold'), + { include: '#block-plain-item' }, + ]; + } + // Additive: a `#expression` sub-grammar for expression-only embeds (Vue `{{ }}`). The // top-level `patterns` (orderedPatterns / $self) are left untouched, so standalone // tokenization is unchanged — `#expression` is inert unless something includes it. diff --git a/test/yaml-depth-witnesses.ts b/test/yaml-depth-witnesses.ts index 39d0122..dae4bbe 100644 --- a/test/yaml-depth-witnesses.ts +++ b/test/yaml-depth-witnesses.ts @@ -85,9 +85,11 @@ const cases: Case[] = [ // ── indent STACK (monogram#24): a nested compact sequence sibling vs a plain-scalar fold. The `-` on // the indented line is a sequence indicator when a sequence is established at that column, but // folds into the preceding plain scalar otherwise — same surface, opposite answer, decided only by - // the indent stack a flat grammar lacks. KNOWN BUG until gen-tm derives indent-tracking regions. + // the indent stack a flat grammar lacks. FIXED by gen-tm §2c: a column-anchored COMPACT + // block-sequence region whose `\G`-anchored `while` (re-anchored each line by meta.stream) reclaims + // the inner sibling `- ` at the inner indicator's column before the §2a′ fold can swallow it. { state: 'indent stack (sibling vs fold)', input: '- - a\n - b\n- c\n', find: '- b', off: 0, want: 'punctuation', - notWant: 'string', note: 'inner-sequence sibling `-` is punctuation, not folded into a plain scalar', knownBug: true }, + notWant: 'string', note: 'inner-sequence sibling `-` is punctuation, not folded into a plain scalar' }, // the counter-proof — SAME indented `- b` line, but here it MUST fold (no sequence at column 2). This // is asserted (not a known bug): the eventual #24 fix must keep this one folding. { state: 'indent stack (counter-proof)', input: 'x: hello\n - b\n', find: '- b', want: 'string', diff --git a/yaml.tmLanguage.json b/yaml.tmLanguage.json index 50a63d3..ad81512 100644 --- a/yaml.tmLanguage.json +++ b/yaml.tmLanguage.json @@ -80,6 +80,9 @@ { "include": "#explicit-key-continuation" }, + { + "include": "#block-sequence" + }, { "include": "#plain-continuation" }, @@ -237,6 +240,9 @@ { "include": "#explicit-key-continuation" }, + { + "include": "#block-sequence" + }, { "include": "#plain-continuation" }, @@ -1893,6 +1899,197 @@ } ] }, + "block-plain-item": { + "begin": "(?=(?:[^\\t\\n\\f\\r \\-?:,\\[\\]{}#&*!|>'\"%@`]|[\\-?:](?=[^\\t\\n\\f\\r ,\\[\\]{}]))(?:[^:#\\n,\\[\\]{}]|:(?=[^\\t\\n\\f\\r ,\\]}])|#(?<=[^\\t\\n\\f\\r ]#))*)(?!(?:#|-[\\t ]|\\?[\\t ]|[^\\n\\[{\\]}]*?:(?:[\\t ]|$)))", + "while": "\\G", + "patterns": [ + { + "begin": "\\G", + "end": "(?=(?>[\\t ]++|\\G)#)", + "name": "string.unquoted.yaml", + "patterns": [ + { + "match": "\\G[\\t ]++", + "name": "string.unquoted.yaml" + }, + { + "match": "[\\t ]++$", + "name": "string.unquoted.yaml" + } + ] + }, + { + "begin": "(?!\\G)", + "while": "\\G", + "patterns": [ + { + "include": "#comment" + } + ] + } + ] + }, + "block-sequence": { + "begin": "(?=((?<=[\\-?:]) )?+)\\G( *+)(-)(?=[\\t ]+-[\\t ])", + "beginCaptures": { + "3": { + "name": "punctuation.yaml" + } + }, + "while": "\\G(?>(\\1\\2)(?=[\\t ]*-[\\t ]|[\\t ]*-$)|(?!\\1\\2)([\\t ]+)(?=[^\\t\\r\\n#])|[\\t ]*$)", + "patterns": [ + { + "include": "#comment" + }, + { + "include": "#blockscalar-explicit-seq-1" + }, + { + "include": "#blockscalar-explicit-seq-2" + }, + { + "include": "#blockscalar-explicit-seq-3" + }, + { + "include": "#blockscalar-explicit-seq-4" + }, + { + "include": "#blockscalar-explicit-seq-5" + }, + { + "include": "#blockscalar-explicit-seq-6" + }, + { + "include": "#blockscalar-explicit-seq-7" + }, + { + "include": "#blockscalar-explicit-seq-8" + }, + { + "include": "#blockscalar-explicit-seq-9" + }, + { + "include": "#blockscalar-seq" + }, + { + "include": "#blockscalar-key" + }, + { + "include": "#blockscalar-explicit-1" + }, + { + "include": "#blockscalar-explicit-2" + }, + { + "include": "#blockscalar-explicit-3" + }, + { + "include": "#blockscalar-explicit-4" + }, + { + "include": "#blockscalar-explicit-5" + }, + { + "include": "#blockscalar-explicit-6" + }, + { + "include": "#blockscalar-explicit-7" + }, + { + "include": "#blockscalar-explicit-8" + }, + { + "include": "#blockscalar-explicit-9" + }, + { + "include": "#blockscalar-doc" + }, + { + "include": "#blockscalar" + }, + { + "include": "#explicit-key-continuation" + }, + { + "include": "#block-sequence" + }, + { + "include": "#explicit-key" + }, + { + "include": "#explicit-key-indicator" + }, + { + "include": "#flow-sequence" + }, + { + "include": "#flow-mapping" + }, + { + "include": "#dquotekey" + }, + { + "include": "#squotekey" + }, + { + "include": "#key" + }, + { + "include": "#docstart" + }, + { + "include": "#docend" + }, + { + "include": "#dquote" + }, + { + "include": "#squote" + }, + { + "include": "#yamldirective" + }, + { + "include": "#directive" + }, + { + "include": "#tag" + }, + { + "include": "#directive-malformed" + }, + { + "include": "#num" + }, + { + "include": "#boolnull" + }, + { + "include": "#plain" + }, + { + "include": "#punctuation" + }, + { + "include": "#alias" + }, + { + "include": "#indent" + }, + { + "include": "#dedent" + }, + { + "include": "#newline" + }, + { + "include": "#anchor" + }, + { + "include": "#block-plain-item" + } + ] + }, "explicit-key": { "match": "(\\?)([\\t ]+)(?:(?:(&[^\\t\\n\\f\\r \\[\\]{},]+)|(!(?:<[^>]*>|[^\\t\\n\\f\\r \\[\\]{},]*)))[\\t ]+)*((?:[^\\t\\n\\f\\r \\-?:,\\[\\]{}#&*!|>'\"%@`]|[\\-?:](?=[^\\t\\n\\f\\r ,\\[\\]{}]))(?:[^:#\\n,\\[\\]{}]|:(?=[^\\t\\n\\f\\r ,\\]}])|#(?<=[^\\t\\n\\f\\r ]#))*)", "captures": { From 77e6f18d4a4c736d999f68d5337cbf871039e26d Mon Sep 17 00:00:00 2001 From: Johnson Chu Date: Mon, 8 Jun 2026 17:07:34 +0800 Subject: [PATCH 3/4] Track the #24 fix's deeper-irregular-indent residual as a known-bug witness --- test/yaml-depth-witnesses.ts | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/test/yaml-depth-witnesses.ts b/test/yaml-depth-witnesses.ts index dae4bbe..c42bf8a 100644 --- a/test/yaml-depth-witnesses.ts +++ b/test/yaml-depth-witnesses.ts @@ -94,6 +94,14 @@ const cases: Case[] = [ // is asserted (not a known bug): the eventual #24 fix must keep this one folding. { state: 'indent stack (counter-proof)', input: 'x: hello\n - b\n', find: '- b', want: 'string', note: 'plain-scalar continuation — `- b` folds (no sequence established at column 2)' }, + // honest residual of the #24 fix: a `-`-led continuation indented STRICTLY DEEPER than the inner + // indicator (`- - a\n - b` = `[["a - b"]]` — the deeper `- b` folds into the scalar `a`) keeps its + // `-` as punctuation instead of folding it. The compact-only region reclaims a SAME-column sibling; + // distinguishing a deeper `-` (fold) from a same-column `-` (sibling) needs the full node-relative + // rule-stack RedCMD threads through every level — a flat-grammar-wide rework, not the #24 report (this + // irregular-indent shape is absent from yaml-test-suite). Tracked so the fix is not over-claimed. + { state: 'indent stack (deeper-irregular fold)', input: '- - a\n - b\n', find: '- b', want: 'string', + notWant: 'punctuation', note: 'deeper-than-inner `- b` should fold into the plain scalar', knownBug: true }, ]; let pass = 0, knownBugs = 0, regressions = 0; From b9e0cf61ee47c49de9cabe1e7f589bcc8caeab4c Mon Sep 17 00:00:00 2001 From: Johnson Chu Date: Mon, 8 Jun 2026 18:03:42 +0800 Subject: [PATCH 4/4] Fold deeper-than-inner compact-sequence continuations (monogram#24 residual) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The #24 fix's COMPACT block-sequence region (gen-tm §2c) reclaimed a dash at ANY depth as a sibling (its while arm 1 used `[\t ]*${dash}`), so a `-`-led continuation indented STRICTLY DEEPER than the inner indicator kept its `-` scoped `punctuation` instead of folding into the plain scalar — `- - a\n - b` is `[["a - b"]]` (the deeper `- b` is plain content, not an item), the one residual the witness tracked as a known-bug. Pin the inner indicator's column PORTABLY instead of matching any depth. The begin captures the indicator run between the outer and inner dash as group 4, so the while reconstructs the inner column as `\1\2 \4` (outer indent + the dash's own column + the captured run — a multi-space compact `- - x` pins correctly too): arm 1 reclaims a dash AT EXACTLY that column (a sibling -> punctuation), arm 2 is a zero-width lookahead that keeps the region alive on a strictly-deeper line so a nested deeper #block-sequence (re-opened per compact level) gets first claim on its own sibling, and a deeper line that opens no nested sequence is folded by a new body rule #block-fold (`^([\t ]+)…(plain run)`, anchored at line start so it never fires on the header line's inline inner item; excludes a comment or a deeper `key:` so a mapping item value's deeper entry keeps its structure). No variable-length lookbehind, so it stays portable under Onigmo / GitHub-Linguist (RedCMD achieves the same semantics only with a rejected variable-length lookbehind). The depth-witnesses deeper-irregular case is now an asserted pass (the column- aligned sibling and counter-proof still hold; a deeper-NESTED sibling `- - - a\n - b` stays punctuation). Highlighter-only and gated on `grammar.indent`: scope-gap stays 100% / 0 monogram-wrong, parser src-coverage 100%, the six other grammars + tree-sitter regenerate byte-identical, and portability / issue#12 / sanity / agnostic are unchanged. --- src/gen-tm.ts | 90 ++++++++++++++++++++++++------------ test/yaml-depth-witnesses.ts | 14 +++--- yaml.tmLanguage.json | 15 +++++- 3 files changed, 81 insertions(+), 38 deletions(-) diff --git a/src/gen-tm.ts b/src/gen-tm.ts index 800ecf5..29ceebd 100644 --- a/src/gen-tm.ts +++ b/src/gen-tm.ts @@ -4775,6 +4775,7 @@ export function generateTmLanguage(grammar: CstGrammar, langName: string): TmGra // the Anchor/Tag tokens, not the indent config) and stays inline. const ind = grammar.indent!; const cmtLit = escapeRegex(ind.comment ?? '#'); + const cmtCc = escapeForCharClass(ind.comment ?? '#'); // the comment introducer, char-class-escaped const compactAlt = (ind.compactIndicators ?? []).map((c) => `${escapeRegex(c)}[\\t ]`).join('|'); const compactCls = `[${(ind.compactIndicators ?? []).map(escapeForCharClass).join('')}]`; const docAlt = (blockScalar.documentMarkers ?? []).map(escapeRegex).join('|'); @@ -5154,28 +5155,33 @@ export function generateTmLanguage(grammar: CstGrammar, langName: string): TmGra // no possessive `[ \t]++` can split from the deeper-fold case `x: y\n - b` (same line, must fold). // // The fix mirrors the maintained RedCMD YAML grammar's block-sequence: a `\G`-anchored region whose - // rule stack carries the indent depth. The meta.stream wrapper re-anchors `\G` at every line, so the - // region's `( *+)`-captured column (`\1\2`) lets its `while` reclaim a same-column sibling `- ` while - // a DEEPER line stays folded into the item value. We emit it ONLY for the COMPACT case (a dash - // followed by ANOTHER dash on the same line) — `begin … (?=[\t ]+${dash}[\t ])` — so a single `- a`, - // a `- key: v` mapping item, a `- {…}`/`- "…"`/`- |` value, etc. are UNTOUCHED (still handled by the - // top-level token includes + the §2a′ fold), confining this region to exactly the bug's shape. The - // compact re-anchor `(?=((?<=${reanchor}) )?+)` (a FIXED-width lookbehind — portable, unlike RedCMD's - // variable-length `(?(\\1\\2)(?=[\\t ]*${dash}[\\t ]|[\\t ]*${dash}$)|(?!\\1\\2)([\\t ]+)(?=[^\\t\\r\\n#])|[\\t ]*$)`, + while: `\\G(?>(\\1\\2 \\4)(?=${dash}[\\t ]|${dash}$)|(?=\\1\\2 \\4[\\t ])|[\\t ]*$)`, patterns: [], }; + // A deeper line (kept alive by the `while`'s arm 2) that is NOT a nested sibling folds into the + // current item's scalar. Anchored at LINE START (`^`), so it NEVER fires on the header line's inline + // inner item (which sits past column 0, after the outer `- `): only a continuation line begins at + // column 0. A leading `#` (a whitespace-preceded comment) is excluded so it falls to #comment, and + // `foldExclude` excludes a deeper KEY line (`: `) so a mapping ITEM VALUE's deeper entry + // (`- - a: 1\n b: 2`) keeps its #key structure instead of folding — the exclusion DROPS the + // compact indicators from `structAhead`, since a deeper `- b` (no sequence at its column) IS a fold + // (`- - a\n - b` = `[["a - b"]]`), the whole point of this rule. The body is one opaque plain run + // stopping before an inline ` #` (same idiom as the §2a′ continuation / §2a″ bareCont). Listed in + // the region body right AFTER the self-include so a deeper COMPACT line opens a nested + // #block-sequence instead of folding. (monogram#24 deeper residual.) + const foldExclude = `(?:${cmtLit}|${flowEx}*?${kvSep}(?:[\\t ]|$))`; + repository['block-fold'] = { + match: `^([\\t ]+)(?=[^\\t\\r\\n${cmtCc}])(?!${foldExclude})((?:[^${cmtCc}\\n]|${cmtLit}(?<=[^\\t\\n\\f\\r ]${cmtLit}))*)`, + captures: { '2': { name: plainContent } }, + }; topPatterns.push({ include: '#block-sequence' }); } } @@ -7899,12 +7926,17 @@ export function generateTmLanguage(grammar: CstGrammar, langName: string): TmGra // column-anchored `while` + the bounded #block-plain-item handle the item-value fold node-relatively; // leaving them in would re-introduce the line-relative swallow this region exists to prevent), and the // bounded #block-plain-item is appended for a bare plain item value. The self-include (#block-sequence, - // already in the ordered list at rank 0.69) gives deeper compact nesting (`- - - x`). + // already in the ordered list at rank 0.69) gives deeper compact nesting (`- - - x`); #block-fold is + // spliced in right AFTER it (region-body-only — never a top-level include) so a deeper line that opens no + // nested sequence folds into the item's plain scalar (monogram#24 deeper residual), while a deeper + // COMPACT line still re-opens #block-sequence first. if (repository['block-sequence']) { - repository['block-sequence'].patterns = [ - ...orderedPatterns.filter(p => p.include !== '#plain-continuation' && p.include !== '#plain-bare-fold'), - { include: '#block-plain-item' }, - ]; + const body = orderedPatterns.filter(p => p.include !== '#plain-continuation' && p.include !== '#plain-bare-fold'); + if (repository['block-fold']) { + const selfAt = body.findIndex(p => p.include === '#block-sequence'); + if (selfAt >= 0) body.splice(selfAt + 1, 0, { include: '#block-fold' }); + } + repository['block-sequence'].patterns = [...body, { include: '#block-plain-item' }]; } // Additive: a `#expression` sub-grammar for expression-only embeds (Vue `{{ }}`). The diff --git a/test/yaml-depth-witnesses.ts b/test/yaml-depth-witnesses.ts index c42bf8a..5eb7922 100644 --- a/test/yaml-depth-witnesses.ts +++ b/test/yaml-depth-witnesses.ts @@ -94,14 +94,14 @@ const cases: Case[] = [ // is asserted (not a known bug): the eventual #24 fix must keep this one folding. { state: 'indent stack (counter-proof)', input: 'x: hello\n - b\n', find: '- b', want: 'string', note: 'plain-scalar continuation — `- b` folds (no sequence established at column 2)' }, - // honest residual of the #24 fix: a `-`-led continuation indented STRICTLY DEEPER than the inner - // indicator (`- - a\n - b` = `[["a - b"]]` — the deeper `- b` folds into the scalar `a`) keeps its - // `-` as punctuation instead of folding it. The compact-only region reclaims a SAME-column sibling; - // distinguishing a deeper `-` (fold) from a same-column `-` (sibling) needs the full node-relative - // rule-stack RedCMD threads through every level — a flat-grammar-wide rework, not the #24 report (this - // irregular-indent shape is absent from yaml-test-suite). Tracked so the fix is not over-claimed. + // a `-`-led continuation indented STRICTLY DEEPER than the inner indicator (`- - a\n - b` = + // `[["a - b"]]` — the deeper `- b` folds into the scalar `a`) folds its `-` as plain content. Resolved by + // §2c pinning the inner column portably (`\1\2 \4`: outer indent + the dash's own column + the captured + // indicator run) so the `while` reclaims ONLY a same-column sibling, with a deeper line folded by the + // body's #block-fold rule. A deeper-NESTED sibling (`- - - a\n - b`) still scopes `punctuation` (its + // own level's region reclaims it) — distinguished by the rule-stack, not a variable-length lookbehind. { state: 'indent stack (deeper-irregular fold)', input: '- - a\n - b\n', find: '- b', want: 'string', - notWant: 'punctuation', note: 'deeper-than-inner `- b` should fold into the plain scalar', knownBug: true }, + notWant: 'punctuation', note: 'deeper-than-inner `- b` should fold into the plain scalar' }, ]; let pass = 0, knownBugs = 0, regressions = 0; diff --git a/yaml.tmLanguage.json b/yaml.tmLanguage.json index ad81512..54c05f9 100644 --- a/yaml.tmLanguage.json +++ b/yaml.tmLanguage.json @@ -1930,13 +1930,13 @@ ] }, "block-sequence": { - "begin": "(?=((?<=[\\-?:]) )?+)\\G( *+)(-)(?=[\\t ]+-[\\t ])", + "begin": "(?=((?<=[\\-?:]) )?+)\\G( *+)(-)(?=([\\t ]+)-[\\t ])", "beginCaptures": { "3": { "name": "punctuation.yaml" } }, - "while": "\\G(?>(\\1\\2)(?=[\\t ]*-[\\t ]|[\\t ]*-$)|(?!\\1\\2)([\\t ]+)(?=[^\\t\\r\\n#])|[\\t ]*$)", + "while": "\\G(?>(\\1\\2 \\4)(?=-[\\t ]|-$)|(?=\\1\\2 \\4[\\t ])|[\\t ]*$)", "patterns": [ { "include": "#comment" @@ -2013,6 +2013,9 @@ { "include": "#block-sequence" }, + { + "include": "#block-fold" + }, { "include": "#explicit-key" }, @@ -2090,6 +2093,14 @@ } ] }, + "block-fold": { + "match": "^([\\t ]+)(?=[^\\t\\r\\n#])(?!(?:#|[^\\n\\[{\\]}]*?:(?:[\\t ]|$)))((?:[^#\\n]|#(?<=[^\\t\\n\\f\\r ]#))*)", + "captures": { + "2": { + "name": "string.unquoted.yaml" + } + } + }, "explicit-key": { "match": "(\\?)([\\t ]+)(?:(?:(&[^\\t\\n\\f\\r \\[\\]{},]+)|(!(?:<[^>]*>|[^\\t\\n\\f\\r \\[\\]{},]*)))[\\t ]+)*((?:[^\\t\\n\\f\\r \\-?:,\\[\\]{}#&*!|>'\"%@`]|[\\-?:](?=[^\\t\\n\\f\\r ,\\[\\]{}]))(?:[^:#\\n,\\[\\]{}]|:(?=[^\\t\\n\\f\\r ,\\]}])|#(?<=[^\\t\\n\\f\\r ]#))*)", "captures": {