diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c4ba0d5..4eaafd0 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -56,6 +56,7 @@ jobs: node test/vue-embed-boundary.ts node test/vue-interp-expr.ts node test/yaml-issue12-regressions.ts + node test/yaml-depth-witnesses.ts # The derived tree-sitter highlighter is the strongest thesis proof (a real GLR # parser from the same grammar, beating the official hand-written one). Build its diff --git a/src/gen-lexer.ts b/src/gen-lexer.ts index b773558..bf44172 100644 --- a/src/gen-lexer.ts +++ b/src/gen-lexer.ts @@ -1,6 +1,6 @@ import type { CstGrammar } from './types.ts'; import { collectLiterals, isKeywordLiteral } from './grammar-utils.ts'; -import { tokenBlockPatternFirstCharSet, tokenBlockPatternSource, tokenEscapeValidPatternSource, tokenPatternFirstCharSet, tokenPatternSource } from './token-pattern.ts'; +import { tokenBlockPatternFirstCharSet, tokenBlockPatternSource, tokenEscapeValidPatternSource, tokenPatternFirstCharSet, tokenPatternHasStartAnchor, tokenPatternSource } from './token-pattern.ts'; // A lexer token: a declared token (type = its name) or a punctuation literal (type = ''). // `$templateHead/$templateMiddle/$templateTail` are synthetic types the lexer emits for @@ -45,10 +45,16 @@ export function createLexer(grammar: CstGrammar) { const tokenMatchers = grammar.tokens.map(t => { const pattern = tokenPatternSource(t); const blockPattern = tokenBlockPatternSource(t); + // A token whose pattern carries a line-START anchor (`start()` → `^`, e.g. YAML's `---`/`...` + // document markers, a shebang) needs the `m` flag: under the sticky `y` matcher a bare `^` + // matches only at index 0 (file start), so a marker at the start of a LATER line (`# c\n---\n…`, + // `%TAG …\n---\n…`) would fail to lex. With `m`, `^` matches at every line start, so a sticky + // match at `lastIndex = pos` succeeds iff `pos` is a line start — exactly `start()`'s meaning. + const flags = tokenPatternHasStartAnchor(t) ? 'ym' : 'y'; return { name: t.name, - regex: new RegExp(`(?:${pattern})`, 'y'), - blockRegex: blockPattern ? new RegExp(`(?:${blockPattern})`, 'y') : null, + regex: new RegExp(`(?:${pattern})`, flags), + blockRegex: blockPattern ? new RegExp(`(?:${blockPattern})`, flags) : null, skip: t.flags.includes('skip'), isRegex: t.flags.includes('regex'), isString: !!t.string, diff --git a/src/gen-tm.ts b/src/gen-tm.ts index 18de676..29ceebd 100644 --- a/src/gen-tm.ts +++ b/src/gen-tm.ts @@ -3409,6 +3409,59 @@ function detectExplicitKey(grammar: CstGrammar): { indicator: string; keyScope: return { indicator, keyScope, keyBody, prefixGroups }; } +// ── Block-sequence detection (YAML `- item` block sequence) ── +// +// A flat per-line grammar cannot tell a COMPACT nested sequence's sibling from a plain-scalar fold: +// `- - a\n - b` (the ` - b` is the INNER sequence's 2nd item) and `x: y\n - b` (the ` - b` folds +// into the plain scalar `y`) share the same continuation line but get OPPOSITE answers — the decider +// is the ENCLOSING node column (an inner sequence at column 2 vs a mapping at column 0), which the +// line-relative `\1` of the §2a′ fold region cannot see (both lines start at column 0). monogram#24. +// The fix is a column-anchored region (§2c below) for the COMPACT block sequence, whose `\G`-anchored +// `while` (re-anchored each line by the meta.stream wrapper) reclaims a sibling `- ` at the inner +// indicator's column BEFORE the plain-scalar fold can swallow it. This detects the block-sequence rule +// + its indicator literal from the grammar so the region is DERIVED, not hardcoded. +// +// Signal (all from the grammar): a rule whose body is `[item, (Newline item)*]` — an item then a +// same-column-NEWLINE-separated run of further items — where `item` is a `ref` to a rule whose body is +// a `seq` headed by a SINGLE-char punctuation literal (the sequence indicator `-`). Returns the +// indicator literal, or null when the family has no such block sequence (every non-YAML grammar). +function detectBlockSequence(grammar: CstGrammar): { indicator: string } | null { + if (!grammar.indent) return null; + const { newlineToken } = grammar.indent; + const ruleByName = new Map(grammar.rules.map(r => [r.name, r] as const)); + const headSinglePunct = (e: RuleExpr): string | null => + e.type === 'literal' && e.value.length === 1 && !/[\w\s]/.test(e.value) ? e.value : null; + // The item rule's indicator: unwrap a ref to a rule whose body's first seq element is a 1-char punct. + const itemIndicator = (e: RuleExpr): string | null => { + let body = e.type === 'ref' ? ruleByName.get(e.name)?.body : e; + if (!body) return null; + // a rule body written `[[...]]` is a single-alt seq; unwrap a lone-arm alt + if (body.type === 'alt' && body.items.length === 1) body = body.items[0]; + return body.type === 'seq' ? headSinglePunct(body.items[0]) : null; + }; + let indicator: string | null = null; + const visit = (e: RuleExpr): void => { + if (e.type === 'seq') { + // `[item, (Newline item)*]`: first element + a `*`/`+` over a `[Newline, item]` seq + if (e.items.length >= 2) { + const head = e.items[0]; + const q = e.items[1]; + if (q.type === 'quantifier' && (q.kind === '*' || q.kind === '+') && q.body.type === 'seq' + && q.body.items.length >= 2 && q.body.items[0].type === 'ref' && q.body.items[0].name === newlineToken) { + const ind = itemIndicator(head); + // the repeated element's item must share the head's indicator (a homogeneous sequence) + if (ind && itemIndicator(q.body.items[1]) === ind) indicator = ind; + } + } + e.items.forEach(visit); + } else if (e.type === 'alt') e.items.forEach(visit); + else if (e.type === 'quantifier' || e.type === 'group' || e.type === 'not') visit(e.body); + else if (e.type === 'sep') visit(e.element); + }; + for (const r of grammar.rules) visit(r.body); + return indicator ? { indicator } : null; +} + // ── Flow-collection detection (YAML `{ … }` mapping / `[ … ]` sequence) ── // // A flat per-token grammar cannot scope a flow MAPPING's keys: in `{ a: 1 }` the `a` is a key @@ -4722,6 +4775,7 @@ export function generateTmLanguage(grammar: CstGrammar, langName: string): TmGra // the Anchor/Tag tokens, not the indent config) and stays inline. const ind = grammar.indent!; const cmtLit = escapeRegex(ind.comment ?? '#'); + const cmtCc = escapeForCharClass(ind.comment ?? '#'); // the comment introducer, char-class-escaped const compactAlt = (ind.compactIndicators ?? []).map((c) => `${escapeRegex(c)}[\\t ]`).join('|'); const compactCls = `[${(ind.compactIndicators ?? []).map(escapeForCharClass).join('')}]`; const docAlt = (blockScalar.documentMarkers ?? []).map(escapeRegex).join('|'); @@ -5088,6 +5142,96 @@ export function generateTmLanguage(grammar: CstGrammar, langName: string): TmGra }); topPatterns.push({ include: '#plain-bare-fold' }); } + + // ── 2c. COMPACT block-sequence — a column-anchored region for nested `- - …` (monogram#24) ── + // The §2a′/§2a″ plain folds are LINE-relative (their `\1` is the line's leading whitespace), but + // a YAML continuation is NODE-relative (more indented than the ENCLOSING dash/key). For a single + // sequence (`- a`) or a mapping (`x: y`) the two coincide — the dash/key sits at column 0 = the + // line indent — so those folds are correct. They DIVERGE only for a COMPACT nested sequence + // (`- - a`): the inner sequence's dash is at column 2 (after the outer `- ` prefix, which is NOT + // whitespace), so a sibling `- b` at column 2 reads — to a line-relative `\1=""` fold — as "indented + // past column 0", and is wrongly folded into the plain scalar `a`. The decider is the inner + // indicator's column, which no `\1`-relative backref can express (the prefix is `- `, not spaces) and + // no possessive `[ \t]++` can split from the deeper-fold case `x: y\n - b` (same line, must fold). + // + // The fix mirrors the maintained RedCMD YAML grammar's block-sequence: a `\G`-anchored region whose + // rule stack carries the indent depth, RE-OPENED per compact level (its body self-includes + // #block-sequence, so `- - - a` nests three deep). The meta.stream wrapper re-anchors `\G` at every + // line, so each level's captures pin ITS OWN inner indicator column and reclaim only siblings AT that + // column. We emit it ONLY for the COMPACT case (a dash followed by ANOTHER dash on the same line) — + // `begin … (?=([\t ]+)${dash}[\t ])` — so a single `- a`, a `- key: v` mapping item, a `- {…}`/`- "…"`/ + // `- |` value, etc. are UNTOUCHED (still handled by the top-level token includes + the §2a′ fold), + // confining this region to exactly the bug's shape. The compact re-anchor `(?=((?<=${reanchor}) )?+)` + // (a FIXED-width lookbehind — portable, unlike RedCMD's variable-length `(?[\\t ]++|\\G)#)', name: plainContent, patterns: [ + { match: '\\G[\\t ]++', name: plainContent }, { match: '[\\t ]++$', name: plainContent }] }, + { begin: '(?!\\G)', while: '\\G', patterns: commentIncludeKeys.map(k => ({ include: `#${k}` })) }, + ], + }; + // The region SHELL (begin/while/captures); its body `patterns` is filled at the END (after the + // top-level dispatch is built + ordered), since the item content reuses that full dispatch. Group 4 + // (`([\t ]+)`, in the begin's lookahead) captures the indicator run between the outer and inner + // dashes, so the `while` can reconstruct the inner column as `\1\2 \4` (outer indent + the dash's + // own column + the run). Arm 1 reclaims a same-column sibling (`punctuation`); arm 2 is a zero-width + // lookahead that keeps the region alive on a strictly-deeper line (deferring to a nested level's + // sibling-reclaim, then to the body's #block-fold rule); arm 3 is a blank line. + repository['block-sequence'] = { + begin: `(?=((?<=${reanchor}) )?+)\\G( *+)(${dash})(?=([\\t ]+)${dash}[\\t ])`, + beginCaptures: { '3': { name: `punctuation.${langName}` } }, + while: `\\G(?>(\\1\\2 \\4)(?=${dash}[\\t ]|${dash}$)|(?=\\1\\2 \\4[\\t ])|[\\t ]*$)`, + patterns: [], + }; + // A deeper line (kept alive by the `while`'s arm 2) that is NOT a nested sibling folds into the + // current item's scalar. Anchored at LINE START (`^`), so it NEVER fires on the header line's inline + // inner item (which sits past column 0, after the outer `- `): only a continuation line begins at + // column 0. A leading `#` (a whitespace-preceded comment) is excluded so it falls to #comment, and + // `foldExclude` excludes a deeper KEY line (`: `) so a mapping ITEM VALUE's deeper entry + // (`- - a: 1\n b: 2`) keeps its #key structure instead of folding — the exclusion DROPS the + // compact indicators from `structAhead`, since a deeper `- b` (no sequence at its column) IS a fold + // (`- - a\n - b` = `[["a - b"]]`), the whole point of this rule. The body is one opaque plain run + // stopping before an inline ` #` (same idiom as the §2a′ continuation / §2a″ bareCont). Listed in + // the region body right AFTER the self-include so a deeper COMPACT line opens a nested + // #block-sequence instead of folding. (monogram#24 deeper residual.) + const foldExclude = `(?:${cmtLit}|${flowEx}*?${kvSep}(?:[\\t ]|$))`; + repository['block-fold'] = { + match: `^([\\t ]+)(?=[^\\t\\r\\n${cmtCc}])(?!${foldExclude})((?:[^${cmtCc}\\n]|${cmtLit}(?<=[^\\t\\n\\f\\r ]${cmtLit}))*)`, + captures: { '2': { name: plainContent } }, + }; + topPatterns.push({ include: '#block-sequence' }); + } } } @@ -7691,6 +7835,13 @@ export function generateTmLanguage(grammar: CstGrammar, langName: string): TmGra // continuation as the KEY (entity.name.tag), so it must win for the `?` case; #plain-continuation // still handles `key:`/`- ` folds (its lookahead, unlike this one, is not pinned to the `?`). if (key === 'explicit-key-continuation') return 0.68; + // The COMPACT block-sequence region (§2c) must out-rank #plain-continuation (0.7): both open on a + // `- `-led header (the `-` is in compactCls, so the fold's lookahead matches a compact line too), but + // for `- - a` the sequence region bounds the inner sibling at the inner indicator's column while the + // fold would swallow it line-relative. Ranked above the fold so the compact case is claimed first; + // its begin requires a SECOND dash (`(?=[\t ]+-[\t ])`), so a non-compact `- a`/`- key:`/`x: y` line + // never matches it and still falls through to #plain-continuation. (monogram#24.) + if (key === 'block-sequence') return 0.69; if (key === 'plain-continuation') return 0.7; // The BARE plain-scalar same-column fold (§2a″) likewise begins AT LINE START and must out-rank the // scalar tokens (#key/#num/#boolnull/#plain ≥ 0.8) so it opens on a bare value scalar and claims its @@ -7768,6 +7919,26 @@ export function generateTmLanguage(grammar: CstGrammar, langName: string): TmGra .sort((a, b) => scopeOrder(a) - scopeOrder(b)) .map(include => ({ include })); + // Fill the COMPACT block-sequence region's body (§2c). Its item content reuses the FULL top-level + // dispatch (so a `- - {…}` flow value, a `- - "x"` quoted value, a `- - key: v` nested mapping, a + // `- - |` block scalar are all scoped correctly) — the same ordered includes meta.stream wraps — with + // two changes: the two LINE-relative plain folds (§2a′/§2a″) are REMOVED (the sequence's own + // column-anchored `while` + the bounded #block-plain-item handle the item-value fold node-relatively; + // leaving them in would re-introduce the line-relative swallow this region exists to prevent), and the + // bounded #block-plain-item is appended for a bare plain item value. The self-include (#block-sequence, + // already in the ordered list at rank 0.69) gives deeper compact nesting (`- - - x`); #block-fold is + // spliced in right AFTER it (region-body-only — never a top-level include) so a deeper line that opens no + // nested sequence folds into the item's plain scalar (monogram#24 deeper residual), while a deeper + // COMPACT line still re-opens #block-sequence first. + if (repository['block-sequence']) { + const body = orderedPatterns.filter(p => p.include !== '#plain-continuation' && p.include !== '#plain-bare-fold'); + if (repository['block-fold']) { + const selfAt = body.findIndex(p => p.include === '#block-sequence'); + if (selfAt >= 0) body.splice(selfAt + 1, 0, { include: '#block-fold' }); + } + repository['block-sequence'].patterns = [...body, { include: '#block-plain-item' }]; + } + // Additive: a `#expression` sub-grammar for expression-only embeds (Vue `{{ }}`). The // top-level `patterns` (orderedPatterns / $self) are left untouched, so standalone // tokenization is unchanged — `#expression` is inert unless something includes it. diff --git a/test/yaml-depth-witnesses.ts b/test/yaml-depth-witnesses.ts new file mode 100644 index 0000000..5eb7922 --- /dev/null +++ b/test/yaml-depth-witnesses.ts @@ -0,0 +1,130 @@ +// yaml-depth-witnesses.ts — a RAW-SCOPE regression gate for the flat YAML TextMate highlighter's +// depth/position sites. It exists because the scope-gap metric reported `monogramWrong=0` while real +// bugs (monogram#23/#24) sat in plain sight: that metric is corpus-bound (the witnesses aren't in +// yaml-test-suite) AND excludes lexical-floor roles (a `-` mis-painted as string is invisible because +// `punctuation` is floor-excluded and the `b` beside it grades correct). So a "0 wrong" headline never +// meant "no bug" — only "no bug my metric can see". +// +// THEOREM behind the cases: where a construct's correct scope depends on cross-line STATE the parser +// keeps in a stack (depth), and the derived TextMate grammar is flat (no stack), the set of inputs +// where they disagree is provably NON-EMPTY. So we don't wait for a corpus to surface these — we +// CONSTRUCT one witness per state field of the derived YAML scanner (indent stack, flow depth, +// block-scalar region, document-marker position, node-property lead) and assert the RAW inner scope at +// the position the depth decides. This is oracle-independent (a fixed expected scope) and floor-blind +// (it checks the punctuation/string class directly), so neither blind spot can hide a regression. +// +// Run (bare node): node test/yaml-depth-witnesses.ts +import { readFileSync } from 'node:fs'; +import { createRequire } from 'node:module'; +import vsctm from 'vscode-textmate'; +import onig from 'vscode-oniguruma'; + +const { INITIAL, Registry, parseRawGrammar } = vsctm; +const { loadWASM, OnigScanner, OnigString } = onig; +const require = createRequire(import.meta.url); +const bin = readFileSync(require.resolve('vscode-oniguruma/release/onig.wasm')); +await loadWASM(bin.buffer.slice(bin.byteOffset, bin.byteOffset + bin.byteLength)); +const reg = new Registry({ + onigLib: Promise.resolve({ createOnigScanner: (p: string[]) => new OnigScanner(p), createOnigString: (s: string) => new OnigString(s) }), + loadGrammar: async (sn: string) => sn === 'source.yaml' ? parseRawGrammar(readFileSync('yaml.tmLanguage.json', 'utf8'), 'y.json') : null, +}); +const grammar = (await reg.loadGrammar('source.yaml'))!; + +interface Tok { start: number; end: number; scopes: string[] } +function tokenize(text: string): Tok[] { + const toks: Tok[] = []; let rs = INITIAL, off = 0; + for (const line of text.split('\n')) { const r = grammar.tokenizeLine(line, rs); for (const t of r.tokens) toks.push({ start: off + t.startIndex, end: off + t.endIndex, scopes: t.scopes }); rs = r.ruleStack; off += line.length + 1; } + return toks; +} +function scopeAt(toks: Tok[], pos: number): string { + let lo = 0, hi = toks.length - 1, ans = -1; + while (lo <= hi) { const mid = (lo + hi) >> 1; if (toks[mid].start <= pos) { ans = mid; lo = mid + 1; } else hi = mid - 1; } + const s = ans >= 0 && toks[ans].end > pos ? toks[ans].scopes : []; + return s.length ? s[s.length - 1] : '(none)'; +} +// Locate the byte offset of `find` in `input` (optionally the n-th occurrence, 0-based). +function locate(input: string, find: string, nth = 0): number { + let i = -1; for (let k = 0; k <= nth; k++) { i = input.indexOf(find, i + 1); if (i < 0) throw new Error(`witness focus not found: ${JSON.stringify(find)}#${nth}`); } + return i; +} + +interface Case { + state: string; // the scanner state field this witness probes + input: string; + find: string; nth?: number; off?: number; // focus = nth occurrence of `find`, plus `off` chars + want?: string; // inner scope MUST start with this + notWant?: string; // inner scope MUST NOT start with this + note: string; + knownBug?: boolean; // a depth site not yet fixed in the flat derivation — tracked, not asserted +} + +const cases: Case[] = [ + // ── document-marker POSITION (monogram#23): a marker is column-0-only; a value-leading `---`/`...` + // is string content. Fixed by anchoring DocStart/DocEnd with start() (yaml.ts) + the lexer m-flag. + { state: 'doc-marker position', input: 'note: --- not a marker\n', find: '---', want: 'string', + notWant: 'entity.other.document', note: 'value-leading `---` is string, not document.begin' }, + { state: 'doc-marker position', input: 'x: ... bar\n', find: '...', want: 'string', + notWant: 'entity.other.document', note: 'value-leading `...` is string, not document.end' }, + { state: 'doc-marker position', input: '- --- x\n', find: '---', want: 'string', + notWant: 'entity.other.document', note: 'sequence-item value-leading `---` is string' }, + // a LEGITIMATE column-0 marker must still scope as document structure (the fix must not over-correct) + { state: 'doc-marker position', input: '---\nkey: value\n', find: '---', want: 'entity.other.document', + note: 'a real column-0 `---` is still a document marker' }, + + // ── block-scalar REGION: inside `|`/`>` the body is literal text — `#`/`-` are NOT comment/indicator. + // Handled by the block-scalar begin/end region (a depth mechanism the flat grammar DOES carry). + { state: 'block-scalar region', input: 'a: |\n # literal\n x\n', find: '# literal', want: 'string', + notWant: 'comment', note: 'inside a block scalar `#` is text, not a comment' }, + { state: 'block-scalar region', input: 'a: |\n - literal\n x\n', find: '- literal', want: 'string', + notWant: 'punctuation', note: 'inside a block scalar `-` is text, not a sequence indicator' }, + + // ── flow DEPTH: outside flow, `,` and an inner `:` are plain-scalar content (block `{k:"a,b"}`). + { state: 'flow depth', input: 'k: a,b\n', find: ',b', want: 'string', + notWant: 'punctuation.separator', note: 'block plain scalar — `,` is content, not a flow separator' }, + + // ── indent STACK (monogram#24): a nested compact sequence sibling vs a plain-scalar fold. The `-` on + // the indented line is a sequence indicator when a sequence is established at that column, but + // folds into the preceding plain scalar otherwise — same surface, opposite answer, decided only by + // the indent stack a flat grammar lacks. FIXED by gen-tm §2c: a column-anchored COMPACT + // block-sequence region whose `\G`-anchored `while` (re-anchored each line by meta.stream) reclaims + // the inner sibling `- ` at the inner indicator's column before the §2a′ fold can swallow it. + { state: 'indent stack (sibling vs fold)', input: '- - a\n - b\n- c\n', find: '- b', off: 0, want: 'punctuation', + notWant: 'string', note: 'inner-sequence sibling `-` is punctuation, not folded into a plain scalar' }, + // the counter-proof — SAME indented `- b` line, but here it MUST fold (no sequence at column 2). This + // is asserted (not a known bug): the eventual #24 fix must keep this one folding. + { state: 'indent stack (counter-proof)', input: 'x: hello\n - b\n', find: '- b', want: 'string', + note: 'plain-scalar continuation — `- b` folds (no sequence established at column 2)' }, + // a `-`-led continuation indented STRICTLY DEEPER than the inner indicator (`- - a\n - b` = + // `[["a - b"]]` — the deeper `- b` folds into the scalar `a`) folds its `-` as plain content. Resolved by + // §2c pinning the inner column portably (`\1\2 \4`: outer indent + the dash's own column + the captured + // indicator run) so the `while` reclaims ONLY a same-column sibling, with a deeper line folded by the + // body's #block-fold rule. A deeper-NESTED sibling (`- - - a\n - b`) still scopes `punctuation` (its + // own level's region reclaims it) — distinguished by the rule-stack, not a variable-length lookbehind. + { state: 'indent stack (deeper-irregular fold)', input: '- - a\n - b\n', find: '- b', want: 'string', + notWant: 'punctuation', note: 'deeper-than-inner `- b` should fold into the plain scalar' }, +]; + +let pass = 0, knownBugs = 0, regressions = 0; +for (const c of cases) { + const toks = tokenize(c.input); + const pos = locate(c.input, c.find, c.nth) + (c.off ?? 0); + const got = scopeAt(toks, pos).replace(/\.yaml$/, ''); + const okWant = c.want ? got.startsWith(c.want) : true; + const okNot = c.notWant ? !got.startsWith(c.notWant) : true; + const ok = okWant && okNot; + const expectStr = [c.want && `want ${c.want}*`, c.notWant && `not ${c.notWant}*`].filter(Boolean).join(', '); + if (c.knownBug) { + knownBugs++; + console.log(` ${ok ? '✓ FIXED' : '· known'} [${c.state}] ${JSON.stringify(c.input)} @«${c.find}» → «${got}» (${expectStr})`); + if (ok) console.log(` ↑ this known bug now PASSES — flip knownBug:false to lock it in.`); + } else if (ok) { + pass++; + console.log(` ✓ ok [${c.state}] @«${c.find}» → «${got}»`); + } else { + regressions++; + console.log(` ✗ FAIL [${c.state}] ${JSON.stringify(c.input)} @«${c.find}» → «${got}» — expected ${expectStr}`); + console.log(` ${c.note}`); + } +} +console.log(`\n ${pass} pass · ${knownBugs} known-bug (depth sites not yet derived) · ${regressions} regression`); +if (regressions > 0) { console.error('\nDEPTH WITNESS REGRESSION — a flat-highlighter depth/position site broke.'); process.exit(1); } diff --git a/yaml.tmLanguage.json b/yaml.tmLanguage.json index 6f1b41b..54c05f9 100644 --- a/yaml.tmLanguage.json +++ b/yaml.tmLanguage.json @@ -80,6 +80,9 @@ { "include": "#explicit-key-continuation" }, + { + "include": "#block-sequence" + }, { "include": "#plain-continuation" }, @@ -237,6 +240,9 @@ { "include": "#explicit-key-continuation" }, + { + "include": "#block-sequence" + }, { "include": "#plain-continuation" }, @@ -321,11 +327,11 @@ "repository": { "docstart": { "name": "entity.other.document.begin.yaml", - "match": "---(?=[\\t ]|\\r|\\n|$)" + "match": "^---(?=[\\t ]|\\r|\\n|$)" }, "docend": { "name": "entity.other.document.end.yaml", - "match": "\\.\\.\\.(?=[\\t ]|\\r|\\n|$)" + "match": "^\\.\\.\\.(?=[\\t ]|\\r|\\n|$)" }, "yamldirective": { "name": "keyword.other.directive.yaml", @@ -1893,6 +1899,208 @@ } ] }, + "block-plain-item": { + "begin": "(?=(?:[^\\t\\n\\f\\r \\-?:,\\[\\]{}#&*!|>'\"%@`]|[\\-?:](?=[^\\t\\n\\f\\r ,\\[\\]{}]))(?:[^:#\\n,\\[\\]{}]|:(?=[^\\t\\n\\f\\r ,\\]}])|#(?<=[^\\t\\n\\f\\r ]#))*)(?!(?:#|-[\\t ]|\\?[\\t ]|[^\\n\\[{\\]}]*?:(?:[\\t ]|$)))", + "while": "\\G", + "patterns": [ + { + "begin": "\\G", + "end": "(?=(?>[\\t ]++|\\G)#)", + "name": "string.unquoted.yaml", + "patterns": [ + { + "match": "\\G[\\t ]++", + "name": "string.unquoted.yaml" + }, + { + "match": "[\\t ]++$", + "name": "string.unquoted.yaml" + } + ] + }, + { + "begin": "(?!\\G)", + "while": "\\G", + "patterns": [ + { + "include": "#comment" + } + ] + } + ] + }, + "block-sequence": { + "begin": "(?=((?<=[\\-?:]) )?+)\\G( *+)(-)(?=([\\t ]+)-[\\t ])", + "beginCaptures": { + "3": { + "name": "punctuation.yaml" + } + }, + "while": "\\G(?>(\\1\\2 \\4)(?=-[\\t ]|-$)|(?=\\1\\2 \\4[\\t ])|[\\t ]*$)", + "patterns": [ + { + "include": "#comment" + }, + { + "include": "#blockscalar-explicit-seq-1" + }, + { + "include": "#blockscalar-explicit-seq-2" + }, + { + "include": "#blockscalar-explicit-seq-3" + }, + { + "include": "#blockscalar-explicit-seq-4" + }, + { + "include": "#blockscalar-explicit-seq-5" + }, + { + "include": "#blockscalar-explicit-seq-6" + }, + { + "include": "#blockscalar-explicit-seq-7" + }, + { + "include": "#blockscalar-explicit-seq-8" + }, + { + "include": "#blockscalar-explicit-seq-9" + }, + { + "include": "#blockscalar-seq" + }, + { + "include": "#blockscalar-key" + }, + { + "include": "#blockscalar-explicit-1" + }, + { + "include": "#blockscalar-explicit-2" + }, + { + "include": "#blockscalar-explicit-3" + }, + { + "include": "#blockscalar-explicit-4" + }, + { + "include": "#blockscalar-explicit-5" + }, + { + "include": "#blockscalar-explicit-6" + }, + { + "include": "#blockscalar-explicit-7" + }, + { + "include": "#blockscalar-explicit-8" + }, + { + "include": "#blockscalar-explicit-9" + }, + { + "include": "#blockscalar-doc" + }, + { + "include": "#blockscalar" + }, + { + "include": "#explicit-key-continuation" + }, + { + "include": "#block-sequence" + }, + { + "include": "#block-fold" + }, + { + "include": "#explicit-key" + }, + { + "include": "#explicit-key-indicator" + }, + { + "include": "#flow-sequence" + }, + { + "include": "#flow-mapping" + }, + { + "include": "#dquotekey" + }, + { + "include": "#squotekey" + }, + { + "include": "#key" + }, + { + "include": "#docstart" + }, + { + "include": "#docend" + }, + { + "include": "#dquote" + }, + { + "include": "#squote" + }, + { + "include": "#yamldirective" + }, + { + "include": "#directive" + }, + { + "include": "#tag" + }, + { + "include": "#directive-malformed" + }, + { + "include": "#num" + }, + { + "include": "#boolnull" + }, + { + "include": "#plain" + }, + { + "include": "#punctuation" + }, + { + "include": "#alias" + }, + { + "include": "#indent" + }, + { + "include": "#dedent" + }, + { + "include": "#newline" + }, + { + "include": "#anchor" + }, + { + "include": "#block-plain-item" + } + ] + }, + "block-fold": { + "match": "^([\\t ]+)(?=[^\\t\\r\\n#])(?!(?:#|[^\\n\\[{\\]}]*?:(?:[\\t ]|$)))((?:[^#\\n]|#(?<=[^\\t\\n\\f\\r ]#))*)", + "captures": { + "2": { + "name": "string.unquoted.yaml" + } + } + }, "explicit-key": { "match": "(\\?)([\\t ]+)(?:(?:(&[^\\t\\n\\f\\r \\[\\]{},]+)|(!(?:<[^>]*>|[^\\t\\n\\f\\r \\[\\]{},]*)))[\\t ]+)*((?:[^\\t\\n\\f\\r \\-?:,\\[\\]{}#&*!|>'\"%@`]|[\\-?:](?=[^\\t\\n\\f\\r ,\\[\\]{}]))(?:[^:#\\n,\\[\\]{}]|:(?=[^\\t\\n\\f\\r ,\\]}])|#(?<=[^\\t\\n\\f\\r ]#))*)", "captures": { diff --git a/yaml.ts b/yaml.ts index 69c2587..847d281 100644 --- a/yaml.ts +++ b/yaml.ts @@ -10,7 +10,7 @@ import { token, rule, defineGrammar, alt, many, many1, opt, not, noCommentBefore, noMultilineFlowBefore, altPattern, optPattern, seq, oneOf, noneOf, range, star, plus, repeat, followedBy, notFollowedBy, - precededBy, notPrecededBy, never, end, + precededBy, notPrecededBy, never, start, end, } from './src/api.ts'; import type { IndentConfig } from './src/types.ts'; @@ -31,13 +31,16 @@ const whitespace = oneOf('\t', '\n', '\f', '\r', ' '); const nonWhitespace = noneOf(whitespace); const hashAfterNonSpace = seq('#', precededBy(seq(nonWhitespace, '#'))); // Document markers: `---` (directives end / document begin) and `...` (document end). Both must be -// followed by whitespace or EOL — `---foo` / `...bar` are plain scalars, not markers — so the -// lookahead keeps the marker from stealing a plain scalar's leading dashes/dots. Scoped -// `entity.other.document.*` (the maintained-grammar convention) so the highlighter paints them as -// document structure, not as a string. +// at the START of a line (YAML §9.1.1 — a marker is column 0) AND followed by whitespace or EOL — +// `---foo` / `...bar` are plain scalars, and a `---` / `...` that OPENS A VALUE (`note: --- x`, +// `x: ... bar`) is string content, not a marker. The parser already constrains the markers to stream +// position structurally (DocStart / DocEnd are referenced only in the Stream grammar), so the CST is +// unchanged; the line-start `start()` anchor carries that same column-0 constraint into the FLAT +// derived highlighter, which otherwise retries the marker pattern at every token boundary and would +// scope a value-leading `---` as a document marker (monogram#23). Scoped `entity.other.document.*`. const docMarkerEnd = followedBy(altPattern(oneOf('\t', ' '), '\r', '\n', end())); -const DocStart = token(seq('---', docMarkerEnd), { scope: 'entity.other.document.begin' }); -const DocEnd = token(seq('...', docMarkerEnd), { scope: 'entity.other.document.end' }); +const DocStart = token(seq(start(), '---', docMarkerEnd), { scope: 'entity.other.document.begin' }); +const DocEnd = token(seq(start(), '...', docMarkerEnd), { scope: 'entity.other.document.end' }); // A `#` is a comment indicator only at line start or AFTER whitespace (YAML §6.6); a `#` glued to a // non-space char is content, not a comment (`a#b` is a plain scalar, `%YAML 1.1#…` keeps the `#…` as // directive content — monogram#12 #8). The `notPrecededBy(nonWhitespace)` guard (a fixed-width, portable