Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ jobs:
node test/vue-embed-boundary.ts
node test/vue-interp-expr.ts
node test/yaml-issue12-regressions.ts
node test/yaml-depth-witnesses.ts

# The derived tree-sitter highlighter is the strongest thesis proof (a real GLR
# parser from the same grammar, beating the official hand-written one). Build its
Expand Down
12 changes: 9 additions & 3 deletions src/gen-lexer.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import type { CstGrammar } from './types.ts';
import { collectLiterals, isKeywordLiteral } from './grammar-utils.ts';
import { tokenBlockPatternFirstCharSet, tokenBlockPatternSource, tokenEscapeValidPatternSource, tokenPatternFirstCharSet, tokenPatternSource } from './token-pattern.ts';
import { tokenBlockPatternFirstCharSet, tokenBlockPatternSource, tokenEscapeValidPatternSource, tokenPatternFirstCharSet, tokenPatternHasStartAnchor, tokenPatternSource } from './token-pattern.ts';

// A lexer token: a declared token (type = its name) or a punctuation literal (type = '').
// `$templateHead/$templateMiddle/$templateTail` are synthetic types the lexer emits for
Expand Down Expand Up @@ -45,10 +45,16 @@ export function createLexer(grammar: CstGrammar) {
const tokenMatchers = grammar.tokens.map(t => {
const pattern = tokenPatternSource(t);
const blockPattern = tokenBlockPatternSource(t);
// A token whose pattern carries a line-START anchor (`start()` → `^`, e.g. YAML's `---`/`...`
// document markers, a shebang) needs the `m` flag: under the sticky `y` matcher a bare `^`
// matches only at index 0 (file start), so a marker at the start of a LATER line (`# c\n---\n…`,
// `%TAG …\n---\n…`) would fail to lex. With `m`, `^` matches at every line start, so a sticky
// match at `lastIndex = pos` succeeds iff `pos` is a line start — exactly `start()`'s meaning.
const flags = tokenPatternHasStartAnchor(t) ? 'ym' : 'y';
return {
name: t.name,
regex: new RegExp(`(?:${pattern})`, 'y'),
blockRegex: blockPattern ? new RegExp(`(?:${blockPattern})`, 'y') : null,
regex: new RegExp(`(?:${pattern})`, flags),
blockRegex: blockPattern ? new RegExp(`(?:${blockPattern})`, flags) : null,
skip: t.flags.includes('skip'),
isRegex: t.flags.includes('regex'),
isString: !!t.string,
Expand Down
171 changes: 171 additions & 0 deletions src/gen-tm.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3409,6 +3409,59 @@ function detectExplicitKey(grammar: CstGrammar): { indicator: string; keyScope:
return { indicator, keyScope, keyBody, prefixGroups };
}

// ── Block-sequence detection (YAML `- item` block sequence) ──
//
// A flat per-line grammar cannot tell a COMPACT nested sequence's sibling from a plain-scalar fold:
// `- - a\n - b` (the ` - b` is the INNER sequence's 2nd item) and `x: y\n - b` (the ` - b` folds
// into the plain scalar `y`) share the same continuation line but get OPPOSITE answers — the decider
// is the ENCLOSING node column (an inner sequence at column 2 vs a mapping at column 0), which the
// line-relative `\1` of the §2a′ fold region cannot see (both lines start at column 0). monogram#24.
// The fix is a column-anchored region (§2c below) for the COMPACT block sequence, whose `\G`-anchored
// `while` (re-anchored each line by the meta.stream wrapper) reclaims a sibling `- ` at the inner
// indicator's column BEFORE the plain-scalar fold can swallow it. This detects the block-sequence rule
// + its indicator literal from the grammar so the region is DERIVED, not hardcoded.
//
// Signal (all from the grammar): a rule whose body is `[item, (Newline item)*]` — an item then a
// same-column-NEWLINE-separated run of further items — where `item` is a `ref` to a rule whose body is
// a `seq` headed by a SINGLE-char punctuation literal (the sequence indicator `-`). Returns the
// indicator literal, or null when the family has no such block sequence (every non-YAML grammar).
function detectBlockSequence(grammar: CstGrammar): { indicator: string } | null {
if (!grammar.indent) return null;
const { newlineToken } = grammar.indent;
const ruleByName = new Map(grammar.rules.map(r => [r.name, r] as const));
const headSinglePunct = (e: RuleExpr): string | null =>
e.type === 'literal' && e.value.length === 1 && !/[\w\s]/.test(e.value) ? e.value : null;
// The item rule's indicator: unwrap a ref to a rule whose body's first seq element is a 1-char punct.
const itemIndicator = (e: RuleExpr): string | null => {
let body = e.type === 'ref' ? ruleByName.get(e.name)?.body : e;
if (!body) return null;
// a rule body written `[[...]]` is a single-alt seq; unwrap a lone-arm alt
if (body.type === 'alt' && body.items.length === 1) body = body.items[0];
return body.type === 'seq' ? headSinglePunct(body.items[0]) : null;
};
let indicator: string | null = null;
const visit = (e: RuleExpr): void => {
if (e.type === 'seq') {
// `[item, (Newline item)*]`: first element + a `*`/`+` over a `[Newline, item]` seq
if (e.items.length >= 2) {
const head = e.items[0];
const q = e.items[1];
if (q.type === 'quantifier' && (q.kind === '*' || q.kind === '+') && q.body.type === 'seq'
&& q.body.items.length >= 2 && q.body.items[0].type === 'ref' && q.body.items[0].name === newlineToken) {
const ind = itemIndicator(head);
// the repeated element's item must share the head's indicator (a homogeneous sequence)
if (ind && itemIndicator(q.body.items[1]) === ind) indicator = ind;
}
}
e.items.forEach(visit);
} else if (e.type === 'alt') e.items.forEach(visit);
else if (e.type === 'quantifier' || e.type === 'group' || e.type === 'not') visit(e.body);
else if (e.type === 'sep') visit(e.element);
};
for (const r of grammar.rules) visit(r.body);
return indicator ? { indicator } : null;
}

// ── Flow-collection detection (YAML `{ … }` mapping / `[ … ]` sequence) ──
//
// A flat per-token grammar cannot scope a flow MAPPING's keys: in `{ a: 1 }` the `a` is a key
Expand Down Expand Up @@ -4722,6 +4775,7 @@ export function generateTmLanguage(grammar: CstGrammar, langName: string): TmGra
// the Anchor/Tag tokens, not the indent config) and stays inline.
const ind = grammar.indent!;
const cmtLit = escapeRegex(ind.comment ?? '#');
const cmtCc = escapeForCharClass(ind.comment ?? '#'); // the comment introducer, char-class-escaped
const compactAlt = (ind.compactIndicators ?? []).map((c) => `${escapeRegex(c)}[\\t ]`).join('|');
const compactCls = `[${(ind.compactIndicators ?? []).map(escapeForCharClass).join('')}]`;
const docAlt = (blockScalar.documentMarkers ?? []).map(escapeRegex).join('|');
Expand Down Expand Up @@ -5088,6 +5142,96 @@ export function generateTmLanguage(grammar: CstGrammar, langName: string): TmGra
});
topPatterns.push({ include: '#plain-bare-fold' });
}

// ── 2c. COMPACT block-sequence — a column-anchored region for nested `- - …` (monogram#24) ──
// The §2a′/§2a″ plain folds are LINE-relative (their `\1` is the line's leading whitespace), but
// a YAML continuation is NODE-relative (more indented than the ENCLOSING dash/key). For a single
// sequence (`- a`) or a mapping (`x: y`) the two coincide — the dash/key sits at column 0 = the
// line indent — so those folds are correct. They DIVERGE only for a COMPACT nested sequence
// (`- - a`): the inner sequence's dash is at column 2 (after the outer `- ` prefix, which is NOT
// whitespace), so a sibling `- b` at column 2 reads — to a line-relative `\1=""` fold — as "indented
// past column 0", and is wrongly folded into the plain scalar `a`. The decider is the inner
// indicator's column, which no `\1`-relative backref can express (the prefix is `- `, not spaces) and
// no possessive `[ \t]++` can split from the deeper-fold case `x: y\n - b` (same line, must fold).
//
// The fix mirrors the maintained RedCMD YAML grammar's block-sequence: a `\G`-anchored region whose
// rule stack carries the indent depth, RE-OPENED per compact level (its body self-includes
// #block-sequence, so `- - - a` nests three deep). The meta.stream wrapper re-anchors `\G` at every
// line, so each level's captures pin ITS OWN inner indicator column and reclaim only siblings AT that
// column. We emit it ONLY for the COMPACT case (a dash followed by ANOTHER dash on the same line) —
// `begin … (?=([\t ]+)${dash}[\t ])` — so a single `- a`, a `- key: v` mapping item, a `- {…}`/`- "…"`/
// `- |` value, etc. are UNTOUCHED (still handled by the top-level token includes + the §2a′ fold),
// confining this region to exactly the bug's shape. The compact re-anchor `(?=((?<=${reanchor}) )?+)`
// (a FIXED-width lookbehind — portable, unlike RedCMD's variable-length `(?<![^\t ][\t ]*+:|---)`,
// which is rejected by Onigmo / GitHub-Linguist) lets the inner dash open right after the outer `- `.
//
// The inner indicator's column is reconstructed PORTABLY as `\1\2 \4`: the outer indent (`\1\2`) +
// one literal space for the dash's own column + the captured indicator run `\4` (the run of spaces
// between the outer dash and the inner one — group 4 in the begin's lookahead, so a multi-space
// compact `- - x` pins correctly too). The `while` then STAYS on: (arm 1) a dash AT EXACTLY that
// column `(\1\2 \4)(?=${dash}[\t ]|${dash}$)` — a sibling item, reclaimed as `punctuation`; (arm 2) a
// line STRICTLY DEEPER than the column `(?=\1\2 \4[\t ])` — a zero-width lookahead that keeps the
// region alive WITHOUT consuming (so a nested deeper #block-sequence, if its own begin matched on the
// header line, gets first claim on its own sibling before the fold), with the deeper line scoped by
// the body's #block-fold rule; or (arm 3) a blank line. It RELEASES on a dedent OR a non-dash line at
// the column (a sibling mapping/scalar), so a column-0 `key:` after the sequence is NOT swallowed.
//
// A deeper line that is NOT a nested sibling folds into the item's plain scalar: the body's
// #block-fold rule (`^([\t ]+)(?=[^\t\r\n#])(plain run)`, anchored at LINE START so it never fires on
// the header line's inline inner item, which sits past column 0) scopes it `string.unquoted`. So
// `- - a\n - b` (`[["a - b"]]` — the deeper `- b` folds) is `string`, while `- - a\n - b` and
// `- - - a\n - b` (a same-column sibling at the inner OR a deeper-nested level) stay `punctuation`.
// This resolves monogram#24's deeper-irregular residual without a variable-length lookbehind.
const blockSeq = detectBlockSequence(grammar);
if (blockSeq) {
const dash = escapeRegex(blockSeq.indicator);
// The compact re-anchor lookbehind class: the compact indicators (`-`/`?`) plus the key/value
// separator (`:`) — exactly the single chars a nested node may sit immediately after on one line.
const reanchor = `[${[...(ind.compactIndicators ?? []), ind.keyValueSeparator ?? ':'].map(escapeForCharClass).join('')}]`;
// The item's plain VALUE folds across DEEPER lines (a multi-line plain scalar) but releases at a
// same-column sibling (reclaimed by the sequence `while`). Opens only on a BARE plain head (not a
// key/indicator/comment — those are scoped by the dispatch includes), then an inner string region
// scopes content until an inline ` #` comment (which falls to #comment). Mirrors RedCMD #block-plain-out.
repository['block-plain-item'] = {
begin: `(?=${plainSrc})(?!${structAhead})`,
while: '\\G',
patterns: [
{ begin: '\\G', end: '(?=(?>[\\t ]++|\\G)#)', name: plainContent, patterns: [
{ match: '\\G[\\t ]++', name: plainContent }, { match: '[\\t ]++$', name: plainContent }] },
{ begin: '(?!\\G)', while: '\\G', patterns: commentIncludeKeys.map(k => ({ include: `#${k}` })) },
],
};
// The region SHELL (begin/while/captures); its body `patterns` is filled at the END (after the
// top-level dispatch is built + ordered), since the item content reuses that full dispatch. Group 4
// (`([\t ]+)`, in the begin's lookahead) captures the indicator run between the outer and inner
// dashes, so the `while` can reconstruct the inner column as `\1\2 \4` (outer indent + the dash's
// own column + the run). Arm 1 reclaims a same-column sibling (`punctuation`); arm 2 is a zero-width
// lookahead that keeps the region alive on a strictly-deeper line (deferring to a nested level's
// sibling-reclaim, then to the body's #block-fold rule); arm 3 is a blank line.
repository['block-sequence'] = {
begin: `(?=((?<=${reanchor}) )?+)\\G( *+)(${dash})(?=([\\t ]+)${dash}[\\t ])`,
beginCaptures: { '3': { name: `punctuation.${langName}` } },
while: `\\G(?>(\\1\\2 \\4)(?=${dash}[\\t ]|${dash}$)|(?=\\1\\2 \\4[\\t ])|[\\t ]*$)`,
patterns: [],
};
// A deeper line (kept alive by the `while`'s arm 2) that is NOT a nested sibling folds into the
// current item's scalar. Anchored at LINE START (`^`), so it NEVER fires on the header line's inline
// inner item (which sits past column 0, after the outer `- `): only a continuation line begins at
// column 0. A leading `#` (a whitespace-preceded comment) is excluded so it falls to #comment, and
// `foldExclude` excludes a deeper KEY line (`<scan>: `) so a mapping ITEM VALUE's deeper entry
// (`- - a: 1\n b: 2`) keeps its #key structure instead of folding — the exclusion DROPS the
// compact indicators from `structAhead`, since a deeper `- b` (no sequence at its column) IS a fold
// (`- - a\n - b` = `[["a - b"]]`), the whole point of this rule. The body is one opaque plain run
// stopping before an inline ` #` (same idiom as the §2a′ continuation / §2a″ bareCont). Listed in
// the region body right AFTER the self-include so a deeper COMPACT line opens a nested
// #block-sequence instead of folding. (monogram#24 deeper residual.)
const foldExclude = `(?:${cmtLit}|${flowEx}*?${kvSep}(?:[\\t ]|$))`;
repository['block-fold'] = {
match: `^([\\t ]+)(?=[^\\t\\r\\n${cmtCc}])(?!${foldExclude})((?:[^${cmtCc}\\n]|${cmtLit}(?<=[^\\t\\n\\f\\r ]${cmtLit}))*)`,
captures: { '2': { name: plainContent } },
};
topPatterns.push({ include: '#block-sequence' });
}
}
}

Expand Down Expand Up @@ -7691,6 +7835,13 @@ export function generateTmLanguage(grammar: CstGrammar, langName: string): TmGra
// continuation as the KEY (entity.name.tag), so it must win for the `?` case; #plain-continuation
// still handles `key:`/`- ` folds (its lookahead, unlike this one, is not pinned to the `?`).
if (key === 'explicit-key-continuation') return 0.68;
// The COMPACT block-sequence region (§2c) must out-rank #plain-continuation (0.7): both open on a
// `- `-led header (the `-` is in compactCls, so the fold's lookahead matches a compact line too), but
// for `- - a` the sequence region bounds the inner sibling at the inner indicator's column while the
// fold would swallow it line-relative. Ranked above the fold so the compact case is claimed first;
// its begin requires a SECOND dash (`(?=[\t ]+-[\t ])`), so a non-compact `- a`/`- key:`/`x: y` line
// never matches it and still falls through to #plain-continuation. (monogram#24.)
if (key === 'block-sequence') return 0.69;
if (key === 'plain-continuation') return 0.7;
// The BARE plain-scalar same-column fold (§2a″) likewise begins AT LINE START and must out-rank the
// scalar tokens (#key/#num/#boolnull/#plain ≥ 0.8) so it opens on a bare value scalar and claims its
Expand Down Expand Up @@ -7768,6 +7919,26 @@ export function generateTmLanguage(grammar: CstGrammar, langName: string): TmGra
.sort((a, b) => scopeOrder(a) - scopeOrder(b))
.map(include => ({ include }));

// Fill the COMPACT block-sequence region's body (§2c). Its item content reuses the FULL top-level
// dispatch (so a `- - {…}` flow value, a `- - "x"` quoted value, a `- - key: v` nested mapping, a
// `- - |` block scalar are all scoped correctly) — the same ordered includes meta.stream wraps — with
// two changes: the two LINE-relative plain folds (§2a′/§2a″) are REMOVED (the sequence's own
// column-anchored `while` + the bounded #block-plain-item handle the item-value fold node-relatively;
// leaving them in would re-introduce the line-relative swallow this region exists to prevent), and the
// bounded #block-plain-item is appended for a bare plain item value. The self-include (#block-sequence,
// already in the ordered list at rank 0.69) gives deeper compact nesting (`- - - x`); #block-fold is
// spliced in right AFTER it (region-body-only — never a top-level include) so a deeper line that opens no
// nested sequence folds into the item's plain scalar (monogram#24 deeper residual), while a deeper
// COMPACT line still re-opens #block-sequence first.
if (repository['block-sequence']) {
const body = orderedPatterns.filter(p => p.include !== '#plain-continuation' && p.include !== '#plain-bare-fold');
if (repository['block-fold']) {
const selfAt = body.findIndex(p => p.include === '#block-sequence');
if (selfAt >= 0) body.splice(selfAt + 1, 0, { include: '#block-fold' });
}
repository['block-sequence'].patterns = [...body, { include: '#block-plain-item' }];
}

// Additive: a `#expression` sub-grammar for expression-only embeds (Vue `{{ }}`). The
// top-level `patterns` (orderedPatterns / $self) are left untouched, so standalone
// tokenization is unchanged — `#expression` is inert unless something includes it.
Expand Down
Loading
Loading