From 5ff6093bf8c55185d601f155f139ea114b04d139 Mon Sep 17 00:00:00 2001 From: Johnson Chu Date: Mon, 8 Jun 2026 03:00:10 +0800 Subject: [PATCH] Drive YAML highlighter scope-gap to 100% (the 0.3% residual: 3 fixes) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After the monogram#12 items, the YAML scope-gap metric (full-span, vs the neutral `yaml`-CST oracle) had 7 Monogram-wrong tokens out of 2034 (99.66%, already ahead of the official 99.51%). Three independent root causes, all highlighter-only (parser untouched — src-coverage-yaml stays 100% aligned; the other six grammars regenerate byte-identical; Onigmo diagnostics clean): A. Line-start quoted scalar with an internal colon (`"foo: bar"`, `"foo: bar\": baz"`). The #plain-continuation key-scan `[^\n{}[]]*?:` treated the colon INSIDE the quotes as a key separator, so the line was mis-read as `key: value` and the opening `"` left unscoped. The scan now consumes a leading/embedded quoted scalar as one whole escape-aware token (derived from the grammar's quoted-scalar tokens), so the `:` separator is guaranteed outside quotes — a quoted scalar no longer opens a fold (it falls to #dquote/#squote), while a real quoted KEY (`"foo": bar`, colon outside) still folds its value. (2 tokens) B. Multi-line explicit KEY (`? a\n true` = the one key "a true"). #plain-continuation opened on the `? ` header but scoped the folded continuation as the VALUE plain scope; a new #explicit-key-continuation region (same shape, pinned to the explicit-key indicator, ranked above #plain-continuation) scopes it as the KEY (entity.name.tag), consistent with its first line. A `- ` seq / `key:` value fold stays on #plain-continuation. (4 tokens) C. Glued `#` after a directive (`%YAML 1.1#...`). YamlDirective's trailing lookahead accepted a glued `#` (no preceding space) as a comment-ahead; per §6.6 a comment indicator needs a preceding space, so the lookahead now requires it. A glued-`#` line then matches no directive token and is scoped by #directive-malformed — which (with #4) now scopes a malformed directive line as a directive (keyword.other.directive), highlighting it normally rather than splashing invalid.illegal, matching the #12 #3 stance and the CST oracle. (1 token) scope-gap-yaml: Monogram correct 100.0% / monogramWrong 0 (official 99.51%). yaml-issue12 regressions 10/10; src-coverage-yaml 100%; six grammars byte-identical; agnostic 9/9; sanity 15/15; tm-diagnostics clean; tsc clean. --- src/gen-tm.ts | 57 ++++++++++++++++++++++++++++++++----- yaml.monarch.json | 2 +- yaml.tmLanguage.json | 67 ++++++++++++++++++++++++++++++++++++++++++-- yaml.ts | 7 ++++- 4 files changed, 121 insertions(+), 12 deletions(-) diff --git a/src/gen-tm.ts b/src/gen-tm.ts index 08264b1..18de676 100644 --- a/src/gen-tm.ts +++ b/src/gen-tm.ts @@ -4975,7 +4975,21 @@ export function generateTmLanguage(grammar: CstGrammar, langName: string): TmGra // a flow collection (`{ a: b,\n c }` / `a: { b: c,\n d }`) is a multi-line begin/end region of // its own — a `{`/`[` before the `:` means the `:` is a FLOW separator, not a block one, so the // region must NOT open and steal those lines from #flow-mapping/#flow-sequence. - const plainVp = `(?:(?:${docAlt})[\\t ]+)?(?:(?:${compactCls}[\\t ]+)+(?:${flowEx}*?${kvSep}[\\t ]+)?|${flowEx}*?${kvSep}[\\t ]+)(?=${plainSrc})`; + // The key-scan up to the `:` separator. A leading / embedded QUOTED scalar is consumed as one + // WHOLE escape-aware token (`fc.dq`/`fc.sq`) so its INTERNAL `:` is never mistaken for the key + // separator: a line-start double/single-quoted scalar with an inner colon (`"a: b"`) is ONE + // scalar, not a `key: value`, and must NOT open a fold (it falls to #dquote/#squote). The bare + // run excludes the quote chars so a quote can ONLY match via the token branch — otherwise the + // engine skips the (optional) token and the bare class re-swallows the opening quote, re-mis- + // reading the inner colon. Derived from the grammar's quoted-scalar tokens; a grammar with no + // quoted scalar keeps the plain `flowEx` scan (byte-identical). + const fcQuote = detectFlowCollections(grammar); + const quotedScalarToks = [fcQuote?.dq, fcQuote?.sq].filter((s): s is string => !!s); + const quoteCharCls = quotedScalarToks.map(t => escapeForCharClass(t[0] === '\\' ? t.slice(0, 2) : t[0])).join(''); + const keyToSep = quotedScalarToks.length + ? `(?:${quotedScalarToks.join('|')}|${flowEx.slice(0, -1)}${quoteCharCls}])*?` + : `${flowEx}*?`; + const plainVp = `(?:(?:${docAlt})[\\t ]+)?(?:(?:${compactCls}[\\t ]+)+(?:${keyToSep}${kvSep}[\\t ]+)?|${keyToSep}${kvSep}[\\t ]+)(?=${plainSrc})`; // Header-line token includes: the same shape any plain `key: value` line gets, so the header is // scoped identically to the top level (only the CONTINUATION changes). Includes the typed-value // tokens (`#num`/`#boolnull`) so a SINGLE-line `a: 1` keeps `constant.numeric`, and the full @@ -5015,6 +5029,27 @@ export function generateTmLanguage(grammar: CstGrammar, langName: string): TmGra topPatterns.push({ include: '#plain-continuation' }); } + // ── 2a‴. Multi-line EXPLICIT-KEY continuation (`? a\n true`) ── + // An explicit key (`? a`) may FOLD across deeper continuation lines exactly like a plain value — + // `? a\n true` is the ONE key "a true" (CST: a single key scalar). #plain-continuation already + // opens on `? a` (its `?` is in `compactCls`), but it scopes the continuation as the VALUE plain + // scope (`string.unquoted`); a KEY continuation must instead take the KEY scope (entity.name.tag), + // so the folded key reads consistently with its first line (which #explicit-key scopes as the key). + // Same structure as #plain-continuation, but pinned to the explicit-key INDICATOR (so a `- ` seq / + // `key:` value fold stays on #plain-continuation) and the continuation takes the key scope. Ranked + // ABOVE #plain-continuation (scopeOrder) so a `? `-led header takes the key-scoped continuation. + const ekFold = detectExplicitKey(grammar); + if (ekFold && fold.hasDeeper) { + const ekContRule = { match: '\\G[\\t ]+(?:[^#\\n]|#(?<=[^\\t\\n\\f\\r ]#))*', name: `${ekFold.keyScope}.${langName}` }; + repository['explicit-key-continuation'] = emitIndentRegion({ + lookahead: `(?=${escapeRegex(ekFold.indicator)}[\\t ]+(?:${keyToSep}${kvSep}[\\t ]+)?(?=${plainSrc}))`, + cont: `\\1[ \\t]+(?!${structAhead})`, + blankFirst: true, + patterns: [ekContRule, ...plainHeaderIncs], + }); + topPatterns.push({ include: '#explicit-key-continuation' }); + } + // ── 2a″. BARE plain-scalar SAME-COLUMN fold (monogram#12 §9) ── // A plain scalar that is itself a NODE (a document value, or the leading value of an indented // block) — NOT a `key:`/`-`/`?` — folds across SAME-COLUMN as well as deeper continuation lines: @@ -5126,11 +5161,14 @@ export function generateTmLanguage(grammar: CstGrammar, langName: string): TmGra // scalar tokens, which paint it as a stray `string.unquoted`. But a `%` can never BEGIN a plain // scalar (YAML §7.3.3 — `%` is a c-indicator, excluded from ns-plain-first), so a `%`-led line the // clean directive tokens did NOT claim is always a malformed directive, never real scalar content. - // Re-scope the whole line as an invalid directive. The indicator (`%`) is read from the directive - // tokens' leading literal (never hardcoded); ranked just BELOW the clean directives and ABOVE the - // plain scalars (scopeOrder 6.5) so it only catches what they left and beats the stray-scalar mis- - // scope. Highlight-only — the parser still rejects the line. The `^` anchor pins it to a line-start - // `%` (an indented `%` mid-line — e.g. a `key: %v` value — is left to the scalar tokens). + // Re-scope the whole line AS A DIRECTIVE (keyword.other.directive) — the malformed trailing token is + // directive content (#4 `%YAML 1.2 foo`, #8 glued `%YAML 1.1#…`), and Monogram highlights questionable- + // but-renderable content NORMALLY rather than splashing `invalid.illegal` (the #12 #3 stance; this also + // matches the neutral `yaml`-CST oracle, which recovers such a line as a directive). The indicator + // (`%`) is read from the directive tokens' leading literal (never hardcoded); ranked just BELOW the + // clean directives and ABOVE the plain scalars (scopeOrder 6.5) so it only catches what they left and + // beats the stray-scalar mis-scope. Highlight-only — the parser still rejects the line. The `^` anchor + // pins it to a line-start `%` (an indented `%` mid-line — e.g. a `key: %v` value — stays a scalar). const directiveToks = grammar.tokens.filter(t => /(^|\.)keyword\.other\.directive(\.|$)/.test(t.scope ?? '')); if (directiveToks.length) { const lead = directiveToks.map(t => tokenPatternLeadingSource(t)).find((s): s is string => !!s); @@ -5138,7 +5176,7 @@ export function generateTmLanguage(grammar: CstGrammar, langName: string): TmGra if (indicator) { repository['directive-malformed'] = { match: `^[ \\t]*(${escapeRegex(indicator)}[^\\n]*?)[\\t ]*$`, - captures: { '1': { name: `invalid.illegal.keyword.other.directive.${langName}` } }, + captures: { '1': { name: `keyword.other.directive.${langName}` } }, }; topPatterns.push({ include: '#directive-malformed' }); } @@ -7648,6 +7686,11 @@ export function generateTmLanguage(grammar: CstGrammar, langName: string): TmGra // deeper `!`/digit/`%` line falls through to #tag/#num/#directive). It ranks BELOW the // block-scalar regions (≤ 0.6) so a `key: |` keeps its block-scalar region — its lookahead requires // a real plain VALUE head (never `|`/`>`), so the two never collide on the same line anyway. + // The explicit-key continuation (`? a\n true`) must out-rank #plain-continuation (0.7): both open + // on a `? `-led header (the `?` is in compactCls), but the explicit-key variant scopes the folded + // continuation as the KEY (entity.name.tag), so it must win for the `?` case; #plain-continuation + // still handles `key:`/`- ` folds (its lookahead, unlike this one, is not pinned to the `?`). + if (key === 'explicit-key-continuation') return 0.68; if (key === 'plain-continuation') return 0.7; // The BARE plain-scalar same-column fold (§2a″) likewise begins AT LINE START and must out-rank the // scalar tokens (#key/#num/#boolnull/#plain ≥ 0.8) so it opens on a bare value scalar and claims its diff --git a/yaml.monarch.json b/yaml.monarch.json index f05727e..5c92a5f 100644 --- a/yaml.monarch.json +++ b/yaml.monarch.json @@ -725,7 +725,7 @@ } ], [ - "%YAML[ \\t]+[0-9]+\\.[0-9]+(?=[ \\t]*(?:#|\\r|\\n|$))", + "%YAML[ \\t]+[0-9]+\\.[0-9]+(?=[ \\t]*(?:\\r|\\n|$)|[ \\t]+#)", { "token": "keyword", "switchTo": "@value" diff --git a/yaml.tmLanguage.json b/yaml.tmLanguage.json index 82366f8..6f1b41b 100644 --- a/yaml.tmLanguage.json +++ b/yaml.tmLanguage.json @@ -77,6 +77,9 @@ { "include": "#blockscalar" }, + { + "include": "#explicit-key-continuation" + }, { "include": "#plain-continuation" }, @@ -231,6 +234,9 @@ { "include": "#blockscalar" }, + { + "include": "#explicit-key-continuation" + }, { "include": "#plain-continuation" }, @@ -323,7 +329,7 @@ }, "yamldirective": { "name": "keyword.other.directive.yaml", - "match": "%YAML[ \\t]+[0-9]+\\.[0-9]+(?=[ \\t]*(?:#|\\r|\\n|$))" + "match": "%YAML[ \\t]+[0-9]+\\.[0-9]+(?=[ \\t]*(?:\\r|\\n|$)|[ \\t]+#)" }, "directive": { "name": "keyword.other.directive.yaml", @@ -1723,7 +1729,7 @@ ] }, "plain-continuation": { - "begin": "^([ \\t]*)(?=(?:(?:---|\\.\\.\\.)[\\t ]+)?(?:(?:[\\-?][\\t ]+)+(?:[^\\n\\[{\\]}]*?:[\\t ]+)?|[^\\n\\[{\\]}]*?:[\\t ]+)(?=(?:[^\\t\\n\\f\\r \\-?:,\\[\\]{}#&*!|>'\"%@`]|[\\-?:](?=[^\\t\\n\\f\\r ,\\[\\]{}]))(?:[^:#\\n,\\[\\]{}]|:(?=[^\\t\\n\\f\\r ,\\]}])|#(?<=[^\\t\\n\\f\\r ]#))*))", + "begin": "^([ \\t]*)(?=(?:(?:---|\\.\\.\\.)[\\t ]+)?(?:(?:[\\-?][\\t ]+)+(?:(?:\"(?:\\\\(?:[0abtnvfre\"/\\\\N_LP \\t]|x[0-9A-Fa-f]{2}|u[0-9A-Fa-f]{4}|U[0-9A-Fa-f]{8}|\\r?\\n)|[^\"\\\\])*\"|'(?:''|[^'])*'|[^\\n\\[{\\]}\"'])*?:[\\t ]+)?|(?:\"(?:\\\\(?:[0abtnvfre\"/\\\\N_LP \\t]|x[0-9A-Fa-f]{2}|u[0-9A-Fa-f]{4}|U[0-9A-Fa-f]{8}|\\r?\\n)|[^\"\\\\])*\"|'(?:''|[^'])*'|[^\\n\\[{\\]}\"'])*?:[\\t ]+)(?=(?:[^\\t\\n\\f\\r \\-?:,\\[\\]{}#&*!|>'\"%@`]|[\\-?:](?=[^\\t\\n\\f\\r ,\\[\\]{}]))(?:[^:#\\n,\\[\\]{}]|:(?=[^\\t\\n\\f\\r ,\\]}])|#(?<=[^\\t\\n\\f\\r ]#))*))", "while": "\\G(?=[ \\t]*$|\\1[ \\t]+(?!(?:#|-[\\t ]|\\?[\\t ]|[^\\n\\[{\\]}]*?:(?:[\\t ]|$))))", "patterns": [ { @@ -1777,6 +1783,61 @@ } ] }, + "explicit-key-continuation": { + "begin": "^([ \\t]*)(?=\\?[\\t ]+(?:(?:\"(?:\\\\(?:[0abtnvfre\"/\\\\N_LP \\t]|x[0-9A-Fa-f]{2}|u[0-9A-Fa-f]{4}|U[0-9A-Fa-f]{8}|\\r?\\n)|[^\"\\\\])*\"|'(?:''|[^'])*'|[^\\n\\[{\\]}\"'])*?:[\\t ]+)?(?=(?:[^\\t\\n\\f\\r \\-?:,\\[\\]{}#&*!|>'\"%@`]|[\\-?:](?=[^\\t\\n\\f\\r ,\\[\\]{}]))(?:[^:#\\n,\\[\\]{}]|:(?=[^\\t\\n\\f\\r ,\\]}])|#(?<=[^\\t\\n\\f\\r ]#))*))", + "while": "\\G(?=[ \\t]*$|\\1[ \\t]+(?!(?:#|-[\\t ]|\\?[\\t ]|[^\\n\\[{\\]}]*?:(?:[\\t ]|$))))", + "patterns": [ + { + "match": "\\G[\\t ]+(?:[^#\\n]|#(?<=[^\\t\\n\\f\\r ]#))*", + "name": "entity.name.tag.yaml" + }, + { + "include": "#docstart" + }, + { + "include": "#docend" + }, + { + "include": "#explicit-key" + }, + { + "include": "#explicit-key-indicator" + }, + { + "include": "#dquotekey" + }, + { + "include": "#squotekey" + }, + { + "include": "#key" + }, + { + "include": "#anchor" + }, + { + "include": "#alias" + }, + { + "include": "#tag" + }, + { + "include": "#num" + }, + { + "include": "#boolnull" + }, + { + "include": "#plain" + }, + { + "include": "#comment" + }, + { + "include": "#punctuation" + } + ] + }, "plain-bare-fold": { "begin": "^([ \\t]*)(?=(?:[^\\t\\n\\f\\r \\-?:,\\[\\]{}#&*!|>'\"%@`]|[\\-?:](?=[^\\t\\n\\f\\r ,\\[\\]{}]))(?:[^:#\\n,\\[\\]{}]|:(?=[^\\t\\n\\f\\r ,\\]}])|#(?<=[^\\t\\n\\f\\r ]#))*)(?!(?:#|-[\\t ]|\\?[\\t ]|(?:---|\\.\\.\\.)(?:[\\t ]|$)|[^\\n\\[{\\]}]*?:(?:[\\t ]|$)))", "while": "\\G(?=[ \\t]*$|\\1(?=[ \\t]*\\S)(?![ \\t]*(?:#|-[\\t ]|\\?[\\t ]|(?:---|\\.\\.\\.)(?:[\\t ]|$)|[^\\n\\[{\\]}]*?:(?:[\\t ]|$))))", @@ -1951,7 +2012,7 @@ "match": "^[ \\t]*(%[^\\n]*?)[\\t ]*$", "captures": { "1": { - "name": "invalid.illegal.keyword.other.directive.yaml" + "name": "keyword.other.directive.yaml" } } }, diff --git a/yaml.ts b/yaml.ts index f0055e5..69c2587 100644 --- a/yaml.ts +++ b/yaml.ts @@ -74,7 +74,12 @@ const Tag = token(seq('!', altPattern(seq('<', star(noneOf('>')), '>'), star(non // token and the stray `%` then fails to lex → reject (H7TQ / ZYU8). The trailing comment is left // OUTSIDE the token (only looked at) so a ` # comment` is tokenised/scoped as a Comment, not folded // into the directive — keeps the highlighter's comment scope intact. -const YamlDirective = token(seq('%YAML', plus(hspace), plus(digit), '.', plus(digit), followedBy(seq(star(hspace), altPattern('#', '\r', '\n', end())))), { scope: 'keyword.other.directive', blockOnly: true }); +// The trailing context is EOL (with optional trailing spaces) OR a real ` #` comment — a comment +// indicator needs a PRECEDING space (§6.6, the same rule the Comment token's `notPrecededBy` guard +// applies), so a GLUED `#` (`%YAML 1.1#…`) is NOT a comment and makes the lookahead FAIL: the line +// then matches no directive token and the highlighter scopes the whole malformed line as a directive +// (gen-tm `#directive-malformed`) instead of leaving the glued `#…` stray (monogram#12 #8). +const YamlDirective = token(seq('%YAML', plus(hspace), plus(digit), '.', plus(digit), followedBy(altPattern(seq(star(hspace), altPattern('\r', '\n', end())), seq(plus(hspace), '#')))), { scope: 'keyword.other.directive', blockOnly: true }); // Directive (`%TAG …`, unknown `%FOO …`): runs to EOL but stops before a ` #` trailing comment — a // `#` is a comment indicator only after whitespace, so a glued `#` (`%YAML 1.1#x`) stays part of // the directive while a spaced ` # comment` falls to the Comment token (same rule as plain scalars).