next_code_frame/
highlight.rs

1use std::{num::NonZeroUsize, ops::Range, sync::LazyLock};
2
3use phf::phf_set;
4use regex::Regex;
5use regex_automata::{Input, PatternID, meta::Regex as MetaRegex};
6use serde::Deserialize;
7
8/// A styled byte range within a line (non-overlapping, sorted by start)
9#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
10pub struct StyleSpan {
11    /// Start byte offset relative to line start (0-indexed, inclusive)
12    pub start: usize,
13    /// End byte offset relative to line start (0-indexed, exclusive)
14    pub end: usize,
15    /// The token type being styled
16    pub token_type: TokenType,
17}
18
19/// Token types for syntax highlighting
20#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
21pub enum TokenType {
22    Keyword,
23    Identifier,
24    String,
25    Number,
26    Regex,
27    Comment,
28}
29
30/// Language hint for keyword highlighting.
31///
32/// Determines which set of keywords are recognized as `TokenType::Keyword`.
33/// Non-keyword tokens (strings, comments, numbers, etc.) are language-agnostic.
34#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Deserialize)]
35#[serde(rename_all = "camelCase")]
36pub enum Language {
37    /// JavaScript/TypeScript keywords
38    #[default]
39    JavaScript,
40    /// CSS keywords (currently empty — CSS has no keyword highlighting)
41    Css,
42}
43
44impl Language {
45    /// Returns true if the given identifier is a keyword in this language.
46    pub fn is_keyword(self, ident: &str) -> bool {
47        match self {
48            Language::JavaScript => JS_KEYWORDS.contains(ident),
49            Language::Css => false,
50        }
51    }
52}
53
54/// JavaScript/TypeScript keywords (compile-time perfect hash set)
55static JS_KEYWORDS: phf::Set<&'static str> = phf_set! {
56    "as",
57    "async",
58    "await",
59    "break",
60    "case",
61    "catch",
62    "class",
63    "const",
64    "continue",
65    "debugger",
66    "default",
67    "delete",
68    "do",
69    "else",
70    "enum",
71    "export",
72    "extends",
73    "false",
74    "finally",
75    "for",
76    "from",
77    "function",
78    "if",
79    "implements",
80    "import",
81    "in",
82    "instanceof",
83    "interface",
84    "let",
85    "new",
86    "null",
87    "of",
88    "package",
89    "private",
90    "protected",
91    "public",
92    "return",
93    "static",
94    "super",
95    "switch",
96    "this",
97    "throw",
98    "true",
99    "try",
100    "type",
101    "typeof",
102    "undefined",
103    "var",
104    "void",
105    "while",
106    "with",
107    "yield",
108};
109
110/// ANSI color codes for token types
111#[derive(Debug, Clone, Copy)]
112pub struct ColorScheme {
113    pub reset: &'static str,
114    pub keyword: &'static str,
115    pub identifier: &'static str,
116    pub string: &'static str,
117    pub number: &'static str,
118    pub regex: &'static str,
119    pub comment: &'static str,
120    pub gutter: &'static str,
121    pub marker: &'static str,
122    pub message: &'static str,
123}
124
125impl ColorScheme {
126    /// Get a color scheme with ANSI colors (matching babel-code-frame)
127    pub const fn colored() -> Self {
128        Self {
129            reset: "\x1b[0m",
130            keyword: "\x1b[36m",        // cyan
131            identifier: "\x1b[33m",     // yellow
132            string: "\x1b[32m",         // green
133            number: "\x1b[35m",         // magenta
134            regex: "\x1b[35m",          // magenta
135            comment: "\x1b[90m",        // gray
136            gutter: "\x1b[90m",         // gray
137            marker: "\x1b[31m\x1b[1m",  // red + bold
138            message: "\x1b[31m\x1b[1m", // red + bold (same as marker for now)
139        }
140    }
141
142    /// Get a plain color scheme with no ANSI codes (all empty strings)
143    pub const fn plain() -> Self {
144        Self {
145            reset: "",
146            keyword: "",
147            identifier: "",
148            string: "",
149            number: "",
150            regex: "",
151            comment: "",
152            gutter: "",
153            marker: "",
154            message: "",
155        }
156    }
157
158    /// Get the color for a token type
159    pub fn color_for_token(&self, token_type: TokenType) -> &'static str {
160        match token_type {
161            TokenType::Keyword => self.keyword,
162            TokenType::Identifier => self.identifier,
163            TokenType::String => self.string,
164            TokenType::Number => self.number,
165            TokenType::Regex => self.regex,
166            TokenType::Comment => self.comment,
167        }
168    }
169}
170
171// ---------------------------------------------------------------------------
172// Shared line-boundary helpers
173// ---------------------------------------------------------------------------
174
175/// Precomputed line index over a source string.
176///
177/// Scans for line terminators once on construction, then provides O(1)
178/// access to line content and byte ranges without allocating a `Vec<&str>`.
179///
180/// Recognized line terminators (per ECMA-262 §12.3):
181/// - LF (`\n`), CRLF (`\r\n`), standalone CR (`\r`)
182/// - U+2028 LINE SEPARATOR, U+2029 PARAGRAPH SEPARATOR
183pub(crate) struct Lines<'a> {
184    source: &'a str,
185    /// Byte offset of the start of each line. `line_starts[0]` corresponds
186    /// to the line at absolute index `first_line`.
187    line_starts: Vec<usize>,
188    /// The 0-indexed absolute line number of `line_starts[0]`.
189    first_line: usize,
190    /// Total number of lines in the source (always ≥ 1).
191    total_lines: usize,
192}
193
194impl<'a> Lines<'a> {
195    /// Build the full line index by scanning for all line terminators.
196    #[cfg(test)]
197    pub fn new(source: &'a str) -> Self {
198        Self::windowed(source, 0, usize::MAX)
199    }
200
201    /// Build a windowed line index. Only stores line-start offsets for
202    /// approximately `window_start..window_end` (0-indexed), plus a margin
203    /// for the skip-scan heuristic. Stops scanning once the window is
204    /// covered — never reads past the end of the window.
205    ///
206    /// This is much faster than `new()` for large files because it avoids
207    /// allocating a Vec entry for every line in the file.
208    pub fn windowed(source: &'a str, window_start: usize, window_end: usize) -> Self {
209        let bytes = source.as_bytes();
210
211        // Add margin before the window for the skip-scan backscan
212        // heuristic (which walks up to MAX_BACKSCAN_LINES backwards).
213        let store_start = window_start.saturating_sub(MAX_BACKSCAN_LINES);
214        // +1 so byte_bounds works for the last visible line.
215        let store_end = window_end.saturating_add(1);
216
217        let mut line_starts = Vec::new();
218        let mut line_num: usize = 0;
219        // Line 0 always starts at byte 0.
220        if store_start == 0 {
221            line_starts.push(0);
222        }
223        line_num += 1;
224
225        for found in memchr::Memchr3::new(b'\n', b'\r', b'\xE2', bytes) {
226            let b = bytes[found];
227            let line_start = if b == b'\n' {
228                found + 1
229            } else if b == b'\r' {
230                // CRLF: skip the \r and let the \n branch handle it.
231                if found + 1 < bytes.len() && bytes[found + 1] == b'\n' {
232                    continue;
233                }
234                // Standalone \r (classic Mac line ending).
235                found + 1
236            } else {
237                // 0xE2 is the leading byte of the 3-byte UTF-8 encoding of
238                // U+2028 LINE SEPARATOR (E2 80 A8) and U+2029 PARAGRAPH
239                // SEPARATOR (E2 80 A9). UTF-8 forbids overlong encodings,
240                // so this exact sequence is the only way these codepoints
241                // appear.
242                if found + 2 < bytes.len()
243                    && bytes[found + 1] == 0x80
244                    && (bytes[found + 2] == 0xA8 || bytes[found + 2] == 0xA9)
245                {
246                    found + 3
247                } else {
248                    // Not a line separator — just a 0xE2 byte in some
249                    // other multi-byte character. Skip it.
250                    continue;
251                }
252            };
253
254            if line_num >= store_end {
255                // Past the window — we have enough data.
256                return Self {
257                    source,
258                    line_starts,
259                    first_line: store_start,
260                    total_lines: line_num + 1,
261                };
262            }
263            if line_num >= store_start {
264                line_starts.push(line_start);
265            }
266            line_num += 1;
267        }
268
269        // File ended before or within the window — total is exact.
270        Self {
271            source,
272            line_starts,
273            first_line: store_start.min(line_num.saturating_sub(1)),
274            total_lines: line_num,
275        }
276    }
277
278    /// Number of lines (always at least 1).
279    pub fn len(&self) -> NonZeroUsize {
280        // SAFETY: total_lines is always at least 1.
281        NonZeroUsize::new(self.total_lines).unwrap()
282    }
283
284    /// The full source string.
285    pub fn source(&self) -> &'a str {
286        self.source
287    }
288
289    /// The raw line-start offsets (for passing to highlight internals).
290    /// Index 0 corresponds to absolute line `first_line()`.
291    pub fn starts(&self) -> &[usize] {
292        &self.line_starts
293    }
294
295    /// The absolute 0-indexed line number of `starts()[0]`.
296    pub fn first_line(&self) -> usize {
297        self.first_line
298    }
299
300    /// Get the content of line `idx` (0-indexed absolute), stripping the
301    /// trailing line terminator (LF, CRLF, CR, U+2028, or U+2029).
302    ///
303    /// # Panics
304    ///
305    /// Panics if `idx` is outside the stored window.
306    pub fn content(&self, idx: usize) -> &'a str {
307        let (start, end) = self.byte_bounds(idx);
308        let line = &self.source[start..end];
309        line.strip_suffix("\r\n")
310            .or_else(|| line.strip_suffix('\n'))
311            .or_else(|| line.strip_suffix('\r'))
312            .or_else(|| line.strip_suffix('\u{2028}'))
313            .or_else(|| line.strip_suffix('\u{2029}'))
314            .unwrap_or(line)
315    }
316
317    /// Byte range `[start, end)` for line `idx` (0-indexed absolute,
318    /// including the newline terminator).
319    pub fn byte_bounds(&self, idx: usize) -> (usize, usize) {
320        let local = idx - self.first_line;
321        let start = self
322            .line_starts
323            .get(local)
324            .copied()
325            .unwrap_or(self.source.len());
326        let end = self
327            .line_starts
328            .get(local + 1)
329            .copied()
330            .unwrap_or(self.source.len());
331        (start, end)
332    }
333}
334
335/// Look up which line (0-indexed) a byte offset falls on via binary search.
336fn lookup_line(line_starts: &[usize], byte_offset: usize) -> usize {
337    match line_starts.binary_search(&byte_offset) {
338        Ok(idx) => idx,
339        Err(idx) => idx.saturating_sub(1),
340    }
341}
342
343/// Get the byte range [start, end) for a given line index (0-indexed).
344fn line_bounds(line_starts: &[usize], source_len: usize, line_idx: usize) -> (usize, usize) {
345    let start = line_starts.get(line_idx).copied().unwrap_or(source_len);
346    let end = line_starts.get(line_idx + 1).copied().unwrap_or(source_len);
347    (start, end)
348}
349
350/// Tokenizer state that scans source code and collects syntax-highlight spans.
351///
352/// The scanner always tokenizes from a given `start_pos` to `scan_end` within
353/// the full `source`, but only *emits* spans that overlap with `output_ranges`.
354/// This lets callers scan from byte 0 (to maintain correct tokenizer state
355/// across multiline comments/strings) while only producing output for the
356/// visible window of lines.
357struct Scanner<'a> {
358    markers: Vec<StyleSpan>,
359    line_starts: &'a [usize],
360    source: &'a str,
361    /// Sorted, non-overlapping byte ranges we're producing highlights for.
362    /// Spans outside these ranges are skipped.
363    output_ranges: Vec<(usize, usize)>,
364    language: Language,
365}
366
367impl<'a> Scanner<'a> {
368    fn new(
369        line_starts: &'a [usize],
370        source: &'a str,
371        output_ranges: Vec<(usize, usize)>,
372        language: Language,
373    ) -> Self {
374        Self {
375            markers: Vec::new(),
376            line_starts,
377            source,
378            output_ranges,
379            language,
380        }
381    }
382
383    /// Returns the end of the last output range, or 0 if empty.
384    fn output_end(&self) -> usize {
385        self.output_ranges.last().map_or(0, |r| r.1)
386    }
387
388    /// Check whether a byte range `[start, end)` overlaps any output range.
389    #[inline]
390    fn overlaps_output(&self, start: usize, end: usize) -> bool {
391        // Ranges are sorted and there are typically ≤6, so linear scan
392        // is faster than binary search for the common case.
393        for &(rs, re) in &self.output_ranges {
394            if rs >= end {
395                return false;
396            }
397            if re > start {
398                return true;
399            }
400        }
401        false
402    }
403
404    /// Push a style span for a byte range.
405    ///
406    /// When a token spans multiple lines, it is split into one span per line
407    /// so that each line's spans are self-contained. Spans outside
408    /// `output_ranges` are skipped.
409    fn add_span(&mut self, start: usize, end: usize, token_type: TokenType) {
410        if start >= end {
411            return;
412        }
413
414        if !self.overlaps_output(start, end) {
415            return;
416        }
417
418        let source_len = self.source.len();
419        let start_line = lookup_line(self.line_starts, start);
420        let end_line = lookup_line(self.line_starts, end.saturating_sub(1));
421
422        if start_line != end_line {
423            // If the token spans lines, split it so each line's spans are self-contained.
424            for line_idx in start_line..=end_line {
425                let (line_start, line_end) = line_bounds(self.line_starts, source_len, line_idx);
426                let span_start = start.max(line_start);
427                let span_end = end.min(line_end);
428                if span_start < span_end && self.overlaps_output(span_start, span_end) {
429                    self.markers.push(StyleSpan {
430                        start: span_start,
431                        end: span_end,
432                        token_type,
433                    });
434                }
435            }
436            return;
437        }
438
439        self.markers.push(StyleSpan {
440            start,
441            end,
442            token_type,
443        });
444    }
445}
446
447// ---------------------------------------------------------------------------
448// Scan-start heuristic
449// ---------------------------------------------------------------------------
450
451/// Maximum number of lines to walk back looking for a safe restart point.
452/// If we don't find one within this limit, fall back to byte 0.
453const MAX_BACKSCAN_LINES: usize = 200;
454
455/// Find a safe byte offset to start the tokenizer scan from, close to
456/// `target_line` (0-indexed) and ideally near `visible_start` (the
457/// absolute byte offset where the visible window begins). This avoids
458/// scanning the entire file from byte 0 when the visible window is in
459/// the middle of a large file.
460///
461/// Two-phase heuristic:
462/// 1. **Line-level**: Walk backwards from `target_line` looking for a blank line — a reliable
463///    restart point outside strings/comments.
464/// 2. **Byte-level**: If `visible_start` is far (>200 bytes) from the line-level result (common for
465///    minified files with one huge line), scan backwards from `visible_start` for a `;` statement
466///    boundary. This can technically land inside a string containing `;`, but in practice minified
467///    code has frequent semicolons between statements and the consequence is at most slightly wrong
468///    highlighting.
469///
470/// Phase 1 is always safe. Phase 2 trades perfect accuracy for
471/// dramatically better performance on minified files (~100x).
472fn find_scan_start(lines: &Lines<'_>, target_line: usize, visible_start: usize) -> usize {
473    let mut result = 0;
474
475    // Phase 1: line-level backscan for a blank line
476    if target_line > 0 {
477        let first = lines.first_line();
478        let search_start = target_line.saturating_sub(MAX_BACKSCAN_LINES).max(first);
479
480        result = 'line: {
481            for line_idx in (search_start..target_line).rev() {
482                if lines.content(line_idx).trim().is_empty() {
483                    let (start, _) = lines.byte_bounds(line_idx);
484                    break 'line start;
485                }
486            }
487            if search_start > first {
488                0
489            } else {
490                let (start, _) = lines.byte_bounds(search_start);
491                start
492            }
493        };
494    }
495
496    // Phase 2: if the visible window starts far into the line, scan
497    // backwards for a `;` which typically marks a statement boundary
498    // in minified code.
499    const MIN_SKIP_DISTANCE: usize = 200;
500    if visible_start > result + MIN_SKIP_DISTANCE {
501        let search_from = result;
502        let window = &lines.source().as_bytes()[search_from..visible_start];
503        if let Some(pos) = window.iter().rposition(|&b| b == b';') {
504            result = search_from + pos + 1;
505        }
506    }
507
508    result
509}
510
511// ---------------------------------------------------------------------------
512// Public entry point
513// ---------------------------------------------------------------------------
514
515/// Extract syntax highlighting markers for source code.
516///
517/// Uses a language-agnostic byte-scanning tokenizer inspired by the `js-tokens`
518/// regex approach. It never fails and produces best-effort highlighting for any
519/// input — recognizing quoted strings, comments, numbers, regex literals, and
520/// capitalized identifiers.
521///
522/// # Parameters
523/// - `source`: The source code to highlight
524/// - `line_range`: Range of line indices (0-indexed, start inclusive, end exclusive). Style markers
525///   are only produced for lines within this range. Pass `0..usize::MAX` to produce markers for all
526///   lines.
527/// - `visible_window`: Optional `(truncation_offset, available_width)` hint. When provided, the
528///   scanner's output range is narrowed to only the visible byte window within each line, avoiding
529///   tokenization of content that will be truncated away. This dramatically improves performance on
530///   minified files with very long lines.
531pub fn extract_highlights(
532    lines: &Lines<'_>,
533    line_range: Range<usize>,
534    language: Language,
535    visible_window: Option<(usize, usize)>,
536) -> Vec<Vec<StyleSpan>> {
537    let line_starts = lines.starts();
538    let first_line = lines.first_line();
539    let source = lines.source();
540    let local_count = line_starts.len();
541
542    let local_start = line_range.start - first_line;
543    let local_end = line_range.end - first_line;
544
545    // Build per-line visible byte ranges. When a visible_window is
546    // provided, each range covers only the truncated portion of the
547    // line; otherwise it covers the full line.
548    let output_ranges: Vec<(usize, usize)> = (local_start..local_end.min(local_count))
549        .filter_map(|local_idx| {
550            let ls = line_starts[local_idx];
551            let line_end = line_starts
552                .get(local_idx + 1)
553                .copied()
554                .unwrap_or(source.len());
555            let (rs, re) = if let Some((trunc_offset, avail_width)) = visible_window {
556                (
557                    (ls + trunc_offset).min(line_end),
558                    (ls + trunc_offset + avail_width).min(line_end),
559                )
560            } else {
561                (ls, line_end)
562            };
563            if rs < re { Some((rs, re)) } else { None }
564        })
565        .collect();
566
567    // Find a safe byte offset to start the tokenizer scan from, close to
568    // the visible window. Uses line-level and byte-level heuristics.
569    let visible_start = output_ranges.first().map_or(0, |r| r.0);
570    let scan_start = find_scan_start(lines, line_range.start, visible_start);
571
572    let scan_end = output_ranges.last().map_or(source.len(), |r| r.1);
573    let mut scanner = Scanner::new(line_starts, source, output_ranges, language);
574    scanner.scan(scan_start, scan_end, None);
575    let all_spans = scanner.markers;
576
577    debug_assert!(
578        all_spans.windows(2).all(|w| w[0].start <= w[1].start),
579        "spans should already be sorted by the left-to-right scan"
580    );
581    debug_assert!(
582        all_spans.windows(2).all(|w| w[0].end <= w[1].start),
583        "spans should be non-overlapping"
584    );
585    group_spans_by_line(&all_spans, line_starts, first_line, source, line_range)
586}
587
588// ---------------------------------------------------------------------------
589// Tokenizer (language-agnostic, js-tokens style)
590// ---------------------------------------------------------------------------
591
592/// Token kinds recognized by the scanner, used for match dispatch.
593#[derive(Debug, Clone, Copy, PartialEq, Eq)]
594enum TokenKind {
595    String,
596    Template,
597    LineComment,
598    BlockComment,
599    Number,
600    Ident,
601    Close,
602    Brace,
603    Postfix,
604    Slash,
605    Op,
606}
607
608/// Each entry pairs a `TokenKind` with its regex pattern. Order matters:
609/// earlier patterns take priority when multiple can match at the same
610/// position (e.g. `//` before `/`). The `PatternID` returned by the
611/// multi-pattern regex indexes directly into this array.
612const TOKEN_RULES: &[(TokenKind, &str)] = &[
613    (
614        TokenKind::String,
615        r#""(?:[^"\\]|\\.)*"?|'(?:[^'\\]|\\.)*'?"#,
616    ),
617    // Match only the opening backtick of a template literal. The rest
618    // of the template (quasis, expressions, closing backtick) is handled
619    // by `scan_template` which manually walks the content, recursing into
620    // `scan()` for `${...}` expressions. This avoids the regex trying to
621    // match across expression boundaries where backticks in nested
622    // templates, comments, or strings would confuse it.
623    (TokenKind::Template, r"`"),
624    (TokenKind::LineComment, r"//[^\n]*"),
625    (TokenKind::BlockComment, r"(?s)/\*.*?\*/"),
626    (
627        TokenKind::Number,
628        r"0[xX][\da-fA-F]+|0[oO][0-7]+|0[bB][01]+|(?:\d*\.\d+|\d+\.?)(?:[eE][+-]?\d+)?",
629    ),
630    (TokenKind::Ident, r"[A-Za-z_$\x80-\xff][\w$\x80-\xff]*"),
631    (TokenKind::Close, r"[)\]]"),
632    (TokenKind::Brace, r"[(\[{}]"),
633    (TokenKind::Postfix, r"\+\+|--"),
634    (TokenKind::Slash, r"/"),
635    // Operators / punctuation catch-all for `last_token` tracking
636    (TokenKind::Op, r"[=+\-*%<>&|^!~?:;,.]"),
637];
638
639impl TokenKind {
640    fn from_pattern_id(id: PatternID) -> Self {
641        TOKEN_RULES[id.as_usize()].0
642    }
643}
644
645/// A multi-pattern regex where each pattern corresponds to a `TokenKind`.
646/// `regex_automata::meta::Regex::new_many()` returns the `PatternID` directly
647/// from a match, avoiding capture-group overhead and linear scanning.
648/// Pattern ordering determines match priority (leftmost-first semantics).
649static TOKEN_RE: LazyLock<MetaRegex> = LazyLock::new(|| {
650    let patterns: Vec<&str> = TOKEN_RULES.iter().map(|(_, p)| *p).collect();
651    MetaRegex::new_many(&patterns).expect("token patterns must compile")
652});
653
654/// Regex that matches a regex literal starting at the opening `/`.
655/// Handles character classes `[...]` (where `/` is literal), escape sequences,
656/// and flags. Does not match across newlines (regex literals are single-line).
657///
658/// Structure: `/` then body then `/` then optional flags:
659/// - `[^\\/\[\n\r]` — normal chars (not `\`, `/`, `[`, newline)
660/// - `\\.`          — escape sequences
661/// - `\[(?:[^\]\\\n\r]|\\.)*\]` — character classes with their own escapes
662static REGEX_LITERAL_RE: LazyLock<Regex> = LazyLock::new(|| {
663    Regex::new(r#"/(?:[^\\/\[\n\r]|\\.|\[(?:[^\]\\\n\r]|\\.)*\])+/[A-Za-z]*"#)
664        .expect("regex literal regex must compile")
665});
666
667impl Scanner<'_> {
668    /// Scan a template literal starting at the opening backtick.
669    ///
670    /// Walks the source byte-by-byte from `tpl_start` (the `` ` ``), emitting
671    /// `String` spans for quasi segments and recursively calling `scan()` for
672    /// `${...}` expression holes. This correctly handles backticks that appear
673    /// inside expressions (in nested templates, strings, or comments) because
674    /// the recursive `scan()` call tokenizes the expression content — including
675    /// any inner template literals — before we resume scanning the outer
676    /// template.
677    ///
678    /// Returns the byte position just past the closing backtick (or `scan_end`
679    /// if the template is unterminated).
680    fn scan_template(&mut self, tpl_start: usize, scan_end: usize) -> usize {
681        let bytes = self.source.as_bytes();
682        let search_start = tpl_start + 1;
683
684        // Track start of current string segment (includes the backtick or
685        // closing `}` of the previous expression)
686        let mut seg_start = tpl_start;
687
688        // Current position — may jump forward past `${...}` expressions.
689        let mut i = search_start;
690
691        // Use a persistent Memchr2 iterator for `` ` `` and `$` over the full
692        // template range. This avoids reinitializing the SIMD searcher on each
693        // call. When `i` jumps forward (after a `${...}` expression), we skip
694        // any stale positions the iterator yields before `i`.
695        //
696        // Escapes (`\`) are handled by advancing `i` past the escaped byte
697        // when a match at `pos` is preceded by an odd number of backslashes.
698        let iter = memchr::Memchr2::new(b'`', b'$', &bytes[search_start..scan_end]);
699        for found in iter {
700            let pos = search_start + found;
701            // Skip positions we've already moved past (after expression scan)
702            if pos < i {
703                continue;
704            }
705
706            // Count consecutive preceding backslashes to detect escapes.
707            // An odd count means this byte is escaped.
708            let mut backslashes = 0;
709            while pos > search_start + backslashes && bytes[pos - 1 - backslashes] == b'\\' {
710                backslashes += 1;
711            }
712            if backslashes % 2 != 0 {
713                i = pos + 1;
714                continue;
715            }
716
717            let b = bytes[pos];
718            if b == b'`' {
719                // Closing backtick — emit the final quasi (including the backtick)
720                self.add_span(seg_start, pos + 1, TokenType::String);
721                return pos + 1;
722            }
723            // b == b'$'
724            debug_assert_eq!(b, b'$');
725            if pos + 1 < scan_end && bytes[pos + 1] == b'{' {
726                // End the current quasi segment just before the `${`
727                if pos > seg_start {
728                    self.add_span(seg_start, pos, TokenType::String);
729                }
730
731                // Tokenize the expression with brace_depth=1. The recursive
732                // scan handles all tokens inside the expression — including
733                // nested template literals, strings with backticks, comments
734                // with backticks, etc. It returns the byte position just past
735                // the matching `}`.
736                let expr_start = pos + 2;
737                let expr_end = self.scan(expr_start, scan_end, Some(1));
738
739                // The next quasi segment starts at the closing `}`
740                if expr_end > expr_start && bytes.get(expr_end - 1) == Some(&b'}') {
741                    seg_start = expr_end - 1;
742                } else {
743                    // Unclosed expression — no more quasi segments
744                    seg_start = expr_end;
745                }
746                i = expr_end;
747                continue;
748            }
749            // Lone `$` not followed by `{` — skip it
750            i = pos + 1;
751        }
752
753        // Unterminated template — emit whatever quasi content we have
754        if scan_end > seg_start {
755            self.add_span(seg_start, scan_end, TokenType::String);
756        }
757        scan_end
758    }
759
760    /// Core tokenizer loop. Scans `source[start_pos..scan_end]` and appends
761    /// style markers.
762    ///
763    /// When `brace_depth` is `Some(n)` we are inside a template expression
764    /// `${...}`. The scanner tracks `{` / `}` tokens and returns as soon as
765    /// the matching `}` brings the depth back to 0, returning the byte
766    /// position just past the `}`. Pass `None` for top-level scanning.
767    fn scan(&mut self, start_pos: usize, scan_end: usize, mut brace_depth: Option<u32>) -> usize {
768        let mut pos = start_pos;
769
770        // Track the last non-whitespace, non-comment token kind for regex
771        // disambiguation. A `/` following a value or close bracket is division;
772        // following an operator or at start of input it's a regex.
773        let mut last_token = LastToken::None;
774
775        while let Some(m) = TOKEN_RE.search(&Input::new(self.source).range(pos..scan_end)) {
776            let start = m.start();
777            let raw_end = m.end();
778
779            // Once we're past the last output range, no future tokens can be visible.
780            if start >= self.output_end() {
781                break;
782            }
783
784            // Clamp the match end to scan_end
785            let end = raw_end.min(scan_end);
786
787            match TokenKind::from_pattern_id(m.pattern()) {
788                TokenKind::String => {
789                    self.add_span(start, end, TokenType::String);
790                    last_token = LastToken::Value;
791                }
792                TokenKind::Template => {
793                    // The regex only matched the opening backtick. Walk the
794                    // full template literal (quasis + expression holes)
795                    // manually, recursing into scan() for each ${...}.
796                    let tpl_end = self.scan_template(start, scan_end);
797                    last_token = LastToken::Value;
798                    pos = tpl_end;
799                    // we already updated pos so just continue
800                    continue;
801                }
802                TokenKind::LineComment | TokenKind::BlockComment => {
803                    self.add_span(start, end, TokenType::Comment);
804                    // Comments don't update last_token
805                }
806                TokenKind::Postfix => {
807                    last_token = LastToken::PostfixOp;
808                }
809                TokenKind::Slash => {
810                    if last_token.slash_means_regex()
811                        && let Some(re_match) = REGEX_LITERAL_RE.find_at(self.source, start)
812                        && re_match.start() == start
813                    {
814                        let re_end = re_match.end().min(scan_end);
815                        self.add_span(start, re_end, TokenType::Regex);
816                        last_token = LastToken::Value;
817                        pos = re_end;
818                        continue;
819                    }
820                    last_token = LastToken::Operator;
821                }
822                TokenKind::Close => {
823                    last_token = LastToken::CloseBracket;
824                }
825                TokenKind::Brace => {
826                    let ch = self.source.as_bytes()[start];
827                    if ch == b'{' {
828                        if let Some(ref mut depth) = brace_depth {
829                            *depth += 1;
830                        }
831                    } else if ch == b'}'
832                        && let Some(ref mut depth) = brace_depth
833                    {
834                        // test first to avoid underflow
835                        if *depth <= 1 {
836                            return end;
837                        }
838                        *depth -= 1;
839                    }
840                    last_token = LastToken::Operator;
841                }
842                TokenKind::Op => {
843                    last_token = LastToken::Operator;
844                }
845                TokenKind::Number => {
846                    self.add_span(start, end, TokenType::Number);
847                    last_token = LastToken::Value;
848                }
849                TokenKind::Ident => {
850                    let ident = &self.source[start..end];
851                    let token_type = if self.language.is_keyword(ident) {
852                        Some(TokenType::Keyword)
853                    } else if ident.as_bytes()[0].is_ascii_uppercase() {
854                        // Highlight capitalized identifiers (matching Babel behavior)
855                        Some(TokenType::Identifier)
856                    } else {
857                        None
858                    };
859                    if let Some(tt) = token_type {
860                        self.add_span(start, end, tt);
861                    }
862                    last_token = LastToken::Value;
863                }
864            }
865
866            assert!(
867                raw_end > pos,
868                "TOKEN_RE produced a zero-width match at byte {pos}"
869            );
870            pos = raw_end;
871        }
872
873        scan_end
874    }
875}
876
877/// Tracks the kind of the last non-whitespace, non-comment token for regex
878/// disambiguation.
879#[derive(Debug, Clone, Copy, PartialEq, Eq)]
880enum LastToken {
881    /// Start of input
882    None,
883    /// Identifier, number, string, regex — values that end expressions
884    Value,
885    /// `)` or `]` — could end an expression
886    CloseBracket,
887    /// `++` or `--` — postfix operators end expressions
888    PostfixOp,
889    /// Operators, open brackets, commas, semicolons, `{`, `}` — regex follows
890    Operator,
891}
892
893impl LastToken {
894    /// Returns true if a `/` at this position should be treated as starting a regex literal.
895    fn slash_means_regex(self) -> bool {
896        match self {
897            LastToken::None | LastToken::Operator => true,
898            LastToken::Value | LastToken::CloseBracket | LastToken::PostfixOp => false,
899        }
900    }
901}
902
903// ---------------------------------------------------------------------------
904// Span → per-line grouping
905// ---------------------------------------------------------------------------
906
907/// Group spans by line. O(spans) single pass.
908fn group_spans_by_line(
909    spans: &[StyleSpan],
910    line_starts: &[usize],
911    first_line: usize,
912    source: &str,
913    line_range: Range<usize>,
914) -> Vec<Vec<StyleSpan>> {
915    if source.is_empty() {
916        return Vec::new();
917    }
918
919    let line_count = first_line + line_starts.len();
920
921    let start_line_idx = line_range.start.min(line_count);
922    let end_line_idx = line_range.end.min(line_count);
923
924    let output_line_count = end_line_idx.saturating_sub(start_line_idx);
925    let mut line_highlights = Vec::with_capacity(output_line_count);
926
927    let mut span_idx = 0;
928
929    for line_idx in start_line_idx..end_line_idx {
930        let local_idx = line_idx - first_line;
931        let (line_start, line_end) = line_bounds(line_starts, source.len(), local_idx);
932
933        let mut line_spans = Vec::new();
934
935        while span_idx < spans.len() {
936            let span = &spans[span_idx];
937
938            if span.start >= line_end {
939                break;
940            }
941            debug_assert!(
942                span.start >= line_start,
943                "span at {} precedes line start {line_start}",
944                span.start
945            );
946
947            line_spans.push(StyleSpan {
948                start: span.start - line_start,
949                end: span.end - line_start,
950                token_type: span.token_type,
951            });
952
953            span_idx += 1;
954        }
955
956        line_highlights.push(line_spans);
957    }
958
959    line_highlights
960}
961
962// ---------------------------------------------------------------------------
963// Line rendering with truncation-aware highlighting
964// ---------------------------------------------------------------------------
965
966/// Apply syntax highlighting to a (possibly truncated) line of text.
967///
968/// Iterates the line's `StyleSpan`s, converting from line-relative offsets to
969/// display offsets accounting for truncation, and inserts ANSI color codes.
970///
971/// - `truncation_offset`: byte offset in the original line where visible source content starts
972/// - `prefix_len`: byte length of any prefix prepended before source content (e.g., `"..."` = 3)
973pub fn apply_line_highlights(
974    visible_content: &str,
975    spans: &[StyleSpan],
976    color_scheme: &ColorScheme,
977    truncation_offset: usize,
978    prefix_len: usize,
979) -> String {
980    if spans.is_empty() {
981        return visible_content.to_string();
982    }
983
984    // The visible source region in original-line coordinates
985    let visible_end = truncation_offset + visible_content.len().saturating_sub(prefix_len);
986
987    let mut result = String::with_capacity(visible_content.len() + spans.len() * 10);
988    let mut last_offset = 0;
989
990    // Skip spans that end before the visible window
991    let start_idx = spans.partition_point(|s| s.end <= truncation_offset);
992
993    for span in &spans[start_idx..] {
994        if span.start >= visible_end {
995            break;
996        }
997
998        // Clamp span to the visible window and convert to display coordinates
999        let display_start = (span.start.max(truncation_offset) - truncation_offset + prefix_len)
1000            .min(visible_content.len());
1001        let display_end =
1002            (span.end.min(visible_end) - truncation_offset + prefix_len).min(visible_content.len());
1003
1004        if display_start < display_end {
1005            // Emit unstyled text before this span
1006            if display_start > last_offset {
1007                result.push_str(&visible_content[last_offset..display_start]);
1008            }
1009            // Emit styled span content
1010            result.push_str(color_scheme.color_for_token(span.token_type));
1011            result.push_str(&visible_content[display_start..display_end]);
1012            result.push_str(color_scheme.reset);
1013            last_offset = display_end;
1014        }
1015    }
1016
1017    // Emit any remaining unstyled text
1018    if last_offset < visible_content.len() {
1019        result.push_str(&visible_content[last_offset..]);
1020    }
1021
1022    result
1023}
1024
1025#[cfg(test)]
1026pub mod tests {
1027    use super::*;
1028
1029    /// Default language for tests
1030    const JS: Language = Language::JavaScript;
1031
1032    /// Strip ANSI escape codes from a string
1033    pub fn strip_ansi_codes(s: &str) -> String {
1034        let mut result = String::with_capacity(s.len());
1035        let mut chars = s.chars();
1036
1037        while let Some(ch) = chars.next() {
1038            if ch == '\x1b' {
1039                if chars.next() == Some('[') {
1040                    for ch in chars.by_ref() {
1041                        if ch.is_alphabetic() {
1042                            break;
1043                        }
1044                    }
1045                }
1046            } else {
1047                result.push(ch);
1048            }
1049        }
1050
1051        result
1052    }
1053
1054    // -----------------------------------------------------------------------
1055    // Basic highlighting tests
1056    // -----------------------------------------------------------------------
1057
1058    #[test]
1059    fn test_apply_line_highlights_basic() {
1060        let source = "const Foo = 123";
1061        let highlights = extract_highlights(&Lines::new(source), 0..usize::MAX, JS, None);
1062        let color_scheme = ColorScheme::colored();
1063
1064        let result = apply_line_highlights(source, &highlights[0], &color_scheme, 0, 0);
1065
1066        assert!(result.contains("\x1b["), "Result should contain ANSI codes");
1067        assert!(result.contains("const"), "Result should contain 'const'");
1068        assert!(result.contains("Foo"), "Result should contain 'Foo'");
1069        assert!(result.contains("123"), "Result should contain '123'");
1070    }
1071
1072    #[test]
1073    fn test_apply_line_highlights_plain() {
1074        let source = "const foo = 123";
1075        let highlights = extract_highlights(&Lines::new(source), 0..usize::MAX, JS, None);
1076        let color_scheme = ColorScheme::plain();
1077
1078        let result = apply_line_highlights(source, &highlights[0], &color_scheme, 0, 0);
1079        assert_eq!(result, source);
1080    }
1081
1082    #[test]
1083    fn test_only_capitalized_identifiers_highlighted() {
1084        let source = "const foo = Bar";
1085        let highlights = extract_highlights(&Lines::new(source), 0..usize::MAX, JS, None);
1086
1087        let has_identifier = highlights[0]
1088            .iter()
1089            .any(|s| s.token_type == TokenType::Identifier);
1090        assert!(has_identifier, "Capitalized 'Bar' should be highlighted");
1091
1092        let ident_starts: Vec<usize> = highlights[0]
1093            .iter()
1094            .filter(|s| s.token_type == TokenType::Identifier)
1095            .map(|s| s.start)
1096            .collect();
1097        assert_eq!(
1098            ident_starts,
1099            vec![12],
1100            "Only 'Bar' at offset 12 should be highlighted"
1101        );
1102    }
1103
1104    #[test]
1105    fn test_strip_ansi_codes() {
1106        let input = "\x1b[36mconst\x1b[0m foo = \x1b[35m123\x1b[0m";
1107        let result = strip_ansi_codes(input);
1108        assert_eq!(result, "const foo = 123");
1109    }
1110
1111    #[test]
1112    fn test_apply_line_highlights_with_truncation() {
1113        let source = "const Foo = 123";
1114        let highlights = extract_highlights(&Lines::new(source), 0..usize::MAX, JS, None);
1115        let color_scheme = ColorScheme::colored();
1116
1117        // Truncate to show "Foo = 123" (offset 6, length 9, no prefix)
1118        let visible = &source[6..];
1119        let result = apply_line_highlights(visible, &highlights[0], &color_scheme, 6, 0);
1120
1121        let stripped = strip_ansi_codes(&result);
1122        assert_eq!(stripped, "Foo = 123");
1123        assert!(
1124            result.contains("\x1b["),
1125            "Should contain ANSI codes for Foo/123"
1126        );
1127    }
1128
1129    #[test]
1130    fn test_apply_line_highlights_overlapping_truncation() {
1131        // "hello world" is a string starting at offset 10
1132        // Truncating at offset 15 lands inside the string ("o world";)
1133        let source = r#"const x = "hello world";"#;
1134        let truncation_offset = 15;
1135        let highlights = extract_highlights(&Lines::new(source), 0..usize::MAX, JS, None);
1136        let color_scheme = ColorScheme::colored();
1137
1138        let visible = &source[truncation_offset..];
1139        let result =
1140            apply_line_highlights(visible, &highlights[0], &color_scheme, truncation_offset, 0);
1141
1142        let stripped = strip_ansi_codes(&result);
1143        assert_eq!(stripped, visible);
1144        // The visible portion starts inside the string, so it should
1145        // begin with an ANSI code for the overlapping string style
1146        assert!(
1147            result.starts_with("\x1b["),
1148            "Should start with ANSI code for the overlapping string: {result:?}"
1149        );
1150    }
1151
1152    #[test]
1153    fn test_comments_and_numbers() {
1154        let source = "const x = 42; // comment\nobj.foo = 10;";
1155        let highlights = extract_highlights(&Lines::new(source), 0..usize::MAX, JS, None);
1156
1157        assert_eq!(highlights.len(), 2);
1158
1159        let line1_has_comment = highlights[0]
1160            .iter()
1161            .any(|m| m.token_type == TokenType::Comment);
1162        assert!(line1_has_comment, "First line should have comment markers");
1163
1164        let line1_has_number = highlights[0]
1165            .iter()
1166            .any(|m| m.token_type == TokenType::Number);
1167        let line2_has_number = highlights[1]
1168            .iter()
1169            .any(|m| m.token_type == TokenType::Number);
1170        assert!(line1_has_number);
1171        assert!(line2_has_number);
1172    }
1173
1174    #[test]
1175    fn test_multiline_comment() {
1176        let source = "const x = 1;\n/* multi\n   line */\nconst y = 2;";
1177        let highlights = extract_highlights(&Lines::new(source), 0..usize::MAX, JS, None);
1178
1179        assert_eq!(highlights.len(), 4);
1180
1181        let line2_has_comment = highlights[1]
1182            .iter()
1183            .any(|m| m.token_type == TokenType::Comment);
1184        let line3_has_comment = highlights[2]
1185            .iter()
1186            .any(|m| m.token_type == TokenType::Comment);
1187
1188        assert!(line2_has_comment, "Line 2 should have comment marker");
1189        assert!(line3_has_comment, "Line 3 should have comment marker");
1190    }
1191
1192    #[test]
1193    fn test_multiline_template_literal() {
1194        let source = "const x = `line1\nline2\nline3`;";
1195        let highlights = extract_highlights(&Lines::new(source), 0..usize::MAX, JS, None);
1196
1197        assert_eq!(highlights.len(), 3);
1198
1199        for (i, highlight) in highlights.iter().enumerate() {
1200            let has_string = highlight.iter().any(|m| m.token_type == TokenType::String);
1201            assert!(
1202                has_string,
1203                "Line {} should have string markers for the template literal",
1204                i + 1
1205            );
1206        }
1207    }
1208
1209    #[test]
1210    fn test_template_literal_with_expression() {
1211        // `hello ${name}!` should mark `hello ` and `!` as string,
1212        // but NOT mark `name` as string.
1213        let source = "const x = `hello ${name}!`;";
1214        let highlights = extract_highlights(&Lines::new(source), 0..usize::MAX, JS, None);
1215
1216        let string_spans: Vec<(usize, usize)> = highlights[0]
1217            .iter()
1218            .filter(|s| s.token_type == TokenType::String)
1219            .map(|s| (s.start, s.end))
1220            .collect();
1221
1222        // Should have two string segments: `hello ${ and }!`
1223        // The `name` between ${ and } should NOT be in any string range
1224        assert!(
1225            string_spans.len() >= 2,
1226            "Should have at least 2 string segments: got {:?}",
1227            string_spans
1228        );
1229
1230        // Verify "name" is NOT inside any string span
1231        let name_offset = source.find("name").unwrap();
1232        let name_in_string = highlights[0].iter().any(|s| {
1233            s.token_type == TokenType::String && s.start <= name_offset && s.end > name_offset
1234        });
1235        assert!(
1236            !name_in_string,
1237            "'name' should not be marked as part of a string"
1238        );
1239    }
1240
1241    #[test]
1242    fn test_template_literal_nested() {
1243        // Nested template literal: `a ${`b ${c}`} d`
1244        let source = r#"const x = `a ${`b ${c}`} d`;"#;
1245        let highlights = extract_highlights(&Lines::new(source), 0..usize::MAX, JS, None);
1246
1247        // Should not panic and should produce some markers
1248        assert!(!highlights.is_empty());
1249        let has_string = highlights[0]
1250            .iter()
1251            .any(|m| m.token_type == TokenType::String);
1252        assert!(has_string, "Should have string markers");
1253    }
1254
1255    // -----------------------------------------------------------------------
1256    // Unbalanced template literal tests
1257    // -----------------------------------------------------------------------
1258
1259    #[test]
1260    fn test_template_unclosed_expression() {
1261        // `hello ${name` — the `${` is never closed with `}`
1262        // Should not panic; the string part before `${` should still be marked.
1263        let source = "const x = `hello ${name";
1264        let highlights = extract_highlights(&Lines::new(source), 0..usize::MAX, JS, None);
1265        assert!(!highlights.is_empty(), "Should produce highlights");
1266
1267        // Should have at least one string marker for the "`hello " part
1268        let has_string = highlights[0]
1269            .iter()
1270            .any(|m| m.token_type == TokenType::String);
1271        assert!(has_string, "Should still mark the string part before ${{");
1272
1273        // "name" should NOT be marked as string since it's inside an expression hole
1274        let name_offset = source.find("name").unwrap();
1275        let name_in_string = highlights[0].iter().any(|s| {
1276            s.token_type == TokenType::String && s.start <= name_offset && s.end > name_offset
1277        });
1278        assert!(
1279            !name_in_string,
1280            "'name' inside unclosed expression should not be a string"
1281        );
1282    }
1283
1284    #[test]
1285    fn test_template_brace_in_string_inside_expression() {
1286        // `${ "}" }` — the `}` inside the string should not close the expression
1287        let source = r#"const x = `${  "}" } end`;"#;
1288        let highlights = extract_highlights(&Lines::new(source), 0..usize::MAX, JS, None);
1289        assert!(!highlights.is_empty());
1290
1291        // The " end" part after the real closing } should be marked as string
1292        let end_offset = source.find(" end").unwrap();
1293        let has_end_string = highlights[0].iter().any(|s| {
1294            s.token_type == TokenType::String && s.start <= end_offset && s.end > end_offset
1295        });
1296        assert!(
1297            has_end_string,
1298            "String part after expression should be marked"
1299        );
1300    }
1301
1302    #[test]
1303    fn test_template_empty_expression() {
1304        // `hello ${}world` — empty expression hole
1305        let source = "const x = `hello ${}world`;";
1306        let highlights = extract_highlights(&Lines::new(source), 0..usize::MAX, JS, None);
1307        assert!(!highlights.is_empty());
1308
1309        // Both "hello " and "world" parts should be string-marked
1310        let string_spans: Vec<usize> = highlights[0]
1311            .iter()
1312            .filter(|s| s.token_type == TokenType::String)
1313            .map(|s| s.start)
1314            .collect();
1315        assert!(
1316            string_spans.len() >= 2,
1317            "Empty expression should still split into two string segments, got {:?}",
1318            string_spans
1319        );
1320    }
1321
1322    #[test]
1323    fn test_template_nested_backtick_in_expression() {
1324        // `some${`template`}literal` — nested template inside expression
1325        let source = r#"const x = `some${`template`}literal`;"#;
1326        let highlights = extract_highlights(&Lines::new(source), 0..usize::MAX, JS, None);
1327        assert!(!highlights.is_empty());
1328
1329        // "literal" should be part of a string span (the outer template quasi)
1330        let literal_offset = source.rfind("literal").unwrap();
1331        let literal_is_string = highlights[0].iter().any(|s| {
1332            s.token_type == TokenType::String && s.start <= literal_offset && s.end > literal_offset
1333        });
1334        assert!(
1335            literal_is_string,
1336            "'literal' should be marked as string (outer template quasi), spans: {:?}",
1337            highlights[0]
1338        );
1339
1340        // "template" should also be string (inner template literal)
1341        let template_offset = source.find("template").unwrap();
1342        let template_is_string = highlights[0].iter().any(|s| {
1343            s.token_type == TokenType::String
1344                && s.start <= template_offset
1345                && s.end > template_offset
1346        });
1347        assert!(
1348            template_is_string,
1349            "'template' should be marked as string (inner template), spans: {:?}",
1350            highlights[0]
1351        );
1352    }
1353
1354    #[test]
1355    fn test_template_block_comment_with_backtick_in_expression() {
1356        // `some${ /* ` */ ""}literal` — block comment containing backtick inside expression
1357        let source = r#"const x = `some${ /* ` */ ""}literal`;"#;
1358        let highlights = extract_highlights(&Lines::new(source), 0..usize::MAX, JS, None);
1359        assert!(!highlights.is_empty());
1360
1361        // The /* ` */ should be a comment, not end the template
1362        let comment_offset = source.find("/* ` */").unwrap();
1363        let comment_is_comment = highlights[0].iter().any(|s| {
1364            s.token_type == TokenType::Comment
1365                && s.start <= comment_offset
1366                && s.end > comment_offset
1367        });
1368        assert!(
1369            comment_is_comment,
1370            "'/* ` */' should be marked as comment, spans: {:?}",
1371            highlights[0]
1372        );
1373
1374        // "literal" should be string (outer template quasi after expression closes)
1375        let literal_offset = source.rfind("literal").unwrap();
1376        let literal_is_string = highlights[0].iter().any(|s| {
1377            s.token_type == TokenType::String && s.start <= literal_offset && s.end > literal_offset
1378        });
1379        assert!(
1380            literal_is_string,
1381            "'literal' should be marked as string, spans: {:?}",
1382            highlights[0]
1383        );
1384    }
1385
1386    #[test]
1387    fn test_template_line_comment_with_backtick_in_expression() {
1388        // `some${ // `
1389        // }literal`
1390        // Line comment containing backtick inside expression
1391        let source = "const x = `some${ // `\n}literal`;";
1392        let highlights = extract_highlights(&Lines::new(source), 0..usize::MAX, JS, None);
1393        assert!(highlights.len() >= 2, "Should have at least 2 lines");
1394
1395        // The // ` should be a comment on line 1
1396        let line1 = "const x = `some${ // `";
1397        let comment_offset = line1.find("// `").unwrap();
1398        let comment_is_comment = highlights[0].iter().any(|s| {
1399            s.token_type == TokenType::Comment
1400                && s.start <= comment_offset
1401                && s.end > comment_offset
1402        });
1403        assert!(
1404            comment_is_comment,
1405            "'// `' should be marked as comment, spans: {:?}",
1406            highlights[0]
1407        );
1408
1409        // "literal" on line 2 should be string (outer template quasi)
1410        // Line 2 is "}literal`;" — "literal" starts at byte 1 (line-relative)
1411        let line2 = "}literal`;";
1412        let literal_offset = line2.find("literal").unwrap();
1413        let literal_is_string = highlights[1].iter().any(|s| {
1414            s.token_type == TokenType::String && s.start <= literal_offset && s.end > literal_offset
1415        });
1416        assert!(
1417            literal_is_string,
1418            "'literal' should be marked as string, spans: {:?}",
1419            highlights[1]
1420        );
1421    }
1422
1423    #[test]
1424    fn test_template_string_with_backtick_in_expression() {
1425        // `some${"`"}literal` — string containing backtick inside expression
1426        let source = r#"const x = `some${"`"}literal`;"#;
1427        let highlights = extract_highlights(&Lines::new(source), 0..usize::MAX, JS, None);
1428        assert!(!highlights.is_empty());
1429
1430        // The "`" should be a string span
1431        let inner_str_offset = source.find(r#""`""#).unwrap();
1432        let inner_is_string = highlights[0].iter().any(|s| {
1433            s.token_type == TokenType::String
1434                && s.start <= inner_str_offset
1435                && s.end > inner_str_offset
1436        });
1437        assert!(
1438            inner_is_string,
1439            r#"'"`"' should be marked as string, spans: {:?}"#,
1440            highlights[0]
1441        );
1442
1443        // "literal" should be string (outer template quasi)
1444        let literal_offset = source.rfind("literal").unwrap();
1445        let literal_is_string = highlights[0].iter().any(|s| {
1446            s.token_type == TokenType::String && s.start <= literal_offset && s.end > literal_offset
1447        });
1448        assert!(
1449            literal_is_string,
1450            "'literal' should be marked as string, spans: {:?}",
1451            highlights[0]
1452        );
1453    }
1454
1455    #[test]
1456    fn test_line_range_filtering() {
1457        let source = "const a = 1;\nconst b = 2;\nconst c = 3;\nconst d = 4;\nconst e = 5;";
1458
1459        let highlights = extract_highlights(&Lines::new(source), 1..4, JS, None);
1460
1461        assert_eq!(highlights.len(), 3);
1462        assert!(highlights.iter().all(|h| !h.is_empty()));
1463    }
1464
1465    // -----------------------------------------------------------------------
1466    // Regex literal tests
1467    // -----------------------------------------------------------------------
1468
1469    #[test]
1470    fn test_regex_after_equals() {
1471        let source = "const re = /foo/gi;";
1472        let highlights = extract_highlights(&Lines::new(source), 0..usize::MAX, JS, None);
1473
1474        let has_regex = highlights[0]
1475            .iter()
1476            .any(|m| m.token_type == TokenType::Regex);
1477        assert!(has_regex, "/foo/gi should be highlighted as regex");
1478    }
1479
1480    #[test]
1481    fn test_division_not_regex() {
1482        // After an identifier, `/` is division not regex
1483        let source = "const x = a / b / c;";
1484        let highlights = extract_highlights(&Lines::new(source), 0..usize::MAX, JS, None);
1485
1486        let has_regex = highlights[0]
1487            .iter()
1488            .any(|m| m.token_type == TokenType::Regex);
1489        assert!(!has_regex, "a / b / c should not have regex markers");
1490    }
1491
1492    // -----------------------------------------------------------------------
1493    // Keyword highlighting tests
1494    // -----------------------------------------------------------------------
1495
1496    #[test]
1497    fn test_js_keywords_highlighted() {
1498        let source = "const foo = function() { return true; }";
1499        let highlights = extract_highlights(&Lines::new(source), 0..usize::MAX, JS, None);
1500
1501        let keyword_starts: Vec<usize> = highlights[0]
1502            .iter()
1503            .filter(|s| s.token_type == TokenType::Keyword)
1504            .map(|s| s.start)
1505            .collect();
1506
1507        // "const" at 0..5, "function" at 12..20, "return" at 25..31, "true" at 32..36
1508        assert!(
1509            keyword_starts.contains(&0),
1510            "'const' should start at offset 0"
1511        );
1512        assert!(
1513            keyword_starts.contains(&12),
1514            "'function' should start at offset 12"
1515        );
1516        assert!(
1517            keyword_starts.contains(&25),
1518            "'return' should start at offset 25"
1519        );
1520        assert!(
1521            keyword_starts.contains(&32),
1522            "'true' should start at offset 32"
1523        );
1524    }
1525
1526    #[test]
1527    fn test_css_no_keywords() {
1528        let source = "const foo = function() { return true; }";
1529        let highlights =
1530            extract_highlights(&Lines::new(source), 0..usize::MAX, Language::Css, None);
1531
1532        let has_keyword = highlights[0]
1533            .iter()
1534            .any(|m| m.token_type == TokenType::Keyword);
1535        assert!(
1536            !has_keyword,
1537            "CSS language should not produce keyword markers"
1538        );
1539    }
1540
1541    // -----------------------------------------------------------------------
1542    // Scan-start heuristic tests
1543    // -----------------------------------------------------------------------
1544
1545    #[test]
1546    fn test_block_comment_with_blank_line_known_limitation() {
1547        // Known limitation: when a block comment contains a blank line, the
1548        // skip-scan heuristic restarts scanning from that blank line, losing
1549        // track of the opening `/*`. The `*/` closer loses its comment
1550        // highlighting because the scanner never saw the opener.
1551        //
1552        // This is a deliberate tradeoff: blank lines inside block comments
1553        // that span the visible window boundary are vanishingly rare in
1554        // practice, and the only consequence is slightly wrong colors —
1555        // never a crash or missing output.
1556        let mut source = String::new();
1557        // Push enough lines so the blank line inside the comment is chosen
1558        // as the scan start rather than scanning from byte 0.
1559        for i in 0..20 {
1560            source.push_str(&format!("const x{i} = {i};\n"));
1561        }
1562        source.push_str("/** sneaky\n");
1563        source.push('\n'); // blank line inside block comment
1564        source.push_str("*/\n");
1565        source.push_str("const after = 1;\n");
1566
1567        let lines = Lines::new(&source);
1568        // Target the `*/` line — should be Comment but won't be.
1569        let closer_line_idx = lines.len().get() - 3;
1570
1571        let highlights = extract_highlights(&lines, closer_line_idx..closer_line_idx + 1, JS, None);
1572        assert_eq!(highlights.len(), 1);
1573
1574        // With correct full-file scanning, `*/` would be highlighted as a
1575        // comment. But the skip-scan heuristic restarts at the blank line
1576        // inside the comment, so the scanner sees `*/` as stray punctuation.
1577        let has_comment = highlights[0]
1578            .iter()
1579            .any(|m| m.token_type == TokenType::Comment);
1580        assert!(
1581            !has_comment,
1582            "Known limitation: `*/` loses comment highlighting when the skip-scan heuristic \
1583             starts after the `/*` opener"
1584        );
1585    }
1586}
next_code_frame/highlight.rs

next_code_frame/
highlight.rs