turbopack_core/
source_pos.rs

1use serde::{Deserialize, Serialize};
2use turbo_tasks::{NonLocalValue, TaskInput, trace::TraceRawVcs};
3use turbo_tasks_hash::DeterministicHash;
4
5/// LINE FEED (LF), one of the basic JS line terminators.
6const U8_LF: u8 = 0x0A;
7/// CARRIAGE RETURN (CR), one of the basic JS line terminators.
8const U8_CR: u8 = 0x0D;
9
10#[derive(
11    Default,
12    Debug,
13    PartialEq,
14    Eq,
15    Copy,
16    Clone,
17    Hash,
18    PartialOrd,
19    Ord,
20    TaskInput,
21    TraceRawVcs,
22    Serialize,
23    Deserialize,
24    DeterministicHash,
25    NonLocalValue,
26)]
27pub struct SourcePos {
28    /// The line, 0-indexed.
29    pub line: u32,
30    /// The byte index of the column, 0-indexed.
31    pub column: u32,
32}
33
34impl SourcePos {
35    pub fn new(start_line: u32) -> Self {
36        Self {
37            line: start_line,
38            column: 0,
39        }
40    }
41
42    pub fn max() -> Self {
43        Self {
44            line: u32::MAX,
45            column: u32::MAX,
46        }
47    }
48
49    /// Increments the line/column position to account for new source code.
50    /// Line terminators are the classic "\n", "\r", "\r\n" (which counts as
51    /// a single terminator), and JSON LINE/PARAGRAPH SEPARATORs.
52    ///
53    /// See https://tc39.es/ecma262/multipage/ecmascript-language-lexical-grammar.html#sec-line-terminators
54    pub fn update(&mut self, code: &[u8]) {
55        // JS source text is interpreted as UCS-2, which is basically UTF-16 with less
56        // restrictions. We cannot iterate UTF-8 bytes here, 2-byte UTF-8 octets
57        // should count as a 1 char and not 2.
58        let &mut SourcePos {
59            mut line,
60            mut column,
61        } = self;
62
63        let mut i = 0;
64        while i < code.len() {
65            // This is not a UTF-8 validator, but it's likely close enough. It's assumed
66            // that the input is valid (and if it isn't than what are you doing trying to
67            // embed it into source code anyways?). The important part is that we process in
68            // order, and use the first octet's bit pattern to decode the octet length of
69            // the char.
70            match code[i] {
71                U8_LF => {
72                    i += 1;
73                    line += 1;
74                    column = 0;
75                }
76                U8_CR => {
77                    // Count "\r\n" as a single terminator.
78                    if code.get(i + 1) == Some(&U8_LF) {
79                        i += 2;
80                    } else {
81                        i += 1;
82                    }
83                    line += 1;
84                    column = 0;
85                }
86
87                // 1 octet chars do not have the high bit set. If it's not a LF or CR, then it's
88                // just a regular ASCII.
89                b if b & 0b10000000 == 0 => {
90                    i += 1;
91                    column += 1;
92                }
93
94                // 2 octet chars have a leading `110` bit pattern. None are considered line
95                // terminators.
96                b if b & 0b11100000 == 0b11000000 => {
97                    // eat this byte and the next.
98                    i += 2;
99                    column += 1;
100                }
101
102                // 3 octet chars have a leading `1110` bit pattern. Both the LINE/PARAGRAPH
103                // SEPARATOR exist in 3 octets.
104                b if b & 0b11110000 == 0b11100000 => {
105                    // The LINE and PARAGRAPH have the bits `11100010 10000000 1010100X`, with the X
106                    // denoting either line or paragraph.
107                    let mut separator = false;
108                    if b == 0b11100010 && code.get(i + 1) == Some(&0b10000000) {
109                        let last = code.get(i + 2).cloned().unwrap_or_default();
110                        separator = (last & 0b11111110) == 0b10101000
111                    }
112
113                    // eat this byte and the next 2.
114                    i += 3;
115                    if separator {
116                        line += 1;
117                        column = 0;
118                    } else {
119                        column += 1;
120                    }
121                }
122
123                // 4 octet chars have a leading `11110` pattern, but we don't need to check because
124                // none of the other patterns matched.
125                _ => {
126                    // eat this byte and the next 3.
127                    i += 4;
128                    column += 1;
129                }
130            }
131        }
132        self.line = line;
133        self.column = column;
134    }
135}
136
137impl std::cmp::PartialEq<(u32, u32)> for SourcePos {
138    fn eq(&self, other: &(u32, u32)) -> bool {
139        &(self.line, self.column) == other
140    }
141}