turbopack_core/
source_pos.rs

1use serde::{Deserialize, Serialize};
2use turbo_tasks::{NonLocalValue, TaskInput, trace::TraceRawVcs};
3use turbo_tasks_hash::DeterministicHash;
4
5/// LINE FEED (LF), one of the basic JS line terminators.
6const U8_LF: u8 = 0x0A;
7/// CARRIAGE RETURN (CR), one of the basic JS line terminators.
8const U8_CR: u8 = 0x0D;
9
10#[derive(
11    Default,
12    Debug,
13    PartialEq,
14    Eq,
15    Copy,
16    Clone,
17    Hash,
18    PartialOrd,
19    Ord,
20    TaskInput,
21    TraceRawVcs,
22    Serialize,
23    Deserialize,
24    DeterministicHash,
25    NonLocalValue,
26)]
27pub struct SourcePos {
28    /// The line, 0-indexed.
29    pub line: u32,
30    /// The byte index of the column, 0-indexed.
31    pub column: u32,
32}
33
34impl SourcePos {
35    pub fn new() -> Self {
36        Default::default()
37    }
38
39    pub fn max() -> Self {
40        Self {
41            line: u32::MAX,
42            column: u32::MAX,
43        }
44    }
45
46    /// Increments the line/column position to account for new source code.
47    /// Line terminators are the classic "\n", "\r", "\r\n" (which counts as
48    /// a single terminator), and JSON LINE/PARAGRAPH SEPARATORs.
49    ///
50    /// See https://tc39.es/ecma262/multipage/ecmascript-language-lexical-grammar.html#sec-line-terminators
51    pub fn update(&mut self, code: &[u8]) {
52        // JS source text is interpreted as UCS-2, which is basically UTF-16 with less
53        // restrictions. We cannot iterate UTF-8 bytes here, 2-byte UTF-8 octets
54        // should count as a 1 char and not 2.
55        let &mut SourcePos {
56            mut line,
57            mut column,
58        } = self;
59
60        let mut i = 0;
61        while i < code.len() {
62            // This is not a UTF-8 validator, but it's likely close enough. It's assumed
63            // that the input is valid (and if it isn't than what are you doing trying to
64            // embed it into source code anyways?). The important part is that we process in
65            // order, and use the first octet's bit pattern to decode the octet length of
66            // the char.
67            match code[i] {
68                U8_LF => {
69                    i += 1;
70                    line += 1;
71                    column = 0;
72                }
73                U8_CR => {
74                    // Count "\r\n" as a single terminator.
75                    if code.get(i + 1) == Some(&U8_LF) {
76                        i += 2;
77                    } else {
78                        i += 1;
79                    }
80                    line += 1;
81                    column = 0;
82                }
83
84                // 1 octet chars do not have the high bit set. If it's not a LF or CR, then it's
85                // just a regular ASCII.
86                b if b & 0b10000000 == 0 => {
87                    i += 1;
88                    column += 1;
89                }
90
91                // 2 octet chars have a leading `110` bit pattern. None are considered line
92                // terminators.
93                b if b & 0b11100000 == 0b11000000 => {
94                    // eat this byte and the next.
95                    i += 2;
96                    column += 1;
97                }
98
99                // 3 octet chars have a leading `1110` bit pattern. Both the LINE/PARAGRAPH
100                // SEPARATOR exist in 3 octets.
101                b if b & 0b11110000 == 0b11100000 => {
102                    // The LINE and PARAGRAPH have the bits `11100010 10000000 1010100X`, with the X
103                    // denoting either line or paragraph.
104                    let mut separator = false;
105                    if b == 0b11100010 && code.get(i + 1) == Some(&0b10000000) {
106                        let last = code.get(i + 2).cloned().unwrap_or_default();
107                        separator = (last & 0b11111110) == 0b10101000
108                    }
109
110                    // eat this byte and the next 2.
111                    i += 3;
112                    if separator {
113                        line += 1;
114                        column = 0;
115                    } else {
116                        column += 1;
117                    }
118                }
119
120                // 4 octet chars have a leading `11110` pattern, but we don't need to check because
121                // none of the other patterns matched.
122                _ => {
123                    // eat this byte and the next 3.
124                    i += 4;
125                    column += 1;
126                }
127            }
128        }
129        self.line = line;
130        self.column = column;
131    }
132}
133
134impl std::cmp::PartialEq<(u32, u32)> for SourcePos {
135    fn eq(&self, other: &(u32, u32)) -> bool {
136        &(self.line, self.column) == other
137    }
138}