turbopack_core/
source_pos.rs

1use bincode::{Decode, Encode};
2use serde::Serialize;
3use turbo_tasks::{NonLocalValue, TaskInput, trace::TraceRawVcs};
4use turbo_tasks_hash::DeterministicHash;
5
6/// LINE FEED (LF), one of the basic JS line terminators.
7const U8_LF: u8 = 0x0A;
8/// CARRIAGE RETURN (CR), one of the basic JS line terminators.
9const U8_CR: u8 = 0x0D;
10
11#[derive(
12    Default,
13    Debug,
14    PartialEq,
15    Eq,
16    Copy,
17    Clone,
18    Hash,
19    PartialOrd,
20    Ord,
21    TaskInput,
22    TraceRawVcs,
23    Serialize,
24    DeterministicHash,
25    NonLocalValue,
26    Encode,
27    Decode,
28)]
29pub struct SourcePos {
30    /// The line, 0-indexed.
31    pub line: u32,
32    /// The byte index of the column, 0-indexed.
33    pub column: u32,
34}
35
36impl SourcePos {
37    pub fn new(start_line: u32) -> Self {
38        Self {
39            line: start_line,
40            column: 0,
41        }
42    }
43
44    pub fn max() -> Self {
45        Self {
46            line: u32::MAX,
47            column: u32::MAX,
48        }
49    }
50
51    /// Increments the line/column position to account for new source code.
52    /// Line terminators are the classic "\n", "\r", "\r\n" (which counts as
53    /// a single terminator), and JSON LINE/PARAGRAPH SEPARATORs.
54    ///
55    /// See https://tc39.es/ecma262/multipage/ecmascript-language-lexical-grammar.html#sec-line-terminators
56    pub fn update(&mut self, code: &[u8]) {
57        // JS source text is interpreted as UCS-2, which is basically UTF-16 with less
58        // restrictions. We cannot iterate UTF-8 bytes here, 2-byte UTF-8 octets
59        // should count as a 1 char and not 2.
60        let &mut SourcePos {
61            mut line,
62            mut column,
63        } = self;
64
65        let mut i = 0;
66        while i < code.len() {
67            // This is not a UTF-8 validator, but it's likely close enough. It's assumed
68            // that the input is valid (and if it isn't than what are you doing trying to
69            // embed it into source code anyways?). The important part is that we process in
70            // order, and use the first octet's bit pattern to decode the octet length of
71            // the char.
72            match code[i] {
73                U8_LF => {
74                    i += 1;
75                    line += 1;
76                    column = 0;
77                }
78                U8_CR => {
79                    // Count "\r\n" as a single terminator.
80                    if code.get(i + 1) == Some(&U8_LF) {
81                        i += 2;
82                    } else {
83                        i += 1;
84                    }
85                    line += 1;
86                    column = 0;
87                }
88
89                // 1 octet chars do not have the high bit set. If it's not a LF or CR, then it's
90                // just a regular ASCII.
91                b if b & 0b10000000 == 0 => {
92                    i += 1;
93                    column += 1;
94                }
95
96                // 2 octet chars have a leading `110` bit pattern. None are considered line
97                // terminators.
98                b if b & 0b11100000 == 0b11000000 => {
99                    // eat this byte and the next.
100                    i += 2;
101                    column += 1;
102                }
103
104                // 3 octet chars have a leading `1110` bit pattern. Both the LINE/PARAGRAPH
105                // SEPARATOR exist in 3 octets.
106                b if b & 0b11110000 == 0b11100000 => {
107                    // The LINE and PARAGRAPH have the bits `11100010 10000000 1010100X`, with the X
108                    // denoting either line or paragraph.
109                    let mut separator = false;
110                    if b == 0b11100010 && code.get(i + 1) == Some(&0b10000000) {
111                        let last = code.get(i + 2).cloned().unwrap_or_default();
112                        separator = (last & 0b11111110) == 0b10101000
113                    }
114
115                    // eat this byte and the next 2.
116                    i += 3;
117                    if separator {
118                        line += 1;
119                        column = 0;
120                    } else {
121                        column += 1;
122                    }
123                }
124
125                // 4 octet chars have a leading `11110` pattern, but we don't need to check because
126                // none of the other patterns matched.
127                _ => {
128                    // eat this byte and the next 3.
129                    i += 4;
130                    column += 1;
131                }
132            }
133        }
134        self.line = line;
135        self.column = column;
136    }
137}
138
139impl std::cmp::PartialEq<(u32, u32)> for SourcePos {
140    fn eq(&self, other: &(u32, u32)) -> bool {
141        &(self.line, self.column) == other
142    }
143}