Skip to main content

turbopack_core/
source_pos.rs

1use bincode::{Decode, Encode};
2use serde::Serialize;
3use turbo_tasks::trace::TraceRawVcs;
4use turbo_tasks_hash::DeterministicHash;
5
6/// LINE FEED (LF), one of the basic JS line terminators.
7const U8_LF: u8 = 0x0A;
8/// CARRIAGE RETURN (CR), one of the basic JS line terminators.
9const U8_CR: u8 = 0x0D;
10
11#[turbo_tasks::task_input]
12#[derive(
13    Default,
14    Debug,
15    PartialEq,
16    Eq,
17    Copy,
18    Clone,
19    Hash,
20    PartialOrd,
21    Ord,
22    TraceRawVcs,
23    Serialize,
24    DeterministicHash,
25    Encode,
26    Decode,
27)]
28pub struct SourcePos {
29    /// The line, 0-indexed.
30    pub line: u32,
31    /// The byte index of the column, 0-indexed.
32    pub column: u32,
33}
34
35impl SourcePos {
36    pub fn new(start_line: u32) -> Self {
37        Self {
38            line: start_line,
39            column: 0,
40        }
41    }
42
43    pub fn max() -> Self {
44        Self {
45            line: u32::MAX,
46            column: u32::MAX,
47        }
48    }
49
50    /// Increments the line/column position to account for new source code.
51    /// Line terminators are the classic "\n", "\r", "\r\n" (which counts as
52    /// a single terminator), and JSON LINE/PARAGRAPH SEPARATORs.
53    ///
54    /// See <https://tc39.es/ecma262/multipage/ecmascript-language-lexical-grammar.html#sec-line-terminators>
55    pub fn update(&mut self, code: &[u8]) {
56        // JS source text is interpreted as UCS-2, which is basically UTF-16 with less
57        // restrictions. We cannot iterate UTF-8 bytes here, 2-byte UTF-8 octets
58        // should count as a 1 char and not 2.
59        let &mut SourcePos {
60            mut line,
61            mut column,
62        } = self;
63
64        let mut i = 0;
65        while i < code.len() {
66            // This is not a UTF-8 validator, but it's likely close enough. It's assumed
67            // that the input is valid (and if it isn't than what are you doing trying to
68            // embed it into source code anyways?). The important part is that we process in
69            // order, and use the first octet's bit pattern to decode the octet length of
70            // the char.
71            match code[i] {
72                U8_LF => {
73                    i += 1;
74                    line += 1;
75                    column = 0;
76                }
77                U8_CR => {
78                    // Count "\r\n" as a single terminator.
79                    if code.get(i + 1) == Some(&U8_LF) {
80                        i += 2;
81                    } else {
82                        i += 1;
83                    }
84                    line += 1;
85                    column = 0;
86                }
87
88                // 1 octet chars do not have the high bit set. If it's not a LF or CR, then it's
89                // just a regular ASCII.
90                b if b & 0b10000000 == 0 => {
91                    i += 1;
92                    column += 1;
93                }
94
95                // 2 octet chars have a leading `110` bit pattern. None are considered line
96                // terminators.
97                b if b & 0b11100000 == 0b11000000 => {
98                    // eat this byte and the next.
99                    i += 2;
100                    column += 1;
101                }
102
103                // 3 octet chars have a leading `1110` bit pattern. Both the LINE/PARAGRAPH
104                // SEPARATOR exist in 3 octets.
105                b if b & 0b11110000 == 0b11100000 => {
106                    // The LINE and PARAGRAPH have the bits `11100010 10000000 1010100X`, with the X
107                    // denoting either line or paragraph.
108                    let mut separator = false;
109                    if b == 0b11100010 && code.get(i + 1) == Some(&0b10000000) {
110                        let last = code.get(i + 2).cloned().unwrap_or_default();
111                        separator = (last & 0b11111110) == 0b10101000
112                    }
113
114                    // eat this byte and the next 2.
115                    i += 3;
116                    if separator {
117                        line += 1;
118                        column = 0;
119                    } else {
120                        column += 1;
121                    }
122                }
123
124                // 4 octet chars have a leading `11110` pattern, but we don't need to check because
125                // none of the other patterns matched.
126                _ => {
127                    // eat this byte and the next 3.
128                    i += 4;
129                    column += 1;
130                }
131            }
132        }
133        self.line = line;
134        self.column = column;
135    }
136}
137
138impl std::cmp::PartialEq<(u32, u32)> for SourcePos {
139    fn eq(&self, other: &(u32, u32)) -> bool {
140        &(self.line, self.column) == other
141    }
142}