turbopack_core/source_pos.rs
1use bincode::{Decode, Encode};
2use serde::Serialize;
3use turbo_tasks::{NonLocalValue, TaskInput, trace::TraceRawVcs};
4use turbo_tasks_hash::DeterministicHash;
5
6/// LINE FEED (LF), one of the basic JS line terminators.
7const U8_LF: u8 = 0x0A;
8/// CARRIAGE RETURN (CR), one of the basic JS line terminators.
9const U8_CR: u8 = 0x0D;
10
11#[derive(
12 Default,
13 Debug,
14 PartialEq,
15 Eq,
16 Copy,
17 Clone,
18 Hash,
19 PartialOrd,
20 Ord,
21 TaskInput,
22 TraceRawVcs,
23 Serialize,
24 DeterministicHash,
25 NonLocalValue,
26 Encode,
27 Decode,
28)]
29pub struct SourcePos {
30 /// The line, 0-indexed.
31 pub line: u32,
32 /// The byte index of the column, 0-indexed.
33 pub column: u32,
34}
35
36impl SourcePos {
37 pub fn new(start_line: u32) -> Self {
38 Self {
39 line: start_line,
40 column: 0,
41 }
42 }
43
44 pub fn max() -> Self {
45 Self {
46 line: u32::MAX,
47 column: u32::MAX,
48 }
49 }
50
51 /// Increments the line/column position to account for new source code.
52 /// Line terminators are the classic "\n", "\r", "\r\n" (which counts as
53 /// a single terminator), and JSON LINE/PARAGRAPH SEPARATORs.
54 ///
55 /// See https://tc39.es/ecma262/multipage/ecmascript-language-lexical-grammar.html#sec-line-terminators
56 pub fn update(&mut self, code: &[u8]) {
57 // JS source text is interpreted as UCS-2, which is basically UTF-16 with less
58 // restrictions. We cannot iterate UTF-8 bytes here, 2-byte UTF-8 octets
59 // should count as a 1 char and not 2.
60 let &mut SourcePos {
61 mut line,
62 mut column,
63 } = self;
64
65 let mut i = 0;
66 while i < code.len() {
67 // This is not a UTF-8 validator, but it's likely close enough. It's assumed
68 // that the input is valid (and if it isn't than what are you doing trying to
69 // embed it into source code anyways?). The important part is that we process in
70 // order, and use the first octet's bit pattern to decode the octet length of
71 // the char.
72 match code[i] {
73 U8_LF => {
74 i += 1;
75 line += 1;
76 column = 0;
77 }
78 U8_CR => {
79 // Count "\r\n" as a single terminator.
80 if code.get(i + 1) == Some(&U8_LF) {
81 i += 2;
82 } else {
83 i += 1;
84 }
85 line += 1;
86 column = 0;
87 }
88
89 // 1 octet chars do not have the high bit set. If it's not a LF or CR, then it's
90 // just a regular ASCII.
91 b if b & 0b10000000 == 0 => {
92 i += 1;
93 column += 1;
94 }
95
96 // 2 octet chars have a leading `110` bit pattern. None are considered line
97 // terminators.
98 b if b & 0b11100000 == 0b11000000 => {
99 // eat this byte and the next.
100 i += 2;
101 column += 1;
102 }
103
104 // 3 octet chars have a leading `1110` bit pattern. Both the LINE/PARAGRAPH
105 // SEPARATOR exist in 3 octets.
106 b if b & 0b11110000 == 0b11100000 => {
107 // The LINE and PARAGRAPH have the bits `11100010 10000000 1010100X`, with the X
108 // denoting either line or paragraph.
109 let mut separator = false;
110 if b == 0b11100010 && code.get(i + 1) == Some(&0b10000000) {
111 let last = code.get(i + 2).cloned().unwrap_or_default();
112 separator = (last & 0b11111110) == 0b10101000
113 }
114
115 // eat this byte and the next 2.
116 i += 3;
117 if separator {
118 line += 1;
119 column = 0;
120 } else {
121 column += 1;
122 }
123 }
124
125 // 4 octet chars have a leading `11110` pattern, but we don't need to check because
126 // none of the other patterns matched.
127 _ => {
128 // eat this byte and the next 3.
129 i += 4;
130 column += 1;
131 }
132 }
133 }
134 self.line = line;
135 self.column = column;
136 }
137}
138
139impl std::cmp::PartialEq<(u32, u32)> for SourcePos {
140 fn eq(&self, other: &(u32, u32)) -> bool {
141 &(self.line, self.column) == other
142 }
143}