turbopack_core/source_pos.rs
1use bincode::{Decode, Encode};
2use serde::Serialize;
3use turbo_tasks::trace::TraceRawVcs;
4use turbo_tasks_hash::DeterministicHash;
5
6/// LINE FEED (LF), one of the basic JS line terminators.
7const U8_LF: u8 = 0x0A;
8/// CARRIAGE RETURN (CR), one of the basic JS line terminators.
9const U8_CR: u8 = 0x0D;
10
11#[turbo_tasks::task_input]
12#[derive(
13 Default,
14 Debug,
15 PartialEq,
16 Eq,
17 Copy,
18 Clone,
19 Hash,
20 PartialOrd,
21 Ord,
22 TraceRawVcs,
23 Serialize,
24 DeterministicHash,
25 Encode,
26 Decode,
27)]
28pub struct SourcePos {
29 /// The line, 0-indexed.
30 pub line: u32,
31 /// The byte index of the column, 0-indexed.
32 pub column: u32,
33}
34
35impl SourcePos {
36 pub fn new(start_line: u32) -> Self {
37 Self {
38 line: start_line,
39 column: 0,
40 }
41 }
42
43 pub fn max() -> Self {
44 Self {
45 line: u32::MAX,
46 column: u32::MAX,
47 }
48 }
49
50 /// Increments the line/column position to account for new source code.
51 /// Line terminators are the classic "\n", "\r", "\r\n" (which counts as
52 /// a single terminator), and JSON LINE/PARAGRAPH SEPARATORs.
53 ///
54 /// See <https://tc39.es/ecma262/multipage/ecmascript-language-lexical-grammar.html#sec-line-terminators>
55 pub fn update(&mut self, code: &[u8]) {
56 // JS source text is interpreted as UCS-2, which is basically UTF-16 with less
57 // restrictions. We cannot iterate UTF-8 bytes here, 2-byte UTF-8 octets
58 // should count as a 1 char and not 2.
59 let &mut SourcePos {
60 mut line,
61 mut column,
62 } = self;
63
64 let mut i = 0;
65 while i < code.len() {
66 // This is not a UTF-8 validator, but it's likely close enough. It's assumed
67 // that the input is valid (and if it isn't than what are you doing trying to
68 // embed it into source code anyways?). The important part is that we process in
69 // order, and use the first octet's bit pattern to decode the octet length of
70 // the char.
71 match code[i] {
72 U8_LF => {
73 i += 1;
74 line += 1;
75 column = 0;
76 }
77 U8_CR => {
78 // Count "\r\n" as a single terminator.
79 if code.get(i + 1) == Some(&U8_LF) {
80 i += 2;
81 } else {
82 i += 1;
83 }
84 line += 1;
85 column = 0;
86 }
87
88 // 1 octet chars do not have the high bit set. If it's not a LF or CR, then it's
89 // just a regular ASCII.
90 b if b & 0b10000000 == 0 => {
91 i += 1;
92 column += 1;
93 }
94
95 // 2 octet chars have a leading `110` bit pattern. None are considered line
96 // terminators.
97 b if b & 0b11100000 == 0b11000000 => {
98 // eat this byte and the next.
99 i += 2;
100 column += 1;
101 }
102
103 // 3 octet chars have a leading `1110` bit pattern. Both the LINE/PARAGRAPH
104 // SEPARATOR exist in 3 octets.
105 b if b & 0b11110000 == 0b11100000 => {
106 // The LINE and PARAGRAPH have the bits `11100010 10000000 1010100X`, with the X
107 // denoting either line or paragraph.
108 let mut separator = false;
109 if b == 0b11100010 && code.get(i + 1) == Some(&0b10000000) {
110 let last = code.get(i + 2).cloned().unwrap_or_default();
111 separator = (last & 0b11111110) == 0b10101000
112 }
113
114 // eat this byte and the next 2.
115 i += 3;
116 if separator {
117 line += 1;
118 column = 0;
119 } else {
120 column += 1;
121 }
122 }
123
124 // 4 octet chars have a leading `11110` pattern, but we don't need to check because
125 // none of the other patterns matched.
126 _ => {
127 // eat this byte and the next 3.
128 i += 4;
129 column += 1;
130 }
131 }
132 }
133 self.line = line;
134 self.column = column;
135 }
136}
137
138impl std::cmp::PartialEq<(u32, u32)> for SourcePos {
139 fn eq(&self, other: &(u32, u32)) -> bool {
140 &(self.line, self.column) == other
141 }
142}