turbopack_core/source_pos.rs
1use serde::{Deserialize, Serialize};
2use turbo_tasks::{NonLocalValue, TaskInput, trace::TraceRawVcs};
3use turbo_tasks_hash::DeterministicHash;
4
5/// LINE FEED (LF), one of the basic JS line terminators.
6const U8_LF: u8 = 0x0A;
7/// CARRIAGE RETURN (CR), one of the basic JS line terminators.
8const U8_CR: u8 = 0x0D;
9
10#[derive(
11 Default,
12 Debug,
13 PartialEq,
14 Eq,
15 Copy,
16 Clone,
17 Hash,
18 PartialOrd,
19 Ord,
20 TaskInput,
21 TraceRawVcs,
22 Serialize,
23 Deserialize,
24 DeterministicHash,
25 NonLocalValue,
26)]
27pub struct SourcePos {
28 /// The line, 0-indexed.
29 pub line: u32,
30 /// The byte index of the column, 0-indexed.
31 pub column: u32,
32}
33
34impl SourcePos {
35 pub fn new(start_line: u32) -> Self {
36 Self {
37 line: start_line,
38 column: 0,
39 }
40 }
41
42 pub fn max() -> Self {
43 Self {
44 line: u32::MAX,
45 column: u32::MAX,
46 }
47 }
48
49 /// Increments the line/column position to account for new source code.
50 /// Line terminators are the classic "\n", "\r", "\r\n" (which counts as
51 /// a single terminator), and JSON LINE/PARAGRAPH SEPARATORs.
52 ///
53 /// See https://tc39.es/ecma262/multipage/ecmascript-language-lexical-grammar.html#sec-line-terminators
54 pub fn update(&mut self, code: &[u8]) {
55 // JS source text is interpreted as UCS-2, which is basically UTF-16 with less
56 // restrictions. We cannot iterate UTF-8 bytes here, 2-byte UTF-8 octets
57 // should count as a 1 char and not 2.
58 let &mut SourcePos {
59 mut line,
60 mut column,
61 } = self;
62
63 let mut i = 0;
64 while i < code.len() {
65 // This is not a UTF-8 validator, but it's likely close enough. It's assumed
66 // that the input is valid (and if it isn't than what are you doing trying to
67 // embed it into source code anyways?). The important part is that we process in
68 // order, and use the first octet's bit pattern to decode the octet length of
69 // the char.
70 match code[i] {
71 U8_LF => {
72 i += 1;
73 line += 1;
74 column = 0;
75 }
76 U8_CR => {
77 // Count "\r\n" as a single terminator.
78 if code.get(i + 1) == Some(&U8_LF) {
79 i += 2;
80 } else {
81 i += 1;
82 }
83 line += 1;
84 column = 0;
85 }
86
87 // 1 octet chars do not have the high bit set. If it's not a LF or CR, then it's
88 // just a regular ASCII.
89 b if b & 0b10000000 == 0 => {
90 i += 1;
91 column += 1;
92 }
93
94 // 2 octet chars have a leading `110` bit pattern. None are considered line
95 // terminators.
96 b if b & 0b11100000 == 0b11000000 => {
97 // eat this byte and the next.
98 i += 2;
99 column += 1;
100 }
101
102 // 3 octet chars have a leading `1110` bit pattern. Both the LINE/PARAGRAPH
103 // SEPARATOR exist in 3 octets.
104 b if b & 0b11110000 == 0b11100000 => {
105 // The LINE and PARAGRAPH have the bits `11100010 10000000 1010100X`, with the X
106 // denoting either line or paragraph.
107 let mut separator = false;
108 if b == 0b11100010 && code.get(i + 1) == Some(&0b10000000) {
109 let last = code.get(i + 2).cloned().unwrap_or_default();
110 separator = (last & 0b11111110) == 0b10101000
111 }
112
113 // eat this byte and the next 2.
114 i += 3;
115 if separator {
116 line += 1;
117 column = 0;
118 } else {
119 column += 1;
120 }
121 }
122
123 // 4 octet chars have a leading `11110` pattern, but we don't need to check because
124 // none of the other patterns matched.
125 _ => {
126 // eat this byte and the next 3.
127 i += 4;
128 column += 1;
129 }
130 }
131 }
132 self.line = line;
133 self.column = column;
134 }
135}
136
137impl std::cmp::PartialEq<(u32, u32)> for SourcePos {
138 fn eq(&self, other: &(u32, u32)) -> bool {
139 &(self.line, self.column) == other
140 }
141}