turbopack_core/source_pos.rs
1use serde::{Deserialize, Serialize};
2use turbo_tasks::{NonLocalValue, TaskInput, trace::TraceRawVcs};
3use turbo_tasks_hash::DeterministicHash;
4
5/// LINE FEED (LF), one of the basic JS line terminators.
6const U8_LF: u8 = 0x0A;
7/// CARRIAGE RETURN (CR), one of the basic JS line terminators.
8const U8_CR: u8 = 0x0D;
9
10#[derive(
11 Default,
12 Debug,
13 PartialEq,
14 Eq,
15 Copy,
16 Clone,
17 Hash,
18 PartialOrd,
19 Ord,
20 TaskInput,
21 TraceRawVcs,
22 Serialize,
23 Deserialize,
24 DeterministicHash,
25 NonLocalValue,
26)]
27pub struct SourcePos {
28 /// The line, 0-indexed.
29 pub line: u32,
30 /// The byte index of the column, 0-indexed.
31 pub column: u32,
32}
33
34impl SourcePos {
35 pub fn new() -> Self {
36 Default::default()
37 }
38
39 pub fn max() -> Self {
40 Self {
41 line: u32::MAX,
42 column: u32::MAX,
43 }
44 }
45
46 /// Increments the line/column position to account for new source code.
47 /// Line terminators are the classic "\n", "\r", "\r\n" (which counts as
48 /// a single terminator), and JSON LINE/PARAGRAPH SEPARATORs.
49 ///
50 /// See https://tc39.es/ecma262/multipage/ecmascript-language-lexical-grammar.html#sec-line-terminators
51 pub fn update(&mut self, code: &[u8]) {
52 // JS source text is interpreted as UCS-2, which is basically UTF-16 with less
53 // restrictions. We cannot iterate UTF-8 bytes here, 2-byte UTF-8 octets
54 // should count as a 1 char and not 2.
55 let &mut SourcePos {
56 mut line,
57 mut column,
58 } = self;
59
60 let mut i = 0;
61 while i < code.len() {
62 // This is not a UTF-8 validator, but it's likely close enough. It's assumed
63 // that the input is valid (and if it isn't than what are you doing trying to
64 // embed it into source code anyways?). The important part is that we process in
65 // order, and use the first octet's bit pattern to decode the octet length of
66 // the char.
67 match code[i] {
68 U8_LF => {
69 i += 1;
70 line += 1;
71 column = 0;
72 }
73 U8_CR => {
74 // Count "\r\n" as a single terminator.
75 if code.get(i + 1) == Some(&U8_LF) {
76 i += 2;
77 } else {
78 i += 1;
79 }
80 line += 1;
81 column = 0;
82 }
83
84 // 1 octet chars do not have the high bit set. If it's not a LF or CR, then it's
85 // just a regular ASCII.
86 b if b & 0b10000000 == 0 => {
87 i += 1;
88 column += 1;
89 }
90
91 // 2 octet chars have a leading `110` bit pattern. None are considered line
92 // terminators.
93 b if b & 0b11100000 == 0b11000000 => {
94 // eat this byte and the next.
95 i += 2;
96 column += 1;
97 }
98
99 // 3 octet chars have a leading `1110` bit pattern. Both the LINE/PARAGRAPH
100 // SEPARATOR exist in 3 octets.
101 b if b & 0b11110000 == 0b11100000 => {
102 // The LINE and PARAGRAPH have the bits `11100010 10000000 1010100X`, with the X
103 // denoting either line or paragraph.
104 let mut separator = false;
105 if b == 0b11100010 && code.get(i + 1) == Some(&0b10000000) {
106 let last = code.get(i + 2).cloned().unwrap_or_default();
107 separator = (last & 0b11111110) == 0b10101000
108 }
109
110 // eat this byte and the next 2.
111 i += 3;
112 if separator {
113 line += 1;
114 column = 0;
115 } else {
116 column += 1;
117 }
118 }
119
120 // 4 octet chars have a leading `11110` pattern, but we don't need to check because
121 // none of the other patterns matched.
122 _ => {
123 // eat this byte and the next 3.
124 i += 4;
125 column += 1;
126 }
127 }
128 }
129 self.line = line;
130 self.column = column;
131 }
132}
133
134impl std::cmp::PartialEq<(u32, u32)> for SourcePos {
135 fn eq(&self, other: &(u32, u32)) -> bool {
136 &(self.line, self.column) == other
137 }
138}