turbo_esregex/
lib.rs

1#![feature(arbitrary_self_types_pointers)]
2
3use std::vec;
4
5use anyhow::{Result, bail};
6use bincode::{
7    Decode, Encode,
8    de::Decoder,
9    enc::Encoder,
10    error::{DecodeError, EncodeError},
11    impl_borrow_decode,
12};
13
14/// A simple regular expression implementation following ecmascript semantics
15///
16/// Delegates to the `regex` crate when possible and `regress` otherwise.
17#[derive(Debug, Clone)]
18#[turbo_tasks::value(eq = "manual", shared, serialization = "custom")]
19pub struct EsRegex {
20    #[turbo_tasks(trace_ignore)]
21    delegate: EsRegexImpl,
22    // Store the original arguments used to construct
23    // this regex to support equality and serialization.
24    pub pattern: String,
25    pub flags: String,
26}
27
28#[derive(Debug, Clone)]
29enum EsRegexImpl {
30    Regex(regex::Regex),
31    Regress(regress::Regex),
32}
33
34/// Equality uses the source inputs since our delegate regex impls don't support
35/// equality natively.
36/// NOTE: there are multiple 'equivalent' ways to write a regex and this
37/// approach does _not_ attempt to equate them.
38impl PartialEq for EsRegex {
39    fn eq(&self, other: &Self) -> bool {
40        self.pattern == other.pattern && self.flags == other.flags
41    }
42}
43impl Eq for EsRegex {}
44
45impl Encode for EsRegex {
46    fn encode<E: Encoder>(&self, encoder: &mut E) -> Result<(), EncodeError> {
47        self.pattern.encode(encoder)?;
48        self.flags.encode(encoder)?;
49        Ok(())
50    }
51}
52
53impl<Context> Decode<Context> for EsRegex {
54    fn decode<D: Decoder<Context = Context>>(decoder: &mut D) -> Result<Self, DecodeError> {
55        let pattern: String = Decode::decode(decoder)?;
56        let flags: String = Decode::decode(decoder)?;
57        // TODO: perf: there's cloning happening here, we should be able to just move the `String`
58        EsRegex::new(&pattern, &flags).map_err(|err| DecodeError::OtherString(err.to_string()))
59    }
60}
61
62impl_borrow_decode!(EsRegex);
63
64impl EsRegex {
65    /// Support ecmascript style regular expressions by selecting the `regex` crate when possible
66    /// and using regress when not.
67    pub fn new(pattern: &str, flags: &str) -> Result<Self> {
68        // rust regex doesn't allow escaped slashes, but they are necessary in js
69        let pattern = pattern.replace("\\/", "/");
70
71        let mut applied_flags = String::new();
72        for flag in flags.chars() {
73            match flag {
74                // indices for substring matches: not relevant for the regex itself
75                'd' => {}
76                // global: default in rust, ignore
77                'g' => {}
78                // case-insensitive: letters match both upper and lower case
79                'i' => applied_flags.push('i'),
80                // multi-line mode: ^ and $ match begin/end of line
81                'm' => applied_flags.push('m'),
82                // allow . to match \n
83                's' => applied_flags.push('s'),
84                // Unicode support (enabled by default)
85                'u' => applied_flags.push('u'),
86                // sticky search: not relevant for the regex itself
87                'y' => {}
88                _ => bail!("unsupported flag `{flag}` in regex: `{pattern}` with flags: `{flags}`"),
89            }
90        }
91
92        let regex = if !applied_flags.is_empty() {
93            regex::Regex::new(&format!("(?{applied_flags}){pattern}"))
94        } else {
95            regex::Regex::new(&pattern)
96        };
97
98        let delegate = match regex {
99            Ok(reg) => Ok(EsRegexImpl::Regex(reg)),
100            Err(_e) => {
101                // We failed to parse as an regex:Regex, try using regress. Regress uses the es
102                // flags format so we can pass the original flags value.
103                match regress::Regex::with_flags(&pattern, regress::Flags::from(flags)) {
104                    Ok(reg) => Ok(EsRegexImpl::Regress(reg)),
105                    // Propagate the error as is, regress has useful error messages.
106                    Err(e) => Err(e),
107                }
108            }
109        }?;
110        Ok(Self {
111            delegate,
112            pattern,
113            flags: flags.to_string(),
114        })
115    }
116
117    /// Returns true if there is any match for this regex in the `haystack`.
118    pub fn is_match(&self, haystack: &str) -> bool {
119        match &self.delegate {
120            EsRegexImpl::Regex(r) => r.is_match(haystack),
121            EsRegexImpl::Regress(r) => r.find(haystack).is_some(),
122        }
123    }
124
125    /// Searches for the first match of the regex in the `haystack`, and iterates over the capture
126    /// groups within that first match.
127    ///
128    /// `None` is returned if there is no match. Individual capture groups may be `None` if the
129    /// capture group wasn't included in the match.
130    ///
131    /// The first capture group is always present ([`Some`]) and represents the entire match.
132    ///
133    /// Capture groups are represented as string slices of the `haystack`, and live for the lifetime
134    /// of `haystack`.
135    pub fn captures<'h>(&self, haystack: &'h str) -> Option<Captures<'h>> {
136        let delegate = match &self.delegate {
137            EsRegexImpl::Regex(r) => CapturesImpl::Regex {
138                captures: r.captures(haystack)?,
139                idx: 0,
140            },
141            EsRegexImpl::Regress(r) => {
142                let re_match = r.find(haystack)?;
143                CapturesImpl::Regress {
144                    captures_iter: re_match.captures.into_iter(),
145                    haystack,
146                    match_range: Some(re_match.range),
147                }
148            }
149        };
150        Some(Captures { delegate })
151    }
152}
153
154pub struct Captures<'h> {
155    delegate: CapturesImpl<'h>,
156}
157
158enum CapturesImpl<'h> {
159    // We have to use `regex::Captures` (which is not an iterator) here instead of
160    // `regex::SubCaptureMatches` (an iterator) because `SubCaptureMatches` must have a reference
161    // to `Capture`, and that would require a self-referential struct.
162    //
163    // Ideally, `regex::Capture` would implement `IntoIterator`, and we could use that here
164    // instead.
165    Regex {
166        captures: regex::Captures<'h>,
167        idx: usize,
168    },
169    // We can't use the iterator from `regress::Match::groups()` due to similar lifetime issues.
170    Regress {
171        captures_iter: vec::IntoIter<Option<regress::Range>>,
172        haystack: &'h str,
173        match_range: Option<regress::Range>,
174    },
175}
176
177impl<'h> Iterator for Captures<'h> {
178    type Item = Option<&'h str>;
179
180    fn next(&mut self) -> Option<Self::Item> {
181        match &mut self.delegate {
182            CapturesImpl::Regex { captures, idx } => {
183                if *idx >= captures.len() {
184                    None
185                } else {
186                    let capture = Some(captures.get(*idx).map(|sub_match| sub_match.as_str()));
187                    *idx += 1;
188                    capture
189                }
190            }
191            CapturesImpl::Regress {
192                captures_iter,
193                haystack,
194                match_range,
195            } => {
196                if let Some(range) = match_range.take() {
197                    // always yield range first
198                    Some(Some(&haystack[range]))
199                } else {
200                    Some(captures_iter.next()?.map(|range| &haystack[range]))
201                }
202            }
203        }
204    }
205}
206
207#[cfg(test)]
208mod tests {
209    use super::{EsRegex, EsRegexImpl};
210
211    #[test]
212    fn round_trip_bincode() {
213        let regex = EsRegex::new("[a-z]", "i").unwrap();
214        let config = bincode::config::standard();
215        let encoded = bincode::encode_to_vec(&regex, config).unwrap();
216        let (decoded, len) = bincode::decode_from_slice::<EsRegex, _>(&encoded, config).unwrap();
217        assert_eq!(regex, decoded);
218        assert_eq!(len, encoded.len());
219    }
220
221    #[test]
222    fn es_regex_matches_simple() {
223        let regex = EsRegex::new("a", "").unwrap();
224        assert!(matches!(regex.delegate, EsRegexImpl::Regex { .. }));
225        assert!(regex.is_match("a"));
226    }
227
228    #[test]
229    fn es_regex_matches_negative_lookahead() {
230        // This feature is not supported by the regex crate
231        let regex = EsRegex::new("a(?!b)", "").unwrap();
232        assert!(matches!(regex.delegate, EsRegexImpl::Regress { .. }));
233        assert!(!regex.is_match("ab"));
234        assert!(regex.is_match("ac"));
235    }
236
237    #[test]
238    fn invalid_regex() {
239        // This is invalid since there is nothing being repeated
240        // Don't bother asserting on the message since we delegate
241        // that to the underlying implementations.
242        assert!(matches!(EsRegex::new("*", ""), Err { .. }))
243    }
244
245    #[test]
246    fn captures_with_regex() {
247        let regex = EsRegex::new(r"(notmatched)|(\d{4})-(\d{2})-(\d{2})", "").unwrap();
248        assert!(matches!(regex.delegate, EsRegexImpl::Regex { .. }));
249
250        let captures = regex.captures("Today is 2024-01-15");
251        assert!(captures.is_some());
252        let caps: Vec<_> = captures.unwrap().collect();
253        assert_eq!(caps.len(), 5); // full match + 4 groups
254        assert_eq!(caps[0], Some("2024-01-15")); // full match
255        assert_eq!(caps[1], None); // 'notmatched' -- this branch isn't taken
256        assert_eq!(caps[2], Some("2024")); // year
257        assert_eq!(caps[3], Some("01")); // month
258        assert_eq!(caps[4], Some("15")); // day
259    }
260
261    #[test]
262    fn captures_with_regress() {
263        let regex = EsRegex::new(r"(\w+)(?=baz)", "").unwrap();
264        assert!(matches!(regex.delegate, EsRegexImpl::Regress { .. }));
265
266        let captures = regex.captures("foobar");
267        assert!(captures.is_none());
268
269        let captures = regex.captures("foobaz");
270        assert!(captures.is_some());
271        let caps: Vec<_> = captures.unwrap().collect();
272        assert_eq!(caps.len(), 2); // full match + 1 group
273        assert_eq!(caps[0], Some("foo")); // full match
274        assert_eq!(caps[1], Some("foo")); // captured group
275    }
276}