turbo_esregex/
lib.rs

1#![feature(arbitrary_self_types_pointers)]
2
3use std::vec;
4
5use anyhow::{Result, bail};
6
7/// A simple regular expression implementation following ecmascript semantics
8///
9/// Delegates to the `regex` crate when possible and `regress` otherwise.
10#[derive(Debug, Clone)]
11#[turbo_tasks::value(eq = "manual", shared)]
12#[serde(into = "RegexForm", try_from = "RegexForm")]
13pub struct EsRegex {
14    #[turbo_tasks(trace_ignore)]
15    delegate: EsRegexImpl,
16    // Store the original arguments used to construct
17    // this regex to support equality and serialization.
18    pub pattern: String,
19    pub flags: String,
20}
21
22#[derive(Debug, Clone)]
23enum EsRegexImpl {
24    Regex(regex::Regex),
25    Regress(regress::Regex),
26}
27
28/// Equality uses the source inputs since our delegate regex impls don't support
29/// equality natively.
30/// NOTE: there are multiple 'equivalent' ways to write a regex and this
31/// approach does _not_ attempt to equate them.
32impl PartialEq for EsRegex {
33    fn eq(&self, other: &Self) -> bool {
34        self.pattern == other.pattern && self.flags == other.flags
35    }
36}
37impl Eq for EsRegex {}
38
39impl TryFrom<RegexForm> for EsRegex {
40    type Error = anyhow::Error;
41
42    fn try_from(value: RegexForm) -> std::result::Result<Self, Self::Error> {
43        EsRegex::new(&value.pattern, &value.flags)
44    }
45}
46
47/// This is the serializable form for the `EsRegex` struct
48#[derive(serde::Serialize, serde::Deserialize, Debug, Clone)]
49struct RegexForm {
50    pattern: String,
51    flags: String,
52}
53
54impl From<EsRegex> for RegexForm {
55    fn from(value: EsRegex) -> Self {
56        Self {
57            pattern: value.pattern,
58            flags: value.flags,
59        }
60    }
61}
62
63impl EsRegex {
64    /// Support ecmascript style regular expressions by selecting the `regex` crate when possible
65    /// and using regress when not.
66    pub fn new(pattern: &str, flags: &str) -> Result<Self> {
67        // rust regex doesn't allow escaped slashes, but they are necessary in js
68        let pattern = pattern.replace("\\/", "/");
69
70        let mut applied_flags = String::new();
71        for flag in flags.chars() {
72            match flag {
73                // indices for substring matches: not relevant for the regex itself
74                'd' => {}
75                // global: default in rust, ignore
76                'g' => {}
77                // case-insensitive: letters match both upper and lower case
78                'i' => applied_flags.push('i'),
79                // multi-line mode: ^ and $ match begin/end of line
80                'm' => applied_flags.push('m'),
81                // allow . to match \n
82                's' => applied_flags.push('s'),
83                // Unicode support (enabled by default)
84                'u' => applied_flags.push('u'),
85                // sticky search: not relevant for the regex itself
86                'y' => {}
87                _ => bail!("unsupported flag `{flag}` in regex: `{pattern}` with flags: `{flags}`"),
88            }
89        }
90
91        let regex = if !applied_flags.is_empty() {
92            regex::Regex::new(&format!("(?{applied_flags}){pattern}"))
93        } else {
94            regex::Regex::new(&pattern)
95        };
96
97        let delegate = match regex {
98            Ok(reg) => Ok(EsRegexImpl::Regex(reg)),
99            Err(_e) => {
100                // We failed to parse as an regex:Regex, try using regress. Regress uses the es
101                // flags format so we can pass the original flags value.
102                match regress::Regex::with_flags(&pattern, regress::Flags::from(flags)) {
103                    Ok(reg) => Ok(EsRegexImpl::Regress(reg)),
104                    // Propagate the error as is, regress has useful error messages.
105                    Err(e) => Err(e),
106                }
107            }
108        }?;
109        Ok(Self {
110            delegate,
111            pattern,
112            flags: flags.to_string(),
113        })
114    }
115
116    /// Returns true if there is any match for this regex in the `haystack`.
117    pub fn is_match(&self, haystack: &str) -> bool {
118        match &self.delegate {
119            EsRegexImpl::Regex(r) => r.is_match(haystack),
120            EsRegexImpl::Regress(r) => r.find(haystack).is_some(),
121        }
122    }
123
124    /// Searches for the first match of the regex in the `haystack`, and iterates over the capture
125    /// groups within that first match.
126    ///
127    /// `None` is returned if there is no match. Individual capture groups may be `None` if the
128    /// capture group wasn't included in the match.
129    ///
130    /// The first capture group is always present ([`Some`]) and represents the entire match.
131    ///
132    /// Capture groups are represented as string slices of the `haystack`, and live for the lifetime
133    /// of `haystack`.
134    pub fn captures<'h>(&self, haystack: &'h str) -> Option<Captures<'h>> {
135        let delegate = match &self.delegate {
136            EsRegexImpl::Regex(r) => CapturesImpl::Regex {
137                captures: r.captures(haystack)?,
138                idx: 0,
139            },
140            EsRegexImpl::Regress(r) => {
141                let re_match = r.find(haystack)?;
142                CapturesImpl::Regress {
143                    captures_iter: re_match.captures.into_iter(),
144                    haystack,
145                    match_range: Some(re_match.range),
146                }
147            }
148        };
149        Some(Captures { delegate })
150    }
151}
152
153pub struct Captures<'h> {
154    delegate: CapturesImpl<'h>,
155}
156
157enum CapturesImpl<'h> {
158    // We have to use `regex::Captures` (which is not an iterator) here instead of
159    // `regex::SubCaptureMatches` (an iterator) because `SubCaptureMatches` must have a reference
160    // to `Capture`, and that would require a self-referential struct.
161    //
162    // Ideally, `regex::Capture` would implement `IntoIterator`, and we could use that here
163    // instead.
164    Regex {
165        captures: regex::Captures<'h>,
166        idx: usize,
167    },
168    // We can't use the iterator from `regress::Match::groups()` due to similar lifetime issues.
169    Regress {
170        captures_iter: vec::IntoIter<Option<regress::Range>>,
171        haystack: &'h str,
172        match_range: Option<regress::Range>,
173    },
174}
175
176impl<'h> Iterator for Captures<'h> {
177    type Item = Option<&'h str>;
178
179    fn next(&mut self) -> Option<Self::Item> {
180        match &mut self.delegate {
181            CapturesImpl::Regex { captures, idx } => {
182                if *idx >= captures.len() {
183                    None
184                } else {
185                    let capture = Some(captures.get(*idx).map(|sub_match| sub_match.as_str()));
186                    *idx += 1;
187                    capture
188                }
189            }
190            CapturesImpl::Regress {
191                captures_iter,
192                haystack,
193                match_range,
194            } => {
195                if let Some(range) = match_range.take() {
196                    // always yield range first
197                    Some(Some(&haystack[range]))
198                } else {
199                    Some(captures_iter.next()?.map(|range| &haystack[range]))
200                }
201            }
202        }
203    }
204}
205
206#[cfg(test)]
207mod tests {
208    use super::{EsRegex, EsRegexImpl};
209
210    #[test]
211    fn round_trip_serialize() {
212        let regex = EsRegex::new("[a-z]", "i").unwrap();
213        let serialized = serde_json::to_string(&regex).unwrap();
214        let parsed = serde_json::from_str::<EsRegex>(&serialized).unwrap();
215        assert_eq!(regex, parsed);
216    }
217
218    #[test]
219    fn es_regex_matches_simple() {
220        let regex = EsRegex::new("a", "").unwrap();
221        assert!(matches!(regex.delegate, EsRegexImpl::Regex { .. }));
222        assert!(regex.is_match("a"));
223    }
224
225    #[test]
226    fn es_regex_matches_negative_lookahead() {
227        // This feature is not supported by the regex crate
228        let regex = EsRegex::new("a(?!b)", "").unwrap();
229        assert!(matches!(regex.delegate, EsRegexImpl::Regress { .. }));
230        assert!(!regex.is_match("ab"));
231        assert!(regex.is_match("ac"));
232    }
233
234    #[test]
235    fn invalid_regex() {
236        // This is invalid since there is nothing being repeated
237        // Don't bother asserting on the message since we delegate
238        // that to the underlying implementations.
239        assert!(matches!(EsRegex::new("*", ""), Err { .. }))
240    }
241
242    #[test]
243    fn captures_with_regex() {
244        let regex = EsRegex::new(r"(notmatched)|(\d{4})-(\d{2})-(\d{2})", "").unwrap();
245        assert!(matches!(regex.delegate, EsRegexImpl::Regex { .. }));
246
247        let captures = regex.captures("Today is 2024-01-15");
248        assert!(captures.is_some());
249        let caps: Vec<_> = captures.unwrap().collect();
250        assert_eq!(caps.len(), 5); // full match + 4 groups
251        assert_eq!(caps[0], Some("2024-01-15")); // full match
252        assert_eq!(caps[1], None); // 'notmatched' -- this branch isn't taken
253        assert_eq!(caps[2], Some("2024")); // year
254        assert_eq!(caps[3], Some("01")); // month
255        assert_eq!(caps[4], Some("15")); // day
256    }
257
258    #[test]
259    fn captures_with_regress() {
260        let regex = EsRegex::new(r"(\w+)(?=baz)", "").unwrap();
261        assert!(matches!(regex.delegate, EsRegexImpl::Regress { .. }));
262
263        let captures = regex.captures("foobar");
264        assert!(captures.is_none());
265
266        let captures = regex.captures("foobaz");
267        assert!(captures.is_some());
268        let caps: Vec<_> = captures.unwrap().collect();
269        assert_eq!(caps.len(), 2); // full match + 1 group
270        assert_eq!(caps[0], Some("foo")); // full match
271        assert_eq!(caps[1], Some("foo")); // captured group
272    }
273}