turbo_esregex/
lib.rs

1#![feature(arbitrary_self_types_pointers)]
2
3use std::vec;
4
5use anyhow::{Result, bail};
6
7pub fn register() {
8    turbo_tasks::register();
9    include!(concat!(env!("OUT_DIR"), "/register.rs"));
10}
11
12/// A simple regular expression implementation following ecmascript semantics
13///
14/// Delegates to the `regex` crate when possible and `regress` otherwise.
15#[derive(Debug, Clone)]
16#[turbo_tasks::value(eq = "manual", shared)]
17#[serde(into = "RegexForm", try_from = "RegexForm")]
18pub struct EsRegex {
19    #[turbo_tasks(trace_ignore)]
20    delegate: EsRegexImpl,
21    // Store the original arguments used to construct
22    // this regex to support equality and serialization.
23    pub pattern: String,
24    pub flags: String,
25}
26
27#[derive(Debug, Clone)]
28enum EsRegexImpl {
29    Regex(regex::Regex),
30    Regress(regress::Regex),
31}
32
33/// Equality uses the source inputs since our delegate regex impls don't support
34/// equality natively.
35/// NOTE: there are multiple 'equivalent' ways to write a regex and this
36/// approach does _not_ attempt to equate them.
37impl PartialEq for EsRegex {
38    fn eq(&self, other: &Self) -> bool {
39        self.pattern == other.pattern && self.flags == other.flags
40    }
41}
42impl Eq for EsRegex {}
43
44impl TryFrom<RegexForm> for EsRegex {
45    type Error = anyhow::Error;
46
47    fn try_from(value: RegexForm) -> std::result::Result<Self, Self::Error> {
48        EsRegex::new(&value.pattern, &value.flags)
49    }
50}
51
52/// This is the serializable form for the `EsRegex` struct
53#[derive(serde::Serialize, serde::Deserialize, Debug, Clone)]
54struct RegexForm {
55    pattern: String,
56    flags: String,
57}
58
59impl From<EsRegex> for RegexForm {
60    fn from(value: EsRegex) -> Self {
61        Self {
62            pattern: value.pattern,
63            flags: value.flags,
64        }
65    }
66}
67
68impl EsRegex {
69    /// Support ecmascript style regular expressions by selecting the `regex` crate when possible
70    /// and using regress when not.
71    pub fn new(pattern: &str, flags: &str) -> Result<Self> {
72        // rust regex doesn't allow escaped slashes, but they are necessary in js
73        let pattern = pattern.replace("\\/", "/");
74
75        let mut applied_flags = String::new();
76        for flag in flags.chars() {
77            match flag {
78                // indices for substring matches: not relevant for the regex itself
79                'd' => {}
80                // global: default in rust, ignore
81                'g' => {}
82                // case-insensitive: letters match both upper and lower case
83                'i' => applied_flags.push('i'),
84                // multi-line mode: ^ and $ match begin/end of line
85                'm' => applied_flags.push('m'),
86                // allow . to match \n
87                's' => applied_flags.push('s'),
88                // Unicode support (enabled by default)
89                'u' => applied_flags.push('u'),
90                // sticky search: not relevant for the regex itself
91                'y' => {}
92                _ => bail!("unsupported flag `{flag}` in regex: `{pattern}` with flags: `{flags}`"),
93            }
94        }
95
96        let regex = if !applied_flags.is_empty() {
97            regex::Regex::new(&format!("(?{applied_flags}){pattern}"))
98        } else {
99            regex::Regex::new(&pattern)
100        };
101
102        let delegate = match regex {
103            Ok(reg) => Ok(EsRegexImpl::Regex(reg)),
104            Err(_e) => {
105                // We failed to parse as an regex:Regex, try using regress. Regress uses the es
106                // flags format so we can pass the original flags value.
107                match regress::Regex::with_flags(&pattern, regress::Flags::from(flags)) {
108                    Ok(reg) => Ok(EsRegexImpl::Regress(reg)),
109                    // Propagate the error as is, regress has useful error messages.
110                    Err(e) => Err(e),
111                }
112            }
113        }?;
114        Ok(Self {
115            delegate,
116            pattern,
117            flags: flags.to_string(),
118        })
119    }
120
121    /// Returns true if there is any match for this regex in the `haystack`.
122    pub fn is_match(&self, haystack: &str) -> bool {
123        match &self.delegate {
124            EsRegexImpl::Regex(r) => r.is_match(haystack),
125            EsRegexImpl::Regress(r) => r.find(haystack).is_some(),
126        }
127    }
128
129    /// Searches for the first match of the regex in the `haystack`, and iterates over the capture
130    /// groups within that first match.
131    ///
132    /// `None` is returned if there is no match. Individual capture groups may be `None` if the
133    /// capture group wasn't included in the match.
134    ///
135    /// The first capture group is always present ([`Some`]) and represents the entire match.
136    ///
137    /// Capture groups are represented as string slices of the `haystack`, and live for the lifetime
138    /// of `haystack`.
139    pub fn captures<'h>(&self, haystack: &'h str) -> Option<Captures<'h>> {
140        let delegate = match &self.delegate {
141            EsRegexImpl::Regex(r) => CapturesImpl::Regex {
142                captures: r.captures(haystack)?,
143                idx: 0,
144            },
145            EsRegexImpl::Regress(r) => {
146                let re_match = r.find(haystack)?;
147                CapturesImpl::Regress {
148                    captures_iter: re_match.captures.into_iter(),
149                    haystack,
150                    match_range: Some(re_match.range),
151                }
152            }
153        };
154        Some(Captures { delegate })
155    }
156}
157
158pub struct Captures<'h> {
159    delegate: CapturesImpl<'h>,
160}
161
162enum CapturesImpl<'h> {
163    // We have to use `regex::Captures` (which is not an iterator) here instead of
164    // `regex::SubCaptureMatches` (an iterator) because `SubCaptureMatches` must have a reference
165    // to `Capture`, and that would require a self-referential struct.
166    //
167    // Ideally, `regex::Capture` would implement `IntoIterator`, and we could use that here
168    // instead.
169    Regex {
170        captures: regex::Captures<'h>,
171        idx: usize,
172    },
173    // We can't use the iterator from `regress::Match::groups()` due to similar lifetime issues.
174    Regress {
175        captures_iter: vec::IntoIter<Option<regress::Range>>,
176        haystack: &'h str,
177        match_range: Option<regress::Range>,
178    },
179}
180
181impl<'h> Iterator for Captures<'h> {
182    type Item = Option<&'h str>;
183
184    fn next(&mut self) -> Option<Self::Item> {
185        match &mut self.delegate {
186            CapturesImpl::Regex { captures, idx } => {
187                if *idx >= captures.len() {
188                    None
189                } else {
190                    let capture = Some(captures.get(*idx).map(|sub_match| sub_match.as_str()));
191                    *idx += 1;
192                    capture
193                }
194            }
195            CapturesImpl::Regress {
196                captures_iter,
197                haystack,
198                match_range,
199            } => {
200                if let Some(range) = match_range.take() {
201                    // always yield range first
202                    Some(Some(&haystack[range]))
203                } else {
204                    Some(captures_iter.next()?.map(|range| &haystack[range]))
205                }
206            }
207        }
208    }
209}
210
211#[cfg(test)]
212mod tests {
213    use super::{EsRegex, EsRegexImpl};
214
215    #[test]
216    fn round_trip_serialize() {
217        let regex = EsRegex::new("[a-z]", "i").unwrap();
218        let serialized = serde_json::to_string(&regex).unwrap();
219        let parsed = serde_json::from_str::<EsRegex>(&serialized).unwrap();
220        assert_eq!(regex, parsed);
221    }
222
223    #[test]
224    fn es_regex_matches_simple() {
225        let regex = EsRegex::new("a", "").unwrap();
226        assert!(matches!(regex.delegate, EsRegexImpl::Regex { .. }));
227        assert!(regex.is_match("a"));
228    }
229
230    #[test]
231    fn es_regex_matches_negative_lookahead() {
232        // This feature is not supported by the regex crate
233        let regex = EsRegex::new("a(?!b)", "").unwrap();
234        assert!(matches!(regex.delegate, EsRegexImpl::Regress { .. }));
235        assert!(!regex.is_match("ab"));
236        assert!(regex.is_match("ac"));
237    }
238
239    #[test]
240    fn invalid_regex() {
241        // This is invalid since there is nothing being repeated
242        // Don't bother asserting on the message since we delegate
243        // that to the underlying implementations.
244        assert!(matches!(EsRegex::new("*", ""), Err { .. }))
245    }
246
247    #[test]
248    fn captures_with_regex() {
249        let regex = EsRegex::new(r"(notmatched)|(\d{4})-(\d{2})-(\d{2})", "").unwrap();
250        assert!(matches!(regex.delegate, EsRegexImpl::Regex { .. }));
251
252        let captures = regex.captures("Today is 2024-01-15");
253        assert!(captures.is_some());
254        let caps: Vec<_> = captures.unwrap().collect();
255        assert_eq!(caps.len(), 5); // full match + 4 groups
256        assert_eq!(caps[0], Some("2024-01-15")); // full match
257        assert_eq!(caps[1], None); // 'notmatched' -- this branch isn't taken
258        assert_eq!(caps[2], Some("2024")); // year
259        assert_eq!(caps[3], Some("01")); // month
260        assert_eq!(caps[4], Some("15")); // day
261    }
262
263    #[test]
264    fn captures_with_regress() {
265        let regex = EsRegex::new(r"(\w+)(?=baz)", "").unwrap();
266        assert!(matches!(regex.delegate, EsRegexImpl::Regress { .. }));
267
268        let captures = regex.captures("foobar");
269        assert!(captures.is_none());
270
271        let captures = regex.captures("foobaz");
272        assert!(captures.is_some());
273        let caps: Vec<_> = captures.unwrap().collect();
274        assert_eq!(caps.len(), 2); // full match + 1 group
275        assert_eq!(caps[0], Some("foo")); // full match
276        assert_eq!(caps[1], Some("foo")); // captured group
277    }
278}