turbo_esregex/
lib.rs

1#![feature(arbitrary_self_types_pointers)]
2
3use anyhow::{Result, bail};
4
5pub fn register() {
6    turbo_tasks::register();
7    include!(concat!(env!("OUT_DIR"), "/register.rs"));
8}
9
10/// A simple regular expression implementation following ecmascript semantics
11///
12/// Delegates to the `regex` crate when possible and `regress` otherwise.
13#[derive(Debug, Clone)]
14#[turbo_tasks::value(eq = "manual", shared)]
15#[serde(into = "RegexForm", try_from = "RegexForm")]
16pub struct EsRegex {
17    #[turbo_tasks(trace_ignore)]
18    delegate: EsRegexImpl,
19    // Store the original arguments used to construct
20    // this regex to support equality and serialization.
21    pub pattern: String,
22    pub flags: String,
23}
24
25#[derive(Debug, Clone)]
26enum EsRegexImpl {
27    Regex(regex::Regex),
28    Regress(regress::Regex),
29}
30
31/// Equality uses the source inputs since our delegate regex impls don't support
32/// equality natively.
33/// NOTE: there are multiple 'equivalent' ways to write a regex and this
34/// approach does _not_ attempt to equate them.
35impl PartialEq for EsRegex {
36    fn eq(&self, other: &Self) -> bool {
37        self.pattern == other.pattern && self.flags == other.flags
38    }
39}
40impl Eq for EsRegex {}
41
42impl TryFrom<RegexForm> for EsRegex {
43    type Error = anyhow::Error;
44
45    fn try_from(value: RegexForm) -> std::result::Result<Self, Self::Error> {
46        EsRegex::new(&value.pattern, &value.flags)
47    }
48}
49
50/// This is the serializable form for the `EsRegex` struct
51#[derive(serde::Serialize, serde::Deserialize, Debug, Clone)]
52struct RegexForm {
53    pattern: String,
54    flags: String,
55}
56
57impl From<EsRegex> for RegexForm {
58    fn from(value: EsRegex) -> Self {
59        Self {
60            pattern: value.pattern,
61            flags: value.flags,
62        }
63    }
64}
65
66impl EsRegex {
67    /// Support ecmascript style regular expressions by selecting the `regex` crate when possible
68    /// and using regress when not.
69    pub fn new(pattern: &str, flags: &str) -> Result<Self> {
70        // rust regex doesn't allow escaped slashes, but they are necessary in js
71        let pattern = pattern.replace("\\/", "/");
72
73        let mut applied_flags = String::new();
74        for flag in flags.chars() {
75            match flag {
76                // indices for substring matches: not relevant for the regex itself
77                'd' => {}
78                // global: default in rust, ignore
79                'g' => {}
80                // case-insensitive: letters match both upper and lower case
81                'i' => applied_flags.push('i'),
82                // multi-line mode: ^ and $ match begin/end of line
83                'm' => applied_flags.push('m'),
84                // allow . to match \n
85                's' => applied_flags.push('s'),
86                // Unicode support (enabled by default)
87                'u' => applied_flags.push('u'),
88                // sticky search: not relevant for the regex itself
89                'y' => {}
90                _ => bail!("unsupported flag `{flag}` in regex: `{pattern}` with flags: `{flags}`"),
91            }
92        }
93
94        let regex = if !applied_flags.is_empty() {
95            regex::Regex::new(&format!("(?{applied_flags}){pattern}"))
96        } else {
97            regex::Regex::new(&pattern)
98        };
99
100        let delegate = match regex {
101            Ok(reg) => Ok(EsRegexImpl::Regex(reg)),
102            Err(_e) => {
103                // We failed to parse as an regex:Regex, try using regress. Regress uses the es
104                // flags format so we can pass the original flags value.
105                match regress::Regex::with_flags(&pattern, regress::Flags::from(flags)) {
106                    Ok(reg) => Ok(EsRegexImpl::Regress(reg)),
107                    // Propagate the error as is, regress has useful error messages.
108                    Err(e) => Err(e),
109                }
110            }
111        }?;
112        Ok(Self {
113            delegate,
114            pattern,
115            flags: flags.to_string(),
116        })
117    }
118
119    /// Returns true if there is any match for this regex in the `haystac`
120    pub fn is_match(&self, haystack: &str) -> bool {
121        match &self.delegate {
122            EsRegexImpl::Regex(r) => r.is_match(haystack),
123            EsRegexImpl::Regress(r) => r.find(haystack).is_some(),
124        }
125    }
126
127    pub fn captures<'h>(&self, haystack: &'h str) -> Option<Vec<&'h str>> {
128        match &self.delegate {
129            EsRegexImpl::Regex(r) => r.captures(haystack).map(|caps| {
130                caps.iter()
131                    .map(|m| m.map(|m| m.as_str()).unwrap_or(""))
132                    .collect::<Vec<_>>()
133            }),
134            EsRegexImpl::Regress(r) => r.find(haystack).map(|m| {
135                m.groups()
136                    .map(|range_opt| range_opt.map(|range| &haystack[range]).unwrap_or(""))
137                    .collect::<Vec<_>>()
138            }),
139        }
140    }
141}
142
143#[cfg(test)]
144mod tests {
145    use super::{EsRegex, EsRegexImpl};
146
147    #[test]
148    fn round_trip_serialize() {
149        let regex = EsRegex::new("[a-z]", "i").unwrap();
150        let serialized = serde_json::to_string(&regex).unwrap();
151        let parsed = serde_json::from_str::<EsRegex>(&serialized).unwrap();
152        assert_eq!(regex, parsed);
153    }
154
155    #[test]
156    fn es_regex_matches_simple() {
157        let regex = EsRegex::new("a", "").unwrap();
158        assert!(matches!(regex.delegate, EsRegexImpl::Regex { .. }));
159        assert!(regex.is_match("a"));
160    }
161
162    #[test]
163    fn es_regex_matches_negative_lookahead() {
164        // This feature is not supported by the regex crate
165        let regex = EsRegex::new("a(?!b)", "").unwrap();
166        assert!(matches!(regex.delegate, EsRegexImpl::Regress { .. }));
167        assert!(!regex.is_match("ab"));
168        assert!(regex.is_match("ac"));
169    }
170
171    #[test]
172    fn invalid_regex() {
173        // This is invalid since there is nothing being repeated
174        // Don't bother asserting on the message since we delegate
175        // that to the underlying implementations.
176        assert!(matches!(EsRegex::new("*", ""), Err { .. }))
177    }
178
179    #[test]
180    fn captures_with_regex() {
181        let regex = EsRegex::new(r"(\d{4})-(\d{2})-(\d{2})", "").unwrap();
182        assert!(matches!(regex.delegate, EsRegexImpl::Regex { .. }));
183
184        let captures = regex.captures("Today is 2024-01-15");
185        assert!(captures.is_some());
186        let caps: Vec<&str> = captures.unwrap();
187        assert_eq!(caps.len(), 4); // full match + 3 groups
188        assert_eq!(caps[0], "2024-01-15"); // full match
189        assert_eq!(caps[1], "2024"); // year
190        assert_eq!(caps[2], "01"); // month
191        assert_eq!(caps[3], "15"); // day
192    }
193
194    #[test]
195    fn captures_with_regress() {
196        let regex = EsRegex::new(r"(\w+)(?=baz)", "").unwrap();
197        assert!(matches!(regex.delegate, EsRegexImpl::Regress { .. }));
198
199        let captures = regex.captures("foobar");
200        assert!(captures.is_none());
201
202        let captures = regex.captures("foobaz");
203        assert!(captures.is_some());
204        let caps: Vec<&str> = captures.unwrap();
205        assert_eq!(caps.len(), 2); // full match + 1 group
206        assert_eq!(caps[0], "foo"); // full match
207        assert_eq!(caps[1], "foo"); // captured group
208    }
209}