1use std::char;
2use std::cmp;
3use std::fmt::Debug;
4use std::slice;
5use std::u8;
6
7use crate::unicode;
8
9// This module contains an *internal* implementation of interval sets.
10//
11// The primary invariant that interval sets guards is canonical ordering. That
12// is, every interval set contains an ordered sequence of intervals where
13// no two intervals are overlapping or adjacent. While this invariant is
14// occasionally broken within the implementation, it should be impossible for
15// callers to observe it.
16//
17// Since case folding (as implemented below) breaks that invariant, we roll
18// that into this API even though it is a little out of place in an otherwise
19// generic interval set. (Hence the reason why the `unicode` module is imported
20// here.)
21//
22// Some of the implementation complexity here is a result of me wanting to
23// preserve the sequential representation without using additional memory.
24// In many cases, we do use linear extra memory, but it is at most 2x and it
25// is amortized. If we relaxed the memory requirements, this implementation
26// could become much simpler. The extra memory is honestly probably OK, but
27// character classes (especially of the Unicode variety) can become quite
28// large, and it would be nice to keep regex compilation snappy even in debug
29// builds. (In the past, I have been careless with this area of code and it has
30// caused slow regex compilations in debug mode, so this isn't entirely
31// unwarranted.)
32//
33// Tests on this are relegated to the public API of HIR in src/hir.rs.
34
35#[derive(Clone, Debug, Eq, PartialEq)]
36pub struct IntervalSet<I> {
37    ranges: Vec<I>,
38}
39
40impl<I: Interval> IntervalSet<I> {
41    /// Create a new set from a sequence of intervals. Each interval is
42    /// specified as a pair of bounds, where both bounds are inclusive.
43    ///
44    /// The given ranges do not need to be in any specific order, and ranges
45    /// may overlap.
46    pub fn new<T: IntoIterator<Item = I>>(intervals: T) -> IntervalSet<I> {
47        let mut set = IntervalSet { ranges: intervals.into_iter().collect() };
48        set.canonicalize();
49        set
50    }
51
52    /// Add a new interval to this set.
53    pub fn push(&mut self, interval: I) {
54        // TODO: This could be faster. e.g., Push the interval such that
55        // it preserves canonicalization.
56        self.ranges.push(interval);
57        self.canonicalize();
58    }
59
60    /// Return an iterator over all intervals in this set.
61    ///
62    /// The iterator yields intervals in ascending order.
63    pub fn iter(&self) -> IntervalSetIter<'_, I> {
64        IntervalSetIter(self.ranges.iter())
65    }
66
67    /// Return an immutable slice of intervals in this set.
68    ///
69    /// The sequence returned is in canonical ordering.
70    pub fn intervals(&self) -> &[I] {
71        &self.ranges
72    }
73
74    /// Expand this interval set such that it contains all case folded
75    /// characters. For example, if this class consists of the range `a-z`,
76    /// then applying case folding will result in the class containing both the
77    /// ranges `a-z` and `A-Z`.
78    ///
79    /// This returns an error if the necessary case mapping data is not
80    /// available.
81    pub fn case_fold_simple(&mut self) -> Result<(), unicode::CaseFoldError> {
82        let len = self.ranges.len();
83        for i in 0..len {
84            let range = self.ranges[i];
85            if let Err(err) = range.case_fold_simple(&mut self.ranges) {
86                self.canonicalize();
87                return Err(err);
88            }
89        }
90        self.canonicalize();
91        Ok(())
92    }
93
94    /// Union this set with the given set, in place.
95    pub fn union(&mut self, other: &IntervalSet<I>) {
96        // This could almost certainly be done more efficiently.
97        self.ranges.extend(&other.ranges);
98        self.canonicalize();
99    }
100
101    /// Intersect this set with the given set, in place.
102    pub fn intersect(&mut self, other: &IntervalSet<I>) {
103        if self.ranges.is_empty() {
104            return;
105        }
106        if other.ranges.is_empty() {
107            self.ranges.clear();
108            return;
109        }
110
111        // There should be a way to do this in-place with constant memory,
112        // but I couldn't figure out a simple way to do it. So just append
113        // the intersection to the end of this range, and then drain it before
114        // we're done.
115        let drain_end = self.ranges.len();
116
117        let mut ita = 0..drain_end;
118        let mut itb = 0..other.ranges.len();
119        let mut a = ita.next().unwrap();
120        let mut b = itb.next().unwrap();
121        loop {
122            if let Some(ab) = self.ranges[a].intersect(&other.ranges[b]) {
123                self.ranges.push(ab);
124            }
125            let (it, aorb) =
126                if self.ranges[a].upper() < other.ranges[b].upper() {
127                    (&mut ita, &mut a)
128                } else {
129                    (&mut itb, &mut b)
130                };
131            match it.next() {
132                Some(v) => *aorb = v,
133                None => break,
134            }
135        }
136        self.ranges.drain(..drain_end);
137    }
138
139    /// Subtract the given set from this set, in place.
140    pub fn difference(&mut self, other: &IntervalSet<I>) {
141        if self.ranges.is_empty() || other.ranges.is_empty() {
142            return;
143        }
144
145        // This algorithm is (to me) surprisingly complex. A search of the
146        // interwebs indicate that this is a potentially interesting problem.
147        // Folks seem to suggest interval or segment trees, but I'd like to
148        // avoid the overhead (both runtime and conceptual) of that.
149        //
150        // The following is basically my Shitty First Draft. Therefore, in
151        // order to grok it, you probably need to read each line carefully.
152        // Simplifications are most welcome!
153        //
154        // Remember, we can assume the canonical format invariant here, which
155        // says that all ranges are sorted, not overlapping and not adjacent in
156        // each class.
157        let drain_end = self.ranges.len();
158        let (mut a, mut b) = (0, 0);
159        'LOOP: while a < drain_end && b < other.ranges.len() {
160            // Basically, the easy cases are when neither range overlaps with
161            // each other. If the `b` range is less than our current `a`
162            // range, then we can skip it and move on.
163            if other.ranges[b].upper() < self.ranges[a].lower() {
164                b += 1;
165                continue;
166            }
167            // ... similarly for the `a` range. If it's less than the smallest
168            // `b` range, then we can add it as-is.
169            if self.ranges[a].upper() < other.ranges[b].lower() {
170                let range = self.ranges[a];
171                self.ranges.push(range);
172                a += 1;
173                continue;
174            }
175            // Otherwise, we have overlapping ranges.
176            assert!(!self.ranges[a].is_intersection_empty(&other.ranges[b]));
177
178            // This part is tricky and was non-obvious to me without looking
179            // at explicit examples (see the tests). The trickiness stems from
180            // two things: 1) subtracting a range from another range could
181            // yield two ranges and 2) after subtracting a range, it's possible
182            // that future ranges can have an impact. The loop below advances
183            // the `b` ranges until they can't possible impact the current
184            // range.
185            //
186            // For example, if our `a` range is `a-t` and our next three `b`
187            // ranges are `a-c`, `g-i`, `r-t` and `x-z`, then we need to apply
188            // subtraction three times before moving on to the next `a` range.
189            let mut range = self.ranges[a];
190            while b < other.ranges.len()
191                && !range.is_intersection_empty(&other.ranges[b])
192            {
193                let old_range = range;
194                range = match range.difference(&other.ranges[b]) {
195                    (None, None) => {
196                        // We lost the entire range, so move on to the next
197                        // without adding this one.
198                        a += 1;
199                        continue 'LOOP;
200                    }
201                    (Some(range1), None) | (None, Some(range1)) => range1,
202                    (Some(range1), Some(range2)) => {
203                        self.ranges.push(range1);
204                        range2
205                    }
206                };
207                // It's possible that the `b` range has more to contribute
208                // here. In particular, if it is greater than the original
209                // range, then it might impact the next `a` range *and* it
210                // has impacted the current `a` range as much as possible,
211                // so we can quit. We don't bump `b` so that the next `a`
212                // range can apply it.
213                if other.ranges[b].upper() > old_range.upper() {
214                    break;
215                }
216                // Otherwise, the next `b` range might apply to the current
217                // `a` range.
218                b += 1;
219            }
220            self.ranges.push(range);
221            a += 1;
222        }
223        while a < drain_end {
224            let range = self.ranges[a];
225            self.ranges.push(range);
226            a += 1;
227        }
228        self.ranges.drain(..drain_end);
229    }
230
231    /// Compute the symmetric difference of the two sets, in place.
232    ///
233    /// This computes the symmetric difference of two interval sets. This
234    /// removes all elements in this set that are also in the given set,
235    /// but also adds all elements from the given set that aren't in this
236    /// set. That is, the set will contain all elements in either set,
237    /// but will not contain any elements that are in both sets.
238    pub fn symmetric_difference(&mut self, other: &IntervalSet<I>) {
239        // TODO(burntsushi): Fix this so that it amortizes allocation.
240        let mut intersection = self.clone();
241        intersection.intersect(other);
242        self.union(other);
243        self.difference(&intersection);
244    }
245
246    /// Negate this interval set.
247    ///
248    /// For all `x` where `x` is any element, if `x` was in this set, then it
249    /// will not be in this set after negation.
250    pub fn negate(&mut self) {
251        if self.ranges.is_empty() {
252            let (min, max) = (I::Bound::min_value(), I::Bound::max_value());
253            self.ranges.push(I::create(min, max));
254            return;
255        }
256
257        // There should be a way to do this in-place with constant memory,
258        // but I couldn't figure out a simple way to do it. So just append
259        // the negation to the end of this range, and then drain it before
260        // we're done.
261        let drain_end = self.ranges.len();
262
263        // We do checked arithmetic below because of the canonical ordering
264        // invariant.
265        if self.ranges[0].lower() > I::Bound::min_value() {
266            let upper = self.ranges[0].lower().decrement();
267            self.ranges.push(I::create(I::Bound::min_value(), upper));
268        }
269        for i in 1..drain_end {
270            let lower = self.ranges[i - 1].upper().increment();
271            let upper = self.ranges[i].lower().decrement();
272            self.ranges.push(I::create(lower, upper));
273        }
274        if self.ranges[drain_end - 1].upper() < I::Bound::max_value() {
275            let lower = self.ranges[drain_end - 1].upper().increment();
276            self.ranges.push(I::create(lower, I::Bound::max_value()));
277        }
278        self.ranges.drain(..drain_end);
279    }
280
281    /// Converts this set into a canonical ordering.
282    fn canonicalize(&mut self) {
283        if self.is_canonical() {
284            return;
285        }
286        self.ranges.sort();
287        assert!(!self.ranges.is_empty());
288
289        // Is there a way to do this in-place with constant memory? I couldn't
290        // figure out a way to do it. So just append the canonicalization to
291        // the end of this range, and then drain it before we're done.
292        let drain_end = self.ranges.len();
293        for oldi in 0..drain_end {
294            // If we've added at least one new range, then check if we can
295            // merge this range in the previously added range.
296            if self.ranges.len() > drain_end {
297                let (last, rest) = self.ranges.split_last_mut().unwrap();
298                if let Some(union) = last.union(&rest[oldi]) {
299                    *last = union;
300                    continue;
301                }
302            }
303            let range = self.ranges[oldi];
304            self.ranges.push(range);
305        }
306        self.ranges.drain(..drain_end);
307    }
308
309    /// Returns true if and only if this class is in a canonical ordering.
310    fn is_canonical(&self) -> bool {
311        for pair in self.ranges.windows(2) {
312            if pair[0] >= pair[1] {
313                return false;
314            }
315            if pair[0].is_contiguous(&pair[1]) {
316                return false;
317            }
318        }
319        true
320    }
321}
322
323/// An iterator over intervals.
324#[derive(Debug)]
325pub struct IntervalSetIter<'a, I>(slice::Iter<'a, I>);
326
327impl<'a, I> Iterator for IntervalSetIter<'a, I> {
328    type Item = &'a I;
329
330    fn next(&mut self) -> Option<&'a I> {
331        self.0.next()
332    }
333}
334
335pub trait Interval:
336    Clone + Copy + Debug + Default + Eq + PartialEq + PartialOrd + Ord
337{
338    type Bound: Bound;
339
340    fn lower(&self) -> Self::Bound;
341    fn upper(&self) -> Self::Bound;
342    fn set_lower(&mut self, bound: Self::Bound);
343    fn set_upper(&mut self, bound: Self::Bound);
344    fn case_fold_simple(
345        &self,
346        intervals: &mut Vec<Self>,
347    ) -> Result<(), unicode::CaseFoldError>;
348
349    /// Create a new interval.
350    fn create(lower: Self::Bound, upper: Self::Bound) -> Self {
351        let mut int = Self::default();
352        if lower <= upper {
353            int.set_lower(lower);
354            int.set_upper(upper);
355        } else {
356            int.set_lower(upper);
357            int.set_upper(lower);
358        }
359        int
360    }
361
362    /// Union the given overlapping range into this range.
363    ///
364    /// If the two ranges aren't contiguous, then this returns `None`.
365    fn union(&self, other: &Self) -> Option<Self> {
366        if !self.is_contiguous(other) {
367            return None;
368        }
369        let lower = cmp::min(self.lower(), other.lower());
370        let upper = cmp::max(self.upper(), other.upper());
371        Some(Self::create(lower, upper))
372    }
373
374    /// Intersect this range with the given range and return the result.
375    ///
376    /// If the intersection is empty, then this returns `None`.
377    fn intersect(&self, other: &Self) -> Option<Self> {
378        let lower = cmp::max(self.lower(), other.lower());
379        let upper = cmp::min(self.upper(), other.upper());
380        if lower <= upper {
381            Some(Self::create(lower, upper))
382        } else {
383            None
384        }
385    }
386
387    /// Subtract the given range from this range and return the resulting
388    /// ranges.
389    ///
390    /// If subtraction would result in an empty range, then no ranges are
391    /// returned.
392    fn difference(&self, other: &Self) -> (Option<Self>, Option<Self>) {
393        if self.is_subset(other) {
394            return (None, None);
395        }
396        if self.is_intersection_empty(other) {
397            return (Some(self.clone()), None);
398        }
399        let add_lower = other.lower() > self.lower();
400        let add_upper = other.upper() < self.upper();
401        // We know this because !self.is_subset(other) and the ranges have
402        // a non-empty intersection.
403        assert!(add_lower || add_upper);
404        let mut ret = (None, None);
405        if add_lower {
406            let upper = other.lower().decrement();
407            ret.0 = Some(Self::create(self.lower(), upper));
408        }
409        if add_upper {
410            let lower = other.upper().increment();
411            let range = Self::create(lower, self.upper());
412            if ret.0.is_none() {
413                ret.0 = Some(range);
414            } else {
415                ret.1 = Some(range);
416            }
417        }
418        ret
419    }
420
421    /// Compute the symmetric difference the given range from this range. This
422    /// returns the union of the two ranges minus its intersection.
423    fn symmetric_difference(
424        &self,
425        other: &Self,
426    ) -> (Option<Self>, Option<Self>) {
427        let union = match self.union(other) {
428            None => return (Some(self.clone()), Some(other.clone())),
429            Some(union) => union,
430        };
431        let intersection = match self.intersect(other) {
432            None => return (Some(self.clone()), Some(other.clone())),
433            Some(intersection) => intersection,
434        };
435        union.difference(&intersection)
436    }
437
438    /// Returns true if and only if the two ranges are contiguous. Two ranges
439    /// are contiguous if and only if the ranges are either overlapping or
440    /// adjacent.
441    fn is_contiguous(&self, other: &Self) -> bool {
442        let lower1 = self.lower().as_u32();
443        let upper1 = self.upper().as_u32();
444        let lower2 = other.lower().as_u32();
445        let upper2 = other.upper().as_u32();
446        cmp::max(lower1, lower2) <= cmp::min(upper1, upper2).saturating_add(1)
447    }
448
449    /// Returns true if and only if the intersection of this range and the
450    /// other range is empty.
451    fn is_intersection_empty(&self, other: &Self) -> bool {
452        let (lower1, upper1) = (self.lower(), self.upper());
453        let (lower2, upper2) = (other.lower(), other.upper());
454        cmp::max(lower1, lower2) > cmp::min(upper1, upper2)
455    }
456
457    /// Returns true if and only if this range is a subset of the other range.
458    fn is_subset(&self, other: &Self) -> bool {
459        let (lower1, upper1) = (self.lower(), self.upper());
460        let (lower2, upper2) = (other.lower(), other.upper());
461        (lower2 <= lower1 && lower1 <= upper2)
462            && (lower2 <= upper1 && upper1 <= upper2)
463    }
464}
465
466pub trait Bound:
467    Copy + Clone + Debug + Eq + PartialEq + PartialOrd + Ord
468{
469    fn min_value() -> Self;
470    fn max_value() -> Self;
471    fn as_u32(self) -> u32;
472    fn increment(self) -> Self;
473    fn decrement(self) -> Self;
474}
475
476impl Bound for u8 {
477    fn min_value() -> Self {
478        u8::MIN
479    }
480    fn max_value() -> Self {
481        u8::MAX
482    }
483    fn as_u32(self) -> u32 {
484        self as u32
485    }
486    fn increment(self) -> Self {
487        self.checked_add(1).unwrap()
488    }
489    fn decrement(self) -> Self {
490        self.checked_sub(1).unwrap()
491    }
492}
493
494impl Bound for char {
495    fn min_value() -> Self {
496        '\x00'
497    }
498    fn max_value() -> Self {
499        '\u{10FFFF}'
500    }
501    fn as_u32(self) -> u32 {
502        self as u32
503    }
504
505    fn increment(self) -> Self {
506        match self {
507            '\u{D7FF}' => '\u{E000}',
508            c => char::from_u32((c as u32).checked_add(1).unwrap()).unwrap(),
509        }
510    }
511
512    fn decrement(self) -> Self {
513        match self {
514            '\u{E000}' => '\u{D7FF}',
515            c => char::from_u32((c as u32).checked_sub(1).unwrap()).unwrap(),
516        }
517    }
518}
519
520// Tests for interval sets are written in src/hir.rs against the public API.
521