// Copyright (c) 2017 Couchbase, Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package utf8 import ( "fmt" "unicode/utf8" ) // Sequences is a collection of Sequence type Sequences []Sequence // NewSequences constructs a collection of Sequence which describe the // byte ranges covered between the start and end runes. func NewSequences(start, end rune) (Sequences, error) { rv, _, err := NewSequencesPrealloc(start, end, nil, nil, nil, nil) return rv, err } func NewSequencesPrealloc(start, end rune, preallocSequences Sequences, preallocRangeStack RangeStack, preallocStartBytes, preallocEndBytes []byte) (Sequences, RangeStack, error) { rv := preallocSequences[:0] startBytes := preallocStartBytes if cap(startBytes) < utf8.UTFMax { startBytes = make([]byte, utf8.UTFMax) } startBytes = startBytes[:utf8.UTFMax] endBytes := preallocEndBytes if cap(endBytes) < utf8.UTFMax { endBytes = make([]byte, utf8.UTFMax) } endBytes = endBytes[:utf8.UTFMax] rangeStack := preallocRangeStack[:0] rangeStack = rangeStack.Push(scalarRange{start, end}) rangeStack, r := rangeStack.Pop() TOP: for r != nilScalarRange { INNER: for { r1, r2 := r.split() if r1 != nilScalarRange { rangeStack = rangeStack.Push(scalarRange{r2.start, r2.end}) r.start = r1.start r.end = r1.end continue INNER } if !r.valid() { rangeStack, r = rangeStack.Pop() continue TOP } for i := 1; i < utf8.UTFMax; i++ { max := maxScalarValue(i) if r.start <= max && max < r.end { rangeStack = rangeStack.Push(scalarRange{max + 1, r.end}) r.end = max continue INNER } } asciiRange := r.ascii() if asciiRange != nilRange { rv = append(rv, Sequence{ asciiRange, }) rangeStack, r = rangeStack.Pop() continue TOP } for i := uint(1); i < utf8.UTFMax; i++ { m := rune((1 << (6 * i)) - 1) if (r.start & ^m) != (r.end & ^m) { if (r.start & m) != 0 { rangeStack = rangeStack.Push(scalarRange{(r.start | m) + 1, r.end}) r.end = r.start | m continue INNER } if (r.end & m) != m { rangeStack = rangeStack.Push(scalarRange{r.end & ^m, r.end}) r.end = (r.end & ^m) - 1 continue INNER } } } n, m := r.encode(startBytes, endBytes) seq, err := SequenceFromEncodedRange(startBytes[0:n], endBytes[0:m]) if err != nil { return nil, nil, err } rv = append(rv, seq) rangeStack, r = rangeStack.Pop() continue TOP } } return rv, rangeStack, nil } // Sequence is a collection of Range type Sequence []Range // SequenceFromEncodedRange creates sequence from the encoded bytes func SequenceFromEncodedRange(start, end []byte) (Sequence, error) { if len(start) != len(end) { return nil, fmt.Errorf("byte slices must be the same length") } switch len(start) { case 2: return Sequence{ Range{start[0], end[0]}, Range{start[1], end[1]}, }, nil case 3: return Sequence{ Range{start[0], end[0]}, Range{start[1], end[1]}, Range{start[2], end[2]}, }, nil case 4: return Sequence{ Range{start[0], end[0]}, Range{start[1], end[1]}, Range{start[2], end[2]}, Range{start[3], end[3]}, }, nil } return nil, fmt.Errorf("invalid encoded byte length") } // Matches checks to see if the provided byte slice matches the Sequence func (u Sequence) Matches(bytes []byte) bool { if len(bytes) < len(u) { return false } for i := 0; i < len(u); i++ { if !u[i].matches(bytes[i]) { return false } } return true } func (u Sequence) String() string { switch len(u) { case 1: return fmt.Sprintf("%v", u[0]) case 2: return fmt.Sprintf("%v%v", u[0], u[1]) case 3: return fmt.Sprintf("%v%v%v", u[0], u[1], u[2]) case 4: return fmt.Sprintf("%v%v%v%v", u[0], u[1], u[2], u[3]) default: return fmt.Sprintf("invalid utf8 sequence") } } // Range describes a single range of byte values type Range struct { Start byte End byte } var nilRange = Range{0xff, 0} func (u Range) matches(b byte) bool { if u.Start <= b && b <= u.End { return true } return false } func (u Range) String() string { if u.Start == u.End { return fmt.Sprintf("[%X]", u.Start) } return fmt.Sprintf("[%X-%X]", u.Start, u.End) } type scalarRange struct { start rune end rune } var nilScalarRange = scalarRange{0xffff, 0} func (s *scalarRange) String() string { return fmt.Sprintf("ScalarRange(%d,%d)", s.start, s.end) } // split this scalar range if it overlaps with a surrogate codepoint func (s *scalarRange) split() (scalarRange, scalarRange) { if s.start < 0xe000 && s.end > 0xd7ff { return scalarRange{ start: s.start, end: 0xd7ff, }, scalarRange{ start: 0xe000, end: s.end, } } return nilScalarRange, nilScalarRange } func (s *scalarRange) valid() bool { return s.start <= s.end } func (s *scalarRange) ascii() Range { if s.valid() && s.end <= 0x7f { return Range{ Start: byte(s.start), End: byte(s.end), } } return nilRange } // start and end MUST have capacity for utf8.UTFMax bytes func (s *scalarRange) encode(start, end []byte) (int, int) { n := utf8.EncodeRune(start, s.start) m := utf8.EncodeRune(end, s.end) return n, m } type RangeStack []scalarRange func (s RangeStack) Push(v scalarRange) RangeStack { return append(s, v) } func (s RangeStack) Pop() (RangeStack, scalarRange) { l := len(s) if l < 1 { return s, nilScalarRange } return s[:l-1], s[l-1] } func maxScalarValue(nbytes int) rune { switch nbytes { case 1: return 0x007f case 2: return 0x07FF case 3: return 0xFFFF default: return 0x10FFFF } }