// Copyright 2014 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// Package runes provide transforms for UTF-8 encoded text.
package runes // import "golang.org/x/text/runes" import ( ) // A Set is a collection of runes. type Set interface { // Contains returns true if r is contained in the set. Contains(r rune) bool } type setFunc func(rune) bool func ( setFunc) ( rune) bool { return () } // Note: using funcs here instead of wrapping types result in cleaner // documentation and a smaller API. // In creates a Set with a Contains method that returns true for all runes in // the given RangeTable. func ( *unicode.RangeTable) Set { return setFunc(func( rune) bool { return unicode.Is(, ) }) } // NotIn creates a Set with a Contains method that returns true for all runes not // in the given RangeTable. func ( *unicode.RangeTable) Set { return setFunc(func( rune) bool { return !unicode.Is(, ) }) } // Predicate creates a Set with a Contains method that returns f(r). func ( func(rune) bool) Set { return setFunc() } // Transformer implements the transform.Transformer interface. type Transformer struct { t transform.SpanningTransformer } func ( Transformer) (, []byte, bool) (, int, error) { return .t.Transform(, , ) } func ( Transformer) ( []byte, bool) ( int, error) { return .t.Span(, ) } func ( Transformer) () { .t.Reset() } // Bytes returns a new byte slice with the result of converting b using t. It // calls Reset on t. It returns nil if any error was found. This can only happen // if an error-producing Transformer is passed to If. func ( Transformer) ( []byte) []byte { , , := transform.Bytes(, ) if != nil { return nil } return } // String returns a string with the result of converting s using t. It calls // Reset on t. It returns the empty string if any error was found. This can only // happen if an error-producing Transformer is passed to If. func ( Transformer) ( string) string { , , := transform.String(, ) if != nil { return "" } return } // TODO: // - Copy: copying strings and bytes in whole-rune units. // - Validation (maybe) // - Well-formed-ness (maybe) const runeErrorString = string(utf8.RuneError) // Remove returns a Transformer that removes runes r for which s.Contains(r). // Illegal input bytes are replaced by RuneError before being passed to f. func ( Set) Transformer { if , := .(setFunc); { // This little trick cuts the running time of BenchmarkRemove for sets // created by Predicate roughly in half. // TODO: special-case RangeTables as well. return Transformer{remove()} } return Transformer{remove(.Contains)} } // TODO: remove transform.RemoveFunc. type remove func(r rune) bool func (remove) () {} // Span implements transform.Spanner. func ( remove) ( []byte, bool) ( int, error) { for , := rune(0), 0; < len(); { if = rune([]); < utf8.RuneSelf { = 1 } else if , = utf8.DecodeRune([:]); == 1 { // Invalid rune. if ! && !utf8.FullRune([:]) { = transform.ErrShortSrc } else { = transform.ErrEndOfSpan } break } if () { = transform.ErrEndOfSpan break } += } return } // Transform implements transform.Transformer. func ( remove) (, []byte, bool) (, int, error) { for , := rune(0), 0; < len(); { if = rune([]); < utf8.RuneSelf { = 1 } else if , = utf8.DecodeRune([:]); == 1 { // Invalid rune. if ! && !utf8.FullRune([:]) { = transform.ErrShortSrc break } // We replace illegal bytes with RuneError. Not doing so might // otherwise turn a sequence of invalid UTF-8 into valid UTF-8. // The resulting byte sequence may subsequently contain runes // for which t(r) is true that were passed unnoticed. if !(utf8.RuneError) { if +3 > len() { = transform.ErrShortDst break } [+0] = runeErrorString[0] [+1] = runeErrorString[1] [+2] = runeErrorString[2] += 3 } ++ continue } if () { += continue } if + > len() { = transform.ErrShortDst break } for := 0; < ; ++ { [] = [] ++ ++ } } return } // Map returns a Transformer that maps the runes in the input using the given // mapping. Illegal bytes in the input are converted to utf8.RuneError before // being passed to the mapping func. func ( func(rune) rune) Transformer { return Transformer{mapper()} } type mapper func(rune) rune func (mapper) () {} // Span implements transform.Spanner. func ( mapper) ( []byte, bool) ( int, error) { for , := rune(0), 0; < len(); += { if = rune([]); < utf8.RuneSelf { = 1 } else if , = utf8.DecodeRune([:]); == 1 { // Invalid rune. if ! && !utf8.FullRune([:]) { = transform.ErrShortSrc } else { = transform.ErrEndOfSpan } break } if () != { = transform.ErrEndOfSpan break } } return , } // Transform implements transform.Transformer. func ( mapper) (, []byte, bool) (, int, error) { var rune var [utf8.UTFMax]byte for , := rune(0), 0; < len(); { if = rune([]); < utf8.RuneSelf { if = (); < utf8.RuneSelf { if == len() { = transform.ErrShortDst break } [] = byte() ++ ++ continue } = 1 } else if , = utf8.DecodeRune([:]); == 1 { // Invalid rune. if ! && !utf8.FullRune([:]) { = transform.ErrShortSrc break } if = (utf8.RuneError); == utf8.RuneError { if +3 > len() { = transform.ErrShortDst break } [+0] = runeErrorString[0] [+1] = runeErrorString[1] [+2] = runeErrorString[2] += 3 ++ continue } } else if = (); == { if + > len() { = transform.ErrShortDst break } for := 0; < ; ++ { [] = [] ++ ++ } continue } := utf8.EncodeRune([:], ) if + > len() { = transform.ErrShortDst break } for := 0; < ; ++ { [] = [] ++ } += } return } // ReplaceIllFormed returns a transformer that replaces all input bytes that are // not part of a well-formed UTF-8 code sequence with utf8.RuneError. func () Transformer { return Transformer{&replaceIllFormed{}} } type replaceIllFormed struct{ transform.NopResetter } func ( replaceIllFormed) ( []byte, bool) ( int, error) { for < len() { // ASCII fast path. if [] < utf8.RuneSelf { ++ continue } , := utf8.DecodeRune([:]) // Look for a valid non-ASCII rune. if != utf8.RuneError || != 1 { += continue } // Look for short source data. if ! && !utf8.FullRune([:]) { = transform.ErrShortSrc break } // We have an invalid rune. = transform.ErrEndOfSpan break } return , } func ( replaceIllFormed) (, []byte, bool) (, int, error) { for < len() { // ASCII fast path. if := []; < utf8.RuneSelf { if == len() { = transform.ErrShortDst break } [] = ++ ++ continue } // Look for a valid non-ASCII rune. if , := utf8.DecodeRune([:]); != 1 { if != copy([:], [:+]) { = transform.ErrShortDst break } += += continue } // Look for short source data. if ! && !utf8.FullRune([:]) { = transform.ErrShortSrc break } // We have an invalid rune. if +3 > len() { = transform.ErrShortDst break } [+0] = runeErrorString[0] [+1] = runeErrorString[1] [+2] = runeErrorString[2] += 3 ++ } return , , }