// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// Package encoding defines an interface for character encodings, such as Shift // JIS and Windows 1252, that can convert to and from UTF-8. // // Encoding implementations are provided in other packages, such as // golang.org/x/text/encoding/charmap and // golang.org/x/text/encoding/japanese.
package encoding // import "golang.org/x/text/encoding" import ( ) // TODO: // - There seems to be some inconsistency in when decoders return errors // and when not. Also documentation seems to suggest they shouldn't return // errors at all (except for UTF-16). // - Encoders seem to rely on or at least benefit from the input being in NFC // normal form. Perhaps add an example how users could prepare their output. // Encoding is a character set encoding that can be transformed to and from // UTF-8. type Encoding interface { // NewDecoder returns a Decoder. NewDecoder() *Decoder // NewEncoder returns an Encoder. NewEncoder() *Encoder } // A Decoder converts bytes to UTF-8. It implements transform.Transformer. // // Transforming source bytes that are not of that encoding will not result in an // error per se. Each byte that cannot be transcoded will be represented in the // output by the UTF-8 encoding of '\uFFFD', the replacement rune. type Decoder struct { transform.Transformer // This forces external creators of Decoders to use names in struct // initializers, allowing for future extendibility without having to break // code. _ struct{} } // Bytes converts the given encoded bytes to UTF-8. It returns the converted // bytes or nil, err if any error occurred. func ( *Decoder) ( []byte) ([]byte, error) { , , := transform.Bytes(, ) if != nil { return nil, } return , nil } // String converts the given encoded string to UTF-8. It returns the converted // string or "", err if any error occurred. func ( *Decoder) ( string) (string, error) { , , := transform.String(, ) if != nil { return "", } return , nil } // Reader wraps another Reader to decode its bytes. // // The Decoder may not be used for any other operation as long as the returned // Reader is in use. func ( *Decoder) ( io.Reader) io.Reader { return transform.NewReader(, ) } // An Encoder converts bytes from UTF-8. It implements transform.Transformer. // // Each rune that cannot be transcoded will result in an error. In this case, // the transform will consume all source byte up to, not including the offending // rune. Transforming source bytes that are not valid UTF-8 will be replaced by // `\uFFFD`. To return early with an error instead, use transform.Chain to // preprocess the data with a UTF8Validator. type Encoder struct { transform.Transformer // This forces external creators of Encoders to use names in struct // initializers, allowing for future extendibility without having to break // code. _ struct{} } // Bytes converts bytes from UTF-8. It returns the converted bytes or nil, err if // any error occurred. func ( *Encoder) ( []byte) ([]byte, error) { , , := transform.Bytes(, ) if != nil { return nil, } return , nil } // String converts a string from UTF-8. It returns the converted string or // "", err if any error occurred. func ( *Encoder) ( string) (string, error) { , , := transform.String(, ) if != nil { return "", } return , nil } // Writer wraps another Writer to encode its UTF-8 output. // // The Encoder may not be used for any other operation as long as the returned // Writer is in use. func ( *Encoder) ( io.Writer) io.Writer { return transform.NewWriter(, ) } // ASCIISub is the ASCII substitute character, as recommended by // https://unicode.org/reports/tr36/#Text_Comparison const ASCIISub = '\x1a' // Nop is the nop encoding. Its transformed bytes are the same as the source // bytes; it does not replace invalid UTF-8 sequences. var Nop Encoding = nop{} type nop struct{} func (nop) () *Decoder { return &Decoder{Transformer: transform.Nop} } func (nop) () *Encoder { return &Encoder{Transformer: transform.Nop} } // Replacement is the replacement encoding. Decoding from the replacement // encoding yields a single '\uFFFD' replacement rune. Encoding from UTF-8 to // the replacement encoding yields the same as the source bytes except that // invalid UTF-8 is converted to '\uFFFD'. // // It is defined at http://encoding.spec.whatwg.org/#replacement var Replacement Encoding = replacement{} type replacement struct{} func (replacement) () *Decoder { return &Decoder{Transformer: replacementDecoder{}} } func (replacement) () *Encoder { return &Encoder{Transformer: replacementEncoder{}} } func (replacement) () ( identifier.MIB, string) { return identifier.Replacement, "" } type replacementDecoder struct{ transform.NopResetter } func (replacementDecoder) (, []byte, bool) (, int, error) { if len() < 3 { return 0, 0, transform.ErrShortDst } if { const = "\ufffd" [0] = [0] [1] = [1] [2] = [2] = 3 } return , len(), nil } type replacementEncoder struct{ transform.NopResetter } func (replacementEncoder) (, []byte, bool) (, int, error) { , := rune(0), 0 for ; < len(); += { = rune([]) // Decode a 1-byte rune. if < utf8.RuneSelf { = 1 } else { // Decode a multi-byte rune. , = utf8.DecodeRune([:]) if == 1 { // All valid runes of size 1 (those below utf8.RuneSelf) were // handled above. We have invalid UTF-8 or we haven't seen the // full character yet. if ! && !utf8.FullRune([:]) { = transform.ErrShortSrc break } = '\ufffd' } } if +utf8.RuneLen() > len() { = transform.ErrShortDst break } += utf8.EncodeRune([:], ) } return , , } // HTMLEscapeUnsupported wraps encoders to replace source runes outside the // repertoire of the destination encoding with HTML escape sequences. // // This wrapper exists to comply to URL and HTML forms requiring a // non-terminating legacy encoder. The produced sequences may lead to data // loss as they are indistinguishable from legitimate input. To avoid this // issue, use UTF-8 encodings whenever possible. func ( *Encoder) *Encoder { return &Encoder{Transformer: &errorHandler{, errorToHTML}} } // ReplaceUnsupported wraps encoders to replace source runes outside the // repertoire of the destination encoding with an encoding-specific // replacement. // // This wrapper is only provided for backwards compatibility and legacy // handling. Its use is strongly discouraged. Use UTF-8 whenever possible. func ( *Encoder) *Encoder { return &Encoder{Transformer: &errorHandler{, errorToReplacement}} } type errorHandler struct { *Encoder handler func(dst []byte, r rune, err repertoireError) (n int, ok bool) } // TODO: consider making this error public in some form. type repertoireError interface { Replacement() byte } func ( errorHandler) (, []byte, bool) (, int, error) { , , = .Transformer.Transform(, , ) for != nil { , := .(repertoireError) if ! { return , , } , := utf8.DecodeRune([:]) , := .handler([:], , ) if ! { return , , transform.ErrShortDst } = nil += if += ; < len() { var , int , , = .Transformer.Transform([:], [:], ) += += } } return , , } func errorToHTML( []byte, rune, repertoireError) ( int, bool) { := [8]byte{} := strconv.AppendUint([:0], uint64(), 10) if = len() + len("&#;"); >= len() { return 0, false } [0] = '&' [1] = '#' [copy([2:], )+2] = ';' return , true } func errorToReplacement( []byte, rune, repertoireError) ( int, bool) { if len() == 0 { return 0, false } [0] = .Replacement() return 1, true } // ErrInvalidUTF8 means that a transformer encountered invalid UTF-8. var ErrInvalidUTF8 = errors.New("encoding: invalid UTF-8") // UTF8Validator is a transformer that returns ErrInvalidUTF8 on the first // input byte that is not valid UTF-8. var UTF8Validator transform.Transformer = utf8Validator{} type utf8Validator struct{ transform.NopResetter } func (utf8Validator) (, []byte, bool) (, int, error) { := len() if > len() { = len() } for := 0; < ; { if := []; < utf8.RuneSelf { [] = ++ continue } , := utf8.DecodeRune([:]) if == 1 { // All valid runes of size 1 (those below utf8.RuneSelf) were // handled above. We have invalid UTF-8 or we haven't seen the // full character yet. = ErrInvalidUTF8 if ! && !utf8.FullRune([:]) { = transform.ErrShortSrc } return , , } if + > len() { return , , transform.ErrShortDst } for ; > 0; -- { [] = [] ++ } } if len() > len() { = transform.ErrShortDst } return , , }