// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// Package charset provides common text encodings for HTML documents. // // The mapping from encoding labels to encodings is defined at // https://encoding.spec.whatwg.org/.
package charset // import "golang.org/x/net/html/charset" import ( ) // Lookup returns the encoding with the specified label, and its canonical // name. It returns nil and the empty string if label is not one of the // standard encodings for HTML. Matching is case-insensitive and ignores // leading and trailing whitespace. Encoders will use HTML escape sequences for // runes that are not supported by the character set. func ( string) ( encoding.Encoding, string) { , := htmlindex.Get() if != nil { return nil, "" } , _ = htmlindex.Name() return &htmlEncoding{}, } type htmlEncoding struct{ encoding.Encoding } func ( *htmlEncoding) () *encoding.Encoder { // HTML requires a non-terminating legacy encoder. We use HTML escapes to // substitute unsupported code points. return encoding.HTMLEscapeUnsupported(.Encoding.NewEncoder()) } // DetermineEncoding determines the encoding of an HTML document by examining // up to the first 1024 bytes of content and the declared Content-Type. // // See http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#determining-the-character-encoding func ( []byte, string) ( encoding.Encoding, string, bool) { if len() > 1024 { = [:1024] } for , := range boms { if bytes.HasPrefix(, .bom) { , = Lookup(.enc) return , , true } } if , , := mime.ParseMediaType(); == nil { if , := ["charset"]; { if , = Lookup(); != nil { return , , true } } } if len() > 0 { , = prescan() if != nil { return , , false } } // Try to detect UTF-8. // First eliminate any partial rune at the end. for := len() - 1; >= 0 && > len()-4; -- { := [] if < 0x80 { break } if utf8.RuneStart() { = [:] break } } := false for , := range { if >= 0x80 { = true break } } if && utf8.Valid() { return encoding.Nop, "utf-8", false } // TODO: change default depending on user's locale? return charmap.Windows1252, "windows-1252", false } // NewReader returns an io.Reader that converts the content of r to UTF-8. // It calls DetermineEncoding to find out what r's encoding is. func ( io.Reader, string) (io.Reader, error) { := make([]byte, 1024) , := io.ReadFull(, ) switch { case == io.ErrUnexpectedEOF: = [:] = bytes.NewReader() case != nil: return nil, default: = io.MultiReader(bytes.NewReader(), ) } if , , := DetermineEncoding(, ); != encoding.Nop { = transform.NewReader(, .NewDecoder()) } return , nil } // NewReaderLabel returns a reader that converts from the specified charset to // UTF-8. It uses Lookup to find the encoding that corresponds to label, and // returns an error if Lookup returns nil. It is suitable for use as // encoding/xml.Decoder's CharsetReader function. func ( string, io.Reader) (io.Reader, error) { , := Lookup() if == nil { return nil, fmt.Errorf("unsupported charset: %q", ) } return transform.NewReader(, .NewDecoder()), nil } func prescan( []byte) ( encoding.Encoding, string) { := html.NewTokenizer(bytes.NewReader()) for { switch .Next() { case html.ErrorToken: return nil, "" case html.StartTagToken, html.SelfClosingTagToken: , := .TagName() if !bytes.Equal(, []byte("meta")) { continue } := make(map[string]bool) := false const ( = iota ) := = "" = nil for { var , []byte , , = .TagAttr() := string() if [] { continue } [] = true for , := range { if 'A' <= && <= 'Z' { [] = + 0x20 } } switch { case "http-equiv": if bytes.Equal(, []byte("content-type")) { = true } case "content": if == nil { = fromMetaElement(string()) if != "" { , = Lookup() if != nil { = } } } case "charset": , = Lookup(string()) = } } if == || == && ! { continue } if strings.HasPrefix(, "utf-16") { = "utf-8" = encoding.Nop } if != nil { return , } } } } func fromMetaElement( string) string { for != "" { := strings.Index(, "charset") if == -1 { return "" } = [+len("charset"):] = strings.TrimLeft(, " \t\n\f\r") if !strings.HasPrefix(, "=") { continue } = [1:] = strings.TrimLeft(, " \t\n\f\r") if == "" { return "" } if := [0]; == '"' || == '\'' { = [1:] := strings.IndexRune(, rune()) if == -1 { return "" } return [:] } := strings.IndexAny(, "; \t\n\f\r") if == -1 { = len() } return [:] } return "" } var boms = []struct { bom []byte enc string }{ {[]byte{0xfe, 0xff}, "utf-16be"}, {[]byte{0xff, 0xfe}, "utf-16le"}, {[]byte{0xef, 0xbb, 0xbf}, "utf-8"}, }