// Copyright 2010 The Go Authors. All rights reserved.// Use of this source code is governed by a BSD-style// license that can be found in the LICENSE file.package htmlimport ()// These replacements permit compatibility with old numeric entities that// assumed Windows-1252 encoding.// https://html.spec.whatwg.org/multipage/syntax.html#consume-a-character-referencevar replacementTable = [...]rune{'\u20AC', // First entry is what 0x80 should be replaced with.'\u0081','\u201A','\u0192','\u201E','\u2026','\u2020','\u2021','\u02C6','\u2030','\u0160','\u2039','\u0152','\u008D','\u017D','\u008F','\u0090','\u2018','\u2019','\u201C','\u201D','\u2022','\u2013','\u2014','\u02DC','\u2122','\u0161','\u203A','\u0153','\u009D','\u017E','\u0178', // Last entry is 0x9F.// 0x00->'\uFFFD' is handled programmatically. // 0x0D->'\u000D' is a no-op.}// unescapeEntity reads an entity like "<" from b[src:] and writes the// corresponding "<" to b[dst:], returning the incremented dst and src cursors.// Precondition: b[src] == '&' && dst <= src.// attribute should be true if parsing an attribute value.func unescapeEntity( []byte, , int, bool) (, int) {// https://html.spec.whatwg.org/multipage/syntax.html#consume-a-character-reference// i starts at 1 because we already know that s[0] == '&'. , := 1, [:]iflen() <= 1 { [] = []return + 1, + 1 }if [] == '#' {iflen() <= 3 { // We need to have at least "&#.". [] = []return + 1, + 1 } ++ := [] := falseif == 'x' || == 'X' { = true ++ } := '\x00'for < len() { = [] ++if {if'0' <= && <= '9' { = 16* + rune() - '0'continue } elseif'a' <= && <= 'f' { = 16* + rune() - 'a' + 10continue } elseif'A' <= && <= 'F' { = 16* + rune() - 'A' + 10continue } } elseif'0' <= && <= '9' { = 10* + rune() - '0'continue }if != ';' { -- }break }if <= 3 { // No characters matched. [] = []return + 1, + 1 }if0x80 <= && <= 0x9F {// Replace characters from Windows-1252 with UTF-8 equivalents. = replacementTable[-0x80] } elseif == 0 || (0xD800 <= && <= 0xDFFF) || > 0x10FFFF {// Replace invalid characters with the replacement character. = '\uFFFD' }return + utf8.EncodeRune([:], ), + }// Consume the maximum number of characters possible, with the // consumed characters matching one of the named references.for < len() { := [] ++// Lower-cased characters are more common in entities, so we check for them first.if'a' <= && <= 'z' || 'A' <= && <= 'Z' || '0' <= && <= '9' {continue }if != ';' { -- }break } := string([1:])if == "" {// No-op. } elseif && [len()-1] != ';' && len() > && [] == '=' {// No-op. } elseif := entity[]; != 0 {return + utf8.EncodeRune([:], ), + } elseif := entity2[]; [0] != 0 { := + utf8.EncodeRune([:], [0])return + utf8.EncodeRune([:], [1]), + } elseif ! { := len() - 1if > longestEntityWithoutSemicolon { = longestEntityWithoutSemicolon }for := ; > 1; -- {if := entity[[:]]; != 0 {return + utf8.EncodeRune([:], ), + + 1 } } } , = +, +copy([:], [:])return , }// unescape unescapes b's entities in-place, so that "a<b" becomes "a<b".// attribute should be true if parsing an attribute value.func unescape( []byte, bool) []byte {for , := range {if == '&' { , := unescapeEntity(, , , )for < len() { := []if == '&' { , = unescapeEntity(, , , ) } else { [] = , = +1, +1 } }return [0:] } }return}// lower lower-cases the A-Z bytes in b in-place, so that "aBc" becomes "abc".func lower( []byte) []byte {for , := range {if'A' <= && <= 'Z' { [] = + 'a' - 'A' } }return}// escapeComment is like func escape but escapes its input bytes less often.// Per https://github.com/golang/go/issues/58246 some HTML comments are (1)// meaningful and (2) contain angle brackets that we'd like to avoid escaping// unless we have to.//// "We have to" includes the '&' byte, since that introduces other escapes.//// It also includes those bytes (not including EOF) that would otherwise end// the comment. Per the summary table at the bottom of comment_test.go, this is// the '>' byte that, per above, we'd like to avoid escaping unless we have to.//// Studying the summary table (and T actions in its '>' column) closely, we// only need to escape in states 43, 44, 49, 51 and 52. State 43 is at the// start of the comment data. State 52 is after a '!'. The other three states// are after a '-'.//// Our algorithm is thus to escape every '&' and to escape '>' if and only if:// - The '>' is after a '!' or '-' (in the unescaped data) or// - The '>' is at the start of the comment data (after the opening "<!--").func escapeComment( writer, string) error {// When modifying this function, consider manually increasing the // maxSuffixLen constant in func TestComments, from 6 to e.g. 9 or more. // That increase should only be temporary, not committed, as it // exponentially affects the test running time.iflen() == 0 {returnnil }// Loop: // - Grow j such that s[i:j] does not need escaping. // - If s[j] does need escaping, output s[i:j] and an escaped s[j], // resetting i and j to point past that s[j] byte. := 0for := 0; < len(); ++ { := ""switch [] {case'&': = "&"case'>':if > 0 {if := [-1]; ( != '!') && ( != '-') {continue } } = ">"default:continue }if < {if , := .WriteString([:]); != nil {return } }if , := .WriteString(); != nil {return } = + 1 }if < len() {if , := .WriteString([:]); != nil {return } }returnnil}// escapeCommentString is to EscapeString as escapeComment is to escape.func escapeCommentString( string) string {ifstrings.IndexAny(, "&>") == -1 {return }varbytes.BufferescapeComment(&, )return .String()}const escapedChars = "&'<>\"\r"func escape( writer, string) error { := strings.IndexAny(, escapedChars)for != -1 {if , := .WriteString([:]); != nil {return }varstringswitch [] {case'&': = "&"case'\'':// "'" is shorter than "'" and apos was not in HTML until HTML5. = "'"case'<': = "<"case'>': = ">"case'"':// """ is shorter than """. = """case'\r': = " "default:panic("unrecognized escape character") } = [+1:]if , := .WriteString(); != nil {return } = strings.IndexAny(, escapedChars) } , := .WriteString()return}// EscapeString escapes special characters like "<" to become "<". It// escapes only five such characters: <, >, &, ' and ".// UnescapeString(EscapeString(s)) == s always holds, but the converse isn't// always true.func ( string) string {ifstrings.IndexAny(, escapedChars) == -1 {return }varbytes.Bufferescape(&, )return .String()}// UnescapeString unescapes entities like "<" to become "<". It unescapes a// larger range of entities than EscapeString escapes. For example, "á"// unescapes to "รก", as does "á" and "&xE1;".// UnescapeString(EscapeString(s)) == s always holds, but the converse isn't// always true.func ( string) string {for , := range {if == '&' {returnstring(unescape([]byte(), false)) } }return}
The pages are generated with Goldsv0.6.7. (GOOS=linux GOARCH=amd64)
Golds is a Go 101 project developed by Tapir Liu.
PR and bug reports are welcome and can be submitted to the issue list.
Please follow @Go100and1 (reachable from the left QR code) to get the latest news of Golds.