// Copyright 2013 The Go Authors. All rights reserved.// Use of this source code is governed by a BSD-style// license that can be found in the LICENSE file.package languageimport ()// isAlpha returns true if the byte is not a digit.// b must be an ASCII letter or digit.func isAlpha( byte) bool {return > '9'}// isAlphaNum returns true if the string contains only ASCII letters or digits.func isAlphaNum( []byte) bool {for , := range {if !('a' <= && <= 'z' || 'A' <= && <= 'Z' || '0' <= && <= '9') {returnfalse } }returntrue}// ErrSyntax is returned by any of the parsing functions when the// input is not well-formed, according to BCP 47.// TODO: return the position at which the syntax error occurred?varErrSyntax = errors.New("language: tag is not well-formed")// ErrDuplicateKey is returned when a tag contains the same key twice with// different values in the -u section.varErrDuplicateKey = errors.New("language: different values for same key in -u extension")// ValueError is returned by any of the parsing functions when the// input is well-formed but the respective subtag is not recognized// as a valid value.typeValueErrorstruct { v [8]byte}// NewValueError creates a new ValueError.func ( []byte) ValueError {varValueErrorcopy(.v[:], )return}func ( ValueError) () []byte { := bytes.IndexByte(.v[:], 0)if == -1 { = 8 }return .v[:]}// Error implements the error interface.func ( ValueError) () string {returnfmt.Sprintf("language: subtag %q is well-formed but unknown", .tag())}// Subtag returns the subtag for which the error occurred.func ( ValueError) () string {returnstring(.tag())}// scanner is used to scan BCP 47 tokens, which are separated by _ or -.type scanner struct { b []byte bytes [max99thPercentileSize]byte token []byte start int// start position of the current token end int// end position of the current token next int// next point for scan err error done bool}func makeScannerString( string) scanner { := scanner{}iflen() <= len(.bytes) { .b = .bytes[:copy(.bytes[:], )] } else { .b = []byte() } .init()return}// makeScanner returns a scanner using b as the input buffer.// b is not copied and may be modified by the scanner routines.func makeScanner( []byte) scanner { := scanner{b: } .init()return}func ( *scanner) () {for , := range .b {if == '_' { .b[] = '-' } } .scan()}// restToLower converts the string between start and end to lower case.func ( *scanner) (, int) {for := ; < ; ++ { := .b[]if'A' <= && <= 'Z' { .b[] += 'a' - 'A' } }}func ( *scanner) ( error) {if .err == nil || ( == ErrSyntax && .err != ErrSyntax) { .err = }}// resizeRange shrinks or grows the array at position oldStart such that// a new string of size newSize can fit between oldStart and oldEnd.// Sets the scan point to after the resized range.func ( *scanner) (, , int) { .start = if := + ; != { := - var []byteif := len(.b) + ; > cap(.b) { = make([]byte, )copy(, .b[:]) } else { = .b[:] }copy([:], .b[:]) .b = .next = + (.next - .end) .end = }}// replace replaces the current token with repl.func ( *scanner) ( string) { .resizeRange(.start, .end, len())copy(.b[.start:], )}// gobble removes the current token from the input.// Caller must call scan after calling gobble.func ( *scanner) ( error) { .setError()if .start == 0 { .b = .b[:+copy(.b, .b[.next:])] .end = 0 } else { .b = .b[:.start-1+copy(.b[.start-1:], .b[.end:])] .end = .start - 1 } .next = .start}// deleteRange removes the given range from s.b before the current token.func ( *scanner) (, int) { .b = .b[:+copy(.b[:], .b[:])] := - .next -= .start -= .end -= }// scan parses the next token of a BCP 47 string. Tokens that are larger// than 8 characters or include non-alphanumeric characters result in an error// and are gobbled and removed from the output.// It returns the end position of the last token consumed.func ( *scanner) () ( int) { = .end .token = nilfor .start = .next; .next < len(.b); { := bytes.IndexByte(.b[.next:], '-')if == -1 { .end = len(.b) .next = len(.b) = .end - .start } else { .end = .next + .next = .end + 1 } := .b[.start:.end]if < 1 || > 8 || !isAlphaNum() { .gobble(ErrSyntax)continue } .token = return }if := len(.b); > 0 && .b[-1] == '-' { .setError(ErrSyntax) .b = .b[:len(.b)-1] } .done = truereturn}// acceptMinSize parses multiple tokens of the given size or greater.// It returns the end position of the last token consumed.func ( *scanner) ( int) ( int) { = .end .scan()for ; len(.token) >= ; .scan() { = .end }return}// Parse parses the given BCP 47 string and returns a valid Tag. If parsing// failed it returns an error and any part of the tag that could be parsed.// If parsing succeeded but an unknown value was found, it returns// ValueError. The Tag returned in this case is just stripped of the unknown// value. All other values are preserved. It accepts tags in the BCP 47 format// and extensions to this standard defined in// https://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.func ( string) ( Tag, error) {// TODO: consider supporting old-style locale key-value pairs.if == "" {returnUnd, ErrSyntax }deferfunc() {ifrecover() != nil { = Und = ErrSyntaxreturn } }()iflen() <= maxAltTaglen { := [maxAltTaglen]byte{}for , := range {// Generating invalid UTF-8 is okay as it won't match.if'A' <= && <= 'Z' { += 'a' - 'A' } elseif == '_' { = '-' } [] = byte() }if , := grandfathered(); {return , nil } } := makeScannerString()returnparse(&, )}func parse( *scanner, string) ( Tag, error) { = Undvarintif := len(.token); <= 1 { .toLower(0, len(.b))if == 0 || .token[0] != 'x' {return , ErrSyntax } = parseExtensions() } elseif >= 4 {returnUnd, ErrSyntax } else { // the usual case , = parseTag(, true)if := len(.token); == 1 { .pExt = uint16() = parseExtensions() } elseif < len(.b) { .setError(ErrSyntax) .b = .b[:] } }ifint(.pVariant) < len(.b) {if < len() { = [:] }iflen() > 0 && tag.Compare(, .b) == 0 { .str = } else { .str = string(.b) } } else { .pVariant, .pExt = 0, 0 }return , .err}// parseTag parses language, script, region and variants.// It returns a Tag and the end position in the input that was parsed.// If doNorm is true, then <lang>-<extlang> will be normalized to <extlang>.func parseTag( *scanner, bool) ( Tag, int) {varerror// TODO: set an error if an unknown lang, script or region is encountered. .LangID, = getLangID(.token) .setError() .replace(.LangID.String()) := .start = .scan()forlen(.token) == 3 && isAlpha(.token[0]) {// From http://tools.ietf.org/html/bcp47, <lang>-<extlang> tags are equivalent // to a tag of the form <extlang>.if { , := getLangID(.token)if != 0 { .LangID = := .String()copy(.b[:], ) .b[+len()] = '-' .start = + len() + 1 } .gobble() } = .scan() }iflen(.token) == 4 && isAlpha(.token[0]) { .ScriptID, = getScriptID(script, .token)if .ScriptID == 0 { .gobble() } = .scan() }if := len(.token); >= 2 && <= 3 { .RegionID, = getRegionID(.token)if .RegionID == 0 { .gobble() } else { .replace(.RegionID.String()) } = .scan() } .toLower(.start, len(.b)) .pVariant = byte() = parseVariants(, , ) .pExt = uint16()return , }var separator = []byte{'-'}// parseVariants scans tokens as long as each token is a valid variant string.// Duplicate variants are removed.func parseVariants( *scanner, int, Tag) int { := .start := [4]uint8{} := [4][]byte{} := [:0] := [:0] := -1 := falsefor ; len(.token) >= 4; .scan() {// TODO: measure the impact of needing this conversion and redesign // the data structure if there is an issue. , := variantIndex[string(.token)]if ! {// unknown variant // TODO: allow user-defined variants? .gobble(NewValueError(.token))continue } = append(, ) = append(, .token)if ! {if < int() { = int() } else { = true// There is no legal combinations of more than 7 variants // (and this is by no means a useful sequence).const = 8iflen() > {break } } } = .end }if {sort.Sort(variantsSort{, }) , := 0, -1for , := range { := int()if == {// Remove duplicates.continue } [] = [] [] = [] ++ = }if := bytes.Join([:], separator); len() == 0 { = - 1 } else { .resizeRange(, , len())copy(.b[.start:], ) = .end } }return}type variantsSort struct { i []uint8 v [][]byte}func ( variantsSort) () int {returnlen(.i)}func ( variantsSort) (, int) { .i[], .i[] = .i[], .i[] .v[], .v[] = .v[], .v[]}func ( variantsSort) (, int) bool {return .i[] < .i[]}type bytesSort struct { b [][]byte n int// first n bytes to compare}func ( bytesSort) () int {returnlen(.b)}func ( bytesSort) (, int) { .b[], .b[] = .b[], .b[]}func ( bytesSort) (, int) bool {for := 0; < .n; ++ {if .b[][] == .b[][] {continue }return .b[][] < .b[][] }returnfalse}// parseExtensions parses and normalizes the extensions in the buffer.// It returns the last position of scan.b that is part of any extension.// It also trims scan.b to remove excess parts accordingly.func parseExtensions( *scanner) int { := .start := [][]byte{} := []byte{} := .endforlen(.token) == 1 { := .start := .token[0] = parseExtension() := .b[:]iflen() < 3 || ( != 'x' && len() < 4) { .setError(ErrSyntax) = continue } elseif == && ( == 'x' || .start == len(.b)) { .b = .b[:]return } elseif == 'x' { = break } = append(, ) }sort.Sort(bytesSort{, 1})iflen() > 0 { = append(, ) } .b = .b[:]iflen() > 0 { .b = append(.b, bytes.Join(, separator)...) } elseif > 0 {// Strip trailing '-'. .b = .b[:-1] }return}// parseExtension parses a single extension and returns the position of// the extension end.func parseExtension( *scanner) int { , := .start, .endswitch .token[0] {case'u': // https://www.ietf.org/rfc/rfc6067.txt := .scan()for := []byte{}; len(.token) > 2; .scan() {ifbytes.Compare(.token, ) != -1 {// Attributes are unsorted. Start over from scratch. := + 1 .next = := [][]byte{}for .scan(); len(.token) > 2; .scan() { = append(, .token) = .end }sort.Sort(bytesSort{, 3})copy(.b[:], bytes.Join(, separator))break } = .token = .end }// Scan key-type sequences. A key is of length 2 and may be followed // by 0 or more "type" subtags from 3 to the maximum of 8 letters.var , []bytefor := ; len(.token) == 2; = { = .token = .endfor .scan(); < .end && len(.token) > 2; .scan() { = .end }// TODO: check key value validityifbytes.Compare(, ) != 1 || .err != nil {// We have an invalid key or the keys are not sorted. // Start scanning keys from scratch and reorder. := + 1 .next = := [][]byte{}for .scan(); len(.token) == 2; { := .start = .endfor .scan(); < .end && len(.token) > 2; .scan() { = .end } = append(, .b[:]) }sort.Stable(bytesSort{, 2})if := len(); > 0 { := 0for := 1; < ; ++ {if !bytes.Equal([][:2], [][:2]) { ++ [] = [] } elseif !bytes.Equal([], []) { .setError(ErrDuplicateKey) } } = [:+1] } := bytes.Join(, separator)if := + len(); < { .deleteRange(, ) = }copy(.b[:], )break } }case't': // https://www.ietf.org/rfc/rfc6497.txt .scan()if := len(.token); >= 2 && <= 3 && isAlpha(.token[1]) { _, = parseTag(, false) .toLower(, ) }forlen(.token) == 2 && !isAlpha(.token[1]) { = .acceptMinSize(3) }case'x': = .acceptMinSize(1)default: = .acceptMinSize(2) }return}// getExtension returns the name, body and end position of the extension.func getExtension( string, int) ( int, string) {if [] == '-' { ++ }if [] == 'x' {returnlen(), [:] } = nextExtension(, )return , [:]}// nextExtension finds the next extension within the string, searching// for the -<char>- pattern from position p.// In the fast majority of cases, language tags will have at most// one extension and extensions tend to be small.func nextExtension( string, int) int {for := len() - 3; < ; {if [] == '-' {if [+2] == '-' {return } += 3 } else { ++ } }returnlen()}
The pages are generated with Goldsv0.6.7. (GOOS=linux GOARCH=amd64)
Golds is a Go 101 project developed by Tapir Liu.
PR and bug reports are welcome and can be submitted to the issue list.
Please follow @Go100and1 (reachable from the left QR code) to get the latest news of Golds.