// Copyright 2014 The Go Authors. All rights reserved.// Use of this source code is governed by a BSD-style// license that can be found in the LICENSE file.package cases// This file contains the definitions of case mappings for all supported// languages. The rules for the language-specific tailorings were taken and// modified from the CLDR transform definitions in common/transforms.import ()// A mapFunc takes a context set to the current rune and writes the mapped// version to the same context. It may advance the context to the next rune. It// returns whether a checkpoint is possible: whether the pDst bytes written to// dst so far won't need changing as we see more source bytes.type mapFunc func(*context) bool// A spanFunc takes a context set to the current rune and returns whether this// rune would be altered when written to the output. It may advance the context// to the next rune. It returns whether a checkpoint is possible.type spanFunc func(*context) bool// maxIgnorable defines the maximum number of ignorables to consider for// lookahead operations.const maxIgnorable = 30// supported lists the language tags for which we have tailorings.const supported = "und af az el lt nl tr"func init() { := []language.Tag{}for , := rangestrings.Split(supported, " ") { = append(, language.MustParse()) }matcher = internal.NewInheritanceMatcher()Supported = language.NewCoverage()}var ( matcher *internal.InheritanceMatcherSupportedlanguage.Coverage// We keep the following lists separate, instead of having a single per- // language struct, to give the compiler a chance to remove unused code.// Some uppercase mappers are stateless, so we can precompute the // Transformers and save a bit on runtime allocations. upperFunc = []struct { upper mapFunc span spanFunc }{ {nil, nil}, // und {nil, nil}, // af {aztrUpper(upper), isUpper}, // az {elUpper, noSpan}, // el {ltUpper(upper), noSpan}, // lt {nil, nil}, // nl {aztrUpper(upper), isUpper}, // tr } undUpper transform.SpanningTransformer = &undUpperCaser{} undLower transform.SpanningTransformer = &undLowerCaser{} undLowerIgnoreSigma transform.SpanningTransformer = &undLowerIgnoreSigmaCaser{} lowerFunc = []mapFunc{nil, // undnil, // afaztrLower, // aznil, // elltLower, // ltnil, // nlaztrLower, // tr } titleInfos = []struct { title mapFunc lower mapFunc titleSpan spanFunc rewrite func(*context) }{ {title, lower, isTitle, nil}, // und {title, lower, isTitle, afnlRewrite}, // af {aztrUpper(title), aztrLower, isTitle, nil}, // az {title, lower, isTitle, nil}, // el {ltUpper(title), ltLower, noSpan, nil}, // lt {nlTitle, lower, nlTitleSpan, afnlRewrite}, // nl {aztrUpper(title), aztrLower, isTitle, nil}, // tr })func makeUpper( language.Tag, options) transform.SpanningTransformer { , , := matcher.Match() := upperFunc[].upperif == nil {returnundUpper }return &simpleCaser{f: , span: upperFunc[].span}}func makeLower( language.Tag, options) transform.SpanningTransformer { , , := matcher.Match() := lowerFunc[]if == nil {if .ignoreFinalSigma {returnundLowerIgnoreSigma }returnundLower }if .ignoreFinalSigma {return &simpleCaser{f: , span: isLower} }return &lowerCaser{first: ,midWord: finalSigma(), }}func makeTitle( language.Tag, options) transform.SpanningTransformer { , , := matcher.Match() := &titleInfos[] := .lowerif .noLower { = (*context).copy } elseif !.ignoreFinalSigma { = finalSigma() }return &titleCaser{title: .title,lower: ,titleSpan: .titleSpan,rewrite: .rewrite, }}func noSpan( *context) bool { .err = transform.ErrEndOfSpanreturnfalse}// TODO: consider a similar special case for the fast majority lower case. This// is a bit more involved so will require some more precise benchmarking to// justify it.type undUpperCaser struct{ transform.NopResetter }// undUpperCaser implements the Transformer interface for doing an upper case// mapping for the root locale (und). It eliminates the need for an allocation// as it prevents escaping by not using function pointers.func ( undUpperCaser) (, []byte, bool) (, int, error) { := context{dst: , src: , atEOF: }for .next() {upper(&) .checkpoint() }return .ret()}func ( undUpperCaser) ( []byte, bool) ( int, error) { := context{src: , atEOF: }for .next() && isUpper(&) { .checkpoint() }return .retSpan()}// undLowerIgnoreSigmaCaser implements the Transformer interface for doing// a lower case mapping for the root locale (und) ignoring final sigma// handling. This casing algorithm is used in some performance-critical packages// like secure/precis and x/net/http/idna, which warrants its special-casing.type undLowerIgnoreSigmaCaser struct{ transform.NopResetter }func ( undLowerIgnoreSigmaCaser) (, []byte, bool) (, int, error) { := context{dst: , src: , atEOF: }for .next() && lower(&) { .checkpoint() }return .ret()}// Span implements a generic lower-casing. This is possible as isLower works// for all lowercasing variants. All lowercase variants only vary in how they// transform a non-lowercase letter. They will never change an already lowercase// letter. In addition, there is no state.func ( undLowerIgnoreSigmaCaser) ( []byte, bool) ( int, error) { := context{src: , atEOF: }for .next() && isLower(&) { .checkpoint() }return .retSpan()}type simpleCaser struct {context f mapFunc span spanFunc}// simpleCaser implements the Transformer interface for doing a case operation// on a rune-by-rune basis.func ( *simpleCaser) (, []byte, bool) (, int, error) { := context{dst: , src: , atEOF: }for .next() && .f(&) { .checkpoint() }return .ret()}func ( *simpleCaser) ( []byte, bool) ( int, error) { := context{src: , atEOF: }for .next() && .span(&) { .checkpoint() }return .retSpan()}// undLowerCaser implements the Transformer interface for doing a lower case// mapping for the root locale (und) ignoring final sigma handling. This casing// algorithm is used in some performance-critical packages like secure/precis// and x/net/http/idna, which warrants its special-casing.type undLowerCaser struct{ transform.NopResetter }func ( undLowerCaser) (, []byte, bool) (, int, error) { := context{dst: , src: , atEOF: }for := true; .next(); {if {if .info.isCased() {if !lower(&) {break } = false } elseif !.copy() {break } } else {if .info.isNotCasedAndNotCaseIgnorable() {if !.copy() {break } = true } elseif !.hasPrefix("Σ") {if !lower(&) {break } } elseif !finalSigmaBody(&) {break } } .checkpoint() }return .ret()}func ( undLowerCaser) ( []byte, bool) ( int, error) { := context{src: , atEOF: }for .next() && isLower(&) { .checkpoint() }return .retSpan()}// lowerCaser implements the Transformer interface. The default Unicode lower// casing requires different treatment for the first and subsequent characters// of a word, most notably to handle the Greek final Sigma.type lowerCaser struct {undLowerIgnoreSigmaCasercontext first, midWord mapFunc}func ( *lowerCaser) (, []byte, bool) (, int, error) { .context = context{dst: , src: , atEOF: } := &.contextfor := true; .next(); {if {if .info.isCased() {if !.first() {break } = false } elseif !.copy() {break } } else {if .info.isNotCasedAndNotCaseIgnorable() {if !.copy() {break } = true } elseif !.midWord() {break } } .checkpoint() }return .ret()}// titleCaser implements the Transformer interface. Title casing algorithms// distinguish between the first letter of a word and subsequent letters of the// same word. It uses state to avoid requiring a potentially infinite lookahead.type titleCaser struct {context// rune mappings used by the actual casing algorithms. title mapFunc lower mapFunc titleSpan spanFunc rewrite func(*context)}// Transform implements the standard Unicode title case algorithm as defined in// Chapter 3 of The Unicode Standard:// toTitlecase(X): Find the word boundaries in X according to Unicode Standard// Annex #29, "Unicode Text Segmentation." For each word boundary, find the// first cased character F following the word boundary. If F exists, map F to// Titlecase_Mapping(F); then map all characters C between F and the following// word boundary to Lowercase_Mapping(C).func ( *titleCaser) (, []byte, bool) (, int, error) { .context = context{dst: , src: , atEOF: , isMidWord: .isMidWord} := &.contextif !.next() {return .ret() }for { := .infoif .rewrite != nil { .rewrite() } := .isMid()// Break out of this loop on failure to ensure we do not modify the // state incorrectly.if .isCased() {if !.isMidWord {if !.title() {break } .isMidWord = true } elseif !.lower() {break } } elseif !.copy() {break } elseif .isBreak() { .isMidWord = false }// As we save the state of the transformer, it is safe to call // checkpoint after any successful write.if !(.isMidWord && ) { .checkpoint() }if !.next() {break }if && .info.isMid() { .isMidWord = false } }return .ret()}func ( *titleCaser) ( []byte, bool) ( int, error) { .context = context{src: , atEOF: , isMidWord: .isMidWord} := &.contextif !.next() {return .retSpan() }for { := .infoif .rewrite != nil { .rewrite() } := .isMid()// Break out of this loop on failure to ensure we do not modify the // state incorrectly.if .isCased() {if !.isMidWord {if !.titleSpan() {break } .isMidWord = true } elseif !isLower() {break } } elseif .isBreak() { .isMidWord = false }// As we save the state of the transformer, it is safe to call // checkpoint after any successful write.if !(.isMidWord && ) { .checkpoint() }if !.next() {break }if && .info.isMid() { .isMidWord = false } }return .retSpan()}// finalSigma adds Greek final Sigma handing to another casing function. It// determines whether a lowercased sigma should be σ or ς, by looking ahead for// case-ignorables and a cased letters.func finalSigma( mapFunc) mapFunc {returnfunc( *context) bool {if !.hasPrefix("Σ") {return () }returnfinalSigmaBody() }}func finalSigmaBody( *context) bool {// Current rune must be ∑.// ::NFD(); // # 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA // Σ } [:case-ignorable:]* [:cased:] → σ; // [:cased:] [:case-ignorable:]* { Σ → ς; // ::Any-Lower; // ::NFC(); := .pDst .writeString("ς")// TODO: we should do this here, but right now this will never have an // effect as this is called when the prefix is Sigma, whereas Dutch and // Afrikaans only test for an apostrophe. // // if t.rewrite != nil { // t.rewrite(c) // }// We need to do one more iteration after maxIgnorable, as a cased // letter is not an ignorable and may modify the result. := falsefor := 0; < maxIgnorable+1; ++ {if !.next() {returnfalse }if !.info.isCaseIgnorable() {// All Midword runes are also case ignorable, so we are // guaranteed to have a letter or word break here. As we are // unreading the run, there is no need to unset c.isMidWord; // the title caser will handle this.if .info.isCased() {// p+1 is guaranteed to be in bounds: if writing ς was // successful, p+1 will contain the second byte of ς. If not, // this function will have returned after c.next returned false. .dst[+1]++ // ς → σ } .unreadRune()returntrue }// A case ignorable may also introduce a word break, so we may need // to continue searching even after detecting a break. := .info.isMid()if ( && ) || .info.isBreak() { .isMidWord = false } = .copy() }returntrue}// finalSigmaSpan would be the same as isLower.// elUpper implements Greek upper casing, which entails removing a predefined// set of non-blocked modifiers. Note that these accents should not be removed// for title casing!// Example: "Οδός" -> "ΟΔΟΣ".func elUpper( *context) bool {// From CLDR: // [:Greek:] [^[:ccc=Not_Reordered:][:ccc=Above:]]*? { [\u0313\u0314\u0301\u0300\u0306\u0342\u0308\u0304] → ; // [:Greek:] [^[:ccc=Not_Reordered:][:ccc=Iota_Subscript:]]*? { \u0345 → ; , := utf8.DecodeRune(.src[.pSrc:]) := .pDstif !upper() {returnfalse }if !unicode.Is(unicode.Greek, ) {returntrue } := 0// Take the properties of the uppercased rune that is already written to the // destination. This saves us the trouble of having to uppercase the // decomposed rune again.if := norm.NFD.Properties(.dst[:]).Decomposition(); != nil {// Restore the destination position and process the decomposed rune. , := utf8.DecodeRune()if <= 0xFF { // See A.6.1returntrue } .pDst = // Insert the first rune and ignore the modifiers. See A.6.2. .writeBytes([:]) = len([:]) / 2// Greek modifiers are always of length 2. }for ; < maxIgnorable && .next(); ++ {switch , := utf8.DecodeRune(.src[.pSrc:]); {// Above and Iota Subscriptcase0x0300, // U+0300 COMBINING GRAVE ACCENT0x0301, // U+0301 COMBINING ACUTE ACCENT0x0304, // U+0304 COMBINING MACRON0x0306, // U+0306 COMBINING BREVE0x0308, // U+0308 COMBINING DIAERESIS0x0313, // U+0313 COMBINING COMMA ABOVE0x0314, // U+0314 COMBINING REVERSED COMMA ABOVE0x0342, // U+0342 COMBINING GREEK PERISPOMENI0x0345: // U+0345 COMBINING GREEK YPOGEGRAMMENI// No-op. Gobble the modifier.default:switch , := trie.lookup(.src[.pSrc:]); info().cccType() {casecccZero: .unreadRune()returntrue// We don't need to test for IotaSubscript as the only rune that // qualifies (U+0345) was already excluded in the switch statement // above. See A.4.casecccAbove:return .copy()default:// Some other modifier. We're still allowed to gobble Greek // modifiers after this. .copy() } } }return == maxIgnorable}// TODO: implement elUpperSpan (low-priority: complex and infrequent).func ltLower( *context) bool {// From CLDR: // # Introduce an explicit dot above when lowercasing capital I's and J's // # whenever there are more accents above. // # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek) // # 0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I // # 004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J // # 012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK // # 00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE // # 00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE // # 0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE // ::NFD(); // I } [^[:ccc=Not_Reordered:][:ccc=Above:]]* [:ccc=Above:] → i \u0307; // J } [^[:ccc=Not_Reordered:][:ccc=Above:]]* [:ccc=Above:] → j \u0307; // I \u0328 (Į) } [^[:ccc=Not_Reordered:][:ccc=Above:]]* [:ccc=Above:] → i \u0328 \u0307; // I \u0300 (Ì) → i \u0307 \u0300; // I \u0301 (Í) → i \u0307 \u0301; // I \u0303 (Ĩ) → i \u0307 \u0303; // ::Any-Lower(); // ::NFC(); := 0if := .src[.pSrc]; < utf8.RuneSelf {lower()if != 'I' && != 'J' {returntrue } } else { := norm.NFD.Properties(.src[.pSrc:])if := .Decomposition(); len() >= 3 && ([0] == 'I' || [0] == 'J') {// UTF-8 optimization: the decomposition will only have an above // modifier if the last rune of the decomposition is in [U+300-U+311]. // In all other cases, a decomposition starting with I is always // an I followed by modifiers that are not cased themselves. See A.2.if [1] == 0xCC && [2] <= 0x91 { // A.2.4.if !.writeBytes([:1]) {returnfalse } .dst[.pDst-1] += 'a' - 'A'// lower// Assumption: modifier never changes on lowercase. See A.1. // Assumption: all modifiers added have CCC = Above. See A.2.3.return .writeString("\u0307") && .writeBytes([1:]) }// In all other cases the additional modifiers will have a CCC // that is less than 230 (Above). We will insert the U+0307, if // needed, after these modifiers so that a string in FCD form // will remain so. See A.2.2.lower() = 1 } else {returnlower() } }for ; < maxIgnorable && .next(); ++ {switch .info.cccType() {casecccZero: .unreadRune()returntruecasecccAbove:return .writeString("\u0307") && .copy() // See A.1.default: .copy() // See A.1. } }return == maxIgnorable}// ltLowerSpan would be the same as isLower.func ltUpper( mapFunc) mapFunc {returnfunc( *context) bool {// Unicode: // 0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE // // From CLDR: // # Remove \u0307 following soft-dotteds (i, j, and the like), with possible // # intervening non-230 marks. // ::NFD(); // [:Soft_Dotted:] [^[:ccc=Not_Reordered:][:ccc=Above:]]* { \u0307 → ; // ::Any-Upper(); // ::NFC();// TODO: See A.5. A soft-dotted rune never has an exception. This would // allow us to overload the exception bit and encode this property in // info. Need to measure performance impact of this. , := utf8.DecodeRune(.src[.pSrc:]) := .pDstif !() {returnfalse }if !unicode.Is(unicode.Soft_Dotted, ) {returntrue }// We don't need to do an NFD normalization, as a soft-dotted rune never // contains U+0307. See A.3. := 0for ; < maxIgnorable && .next(); ++ {switch .info.cccType() {casecccZero: .unreadRune()returntruecasecccAbove:if .hasPrefix("\u0307") {// We don't do a full NFC, but rather combine runes for // some of the common cases. (Returning NFC or // preserving normal form is neither a requirement nor // a possibility anyway).if !.next() {returnfalse }if .dst[] == 'I' && .pDst == +1 && .src[.pSrc] == 0xcc { := ""switch .src[.pSrc+1] {case0x80: // U+0300 COMBINING GRAVE ACCENT = "\u00cc"// U+00CC LATIN CAPITAL LETTER I WITH GRAVEcase0x81: // U+0301 COMBINING ACUTE ACCENT = "\u00cd"// U+00CD LATIN CAPITAL LETTER I WITH ACUTEcase0x83: // U+0303 COMBINING TILDE = "\u0128"// U+0128 LATIN CAPITAL LETTER I WITH TILDEcase0x88: // U+0308 COMBINING DIAERESIS = "\u00cf"// U+00CF LATIN CAPITAL LETTER I WITH DIAERESISdefault: }if != "" { .pDst = return .writeString() } } }return .copy()default: .copy() } }return == maxIgnorable }}// TODO: implement ltUpperSpan (low priority: complex and infrequent).func aztrUpper( mapFunc) mapFunc {returnfunc( *context) bool {// i→İ;if .src[.pSrc] == 'i' {return .writeString("İ") }return () }}func aztrLower( *context) ( bool) {// From CLDR: // # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri // # 0130; 0069; 0130; 0130; tr; # LATIN CAPITAL LETTER I WITH DOT ABOVE // İ→i; // # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i. // # This matches the behavior of the canonically equivalent I-dot_above // # 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE // # When lowercasing, unless an I is before a dot_above, it turns into a dotless i. // # 0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I // I([^[:ccc=Not_Reordered:][:ccc=Above:]]*)\u0307 → i$1 ; // I→ı ; // ::Any-Lower();if .hasPrefix("\u0130") { // İreturn .writeString("i") }if .src[.pSrc] != 'I' {returnlower() }// We ignore the lower-case I for now, but insert it later when we know // which form we need. := .pSrc + .sz := 0:// We check for up to n ignorables before \u0307. As \u0307 is an // ignorable as well, n is maxIgnorable-1.for ; < maxIgnorable && .next(); ++ {switch .info.cccType() {casecccAbove:if .hasPrefix("\u0307") {return .writeString("i") && .writeBytes(.src[:.pSrc]) // ignore U+0307 } = truebreakcasecccZero: .unreadRune() = truebreakdefault:// We'll write this rune after we know which starter to use. } }if == maxIgnorable { = true }return .writeString("ı") && .writeBytes(.src[:.pSrc+.sz]) && }// aztrLowerSpan would be the same as isLower.func nlTitle( *context) bool {// From CLDR: // # Special titlecasing for Dutch initial "ij". // ::Any-Title(); // # Fix up Ij at the beginning of a "word" (per Any-Title, notUAX #29) // [:^WB=ALetter:] [:WB=Extend:]* [[:WB=MidLetter:][:WB=MidNumLet:]]? { Ij } → IJ ;if .src[.pSrc] != 'I' && .src[.pSrc] != 'i' {returntitle() }if !.writeString("I") || !.next() {returnfalse }if .src[.pSrc] == 'j' || .src[.pSrc] == 'J' {return .writeString("J") } .unreadRune()returntrue}func nlTitleSpan( *context) bool {// From CLDR: // # Special titlecasing for Dutch initial "ij". // ::Any-Title(); // # Fix up Ij at the beginning of a "word" (per Any-Title, notUAX #29) // [:^WB=ALetter:] [:WB=Extend:]* [[:WB=MidLetter:][:WB=MidNumLet:]]? { Ij } → IJ ;if .src[.pSrc] != 'I' {returnisTitle() }if !.next() || .src[.pSrc] == 'j' {returnfalse }if .src[.pSrc] != 'J' { .unreadRune() }returntrue}// Not part of CLDR, but see https://unicode.org/cldr/trac/ticket/7078.func afnlRewrite( *context) {if .hasPrefix("'") || .hasPrefix("’") { .isMidWord = true }}
The pages are generated with Goldsv0.6.7. (GOOS=linux GOARCH=amd64)
Golds is a Go 101 project developed by Tapir Liu.
PR and bug reports are welcome and can be submitted to the issue list.
Please follow @Go100and1 (reachable from the left QR code) to get the latest news of Golds.