Source File
map.go
Belonging Package
golang.org/x/text/cases
// Copyright 2014 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package cases
// This file contains the definitions of case mappings for all supported
// languages. The rules for the language-specific tailorings were taken and
// modified from the CLDR transform definitions in common/transforms.
import (
)
// A mapFunc takes a context set to the current rune and writes the mapped
// version to the same context. It may advance the context to the next rune. It
// returns whether a checkpoint is possible: whether the pDst bytes written to
// dst so far won't need changing as we see more source bytes.
type mapFunc func(*context) bool
// A spanFunc takes a context set to the current rune and returns whether this
// rune would be altered when written to the output. It may advance the context
// to the next rune. It returns whether a checkpoint is possible.
type spanFunc func(*context) bool
// maxIgnorable defines the maximum number of ignorables to consider for
// lookahead operations.
const maxIgnorable = 30
// supported lists the language tags for which we have tailorings.
const supported = "und af az el lt nl tr"
func init() {
:= []language.Tag{}
for , := range strings.Split(supported, " ") {
= append(, language.MustParse())
}
matcher = internal.NewInheritanceMatcher()
Supported = language.NewCoverage()
}
var (
matcher *internal.InheritanceMatcher
Supported language.Coverage
// We keep the following lists separate, instead of having a single per-
// language struct, to give the compiler a chance to remove unused code.
// Some uppercase mappers are stateless, so we can precompute the
// Transformers and save a bit on runtime allocations.
upperFunc = []struct {
upper mapFunc
span spanFunc
}{
{nil, nil}, // und
{nil, nil}, // af
{aztrUpper(upper), isUpper}, // az
{elUpper, noSpan}, // el
{ltUpper(upper), noSpan}, // lt
{nil, nil}, // nl
{aztrUpper(upper), isUpper}, // tr
}
undUpper transform.SpanningTransformer = &undUpperCaser{}
undLower transform.SpanningTransformer = &undLowerCaser{}
undLowerIgnoreSigma transform.SpanningTransformer = &undLowerIgnoreSigmaCaser{}
lowerFunc = []mapFunc{
nil, // und
nil, // af
aztrLower, // az
nil, // el
ltLower, // lt
nil, // nl
aztrLower, // tr
}
titleInfos = []struct {
title mapFunc
lower mapFunc
titleSpan spanFunc
rewrite func(*context)
}{
{title, lower, isTitle, nil}, // und
{title, lower, isTitle, afnlRewrite}, // af
{aztrUpper(title), aztrLower, isTitle, nil}, // az
{title, lower, isTitle, nil}, // el
{ltUpper(title), ltLower, noSpan, nil}, // lt
{nlTitle, lower, nlTitleSpan, afnlRewrite}, // nl
{aztrUpper(title), aztrLower, isTitle, nil}, // tr
}
)
func makeUpper( language.Tag, options) transform.SpanningTransformer {
, , := matcher.Match()
:= upperFunc[].upper
if == nil {
return undUpper
}
return &simpleCaser{f: , span: upperFunc[].span}
}
func makeLower( language.Tag, options) transform.SpanningTransformer {
, , := matcher.Match()
:= lowerFunc[]
if == nil {
if .ignoreFinalSigma {
return undLowerIgnoreSigma
}
return undLower
}
if .ignoreFinalSigma {
return &simpleCaser{f: , span: isLower}
}
return &lowerCaser{
first: ,
midWord: finalSigma(),
}
}
func makeTitle( language.Tag, options) transform.SpanningTransformer {
, , := matcher.Match()
:= &titleInfos[]
:= .lower
if .noLower {
= (*context).copy
} else if !.ignoreFinalSigma {
= finalSigma()
}
return &titleCaser{
title: .title,
lower: ,
titleSpan: .titleSpan,
rewrite: .rewrite,
}
}
func noSpan( *context) bool {
.err = transform.ErrEndOfSpan
return false
}
// TODO: consider a similar special case for the fast majority lower case. This
// is a bit more involved so will require some more precise benchmarking to
// justify it.
type undUpperCaser struct{ transform.NopResetter }
// undUpperCaser implements the Transformer interface for doing an upper case
// mapping for the root locale (und). It eliminates the need for an allocation
// as it prevents escaping by not using function pointers.
func ( undUpperCaser) (, []byte, bool) (, int, error) {
:= context{dst: , src: , atEOF: }
for .next() {
upper(&)
.checkpoint()
}
return .ret()
}
func ( undUpperCaser) ( []byte, bool) ( int, error) {
:= context{src: , atEOF: }
for .next() && isUpper(&) {
.checkpoint()
}
return .retSpan()
}
// undLowerIgnoreSigmaCaser implements the Transformer interface for doing
// a lower case mapping for the root locale (und) ignoring final sigma
// handling. This casing algorithm is used in some performance-critical packages
// like secure/precis and x/net/http/idna, which warrants its special-casing.
type undLowerIgnoreSigmaCaser struct{ transform.NopResetter }
func ( undLowerIgnoreSigmaCaser) (, []byte, bool) (, int, error) {
:= context{dst: , src: , atEOF: }
for .next() && lower(&) {
.checkpoint()
}
return .ret()
}
// Span implements a generic lower-casing. This is possible as isLower works
// for all lowercasing variants. All lowercase variants only vary in how they
// transform a non-lowercase letter. They will never change an already lowercase
// letter. In addition, there is no state.
func ( undLowerIgnoreSigmaCaser) ( []byte, bool) ( int, error) {
:= context{src: , atEOF: }
for .next() && isLower(&) {
.checkpoint()
}
return .retSpan()
}
type simpleCaser struct {
context
f mapFunc
span spanFunc
}
// simpleCaser implements the Transformer interface for doing a case operation
// on a rune-by-rune basis.
func ( *simpleCaser) (, []byte, bool) (, int, error) {
:= context{dst: , src: , atEOF: }
for .next() && .f(&) {
.checkpoint()
}
return .ret()
}
func ( *simpleCaser) ( []byte, bool) ( int, error) {
:= context{src: , atEOF: }
for .next() && .span(&) {
.checkpoint()
}
return .retSpan()
}
// undLowerCaser implements the Transformer interface for doing a lower case
// mapping for the root locale (und) ignoring final sigma handling. This casing
// algorithm is used in some performance-critical packages like secure/precis
// and x/net/http/idna, which warrants its special-casing.
type undLowerCaser struct{ transform.NopResetter }
func ( undLowerCaser) (, []byte, bool) (, int, error) {
:= context{dst: , src: , atEOF: }
for := true; .next(); {
if {
if .info.isCased() {
if !lower(&) {
break
}
= false
} else if !.copy() {
break
}
} else {
if .info.isNotCasedAndNotCaseIgnorable() {
if !.copy() {
break
}
= true
} else if !.hasPrefix("Σ") {
if !lower(&) {
break
}
} else if !finalSigmaBody(&) {
break
}
}
.checkpoint()
}
return .ret()
}
func ( undLowerCaser) ( []byte, bool) ( int, error) {
:= context{src: , atEOF: }
for .next() && isLower(&) {
.checkpoint()
}
return .retSpan()
}
// lowerCaser implements the Transformer interface. The default Unicode lower
// casing requires different treatment for the first and subsequent characters
// of a word, most notably to handle the Greek final Sigma.
type lowerCaser struct {
undLowerIgnoreSigmaCaser
context
first, midWord mapFunc
}
func ( *lowerCaser) (, []byte, bool) (, int, error) {
.context = context{dst: , src: , atEOF: }
:= &.context
for := true; .next(); {
if {
if .info.isCased() {
if !.first() {
break
}
= false
} else if !.copy() {
break
}
} else {
if .info.isNotCasedAndNotCaseIgnorable() {
if !.copy() {
break
}
= true
} else if !.midWord() {
break
}
}
.checkpoint()
}
return .ret()
}
// titleCaser implements the Transformer interface. Title casing algorithms
// distinguish between the first letter of a word and subsequent letters of the
// same word. It uses state to avoid requiring a potentially infinite lookahead.
type titleCaser struct {
context
// rune mappings used by the actual casing algorithms.
title mapFunc
lower mapFunc
titleSpan spanFunc
rewrite func(*context)
}
// Transform implements the standard Unicode title case algorithm as defined in
// Chapter 3 of The Unicode Standard:
// toTitlecase(X): Find the word boundaries in X according to Unicode Standard
// Annex #29, "Unicode Text Segmentation." For each word boundary, find the
// first cased character F following the word boundary. If F exists, map F to
// Titlecase_Mapping(F); then map all characters C between F and the following
// word boundary to Lowercase_Mapping(C).
func ( *titleCaser) (, []byte, bool) (, int, error) {
.context = context{dst: , src: , atEOF: , isMidWord: .isMidWord}
:= &.context
if !.next() {
return .ret()
}
for {
:= .info
if .rewrite != nil {
.rewrite()
}
:= .isMid()
// Break out of this loop on failure to ensure we do not modify the
// state incorrectly.
if .isCased() {
if !.isMidWord {
if !.title() {
break
}
.isMidWord = true
} else if !.lower() {
break
}
} else if !.copy() {
break
} else if .isBreak() {
.isMidWord = false
}
// As we save the state of the transformer, it is safe to call
// checkpoint after any successful write.
if !(.isMidWord && ) {
.checkpoint()
}
if !.next() {
break
}
if && .info.isMid() {
.isMidWord = false
}
}
return .ret()
}
func ( *titleCaser) ( []byte, bool) ( int, error) {
.context = context{src: , atEOF: , isMidWord: .isMidWord}
:= &.context
if !.next() {
return .retSpan()
}
for {
:= .info
if .rewrite != nil {
.rewrite()
}
:= .isMid()
// Break out of this loop on failure to ensure we do not modify the
// state incorrectly.
if .isCased() {
if !.isMidWord {
if !.titleSpan() {
break
}
.isMidWord = true
} else if !isLower() {
break
}
} else if .isBreak() {
.isMidWord = false
}
// As we save the state of the transformer, it is safe to call
// checkpoint after any successful write.
if !(.isMidWord && ) {
.checkpoint()
}
if !.next() {
break
}
if && .info.isMid() {
.isMidWord = false
}
}
return .retSpan()
}
// finalSigma adds Greek final Sigma handing to another casing function. It
// determines whether a lowercased sigma should be σ or ς, by looking ahead for
// case-ignorables and a cased letters.
func finalSigma( mapFunc) mapFunc {
return func( *context) bool {
if !.hasPrefix("Σ") {
return ()
}
return finalSigmaBody()
}
}
func finalSigmaBody( *context) bool {
// Current rune must be ∑.
// ::NFD();
// # 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
// Σ } [:case-ignorable:]* [:cased:] → σ;
// [:cased:] [:case-ignorable:]* { Σ → ς;
// ::Any-Lower;
// ::NFC();
:= .pDst
.writeString("ς")
// TODO: we should do this here, but right now this will never have an
// effect as this is called when the prefix is Sigma, whereas Dutch and
// Afrikaans only test for an apostrophe.
//
// if t.rewrite != nil {
// t.rewrite(c)
// }
// We need to do one more iteration after maxIgnorable, as a cased
// letter is not an ignorable and may modify the result.
:= false
for := 0; < maxIgnorable+1; ++ {
if !.next() {
return false
}
if !.info.isCaseIgnorable() {
// All Midword runes are also case ignorable, so we are
// guaranteed to have a letter or word break here. As we are
// unreading the run, there is no need to unset c.isMidWord;
// the title caser will handle this.
if .info.isCased() {
// p+1 is guaranteed to be in bounds: if writing ς was
// successful, p+1 will contain the second byte of ς. If not,
// this function will have returned after c.next returned false.
.dst[+1]++ // ς → σ
}
.unreadRune()
return true
}
// A case ignorable may also introduce a word break, so we may need
// to continue searching even after detecting a break.
:= .info.isMid()
if ( && ) || .info.isBreak() {
.isMidWord = false
}
=
.copy()
}
return true
}
// finalSigmaSpan would be the same as isLower.
// elUpper implements Greek upper casing, which entails removing a predefined
// set of non-blocked modifiers. Note that these accents should not be removed
// for title casing!
// Example: "Οδός" -> "ΟΔΟΣ".
func elUpper( *context) bool {
// From CLDR:
// [:Greek:] [^[:ccc=Not_Reordered:][:ccc=Above:]]*? { [\u0313\u0314\u0301\u0300\u0306\u0342\u0308\u0304] → ;
// [:Greek:] [^[:ccc=Not_Reordered:][:ccc=Iota_Subscript:]]*? { \u0345 → ;
, := utf8.DecodeRune(.src[.pSrc:])
:= .pDst
if !upper() {
return false
}
if !unicode.Is(unicode.Greek, ) {
return true
}
:= 0
// Take the properties of the uppercased rune that is already written to the
// destination. This saves us the trouble of having to uppercase the
// decomposed rune again.
if := norm.NFD.Properties(.dst[:]).Decomposition(); != nil {
// Restore the destination position and process the decomposed rune.
, := utf8.DecodeRune()
if <= 0xFF { // See A.6.1
return true
}
.pDst =
// Insert the first rune and ignore the modifiers. See A.6.2.
.writeBytes([:])
= len([:]) / 2 // Greek modifiers are always of length 2.
}
for ; < maxIgnorable && .next(); ++ {
switch , := utf8.DecodeRune(.src[.pSrc:]); {
// Above and Iota Subscript
case 0x0300, // U+0300 COMBINING GRAVE ACCENT
0x0301, // U+0301 COMBINING ACUTE ACCENT
0x0304, // U+0304 COMBINING MACRON
0x0306, // U+0306 COMBINING BREVE
0x0308, // U+0308 COMBINING DIAERESIS
0x0313, // U+0313 COMBINING COMMA ABOVE
0x0314, // U+0314 COMBINING REVERSED COMMA ABOVE
0x0342, // U+0342 COMBINING GREEK PERISPOMENI
0x0345: // U+0345 COMBINING GREEK YPOGEGRAMMENI
// No-op. Gobble the modifier.
default:
switch , := trie.lookup(.src[.pSrc:]); info().cccType() {
case cccZero:
.unreadRune()
return true
// We don't need to test for IotaSubscript as the only rune that
// qualifies (U+0345) was already excluded in the switch statement
// above. See A.4.
case cccAbove:
return .copy()
default:
// Some other modifier. We're still allowed to gobble Greek
// modifiers after this.
.copy()
}
}
}
return == maxIgnorable
}
// TODO: implement elUpperSpan (low-priority: complex and infrequent).
func ltLower( *context) bool {
// From CLDR:
// # Introduce an explicit dot above when lowercasing capital I's and J's
// # whenever there are more accents above.
// # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek)
// # 0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I
// # 004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J
// # 012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK
// # 00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE
// # 00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE
// # 0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE
// ::NFD();
// I } [^[:ccc=Not_Reordered:][:ccc=Above:]]* [:ccc=Above:] → i \u0307;
// J } [^[:ccc=Not_Reordered:][:ccc=Above:]]* [:ccc=Above:] → j \u0307;
// I \u0328 (Į) } [^[:ccc=Not_Reordered:][:ccc=Above:]]* [:ccc=Above:] → i \u0328 \u0307;
// I \u0300 (Ì) → i \u0307 \u0300;
// I \u0301 (Í) → i \u0307 \u0301;
// I \u0303 (Ĩ) → i \u0307 \u0303;
// ::Any-Lower();
// ::NFC();
:= 0
if := .src[.pSrc]; < utf8.RuneSelf {
lower()
if != 'I' && != 'J' {
return true
}
} else {
:= norm.NFD.Properties(.src[.pSrc:])
if := .Decomposition(); len() >= 3 && ([0] == 'I' || [0] == 'J') {
// UTF-8 optimization: the decomposition will only have an above
// modifier if the last rune of the decomposition is in [U+300-U+311].
// In all other cases, a decomposition starting with I is always
// an I followed by modifiers that are not cased themselves. See A.2.
if [1] == 0xCC && [2] <= 0x91 { // A.2.4.
if !.writeBytes([:1]) {
return false
}
.dst[.pDst-1] += 'a' - 'A' // lower
// Assumption: modifier never changes on lowercase. See A.1.
// Assumption: all modifiers added have CCC = Above. See A.2.3.
return .writeString("\u0307") && .writeBytes([1:])
}
// In all other cases the additional modifiers will have a CCC
// that is less than 230 (Above). We will insert the U+0307, if
// needed, after these modifiers so that a string in FCD form
// will remain so. See A.2.2.
lower()
= 1
} else {
return lower()
}
}
for ; < maxIgnorable && .next(); ++ {
switch .info.cccType() {
case cccZero:
.unreadRune()
return true
case cccAbove:
return .writeString("\u0307") && .copy() // See A.1.
default:
.copy() // See A.1.
}
}
return == maxIgnorable
}
// ltLowerSpan would be the same as isLower.
func ltUpper( mapFunc) mapFunc {
return func( *context) bool {
// Unicode:
// 0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE
//
// From CLDR:
// # Remove \u0307 following soft-dotteds (i, j, and the like), with possible
// # intervening non-230 marks.
// ::NFD();
// [:Soft_Dotted:] [^[:ccc=Not_Reordered:][:ccc=Above:]]* { \u0307 → ;
// ::Any-Upper();
// ::NFC();
// TODO: See A.5. A soft-dotted rune never has an exception. This would
// allow us to overload the exception bit and encode this property in
// info. Need to measure performance impact of this.
, := utf8.DecodeRune(.src[.pSrc:])
:= .pDst
if !() {
return false
}
if !unicode.Is(unicode.Soft_Dotted, ) {
return true
}
// We don't need to do an NFD normalization, as a soft-dotted rune never
// contains U+0307. See A.3.
:= 0
for ; < maxIgnorable && .next(); ++ {
switch .info.cccType() {
case cccZero:
.unreadRune()
return true
case cccAbove:
if .hasPrefix("\u0307") {
// We don't do a full NFC, but rather combine runes for
// some of the common cases. (Returning NFC or
// preserving normal form is neither a requirement nor
// a possibility anyway).
if !.next() {
return false
}
if .dst[] == 'I' && .pDst == +1 && .src[.pSrc] == 0xcc {
:= ""
switch .src[.pSrc+1] {
case 0x80: // U+0300 COMBINING GRAVE ACCENT
= "\u00cc" // U+00CC LATIN CAPITAL LETTER I WITH GRAVE
case 0x81: // U+0301 COMBINING ACUTE ACCENT
= "\u00cd" // U+00CD LATIN CAPITAL LETTER I WITH ACUTE
case 0x83: // U+0303 COMBINING TILDE
= "\u0128" // U+0128 LATIN CAPITAL LETTER I WITH TILDE
case 0x88: // U+0308 COMBINING DIAERESIS
= "\u00cf" // U+00CF LATIN CAPITAL LETTER I WITH DIAERESIS
default:
}
if != "" {
.pDst =
return .writeString()
}
}
}
return .copy()
default:
.copy()
}
}
return == maxIgnorable
}
}
// TODO: implement ltUpperSpan (low priority: complex and infrequent).
func aztrUpper( mapFunc) mapFunc {
return func( *context) bool {
// i→İ;
if .src[.pSrc] == 'i' {
return .writeString("İ")
}
return ()
}
}
func aztrLower( *context) ( bool) {
// From CLDR:
// # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
// # 0130; 0069; 0130; 0130; tr; # LATIN CAPITAL LETTER I WITH DOT ABOVE
// İ→i;
// # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
// # This matches the behavior of the canonically equivalent I-dot_above
// # 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
// # When lowercasing, unless an I is before a dot_above, it turns into a dotless i.
// # 0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I
// I([^[:ccc=Not_Reordered:][:ccc=Above:]]*)\u0307 → i$1 ;
// I→ı ;
// ::Any-Lower();
if .hasPrefix("\u0130") { // İ
return .writeString("i")
}
if .src[.pSrc] != 'I' {
return lower()
}
// We ignore the lower-case I for now, but insert it later when we know
// which form we need.
:= .pSrc + .sz
:= 0
:
// We check for up to n ignorables before \u0307. As \u0307 is an
// ignorable as well, n is maxIgnorable-1.
for ; < maxIgnorable && .next(); ++ {
switch .info.cccType() {
case cccAbove:
if .hasPrefix("\u0307") {
return .writeString("i") && .writeBytes(.src[:.pSrc]) // ignore U+0307
}
= true
break
case cccZero:
.unreadRune()
= true
break
default:
// We'll write this rune after we know which starter to use.
}
}
if == maxIgnorable {
= true
}
return .writeString("ı") && .writeBytes(.src[:.pSrc+.sz]) &&
}
// aztrLowerSpan would be the same as isLower.
func nlTitle( *context) bool {
// From CLDR:
// # Special titlecasing for Dutch initial "ij".
// ::Any-Title();
// # Fix up Ij at the beginning of a "word" (per Any-Title, notUAX #29)
// [:^WB=ALetter:] [:WB=Extend:]* [[:WB=MidLetter:][:WB=MidNumLet:]]? { Ij } → IJ ;
if .src[.pSrc] != 'I' && .src[.pSrc] != 'i' {
return title()
}
if !.writeString("I") || !.next() {
return false
}
if .src[.pSrc] == 'j' || .src[.pSrc] == 'J' {
return .writeString("J")
}
.unreadRune()
return true
}
func nlTitleSpan( *context) bool {
// From CLDR:
// # Special titlecasing for Dutch initial "ij".
// ::Any-Title();
// # Fix up Ij at the beginning of a "word" (per Any-Title, notUAX #29)
// [:^WB=ALetter:] [:WB=Extend:]* [[:WB=MidLetter:][:WB=MidNumLet:]]? { Ij } → IJ ;
if .src[.pSrc] != 'I' {
return isTitle()
}
if !.next() || .src[.pSrc] == 'j' {
return false
}
if .src[.pSrc] != 'J' {
.unreadRune()
}
return true
}
// Not part of CLDR, but see https://unicode.org/cldr/trac/ticket/7078.
func afnlRewrite( *context) {
if .hasPrefix("'") || .hasPrefix("’") {
.isMidWord = true
}
}
![]() |
The pages are generated with Golds v0.6.7. (GOOS=linux GOARCH=amd64) Golds is a Go 101 project developed by Tapir Liu. PR and bug reports are welcome and can be submitted to the issue list. Please follow @Go100and1 (reachable from the left QR code) to get the latest news of Golds. |