Source File
forminfo.go
Belonging Package
vendor/golang.org/x/text/unicode/norm
// Copyright 2011 The Go Authors. All rights reserved.// Use of this source code is governed by a BSD-style// license that can be found in the LICENSE file.package normimport// This file contains Form-specific logic and wrappers for data in tables.go.// Rune info is stored in a separate trie per composing form. A composing form// and its corresponding decomposing form share the same trie. Each trie maps// a rune to a uint16. The values take two forms. For v >= 0x8000:// bits// 15: 1 (inverse of NFD_QC bit of qcInfo)// 13..7: qcInfo (see below). isYesD is always true (no decomposition).// 6..0: ccc (compressed CCC value).// For v < 0x8000, the respective rune has a decomposition and v is an index// into a byte array of UTF-8 decomposition sequences and additional info and// has the form:// <header> <decomp_byte>* [<tccc> [<lccc>]]// The header contains the number of bytes in the decomposition (excluding this// length byte). The two most significant bits of this length byte correspond// to bit 5 and 4 of qcInfo (see below). The byte sequence itself starts at v+1.// The byte sequence is followed by a trailing and leading CCC if the values// for these are not zero. The value of v determines which ccc are appended// to the sequences. For v < firstCCC, there are none, for v >= firstCCC,// the sequence is followed by a trailing ccc, and for v >= firstLeadingCC// there is an additional leading ccc. The value of tccc itself is the// trailing CCC shifted left 2 bits. The two least-significant bits of tccc// are the number of trailing non-starters.const (qcInfoMask = 0x3F // to clear all but the relevant bits in a qcInfoheaderLenMask = 0x3F // extract the length value from the header byteheaderFlagsMask = 0xC0 // extract the qcInfo bits from the header byte)// Properties provides access to normalization properties of a rune.type Properties struct {pos uint8 // start position in reorderBuffer; used in composition.gosize uint8 // length of UTF-8 encoding of this runeccc uint8 // leading canonical combining class (ccc if not decomposition)tccc uint8 // trailing canonical combining class (ccc if not decomposition)nLead uint8 // number of leading non-starters.flags qcInfo // quick check flagsindex uint16}// functions dispatchable per formtype lookupFunc func(b input, i int) Properties// formInfo holds Form-specific functions and tables.type formInfo struct {form Formcomposing, compatibility bool // form typeinfo lookupFuncnextMain iterFunc}var formTable = []*formInfo{{form: NFC,composing: true,compatibility: false,info: lookupInfoNFC,nextMain: nextComposed,}, {form: NFD,composing: false,compatibility: false,info: lookupInfoNFC,nextMain: nextDecomposed,}, {form: NFKC,composing: true,compatibility: true,info: lookupInfoNFKC,nextMain: nextComposed,}, {form: NFKD,composing: false,compatibility: true,info: lookupInfoNFKC,nextMain: nextDecomposed,}}// We do not distinguish between boundaries for NFC, NFD, etc. to avoid// unexpected behavior for the user. For example, in NFD, there is a boundary// after 'a'. However, 'a' might combine with modifiers, so from the application's// perspective it is not a good boundary. We will therefore always use the// boundaries for the combining variants.// BoundaryBefore returns true if this rune starts a new segment and// cannot combine with any rune on the left.func ( Properties) () bool {if .ccc == 0 && !.combinesBackward() {return true}// We assume that the CCC of the first character in a decomposition// is always non-zero if different from info.ccc and that we can return// false at this point. This is verified by maketables.return false}// BoundaryAfter returns true if runes cannot combine with or otherwise// interact with this or previous runes.func ( Properties) () bool {// TODO: loosen these conditions.return .isInert()}// We pack quick check data in 4 bits://// 5: Combines forward (0 == false, 1 == true)// 4..3: NFC_QC Yes(00), No (10), or Maybe (11)// 2: NFD_QC Yes (0) or No (1). No also means there is a decomposition.// 1..0: Number of trailing non-starters.//// When all 4 bits are zero, the character is inert, meaning it is never// influenced by normalization.type qcInfo uint8func ( Properties) () bool { return .flags&0x10 == 0 }func ( Properties) () bool { return .flags&0x4 == 0 }func ( Properties) () bool { return .flags&0x20 != 0 }func ( Properties) () bool { return .flags&0x8 != 0 } // == isMaybefunc ( Properties) () bool { return .flags&0x4 != 0 } // == isNoDfunc ( Properties) () bool {return .flags&qcInfoMask == 0 && .ccc == 0}func ( Properties) () bool {return .index >= firstMulti && .index < endMulti}func ( Properties) () uint8 {return .nLead}func ( Properties) () uint8 {return uint8(.flags & 0x03)}// Decomposition returns the decomposition for the underlying rune// or nil if there is none.func ( Properties) () []byte {// TODO: create the decomposition for Hangul?if .index == 0 {return nil}:= .index:= decomps[] & headerLenMask++return decomps[ : +uint16()]}// Size returns the length of UTF-8 encoding of the rune.func ( Properties) () int {return int(.size)}// CCC returns the canonical combining class of the underlying rune.func ( Properties) () uint8 {if .index >= firstCCCZeroExcept {return 0}return ccc[.ccc]}// LeadCCC returns the CCC of the first rune in the decomposition.// If there is no decomposition, LeadCCC equals CCC.func ( Properties) () uint8 {return ccc[.ccc]}// TrailCCC returns the CCC of the last rune in the decomposition.// If there is no decomposition, TrailCCC equals CCC.func ( Properties) () uint8 {return ccc[.tccc]}func buildRecompMap() {recompMap = make(map[uint32]rune, len(recompMapPacked)/8)var [8]bytefor := 0; < len(recompMapPacked); += 8 {copy([:], recompMapPacked[:+8]):= binary.BigEndian.Uint32([:4]):= binary.BigEndian.Uint32([4:])recompMap[] = rune()}}// Recomposition// We use 32-bit keys instead of 64-bit for the two codepoint keys.// This clips off the bits of three entries, but we know this will not// result in a collision. In the unlikely event that changes to// UnicodeData.txt introduce collisions, the compiler will catch it.// Note that the recomposition map for NFC and NFKC are identical.// combine returns the combined rune or 0 if it doesn't exist.//// The caller is responsible for calling// recompMapOnce.Do(buildRecompMap) sometime before this is called.func combine(, rune) rune {:= uint32(uint16())<<16 + uint32(uint16())if recompMap == nil {panic("caller error") // see func comment}return recompMap[]}func lookupInfoNFC( input, int) Properties {, := .charinfoNFC()return compInfo(, )}func lookupInfoNFKC( input, int) Properties {, := .charinfoNFKC()return compInfo(, )}// Properties returns properties for the first rune in s.func ( Form) ( []byte) Properties {if == NFC || == NFD {return compInfo(nfcData.lookup())}return compInfo(nfkcData.lookup())}// PropertiesString returns properties for the first rune in s.func ( Form) ( string) Properties {if == NFC || == NFD {return compInfo(nfcData.lookupString())}return compInfo(nfkcData.lookupString())}// compInfo converts the information contained in v and sz// to a Properties. See the comment at the top of the file// for more information on the format.func compInfo( uint16, int) Properties {if == 0 {return Properties{size: uint8()}} else if >= 0x8000 {:= Properties{size: uint8(),ccc: uint8(),tccc: uint8(),flags: qcInfo( >> 8),}if .ccc > 0 || .combinesBackward() {.nLead = uint8(.flags & 0x3)}return}// has decomposition:= decomps[]:= (qcInfo(&headerFlagsMask) >> 2) | 0x4:= Properties{size: uint8(), flags: , index: }if >= firstCCC {+= uint16(&headerLenMask) + 1:= decomps[].tccc = >> 2.flags |= qcInfo( & 0x3)if >= firstLeadingCCC {.nLead = & 0x3if >= firstStarterWithNLead {// We were tricked. Remove the decomposition..flags &= 0x03.index = 0return}.ccc = decomps[+1]}}return}
![]() |
The pages are generated with Golds v0.6.7. (GOOS=linux GOARCH=amd64) Golds is a Go 101 project developed by Tapir Liu. PR and bug reports are welcome and can be submitted to the issue list. Please follow @Go100and1 (reachable from the left QR code) to get the latest news of Golds. |