package unisegimport// Graphemes implements an iterator over Unicode grapheme clusters, or// user-perceived characters. While iterating, it also provides information// about word boundaries, sentence boundaries, line breaks, and monospace// character widths.//// After constructing the class via [NewGraphemes] for a given string "str",// [Graphemes.Next] is called for every grapheme cluster in a loop until it// returns false. Inside the loop, information about the grapheme cluster as// well as boundary information and character width is available via the various// methods (see examples below).//// Using this class to iterate over a string is convenient but it is much slower// than using this package's [Step] or [StepString] functions or any of the// other specialized functions starting with "First".typeGraphemesstruct {// The original string. original string// The remaining string to be parsed. remaining string// The current grapheme cluster. cluster string// The byte offset of the current grapheme cluster relative to the original // string. offset int// The current boundary information of the [Step] parser. boundaries int// The current state of the [Step] parser. state int}// NewGraphemes returns a new grapheme cluster iterator.func ( string) *Graphemes {return &Graphemes{original: ,remaining: ,state: -1, }}// Next advances the iterator by one grapheme cluster and returns false if no// clusters are left. This function must be called before the first cluster is// accessed.func ( *Graphemes) () bool {iflen(.remaining) == 0 {// We're already past the end. .state = -2 .cluster = ""returnfalse } .offset += len(.cluster) .cluster, .remaining, .boundaries, .state = StepString(.remaining, .state)returntrue}// Runes returns a slice of runes (code points) which corresponds to the current// grapheme cluster. If the iterator is already past the end or [Graphemes.Next]// has not yet been called, nil is returned.func ( *Graphemes) () []rune {if .state < 0 {returnnil }return []rune(.cluster)}// Str returns a substring of the original string which corresponds to the// current grapheme cluster. If the iterator is already past the end or// [Graphemes.Next] has not yet been called, an empty string is returned.func ( *Graphemes) () string {return .cluster}// Bytes returns a byte slice which corresponds to the current grapheme cluster.// If the iterator is already past the end or [Graphemes.Next] has not yet been// called, nil is returned.func ( *Graphemes) () []byte {if .state < 0 {returnnil }return []byte(.cluster)}// Positions returns the interval of the current grapheme cluster as byte// positions into the original string. The first returned value "from" indexes// the first byte and the second returned value "to" indexes the first byte that// is not included anymore, i.e. str[from:to] is the current grapheme cluster of// the original string "str". If [Graphemes.Next] has not yet been called, both// values are 0. If the iterator is already past the end, both values are 1.func ( *Graphemes) () (int, int) {if .state == -1 {return0, 0 } elseif .state == -2 {return1, 1 }return .offset, .offset + len(.cluster)}// IsWordBoundary returns true if a word ends after the current grapheme// cluster.func ( *Graphemes) () bool {if .state < 0 {returntrue }return .boundaries&MaskWord != 0}// IsSentenceBoundary returns true if a sentence ends after the current// grapheme cluster.func ( *Graphemes) () bool {if .state < 0 {returntrue }return .boundaries&MaskSentence != 0}// LineBreak returns whether the line can be broken after the current grapheme// cluster. A value of [LineDontBreak] means the line may not be broken, a value// of [LineMustBreak] means the line must be broken, and a value of// [LineCanBreak] means the line may or may not be broken.func ( *Graphemes) () int {if .state == -1 {returnLineDontBreak }if .state == -2 {returnLineMustBreak }return .boundaries & MaskLine}// Width returns the monospace width of the current grapheme cluster.func ( *Graphemes) () int {if .state < 0 {return0 }return .boundaries >> ShiftWidth}// Reset puts the iterator into its initial state such that the next call to// [Graphemes.Next] sets it to the first grapheme cluster again.func ( *Graphemes) () { .state = -1 .offset = 0 .cluster = "" .remaining = .original}// GraphemeClusterCount returns the number of user-perceived characters// (grapheme clusters) for the given string.func ( string) ( int) { := -1forlen() > 0 { _, , _, = FirstGraphemeClusterInString(, ) ++ }return}// ReverseString reverses the given string while observing grapheme cluster// boundaries.func ( string) string { := []byte() := make([]byte, len()) := -1 := len()forlen() > 0 {var []byte , , _, = FirstGraphemeCluster(, ) -= len()copy([:], )if <= len()/2 {break } }returnstring()}// The number of bits the grapheme property must be shifted to make place for// grapheme states.const shiftGraphemePropState = 4// FirstGraphemeCluster returns the first grapheme cluster found in the given// byte slice according to the rules of [Unicode Standard Annex #29, Grapheme// Cluster Boundaries]. This function can be called continuously to extract all// grapheme clusters from a byte slice, as illustrated in the example below.//// If you don't know the current state, for example when calling the function// for the first time, you must pass -1. For consecutive calls, pass the state// and rest slice returned by the previous call.//// The "rest" slice is the sub-slice of the original byte slice "b" starting// after the last byte of the identified grapheme cluster. If the length of the// "rest" slice is 0, the entire byte slice "b" has been processed. The// "cluster" byte slice is the sub-slice of the input slice containing the// identified grapheme cluster.//// The returned width is the width of the grapheme cluster for most monospace// fonts where a value of 1 represents one character cell.//// Given an empty byte slice "b", the function returns nil values.//// While slightly less convenient than using the Graphemes class, this function// has much better performance and makes no allocations. It lends itself well to// large byte slices.//// [Unicode Standard Annex #29, Grapheme Cluster Boundaries]: http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundariesfunc ( []byte, int) (, []byte, , int) {// An empty byte slice returns nothing.iflen() == 0 {return }// Extract the first rune. , := utf8.DecodeRune()iflen() <= { // If we're already past the end, there is nothing else to parse.varintif < 0 { = property(graphemeCodePoints, ) } else { = >> shiftGraphemePropState }return , nil, runeWidth(, ), grAny | ( << shiftGraphemePropState) }// If we don't know the state, determine it now.varintif < 0 { , , _ = transitionGraphemeState(, ) } else { = >> shiftGraphemePropState } += runeWidth(, )// Transition until we find a boundary.for {var (intbool ) , := utf8.DecodeRune([:]) , , = transitionGraphemeState(&maskGraphemeState, )if {return [:], [:], , | ( << shiftGraphemePropState) }if == vs16 { = 2 } elseif != prExtendedPictographic && != prRegionalIndicator && != prL { += runeWidth(, ) } elseif == prExtendedPictographic {if == vs15 { = 1 } else { = 2 } } += iflen() <= {return , nil, , grAny | ( << shiftGraphemePropState) } }}// FirstGraphemeClusterInString is like [FirstGraphemeCluster] but its input and// outputs are strings.func ( string, int) (, string, , int) {// An empty string returns nothing.iflen() == 0 {return }// Extract the first rune. , := utf8.DecodeRuneInString()iflen() <= { // If we're already past the end, there is nothing else to parse.varintif < 0 { = property(graphemeCodePoints, ) } else { = >> shiftGraphemePropState }return , "", runeWidth(, ), grAny | ( << shiftGraphemePropState) }// If we don't know the state, determine it now.varintif < 0 { , , _ = transitionGraphemeState(, ) } else { = >> shiftGraphemePropState } += runeWidth(, )// Transition until we find a boundary.for {var (intbool ) , := utf8.DecodeRuneInString([:]) , , = transitionGraphemeState(&maskGraphemeState, )if {return [:], [:], , | ( << shiftGraphemePropState) }if == vs16 { = 2 } elseif != prExtendedPictographic && != prRegionalIndicator && != prL { += runeWidth(, ) } elseif == prExtendedPictographic {if == vs15 { = 1 } else { = 2 } } += iflen() <= {return , "", , grAny | ( << shiftGraphemePropState) } }}
The pages are generated with Goldsv0.6.7. (GOOS=linux GOARCH=amd64)
Golds is a Go 101 project developed by Tapir Liu.
PR and bug reports are welcome and can be submitted to the issue list.
Please follow @Go100and1 (reachable from the left QR code) to get the latest news of Golds.