Source File
grapheme.go
Belonging Package
github.com/rivo/uniseg
package uniseg
import
// Graphemes implements an iterator over Unicode grapheme clusters, or
// user-perceived characters. While iterating, it also provides information
// about word boundaries, sentence boundaries, line breaks, and monospace
// character widths.
//
// After constructing the class via [NewGraphemes] for a given string "str",
// [Graphemes.Next] is called for every grapheme cluster in a loop until it
// returns false. Inside the loop, information about the grapheme cluster as
// well as boundary information and character width is available via the various
// methods (see examples below).
//
// Using this class to iterate over a string is convenient but it is much slower
// than using this package's [Step] or [StepString] functions or any of the
// other specialized functions starting with "First".
type Graphemes struct {
// The original string.
original string
// The remaining string to be parsed.
remaining string
// The current grapheme cluster.
cluster string
// The byte offset of the current grapheme cluster relative to the original
// string.
offset int
// The current boundary information of the [Step] parser.
boundaries int
// The current state of the [Step] parser.
state int
}
// NewGraphemes returns a new grapheme cluster iterator.
func ( string) *Graphemes {
return &Graphemes{
original: ,
remaining: ,
state: -1,
}
}
// Next advances the iterator by one grapheme cluster and returns false if no
// clusters are left. This function must be called before the first cluster is
// accessed.
func ( *Graphemes) () bool {
if len(.remaining) == 0 {
// We're already past the end.
.state = -2
.cluster = ""
return false
}
.offset += len(.cluster)
.cluster, .remaining, .boundaries, .state = StepString(.remaining, .state)
return true
}
// Runes returns a slice of runes (code points) which corresponds to the current
// grapheme cluster. If the iterator is already past the end or [Graphemes.Next]
// has not yet been called, nil is returned.
func ( *Graphemes) () []rune {
if .state < 0 {
return nil
}
return []rune(.cluster)
}
// Str returns a substring of the original string which corresponds to the
// current grapheme cluster. If the iterator is already past the end or
// [Graphemes.Next] has not yet been called, an empty string is returned.
func ( *Graphemes) () string {
return .cluster
}
// Bytes returns a byte slice which corresponds to the current grapheme cluster.
// If the iterator is already past the end or [Graphemes.Next] has not yet been
// called, nil is returned.
func ( *Graphemes) () []byte {
if .state < 0 {
return nil
}
return []byte(.cluster)
}
// Positions returns the interval of the current grapheme cluster as byte
// positions into the original string. The first returned value "from" indexes
// the first byte and the second returned value "to" indexes the first byte that
// is not included anymore, i.e. str[from:to] is the current grapheme cluster of
// the original string "str". If [Graphemes.Next] has not yet been called, both
// values are 0. If the iterator is already past the end, both values are 1.
func ( *Graphemes) () (int, int) {
if .state == -1 {
return 0, 0
} else if .state == -2 {
return 1, 1
}
return .offset, .offset + len(.cluster)
}
// IsWordBoundary returns true if a word ends after the current grapheme
// cluster.
func ( *Graphemes) () bool {
if .state < 0 {
return true
}
return .boundaries&MaskWord != 0
}
// IsSentenceBoundary returns true if a sentence ends after the current
// grapheme cluster.
func ( *Graphemes) () bool {
if .state < 0 {
return true
}
return .boundaries&MaskSentence != 0
}
// LineBreak returns whether the line can be broken after the current grapheme
// cluster. A value of [LineDontBreak] means the line may not be broken, a value
// of [LineMustBreak] means the line must be broken, and a value of
// [LineCanBreak] means the line may or may not be broken.
func ( *Graphemes) () int {
if .state == -1 {
return LineDontBreak
}
if .state == -2 {
return LineMustBreak
}
return .boundaries & MaskLine
}
// Width returns the monospace width of the current grapheme cluster.
func ( *Graphemes) () int {
if .state < 0 {
return 0
}
return .boundaries >> ShiftWidth
}
// Reset puts the iterator into its initial state such that the next call to
// [Graphemes.Next] sets it to the first grapheme cluster again.
func ( *Graphemes) () {
.state = -1
.offset = 0
.cluster = ""
.remaining = .original
}
// GraphemeClusterCount returns the number of user-perceived characters
// (grapheme clusters) for the given string.
func ( string) ( int) {
:= -1
for len() > 0 {
_, , _, = FirstGraphemeClusterInString(, )
++
}
return
}
// ReverseString reverses the given string while observing grapheme cluster
// boundaries.
func ( string) string {
:= []byte()
:= make([]byte, len())
:= -1
:= len()
for len() > 0 {
var []byte
, , _, = FirstGraphemeCluster(, )
-= len()
copy([:], )
if <= len()/2 {
break
}
}
return string()
}
// The number of bits the grapheme property must be shifted to make place for
// grapheme states.
const shiftGraphemePropState = 4
// FirstGraphemeCluster returns the first grapheme cluster found in the given
// byte slice according to the rules of [Unicode Standard Annex #29, Grapheme
// Cluster Boundaries]. This function can be called continuously to extract all
// grapheme clusters from a byte slice, as illustrated in the example below.
//
// If you don't know the current state, for example when calling the function
// for the first time, you must pass -1. For consecutive calls, pass the state
// and rest slice returned by the previous call.
//
// The "rest" slice is the sub-slice of the original byte slice "b" starting
// after the last byte of the identified grapheme cluster. If the length of the
// "rest" slice is 0, the entire byte slice "b" has been processed. The
// "cluster" byte slice is the sub-slice of the input slice containing the
// identified grapheme cluster.
//
// The returned width is the width of the grapheme cluster for most monospace
// fonts where a value of 1 represents one character cell.
//
// Given an empty byte slice "b", the function returns nil values.
//
// While slightly less convenient than using the Graphemes class, this function
// has much better performance and makes no allocations. It lends itself well to
// large byte slices.
//
// [Unicode Standard Annex #29, Grapheme Cluster Boundaries]: http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries
func ( []byte, int) (, []byte, , int) {
// An empty byte slice returns nothing.
if len() == 0 {
return
}
// Extract the first rune.
, := utf8.DecodeRune()
if len() <= { // If we're already past the end, there is nothing else to parse.
var int
if < 0 {
= property(graphemeCodePoints, )
} else {
= >> shiftGraphemePropState
}
return , nil, runeWidth(, ), grAny | ( << shiftGraphemePropState)
}
// If we don't know the state, determine it now.
var int
if < 0 {
, , _ = transitionGraphemeState(, )
} else {
= >> shiftGraphemePropState
}
+= runeWidth(, )
// Transition until we find a boundary.
for {
var (
int
bool
)
, := utf8.DecodeRune([:])
, , = transitionGraphemeState(&maskGraphemeState, )
if {
return [:], [:], , | ( << shiftGraphemePropState)
}
if == vs16 {
= 2
} else if != prExtendedPictographic && != prRegionalIndicator && != prL {
+= runeWidth(, )
} else if == prExtendedPictographic {
if == vs15 {
= 1
} else {
= 2
}
}
+=
if len() <= {
return , nil, , grAny | ( << shiftGraphemePropState)
}
}
}
// FirstGraphemeClusterInString is like [FirstGraphemeCluster] but its input and
// outputs are strings.
func ( string, int) (, string, , int) {
// An empty string returns nothing.
if len() == 0 {
return
}
// Extract the first rune.
, := utf8.DecodeRuneInString()
if len() <= { // If we're already past the end, there is nothing else to parse.
var int
if < 0 {
= property(graphemeCodePoints, )
} else {
= >> shiftGraphemePropState
}
return , "", runeWidth(, ), grAny | ( << shiftGraphemePropState)
}
// If we don't know the state, determine it now.
var int
if < 0 {
, , _ = transitionGraphemeState(, )
} else {
= >> shiftGraphemePropState
}
+= runeWidth(, )
// Transition until we find a boundary.
for {
var (
int
bool
)
, := utf8.DecodeRuneInString([:])
, , = transitionGraphemeState(&maskGraphemeState, )
if {
return [:], [:], , | ( << shiftGraphemePropState)
}
if == vs16 {
= 2
} else if != prExtendedPictographic && != prRegionalIndicator && != prL {
+= runeWidth(, )
} else if == prExtendedPictographic {
if == vs15 {
= 1
} else {
= 2
}
}
+=
if len() <= {
return , "", , grAny | ( << shiftGraphemePropState)
}
}
}
![]() |
The pages are generated with Golds v0.6.7. (GOOS=linux GOARCH=amd64) Golds is a Go 101 project developed by Tapir Liu. PR and bug reports are welcome and can be submitted to the issue list. Please follow @Go100and1 (reachable from the left QR code) to get the latest news of Golds. |