Source File
match.go
Belonging Package
golang.org/x/text/language
// Copyright 2013 The Go Authors. All rights reserved.// Use of this source code is governed by a BSD-style// license that can be found in the LICENSE file.package languageimport ()// A MatchOption configures a Matcher.type MatchOption func(*matcher)// PreferSameScript will, in the absence of a match, result in the first// preferred tag with the same script as a supported tag to match this supported// tag. The default is currently true, but this may change in the future.func ( bool) MatchOption {return func( *matcher) { .preferSameScript = }}// TODO(v1.0.0): consider making Matcher a concrete type, instead of interface.// There doesn't seem to be too much need for multiple types.// Making it a concrete type allows MatchStrings to be a method, which will// improve its discoverability.// MatchStrings parses and matches the given strings until one of them matches// the language in the Matcher. A string may be an Accept-Language header as// handled by ParseAcceptLanguage. The default language is returned if no// other language matched.func ( Matcher, ...string) ( Tag, int) {for , := range {, , := ParseAcceptLanguage()if != nil {continue}if , , := .Match(...); != No {return ,}}, , _ = .Match()return}// Matcher is the interface that wraps the Match method.//// Match returns the best match for any of the given tags, along with// a unique index associated with the returned tag and a confidence// score.type Matcher interface {Match(t ...Tag) (tag Tag, index int, c Confidence)}// Comprehends reports the confidence score for a speaker of a given language// to being able to comprehend the written form of an alternative language.func (, Tag) Confidence {, , := NewMatcher([]Tag{}).Match()return}// NewMatcher returns a Matcher that matches an ordered list of preferred tags// against a list of supported tags based on written intelligibility, closeness// of dialect, equivalence of subtags and various other rules. It is initialized// with the list of supported tags. The first element is used as the default// value in case no match is found.//// Its Match method matches the first of the given Tags to reach a certain// confidence threshold. The tags passed to Match should therefore be specified// in order of preference. Extensions are ignored for matching.//// The index returned by the Match method corresponds to the index of the// matched tag in t, but is augmented with the Unicode extension ('u')of the// corresponding preferred tag. This allows user locale options to be passed// transparently.func ( []Tag, ...MatchOption) Matcher {return newMatcher(, )}func ( *matcher) ( ...Tag) ( Tag, int, Confidence) {var language.Tag, , := .getBest(...)if != nil {, = .tag, .index} else {// TODO: this should be an option= .default_.tagif .preferSameScript {:for , := range {, := .Script()if .scriptID == 0 {// Don't do anything if there is no script, such as with// private subtags.continue}for , := range .supported {if .scriptID == .maxScript {, = .tag,break}}}}// TODO: select first language tag based on script.}if .RegionID != .RegionID && .RegionID != 0 {if .RegionID != 0 && .RegionID != 0 && .RegionID.Contains(.RegionID) {.RegionID = .RegionID.RemakeString()} else if := .RegionID.String(); len() == 2 {// TODO: also filter macro and deprecated., _ = .SetTypeForKey("rg", strings.ToLower()+"zzzz")}}// Copy options from the user-provided tag into the result tag. This is hard// to do after the fact, so we do it here.// TODO: add in alternative variants to -u-va-.// TODO: add preferred region to -u-rg-.if := .Extensions(); len() > 0 {:= language.Builder{}.SetTag()for , := range {.AddExt()}= .Make()}return makeTag(), ,}// ErrMissingLikelyTagsData indicates no information was available// to compute likely values of missing tags.var ErrMissingLikelyTagsData = errors.New("missing likely tags data")// func (t *Tag) setTagsFrom(id Tag) {// t.LangID = id.LangID// t.ScriptID = id.ScriptID// t.RegionID = id.RegionID// }// Tag Matching// CLDR defines an algorithm for finding the best match between two sets of language// tags. The basic algorithm defines how to score a possible match and then find// the match with the best score// (see https://www.unicode.org/reports/tr35/#LanguageMatching).// Using scoring has several disadvantages. The scoring obfuscates the importance of// the various factors considered, making the algorithm harder to understand. Using// scoring also requires the full score to be computed for each pair of tags.//// We will use a different algorithm which aims to have the following properties:// - clarity on the precedence of the various selection factors, and// - improved performance by allowing early termination of a comparison.//// Matching algorithm (overview)// Input:// - supported: a set of supported tags// - default: the default tag to return in case there is no match// - desired: list of desired tags, ordered by preference, starting with// the most-preferred.//// Algorithm:// 1) Set the best match to the lowest confidence level// 2) For each tag in "desired":// a) For each tag in "supported":// 1) compute the match between the two tags.// 2) if the match is better than the previous best match, replace it// with the new match. (see next section)// b) if the current best match is Exact and pin is true the result will be// frozen to the language found thusfar, although better matches may// still be found for the same language.// 3) If the best match so far is below a certain threshold, return "default".//// Ranking:// We use two phases to determine whether one pair of tags are a better match// than another pair of tags. First, we determine a rough confidence level. If the// levels are different, the one with the highest confidence wins.// Second, if the rough confidence levels are identical, we use a set of tie-breaker// rules.//// The confidence level of matching a pair of tags is determined by finding the// lowest confidence level of any matches of the corresponding subtags (the// result is deemed as good as its weakest link).// We define the following levels:// Exact - An exact match of a subtag, before adding likely subtags.// MaxExact - An exact match of a subtag, after adding likely subtags.// [See Note 2].// High - High level of mutual intelligibility between different subtag// variants.// Low - Low level of mutual intelligibility between different subtag// variants.// No - No mutual intelligibility.//// The following levels can occur for each type of subtag:// Base: Exact, MaxExact, High, Low, No// Script: Exact, MaxExact [see Note 3], Low, No// Region: Exact, MaxExact, High// Variant: Exact, High// Private: Exact, No//// Any result with a confidence level of Low or higher is deemed a possible match.// Once a desired tag matches any of the supported tags with a level of MaxExact// or higher, the next desired tag is not considered (see Step 2.b).// Note that CLDR provides languageMatching data that defines close equivalence// classes for base languages, scripts and regions.//// Tie-breaking// If we get the same confidence level for two matches, we apply a sequence of// tie-breaking rules. The first that succeeds defines the result. The rules are// applied in the following order.// 1) Original language was defined and was identical.// 2) Original region was defined and was identical.// 3) Distance between two maximized regions was the smallest.// 4) Original script was defined and was identical.// 5) Distance from want tag to have tag using the parent relation [see Note 5.]// If there is still no winner after these rules are applied, the first match// found wins.//// Notes:// [2] In practice, as matching of Exact is done in a separate phase from// matching the other levels, we reuse the Exact level to mean MaxExact in// the second phase. As a consequence, we only need the levels defined by// the Confidence type. The MaxExact confidence level is mapped to High in// the public API.// [3] We do not differentiate between maximized script values that were derived// from suppressScript versus most likely tag data. We determined that in// ranking the two, one ranks just after the other. Moreover, the two cannot// occur concurrently. As a consequence, they are identical for practical// purposes.// [4] In case of deprecated, macro-equivalents and legacy mappings, we assign// the MaxExact level to allow iw vs he to still be a closer match than// en-AU vs en-US, for example.// [5] In CLDR a locale inherits fields that are unspecified for this locale// from its parent. Therefore, if a locale is a parent of another locale,// it is a strong measure for closeness, especially when no other tie// breaker rule applies. One could also argue it is inconsistent, for// example, when pt-AO matches pt (which CLDR equates with pt-BR), even// though its parent is pt-PT according to the inheritance rules.//// Implementation Details:// There are several performance considerations worth pointing out. Most notably,// we preprocess as much as possible (within reason) at the time of creation of a// matcher. This includes:// - creating a per-language map, which includes data for the raw base language// and its canonicalized variant (if applicable),// - expanding entries for the equivalence classes defined in CLDR's// languageMatch data.// The per-language map ensures that typically only a very small number of tags// need to be considered. The pre-expansion of canonicalized subtags and// equivalence classes reduces the amount of map lookups that need to be done at// runtime.// matcher keeps a set of supported language tags, indexed by language.type matcher struct {default_ *haveTagsupported []*haveTagindex map[language.Language]*matchHeaderpassSettings boolpreferSameScript bool}// matchHeader has the lists of tags for exact matches and matches based on// maximized and canonicalized tags for a given language.type matchHeader struct {haveTags []*haveTagoriginal bool}// haveTag holds a supported Tag and its maximized script and region. The maximized// or canonicalized language is not stored as it is not needed during matching.type haveTag struct {tag language.Tag// index of this tag in the original list of supported tags.index int// conf is the maximum confidence that can result from matching this haveTag.// When conf < Exact this means it was inserted after applying a CLDR equivalence rule.conf Confidence// Maximized region and script.maxRegion language.RegionmaxScript language.Script// altScript may be checked as an alternative match to maxScript. If altScript// matches, the confidence level for this match is Low. Theoretically there// could be multiple alternative scripts. This does not occur in practice.altScript language.Script// nextMax is the index of the next haveTag with the same maximized tags.nextMax uint16}func makeHaveTag( language.Tag, int) (haveTag, language.Language) {:=if .LangID != 0 || .RegionID != 0 || .ScriptID != 0 {, _ = canonicalize(All, ), _ = .Maximize().RemakeString()}return haveTag{, , Exact, .RegionID, .ScriptID, altScript(.LangID, .ScriptID), 0}, .LangID}// altScript returns an alternative script that may match the given script with// a low confidence. At the moment, the langMatch data allows for at most one// script to map to another and we rely on this to keep the code simple.func altScript( language.Language, language.Script) language.Script {for , := range matchScript {// TODO: also match cases where language is not the same.if (language.Language(.wantLang) == || language.Language(.haveLang) == ) &&language.Script(.haveScript) == {return language.Script(.wantScript)}}return 0}// addIfNew adds a haveTag to the list of tags only if it is a unique tag.// Tags that have the same maximized values are linked by index.func ( *matchHeader) ( haveTag, bool) {.original = .original ||// Don't add new exact matches.for , := range .haveTags {if equalsRest(.tag, .tag) {return}}// Allow duplicate maximized tags, but create a linked list to allow quickly// comparing the equivalents and bail out.for , := range .haveTags {if .maxScript == .maxScript &&.maxRegion == .maxRegion &&.tag.VariantOrPrivateUseTags() == .tag.VariantOrPrivateUseTags() {for .haveTags[].nextMax != 0 {= int(.haveTags[].nextMax)}.haveTags[].nextMax = uint16(len(.haveTags))break}}.haveTags = append(.haveTags, &)}// header returns the matchHeader for the given language. It creates one if// it doesn't already exist.func ( *matcher) ( language.Language) *matchHeader {if := .index[]; != nil {return}:= &matchHeader{}.index[] =return}func toConf( uint8) Confidence {if <= 10 {return High}if < 30 {return Low}return No}// newMatcher builds an index for the given supported tags and returns it as// a matcher. It also expands the index by considering various equivalence classes// for a given tag.func newMatcher( []Tag, []MatchOption) *matcher {:= &matcher{index: make(map[language.Language]*matchHeader),preferSameScript: true,}for , := range {()}if len() == 0 {.default_ = &haveTag{}return}// Add supported languages to the index. Add exact matches first to give// them precedence.for , := range {:= .tag(), := makeHaveTag(, ).header(.LangID).addIfNew(, true).supported = append(.supported, &)}.default_ = .header([0].lang()).haveTags[0]// Keep these in two different loops to support the case that two equivalent// languages are distinguished, such as iw and he.for , := range {:= .tag(), := makeHaveTag(, )if != .LangID {.header().addIfNew(, true)}}// update is used to add indexes in the map for equivalent languages.// update will only add entries to original indexes, thus not computing any// transitive relations.:= func(, uint16, Confidence) {if := .index[language.Language()]; != nil {if !.original {return}:= .header(language.Language())for , := range .haveTags {:= *if < .conf {.conf =}.nextMax = 0 // this value needs to be recomputedif .altScript != 0 {.altScript = altScript(language.Language(), .maxScript)}.addIfNew(, == Exact && .original)}}}// Add entries for languages with mutual intelligibility as defined by CLDR's// languageMatch data.for , := range matchLang {(.want, .have, toConf(.distance))if !.oneway {(.have, .want, toConf(.distance))}}// Add entries for possible canonicalizations. This is an optimization to// ensure that only one map lookup needs to be done at runtime per desired tag.// First we match deprecated equivalents. If they are perfect equivalents// (their canonicalization simply substitutes a different language code, but// nothing else), the match confidence is Exact, otherwise it is High.for , := range language.AliasMap {// If deprecated codes match and there is no fiddling with the script// or region, we consider it an exact match.:= Exactif language.AliasTypes[] != language.Macro {if !isExactEquivalent(language.Language(.From)) {= High}(.To, .From, )}(.From, .To, )}return}// getBest gets the best matching tag in m for any of the given tags, taking into// account the order of preference of the given tags.func ( *matcher) ( ...Tag) ( *haveTag, language.Tag, Confidence) {:= bestMatch{}for , := range {:= .tag()var language.Tag// Check for exact match first.:= .index[.LangID]if .LangID != 0 {if == nil {continue}// Base language is defined., _ = canonicalize(Legacy|Deprecated|Macro, )// A region that is added through canonicalization is stronger than// a maximized region: set it in the original (e.g. mo -> ro-MD).if .RegionID != .RegionID {.RegionID = .RegionID}// TODO: should we do the same for scripts?// See test case: en, sr, nl ; sh ; sr, _ = .Maximize()} else {// Base language is not defined.if != nil {for := range .haveTags {:= .haveTags[]if equalsRest(.tag, ) {return , , Exact}}}if .ScriptID == 0 && .RegionID == 0 {// We skip all tags matching und for approximate matching, including// private tags.continue}, _ = .Maximize()if = .index[.LangID]; == nil {continue}}:= truefor , := range [+1:] {if .LangID == .lang() {= falsebreak}}// Check for match based on maximized tag.for := range .haveTags {:= .haveTags[].update(, , .ScriptID, .RegionID, )if .conf == Exact {for .nextMax != 0 {= .haveTags[.nextMax].update(, , .ScriptID, .RegionID, )}return .have, .want, .conf}}}if .conf <= No {if len() != 0 {return nil, [0].tag(), No}return nil, language.Tag{}, No}return .have, .want, .conf}// bestMatch accumulates the best match so far.type bestMatch struct {have *haveTagwant language.Tagconf ConfidencepinnedRegion language.RegionpinLanguage boolsameRegionGroup bool// Cached results from applying tie-breaking rules.origLang boolorigReg boolparadigmReg boolregGroupDist uint8origScript bool}// update updates the existing best match if the new pair is considered to be a// better match. To determine if the given pair is a better match, it first// computes the rough confidence level. If this surpasses the current match, it// will replace it and update the tie-breaker rule cache. If there is a tie, it// proceeds with applying a series of tie-breaker rules. If there is no// conclusive winner after applying the tie-breaker rules, it leaves the current// match as the preferred match.//// If pin is true and have and tag are a strong match, it will henceforth only// consider matches for this language. This corresponds to the idea that most// users have a strong preference for the first defined language. A user can// still prefer a second language over a dialect of the preferred language by// explicitly specifying dialects, e.g. "en, nl, en-GB". In this case pin should// be false.func ( *bestMatch) ( *haveTag, language.Tag, language.Script, language.Region, bool) {// Bail if the maximum attainable confidence is below that of the current best match.:= .confif < .conf {return}// Don't change the language once we already have found an exact match.if .pinLanguage && .LangID != .want.LangID {return}// Pin the region group if we are comparing tags for the same language.if .LangID == .want.LangID && .sameRegionGroup {, := regionGroupDist(.pinnedRegion, .maxRegion, .maxScript, .want.LangID)if ! {return}}if == Exact && .maxScript == {// If there is another language and then another entry of this language,// don't pin anything, otherwise pin the language..pinLanguage =}if equalsRest(.tag, ) {} else if .maxScript != {// There is usually very little comprehension between different scripts.// In a few cases there may still be Low comprehension. This possibility// is pre-computed and stored in have.altScript.if Low < .conf || .altScript != {return}= Low} else if .maxRegion != {if High < {// There is usually a small difference between languages across regions.= High}}// We store the results of the computations of the tie-breaker rules along// with the best match. There is no need to do the checks once we determine// we have a winner, but we do still need to do the tie-breaker computations.// We use "beaten" to keep track if we still need to do the checks.:= false // true if the new pair defeats the current one.if != .conf {if < .conf {return}= true}// Tie-breaker rules:// We prefer if the pre-maximized language was specified and identical.:= .tag.LangID == .LangID && .LangID != 0if ! && .origLang != {if .origLang {return}= true}// We prefer if the pre-maximized region was specified and identical.:= .tag.RegionID == .RegionID && .RegionID != 0if ! && .origReg != {if .origReg {return}= true}, := regionGroupDist(.maxRegion, , , .LangID)if ! && .regGroupDist != {if > .regGroupDist {return}= true}:= isParadigmLocale(.LangID, .maxRegion)if ! && .paradigmReg != {if ! {return}= true}// Next we prefer if the pre-maximized script was specified and identical.:= .tag.ScriptID == .ScriptID && .ScriptID != 0if ! && .origScript != {if .origScript {return}= true}// Update m to the newly found best match.if {.have =.want =.conf =.pinnedRegion =.sameRegionGroup =.origLang =.origReg =.paradigmReg =.origScript =.regGroupDist =}}func isParadigmLocale( language.Language, language.Region) bool {for , := range paradigmLocales {if language.Language([0]) == && ( == language.Region([1]) || == language.Region([2])) {return true}}return false}// regionGroupDist computes the distance between two regions based on their// CLDR grouping.func regionGroupDist(, language.Region, language.Script, language.Language) ( uint8, bool) {const = 4:= uint(regionToGroups[]) << 1:= uint(regionToGroups[]) << 1for , := range matchRegion {if language.Language(.lang) == && (.script == 0 || language.Script(.script) == ) {:= uint(1 << (.group &^ 0x80))if 0x80&.group == 0 {if && != 0 { // Both regions are in the group.return .distance, .distance ==}} else {if (|)& == 0 { // Both regions are not in the group.return .distance, .distance ==}}}}return , true}// equalsRest compares everything except the language.func equalsRest(, language.Tag) bool {// TODO: don't include extensions in this comparison. To do this efficiently,// though, we should handle private tags separately.return .ScriptID == .ScriptID && .RegionID == .RegionID && .VariantOrPrivateUseTags() == .VariantOrPrivateUseTags()}// isExactEquivalent returns true if canonicalizing the language will not alter// the script or region of a tag.func isExactEquivalent( language.Language) bool {for , := range notEquivalent {if == {return false}}return true}var notEquivalent []language.Languagefunc init() {// Create a list of all languages for which canonicalization may alter the// script or region.for , := range language.AliasMap {:= language.Tag{LangID: language.Language(.From)}if , _ = canonicalize(All, ); .ScriptID != 0 || .RegionID != 0 {notEquivalent = append(notEquivalent, language.Language(.From))}}// Maximize undefined regions of paradigm locales.for , := range paradigmLocales {:= language.Tag{LangID: language.Language([0])}, := .Maximize()if [1] == 0 {paradigmLocales[][1] = uint16(.RegionID)}if [2] == 0 {paradigmLocales[][2] = uint16(.RegionID)}}}
![]() |
The pages are generated with Golds v0.6.7. (GOOS=linux GOARCH=amd64) Golds is a Go 101 project developed by Tapir Liu. PR and bug reports are welcome and can be submitted to the issue list. Please follow @Go100and1 (reachable from the left QR code) to get the latest news of Golds. |