Update godeps
This commit is contained in:
parent
1c8773fc98
commit
1bc383f9c5
1723 changed files with 287976 additions and 411028 deletions
27
vendor/golang.org/x/text/LICENSE
generated
vendored
Normal file
27
vendor/golang.org/x/text/LICENSE
generated
vendored
Normal file
|
|
@ -0,0 +1,27 @@
|
|||
Copyright (c) 2009 The Go Authors. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the following disclaimer
|
||||
in the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of Google Inc. nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
22
vendor/golang.org/x/text/PATENTS
generated
vendored
Normal file
22
vendor/golang.org/x/text/PATENTS
generated
vendored
Normal file
|
|
@ -0,0 +1,22 @@
|
|||
Additional IP Rights Grant (Patents)
|
||||
|
||||
"This implementation" means the copyrightable works distributed by
|
||||
Google as part of the Go project.
|
||||
|
||||
Google hereby grants to You a perpetual, worldwide, non-exclusive,
|
||||
no-charge, royalty-free, irrevocable (except as stated in this section)
|
||||
patent license to make, have made, use, offer to sell, sell, import,
|
||||
transfer and otherwise run, modify and propagate the contents of this
|
||||
implementation of Go, where such license applies only to those patent
|
||||
claims, both currently owned or controlled by Google and acquired in
|
||||
the future, licensable by Google that are necessarily infringed by this
|
||||
implementation of Go. This grant does not include claims that would be
|
||||
infringed only as a consequence of further modification of this
|
||||
implementation. If you or your agent or exclusive licensee institute or
|
||||
order or agree to the institution of patent litigation against any
|
||||
entity (including a cross-claim or counterclaim in a lawsuit) alleging
|
||||
that this implementation of Go or any code incorporated within this
|
||||
implementation of Go constitutes direct or contributory patent
|
||||
infringement, or inducement of patent infringement, then any patent
|
||||
rights granted to you under this License for this implementation of Go
|
||||
shall terminate as of the date such litigation is filed.
|
||||
129
vendor/golang.org/x/text/cases/cases.go
generated
vendored
Normal file
129
vendor/golang.org/x/text/cases/cases.go
generated
vendored
Normal file
|
|
@ -0,0 +1,129 @@
|
|||
// Copyright 2014 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
//go:generate go run gen.go gen_trieval.go
|
||||
|
||||
// Package cases provides general and language-specific case mappers.
|
||||
package cases
|
||||
|
||||
import (
|
||||
"golang.org/x/text/language"
|
||||
"golang.org/x/text/transform"
|
||||
)
|
||||
|
||||
// References:
|
||||
// - Unicode Reference Manual Chapter 3.13, 4.2, and 5.18.
|
||||
// - http://www.unicode.org/reports/tr29/
|
||||
// - http://www.unicode.org/Public/6.3.0/ucd/CaseFolding.txt
|
||||
// - http://www.unicode.org/Public/6.3.0/ucd/SpecialCasing.txt
|
||||
// - http://www.unicode.org/Public/6.3.0/ucd/DerivedCoreProperties.txt
|
||||
// - http://www.unicode.org/Public/6.3.0/ucd/auxiliary/WordBreakProperty.txt
|
||||
// - http://www.unicode.org/Public/6.3.0/ucd/auxiliary/WordBreakTest.txt
|
||||
// - http://userguide.icu-project.org/transforms/casemappings
|
||||
|
||||
// TODO:
|
||||
// - Case folding
|
||||
// - Wide and Narrow?
|
||||
// - Segmenter option for title casing.
|
||||
// - ASCII fast paths
|
||||
// - Encode Soft-Dotted property within trie somehow.
|
||||
|
||||
// A Caser transforms given input to a certain case. It implements
|
||||
// transform.Transformer.
|
||||
//
|
||||
// A Caser may be stateful and should therefore not be shared between
|
||||
// goroutines.
|
||||
type Caser struct {
|
||||
t transform.Transformer
|
||||
}
|
||||
|
||||
// Bytes returns a new byte slice with the result of converting b to the case
|
||||
// form implemented by c.
|
||||
func (c Caser) Bytes(b []byte) []byte {
|
||||
b, _, _ = transform.Bytes(c.t, b)
|
||||
return b
|
||||
}
|
||||
|
||||
// String returns a string with the result of transforming s to the case form
|
||||
// implemented by c.
|
||||
func (c Caser) String(s string) string {
|
||||
s, _, _ = transform.String(c.t, s)
|
||||
return s
|
||||
}
|
||||
|
||||
// Reset resets the Caser to be reused for new input after a previous call to
|
||||
// Transform.
|
||||
func (c Caser) Reset() { c.t.Reset() }
|
||||
|
||||
// Transform implements the Transformer interface and transforms the given input
|
||||
// to the case form implemented by c.
|
||||
func (c Caser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
return c.t.Transform(dst, src, atEOF)
|
||||
}
|
||||
|
||||
// Upper returns a Caser for language-specific uppercasing.
|
||||
func Upper(t language.Tag, opts ...Option) Caser {
|
||||
return Caser{makeUpper(t, getOpts(opts...))}
|
||||
}
|
||||
|
||||
// Lower returns a Caser for language-specific lowercasing.
|
||||
func Lower(t language.Tag, opts ...Option) Caser {
|
||||
return Caser{makeLower(t, getOpts(opts...))}
|
||||
}
|
||||
|
||||
// Title returns a Caser for language-specific title casing. It uses an
|
||||
// approximation of the default Unicode Word Break algorithm.
|
||||
func Title(t language.Tag, opts ...Option) Caser {
|
||||
return Caser{makeTitle(t, getOpts(opts...))}
|
||||
}
|
||||
|
||||
// Fold returns a Caser that implements Unicode case folding. The returned Caser
|
||||
// is stateless and safe to use concurrently by multiple goroutines.
|
||||
//
|
||||
// Case folding does not normalize the input and may not preserve a normal form.
|
||||
// Use the collate or search package for more convenient and linguistically
|
||||
// sound comparisons. Use unicode/precis for string comparisons where security
|
||||
// aspects are a concern.
|
||||
func Fold(opts ...Option) Caser {
|
||||
return Caser{makeFold(getOpts(opts...))}
|
||||
}
|
||||
|
||||
// An Option is used to modify the behavior of a Caser.
|
||||
type Option func(o *options)
|
||||
|
||||
var (
|
||||
// NoLower disables the lowercasing of non-leading letters for a title
|
||||
// caser.
|
||||
NoLower Option = noLower
|
||||
|
||||
// Compact omits mappings in case folding for characters that would grow the
|
||||
// input. (Unimplemented.)
|
||||
Compact Option = compact
|
||||
)
|
||||
|
||||
// TODO: option to preserve a normal form, if applicable?
|
||||
|
||||
type options struct {
|
||||
noLower bool
|
||||
simple bool
|
||||
|
||||
// TODO: segmenter, max ignorable, alternative versions, etc.
|
||||
|
||||
noFinalSigma bool // Only used for testing.
|
||||
}
|
||||
|
||||
func getOpts(o ...Option) (res options) {
|
||||
for _, f := range o {
|
||||
f(&res)
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func noLower(o *options) {
|
||||
o.noLower = true
|
||||
}
|
||||
|
||||
func compact(o *options) {
|
||||
o.simple = true
|
||||
}
|
||||
281
vendor/golang.org/x/text/cases/context.go
generated
vendored
Normal file
281
vendor/golang.org/x/text/cases/context.go
generated
vendored
Normal file
|
|
@ -0,0 +1,281 @@
|
|||
// Copyright 2014 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package cases
|
||||
|
||||
import (
|
||||
"golang.org/x/text/transform"
|
||||
)
|
||||
|
||||
// A context is used for iterating over source bytes, fetching case info and
|
||||
// writing to a destination buffer.
|
||||
//
|
||||
// Casing operations may need more than one rune of context to decide how a rune
|
||||
// should be cased. Casing implementations should call checkpoint on context
|
||||
// whenever it is known to be safe to return the runes processed so far.
|
||||
//
|
||||
// It is recommended for implementations to not allow for more than 30 case
|
||||
// ignorables as lookahead (analogous to the limit in norm) and to use state if
|
||||
// unbounded lookahead is needed for cased runes.
|
||||
type context struct {
|
||||
dst, src []byte
|
||||
atEOF bool
|
||||
|
||||
pDst int // pDst points past the last written rune in dst.
|
||||
pSrc int // pSrc points to the start of the currently scanned rune.
|
||||
|
||||
// checkpoints safe to return in Transform, where nDst <= pDst and nSrc <= pSrc.
|
||||
nDst, nSrc int
|
||||
err error
|
||||
|
||||
sz int // size of current rune
|
||||
info info // case information of currently scanned rune
|
||||
|
||||
// State preserved across calls to Transform.
|
||||
isMidWord bool // false if next cased letter needs to be title-cased.
|
||||
}
|
||||
|
||||
func (c *context) Reset() {
|
||||
c.isMidWord = false
|
||||
}
|
||||
|
||||
// ret returns the return values for the Transform method. It checks whether
|
||||
// there were insufficient bytes in src to complete and introduces an error
|
||||
// accordingly, if necessary.
|
||||
func (c *context) ret() (nDst, nSrc int, err error) {
|
||||
if c.err != nil || c.nSrc == len(c.src) {
|
||||
return c.nDst, c.nSrc, c.err
|
||||
}
|
||||
// This point is only reached by mappers if there was no short destination
|
||||
// buffer. This means that the source buffer was exhausted and that c.sz was
|
||||
// set to 0 by next.
|
||||
if c.atEOF && c.pSrc == len(c.src) {
|
||||
return c.pDst, c.pSrc, nil
|
||||
}
|
||||
return c.nDst, c.nSrc, transform.ErrShortSrc
|
||||
}
|
||||
|
||||
// checkpoint sets the return value buffer points for Transform to the current
|
||||
// positions.
|
||||
func (c *context) checkpoint() {
|
||||
if c.err == nil {
|
||||
c.nDst, c.nSrc = c.pDst, c.pSrc+c.sz
|
||||
}
|
||||
}
|
||||
|
||||
// unreadRune causes the last rune read by next to be reread on the next
|
||||
// invocation of next. Only one unreadRune may be called after a call to next.
|
||||
func (c *context) unreadRune() {
|
||||
c.sz = 0
|
||||
}
|
||||
|
||||
func (c *context) next() bool {
|
||||
c.pSrc += c.sz
|
||||
if c.pSrc == len(c.src) || c.err != nil {
|
||||
c.info, c.sz = 0, 0
|
||||
return false
|
||||
}
|
||||
v, sz := trie.lookup(c.src[c.pSrc:])
|
||||
c.info, c.sz = info(v), sz
|
||||
if c.sz == 0 {
|
||||
if c.atEOF {
|
||||
// A zero size means we have an incomplete rune. If we are atEOF,
|
||||
// this means it is an illegal rune, which we will consume one
|
||||
// byte at a time.
|
||||
c.sz = 1
|
||||
} else {
|
||||
c.err = transform.ErrShortSrc
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// writeBytes adds bytes to dst.
|
||||
func (c *context) writeBytes(b []byte) bool {
|
||||
if len(c.dst)-c.pDst < len(b) {
|
||||
c.err = transform.ErrShortDst
|
||||
return false
|
||||
}
|
||||
// This loop is faster than using copy.
|
||||
for _, ch := range b {
|
||||
c.dst[c.pDst] = ch
|
||||
c.pDst++
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// writeString writes the given string to dst.
|
||||
func (c *context) writeString(s string) bool {
|
||||
if len(c.dst)-c.pDst < len(s) {
|
||||
c.err = transform.ErrShortDst
|
||||
return false
|
||||
}
|
||||
// This loop is faster than using copy.
|
||||
for i := 0; i < len(s); i++ {
|
||||
c.dst[c.pDst] = s[i]
|
||||
c.pDst++
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// copy writes the current rune to dst.
|
||||
func (c *context) copy() bool {
|
||||
return c.writeBytes(c.src[c.pSrc : c.pSrc+c.sz])
|
||||
}
|
||||
|
||||
// copyXOR copies the current rune to dst and modifies it by applying the XOR
|
||||
// pattern of the case info. It is the responsibility of the caller to ensure
|
||||
// that this is a rune with a XOR pattern defined.
|
||||
func (c *context) copyXOR() bool {
|
||||
if !c.copy() {
|
||||
return false
|
||||
}
|
||||
if c.info&xorIndexBit == 0 {
|
||||
// Fast path for 6-bit XOR pattern, which covers most cases.
|
||||
c.dst[c.pDst-1] ^= byte(c.info >> xorShift)
|
||||
} else {
|
||||
// Interpret XOR bits as an index.
|
||||
// TODO: test performance for unrolling this loop. Verify that we have
|
||||
// at least two bytes and at most three.
|
||||
idx := c.info >> xorShift
|
||||
for p := c.pDst - 1; ; p-- {
|
||||
c.dst[p] ^= xorData[idx]
|
||||
idx--
|
||||
if xorData[idx] == 0 {
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// hasPrefix returns true if src[pSrc:] starts with the given string.
|
||||
func (c *context) hasPrefix(s string) bool {
|
||||
b := c.src[c.pSrc:]
|
||||
if len(b) < len(s) {
|
||||
return false
|
||||
}
|
||||
for i, c := range b[:len(s)] {
|
||||
if c != s[i] {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// caseType returns an info with only the case bits, normalized to either
|
||||
// cLower, cUpper, cTitle or cUncased.
|
||||
func (c *context) caseType() info {
|
||||
cm := c.info & 0x7
|
||||
if cm < 4 {
|
||||
return cm
|
||||
}
|
||||
if cm >= cXORCase {
|
||||
// xor the last bit of the rune with the case type bits.
|
||||
b := c.src[c.pSrc+c.sz-1]
|
||||
return info(b&1) ^ cm&0x3
|
||||
}
|
||||
if cm == cIgnorableCased {
|
||||
return cLower
|
||||
}
|
||||
return cUncased
|
||||
}
|
||||
|
||||
// lower writes the lowercase version of the current rune to dst.
|
||||
func lower(c *context) bool {
|
||||
ct := c.caseType()
|
||||
if c.info&hasMappingMask == 0 || ct == cLower {
|
||||
return c.copy()
|
||||
}
|
||||
if c.info&exceptionBit == 0 {
|
||||
return c.copyXOR()
|
||||
}
|
||||
e := exceptions[c.info>>exceptionShift:]
|
||||
offset := 2 + e[0]&lengthMask // size of header + fold string
|
||||
if nLower := (e[1] >> lengthBits) & lengthMask; nLower != noChange {
|
||||
return c.writeString(e[offset : offset+nLower])
|
||||
}
|
||||
return c.copy()
|
||||
}
|
||||
|
||||
// upper writes the uppercase version of the current rune to dst.
|
||||
func upper(c *context) bool {
|
||||
ct := c.caseType()
|
||||
if c.info&hasMappingMask == 0 || ct == cUpper {
|
||||
return c.copy()
|
||||
}
|
||||
if c.info&exceptionBit == 0 {
|
||||
return c.copyXOR()
|
||||
}
|
||||
e := exceptions[c.info>>exceptionShift:]
|
||||
offset := 2 + e[0]&lengthMask // size of header + fold string
|
||||
// Get length of first special case mapping.
|
||||
n := (e[1] >> lengthBits) & lengthMask
|
||||
if ct == cTitle {
|
||||
// The first special case mapping is for lower. Set n to the second.
|
||||
if n == noChange {
|
||||
n = 0
|
||||
}
|
||||
n, e = e[1]&lengthMask, e[n:]
|
||||
}
|
||||
if n != noChange {
|
||||
return c.writeString(e[offset : offset+n])
|
||||
}
|
||||
return c.copy()
|
||||
}
|
||||
|
||||
// title writes the title case version of the current rune to dst.
|
||||
func title(c *context) bool {
|
||||
ct := c.caseType()
|
||||
if c.info&hasMappingMask == 0 || ct == cTitle {
|
||||
return c.copy()
|
||||
}
|
||||
if c.info&exceptionBit == 0 {
|
||||
if ct == cLower {
|
||||
return c.copyXOR()
|
||||
}
|
||||
return c.copy()
|
||||
}
|
||||
// Get the exception data.
|
||||
e := exceptions[c.info>>exceptionShift:]
|
||||
offset := 2 + e[0]&lengthMask // size of header + fold string
|
||||
|
||||
nFirst := (e[1] >> lengthBits) & lengthMask
|
||||
if nTitle := e[1] & lengthMask; nTitle != noChange {
|
||||
if nFirst != noChange {
|
||||
e = e[nFirst:]
|
||||
}
|
||||
return c.writeString(e[offset : offset+nTitle])
|
||||
}
|
||||
if ct == cLower && nFirst != noChange {
|
||||
// Use the uppercase version instead.
|
||||
return c.writeString(e[offset : offset+nFirst])
|
||||
}
|
||||
// Already in correct case.
|
||||
return c.copy()
|
||||
}
|
||||
|
||||
// foldFull writes the foldFull version of the current rune to dst.
|
||||
func foldFull(c *context) bool {
|
||||
if c.info&hasMappingMask == 0 {
|
||||
return c.copy()
|
||||
}
|
||||
ct := c.caseType()
|
||||
if c.info&exceptionBit == 0 {
|
||||
if ct != cLower || c.info&inverseFoldBit != 0 {
|
||||
return c.copyXOR()
|
||||
}
|
||||
return c.copy()
|
||||
}
|
||||
e := exceptions[c.info>>exceptionShift:]
|
||||
n := e[0] & lengthMask
|
||||
if n == 0 {
|
||||
if ct == cLower {
|
||||
return c.copy()
|
||||
}
|
||||
n = (e[1] >> lengthBits) & lengthMask
|
||||
}
|
||||
return c.writeString(e[2 : 2+n])
|
||||
}
|
||||
26
vendor/golang.org/x/text/cases/fold.go
generated
vendored
Normal file
26
vendor/golang.org/x/text/cases/fold.go
generated
vendored
Normal file
|
|
@ -0,0 +1,26 @@
|
|||
// Copyright 2016 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package cases
|
||||
|
||||
import "golang.org/x/text/transform"
|
||||
|
||||
type caseFolder struct{ transform.NopResetter }
|
||||
|
||||
// caseFolder implements the Transformer interface for doing case folding.
|
||||
func (t *caseFolder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
c := context{dst: dst, src: src, atEOF: atEOF}
|
||||
for c.next() {
|
||||
foldFull(&c)
|
||||
c.checkpoint()
|
||||
}
|
||||
return c.ret()
|
||||
}
|
||||
|
||||
func makeFold(o options) transform.Transformer {
|
||||
// TODO: Special case folding, through option Language, Special/Turkic, or
|
||||
// both.
|
||||
// TODO: Implement Compact options.
|
||||
return &caseFolder{}
|
||||
}
|
||||
831
vendor/golang.org/x/text/cases/gen.go
generated
vendored
Normal file
831
vendor/golang.org/x/text/cases/gen.go
generated
vendored
Normal file
|
|
@ -0,0 +1,831 @@
|
|||
// Copyright 2014 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build ignore
|
||||
|
||||
// This program generates the trie for casing operations. The Unicode casing
|
||||
// algorithm requires the lookup of various properties and mappings for each
|
||||
// rune. The table generated by this generator combines several of the most
|
||||
// frequently used of these into a single trie so that they can be accessed
|
||||
// with a single lookup.
|
||||
package main
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"io"
|
||||
"io/ioutil"
|
||||
"log"
|
||||
"reflect"
|
||||
"strconv"
|
||||
"strings"
|
||||
"unicode"
|
||||
|
||||
"golang.org/x/text/internal/gen"
|
||||
"golang.org/x/text/internal/triegen"
|
||||
"golang.org/x/text/internal/ucd"
|
||||
"golang.org/x/text/unicode/norm"
|
||||
)
|
||||
|
||||
func main() {
|
||||
gen.Init()
|
||||
genTables()
|
||||
genTablesTest()
|
||||
gen.Repackage("gen_trieval.go", "trieval.go", "cases")
|
||||
}
|
||||
|
||||
// runeInfo contains all information for a rune that we care about for casing
|
||||
// operations.
|
||||
type runeInfo struct {
|
||||
Rune rune
|
||||
|
||||
entry info // trie value for this rune.
|
||||
|
||||
CaseMode info
|
||||
|
||||
// Simple case mappings.
|
||||
Simple [1 + maxCaseMode][]rune
|
||||
|
||||
// Special casing
|
||||
HasSpecial bool
|
||||
Conditional bool
|
||||
Special [1 + maxCaseMode][]rune
|
||||
|
||||
// Folding
|
||||
FoldSimple rune
|
||||
FoldSpecial rune
|
||||
FoldFull []rune
|
||||
|
||||
// TODO: FC_NFKC, or equivalent data.
|
||||
|
||||
// Properties
|
||||
SoftDotted bool
|
||||
CaseIgnorable bool
|
||||
Cased bool
|
||||
DecomposeGreek bool
|
||||
BreakType string
|
||||
BreakCat breakCategory
|
||||
|
||||
// We care mostly about 0, Above, and IotaSubscript.
|
||||
CCC byte
|
||||
}
|
||||
|
||||
type breakCategory int
|
||||
|
||||
const (
|
||||
breakBreak breakCategory = iota
|
||||
breakLetter
|
||||
breakIgnored
|
||||
)
|
||||
|
||||
// mapping returns the case mapping for the given case type.
|
||||
func (r *runeInfo) mapping(c info) string {
|
||||
if r.HasSpecial {
|
||||
return string(r.Special[c])
|
||||
}
|
||||
if len(r.Simple[c]) != 0 {
|
||||
return string(r.Simple[c])
|
||||
}
|
||||
return string(r.Rune)
|
||||
}
|
||||
|
||||
func parse(file string, f func(p *ucd.Parser)) {
|
||||
ucd.Parse(gen.OpenUCDFile(file), f)
|
||||
}
|
||||
|
||||
func parseUCD() []runeInfo {
|
||||
chars := make([]runeInfo, unicode.MaxRune)
|
||||
|
||||
get := func(r rune) *runeInfo {
|
||||
c := &chars[r]
|
||||
c.Rune = r
|
||||
return c
|
||||
}
|
||||
|
||||
parse("UnicodeData.txt", func(p *ucd.Parser) {
|
||||
ri := get(p.Rune(0))
|
||||
ri.CCC = byte(p.Int(ucd.CanonicalCombiningClass))
|
||||
ri.Simple[cLower] = p.Runes(ucd.SimpleLowercaseMapping)
|
||||
ri.Simple[cUpper] = p.Runes(ucd.SimpleUppercaseMapping)
|
||||
ri.Simple[cTitle] = p.Runes(ucd.SimpleTitlecaseMapping)
|
||||
if p.String(ucd.GeneralCategory) == "Lt" {
|
||||
ri.CaseMode = cTitle
|
||||
}
|
||||
})
|
||||
|
||||
// <code>; <property>
|
||||
parse("PropList.txt", func(p *ucd.Parser) {
|
||||
if p.String(1) == "Soft_Dotted" {
|
||||
chars[p.Rune(0)].SoftDotted = true
|
||||
}
|
||||
})
|
||||
|
||||
// <code>; <word break type>
|
||||
parse("DerivedCoreProperties.txt", func(p *ucd.Parser) {
|
||||
ri := get(p.Rune(0))
|
||||
switch p.String(1) {
|
||||
case "Case_Ignorable":
|
||||
ri.CaseIgnorable = true
|
||||
case "Cased":
|
||||
ri.Cased = true
|
||||
case "Lowercase":
|
||||
ri.CaseMode = cLower
|
||||
case "Uppercase":
|
||||
ri.CaseMode = cUpper
|
||||
}
|
||||
})
|
||||
|
||||
// <code>; <lower> ; <title> ; <upper> ; (<condition_list> ;)?
|
||||
parse("SpecialCasing.txt", func(p *ucd.Parser) {
|
||||
// We drop all conditional special casing and deal with them manually in
|
||||
// the language-specific case mappers. Rune 0x03A3 is the only one with
|
||||
// a conditional formatting that is not language-specific. However,
|
||||
// dealing with this letter is tricky, especially in a streaming
|
||||
// context, so we deal with it in the Caser for Greek specifically.
|
||||
ri := get(p.Rune(0))
|
||||
if p.String(4) == "" {
|
||||
ri.HasSpecial = true
|
||||
ri.Special[cLower] = p.Runes(1)
|
||||
ri.Special[cTitle] = p.Runes(2)
|
||||
ri.Special[cUpper] = p.Runes(3)
|
||||
} else {
|
||||
ri.Conditional = true
|
||||
}
|
||||
})
|
||||
|
||||
// TODO: Use text breaking according to UAX #29.
|
||||
// <code>; <word break type>
|
||||
parse("auxiliary/WordBreakProperty.txt", func(p *ucd.Parser) {
|
||||
ri := get(p.Rune(0))
|
||||
ri.BreakType = p.String(1)
|
||||
|
||||
// We collapse the word breaking properties onto the categories we need.
|
||||
switch p.String(1) { // TODO: officially we need to canonicalize.
|
||||
case "Format", "MidLetter", "MidNumLet", "Single_Quote":
|
||||
ri.BreakCat = breakIgnored
|
||||
case "ALetter", "Hebrew_Letter", "Numeric", "Extend", "ExtendNumLet":
|
||||
ri.BreakCat = breakLetter
|
||||
}
|
||||
})
|
||||
|
||||
// <code>; <type>; <mapping>
|
||||
parse("CaseFolding.txt", func(p *ucd.Parser) {
|
||||
ri := get(p.Rune(0))
|
||||
switch p.String(1) {
|
||||
case "C":
|
||||
ri.FoldSimple = p.Rune(2)
|
||||
ri.FoldFull = p.Runes(2)
|
||||
case "S":
|
||||
ri.FoldSimple = p.Rune(2)
|
||||
case "T":
|
||||
ri.FoldSpecial = p.Rune(2)
|
||||
case "F":
|
||||
ri.FoldFull = p.Runes(2)
|
||||
default:
|
||||
log.Fatalf("%U: unknown type: %s", p.Rune(0), p.String(1))
|
||||
}
|
||||
})
|
||||
|
||||
return chars
|
||||
}
|
||||
|
||||
func genTables() {
|
||||
chars := parseUCD()
|
||||
verifyProperties(chars)
|
||||
|
||||
t := triegen.NewTrie("case")
|
||||
for i := range chars {
|
||||
c := &chars[i]
|
||||
makeEntry(c)
|
||||
t.Insert(rune(i), uint64(c.entry))
|
||||
}
|
||||
|
||||
w := gen.NewCodeWriter()
|
||||
defer w.WriteGoFile("tables.go", "cases")
|
||||
|
||||
gen.WriteUnicodeVersion(w)
|
||||
|
||||
// TODO: write CLDR version after adding a mechanism to detect that the
|
||||
// tables on which the manually created locale-sensitive casing code is
|
||||
// based hasn't changed.
|
||||
|
||||
w.WriteVar("xorData", string(xorData))
|
||||
w.WriteVar("exceptions", string(exceptionData))
|
||||
|
||||
sz, err := t.Gen(w, triegen.Compact(&sparseCompacter{}))
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
w.Size += sz
|
||||
}
|
||||
|
||||
func makeEntry(ri *runeInfo) {
|
||||
if ri.CaseIgnorable {
|
||||
if ri.Cased {
|
||||
ri.entry = cIgnorableCased
|
||||
} else {
|
||||
ri.entry = cIgnorableUncased
|
||||
}
|
||||
} else {
|
||||
ri.entry = ri.CaseMode
|
||||
}
|
||||
|
||||
// TODO: handle soft-dotted.
|
||||
|
||||
ccc := cccOther
|
||||
switch ri.CCC {
|
||||
case 0: // Not_Reordered
|
||||
ccc = cccZero
|
||||
case above: // Above
|
||||
ccc = cccAbove
|
||||
}
|
||||
if ri.BreakCat == breakBreak {
|
||||
ccc = cccBreak
|
||||
}
|
||||
|
||||
ri.entry |= ccc
|
||||
|
||||
if ri.CaseMode == cUncased {
|
||||
return
|
||||
}
|
||||
|
||||
// Need to do something special.
|
||||
if ri.CaseMode == cTitle || ri.HasSpecial || ri.mapping(cTitle) != ri.mapping(cUpper) {
|
||||
makeException(ri)
|
||||
return
|
||||
}
|
||||
if f := string(ri.FoldFull); len(f) > 0 && f != ri.mapping(cUpper) && f != ri.mapping(cLower) {
|
||||
makeException(ri)
|
||||
return
|
||||
}
|
||||
|
||||
// Rune is either lowercase or uppercase.
|
||||
|
||||
orig := string(ri.Rune)
|
||||
mapped := ""
|
||||
if ri.CaseMode == cUpper {
|
||||
mapped = ri.mapping(cLower)
|
||||
} else {
|
||||
mapped = ri.mapping(cUpper)
|
||||
}
|
||||
|
||||
if len(orig) != len(mapped) {
|
||||
makeException(ri)
|
||||
return
|
||||
}
|
||||
|
||||
if string(ri.FoldFull) == ri.mapping(cUpper) {
|
||||
ri.entry |= inverseFoldBit
|
||||
}
|
||||
|
||||
n := len(orig)
|
||||
|
||||
// Create per-byte XOR mask.
|
||||
var b []byte
|
||||
for i := 0; i < n; i++ {
|
||||
b = append(b, orig[i]^mapped[i])
|
||||
}
|
||||
|
||||
// Remove leading 0 bytes, but keep at least one byte.
|
||||
for ; len(b) > 1 && b[0] == 0; b = b[1:] {
|
||||
}
|
||||
|
||||
if len(b) == 1 && b[0]&0xc0 == 0 {
|
||||
ri.entry |= info(b[0]) << xorShift
|
||||
return
|
||||
}
|
||||
|
||||
key := string(b)
|
||||
x, ok := xorCache[key]
|
||||
if !ok {
|
||||
xorData = append(xorData, 0) // for detecting start of sequence
|
||||
xorData = append(xorData, b...)
|
||||
|
||||
x = len(xorData) - 1
|
||||
xorCache[key] = x
|
||||
}
|
||||
ri.entry |= info(x<<xorShift) | xorIndexBit
|
||||
}
|
||||
|
||||
var xorCache = map[string]int{}
|
||||
|
||||
// xorData contains byte-wise XOR data for the least significant bytes of a
|
||||
// UTF-8 encoded rune. An index points to the last byte. The sequence starts
|
||||
// with a zero terminator.
|
||||
var xorData = []byte{}
|
||||
|
||||
// See the comments in gen_trieval.go re "the exceptions slice".
|
||||
var exceptionData = []byte{0}
|
||||
|
||||
// makeException encodes case mappings that cannot be expressed in a simple
|
||||
// XOR diff.
|
||||
func makeException(ri *runeInfo) {
|
||||
ccc := ri.entry & cccMask
|
||||
// Set exception bit and retain case type.
|
||||
ri.entry &= 0x0007
|
||||
ri.entry |= exceptionBit
|
||||
|
||||
if len(exceptionData) >= 1<<numExceptionBits {
|
||||
log.Fatalf("%U:exceptionData too large %x > %d bits", ri.Rune, len(exceptionData), numExceptionBits)
|
||||
}
|
||||
|
||||
// Set the offset in the exceptionData array.
|
||||
ri.entry |= info(len(exceptionData) << exceptionShift)
|
||||
|
||||
orig := string(ri.Rune)
|
||||
tc := ri.mapping(cTitle)
|
||||
uc := ri.mapping(cUpper)
|
||||
lc := ri.mapping(cLower)
|
||||
ff := string(ri.FoldFull)
|
||||
|
||||
// addString sets the length of a string and adds it to the expansions array.
|
||||
addString := func(s string, b *byte) {
|
||||
if len(s) == 0 {
|
||||
// Zero-length mappings exist, but only for conditional casing,
|
||||
// which we are representing outside of this table.
|
||||
log.Fatalf("%U: has zero-length mapping.", ri.Rune)
|
||||
}
|
||||
*b <<= 3
|
||||
if s != orig {
|
||||
n := len(s)
|
||||
if n > 7 {
|
||||
log.Fatalf("%U: mapping larger than 7 (%d)", ri.Rune, n)
|
||||
}
|
||||
*b |= byte(n)
|
||||
exceptionData = append(exceptionData, s...)
|
||||
}
|
||||
}
|
||||
|
||||
// byte 0:
|
||||
exceptionData = append(exceptionData, byte(ccc)|byte(len(ff)))
|
||||
|
||||
// byte 1:
|
||||
p := len(exceptionData)
|
||||
exceptionData = append(exceptionData, 0)
|
||||
|
||||
if len(ff) > 7 { // May be zero-length.
|
||||
log.Fatalf("%U: fold string larger than 7 (%d)", ri.Rune, len(ff))
|
||||
}
|
||||
exceptionData = append(exceptionData, ff...)
|
||||
ct := ri.CaseMode
|
||||
if ct != cLower {
|
||||
addString(lc, &exceptionData[p])
|
||||
}
|
||||
if ct != cUpper {
|
||||
addString(uc, &exceptionData[p])
|
||||
}
|
||||
if ct != cTitle {
|
||||
// If title is the same as upper, we set it to the original string so
|
||||
// that it will be marked as not present. This implies title case is
|
||||
// the same as upper case.
|
||||
if tc == uc {
|
||||
tc = orig
|
||||
}
|
||||
addString(tc, &exceptionData[p])
|
||||
}
|
||||
}
|
||||
|
||||
// sparseCompacter is a trie value block Compacter. There are many cases where
|
||||
// successive runes alternate between lower- and upper-case. This Compacter
|
||||
// exploits this by adding a special case type where the case value is obtained
|
||||
// from or-ing it with the least-significant bit of the rune, creating large
|
||||
// ranges of equal case values that compress well.
|
||||
type sparseCompacter struct {
|
||||
sparseBlocks [][]uint16
|
||||
sparseOffsets []uint16
|
||||
sparseCount int
|
||||
}
|
||||
|
||||
// makeSparse returns the number of elements that compact block would contain
|
||||
// as well as the modified values.
|
||||
func makeSparse(vals []uint64) ([]uint16, int) {
|
||||
// Copy the values.
|
||||
values := make([]uint16, len(vals))
|
||||
for i, v := range vals {
|
||||
values[i] = uint16(v)
|
||||
}
|
||||
|
||||
alt := func(i int, v uint16) uint16 {
|
||||
if cm := info(v & fullCasedMask); cm == cUpper || cm == cLower {
|
||||
// Convert cLower or cUpper to cXORCase value, which has the form 11x.
|
||||
xor := v
|
||||
xor &^= 1
|
||||
xor |= uint16(i&1) ^ (v & 1)
|
||||
xor |= 0x4
|
||||
return xor
|
||||
}
|
||||
return v
|
||||
}
|
||||
|
||||
var count int
|
||||
var previous uint16
|
||||
for i, v := range values {
|
||||
if v != 0 {
|
||||
// Try if the unmodified value is equal to the previous.
|
||||
if v == previous {
|
||||
continue
|
||||
}
|
||||
|
||||
// Try if the xor-ed value is equal to the previous value.
|
||||
a := alt(i, v)
|
||||
if a == previous {
|
||||
values[i] = a
|
||||
continue
|
||||
}
|
||||
|
||||
// This is a new value.
|
||||
count++
|
||||
|
||||
// Use the xor-ed value if it will be identical to the next value.
|
||||
if p := i + 1; p < len(values) && alt(p, values[p]) == a {
|
||||
values[i] = a
|
||||
v = a
|
||||
}
|
||||
}
|
||||
previous = v
|
||||
}
|
||||
return values, count
|
||||
}
|
||||
|
||||
func (s *sparseCompacter) Size(v []uint64) (int, bool) {
|
||||
_, n := makeSparse(v)
|
||||
|
||||
// We limit using this method to having 16 entries.
|
||||
if n > 16 {
|
||||
return 0, false
|
||||
}
|
||||
|
||||
return 2 + int(reflect.TypeOf(valueRange{}).Size())*n, true
|
||||
}
|
||||
|
||||
func (s *sparseCompacter) Store(v []uint64) uint32 {
|
||||
h := uint32(len(s.sparseOffsets))
|
||||
values, sz := makeSparse(v)
|
||||
s.sparseBlocks = append(s.sparseBlocks, values)
|
||||
s.sparseOffsets = append(s.sparseOffsets, uint16(s.sparseCount))
|
||||
s.sparseCount += sz
|
||||
return h
|
||||
}
|
||||
|
||||
func (s *sparseCompacter) Handler() string {
|
||||
// The sparse global variable and its lookup method is defined in gen_trieval.go.
|
||||
return "sparse.lookup"
|
||||
}
|
||||
|
||||
func (s *sparseCompacter) Print(w io.Writer) (retErr error) {
|
||||
p := func(format string, args ...interface{}) {
|
||||
_, err := fmt.Fprintf(w, format, args...)
|
||||
if retErr == nil && err != nil {
|
||||
retErr = err
|
||||
}
|
||||
}
|
||||
|
||||
ls := len(s.sparseBlocks)
|
||||
if ls == len(s.sparseOffsets) {
|
||||
s.sparseOffsets = append(s.sparseOffsets, uint16(s.sparseCount))
|
||||
}
|
||||
p("// sparseOffsets: %d entries, %d bytes\n", ls+1, (ls+1)*2)
|
||||
p("var sparseOffsets = %#v\n\n", s.sparseOffsets)
|
||||
|
||||
ns := s.sparseCount
|
||||
p("// sparseValues: %d entries, %d bytes\n", ns, ns*4)
|
||||
p("var sparseValues = [%d]valueRange {", ns)
|
||||
for i, values := range s.sparseBlocks {
|
||||
p("\n// Block %#x, offset %#x", i, s.sparseOffsets[i])
|
||||
var v uint16
|
||||
for i, nv := range values {
|
||||
if nv != v {
|
||||
if v != 0 {
|
||||
p(",hi:%#02x},", 0x80+i-1)
|
||||
}
|
||||
if nv != 0 {
|
||||
p("\n{value:%#04x,lo:%#02x", nv, 0x80+i)
|
||||
}
|
||||
}
|
||||
v = nv
|
||||
}
|
||||
if v != 0 {
|
||||
p(",hi:%#02x},", 0x80+len(values)-1)
|
||||
}
|
||||
}
|
||||
p("\n}\n\n")
|
||||
return
|
||||
}
|
||||
|
||||
// verifyProperties that properties of the runes that are relied upon in the
|
||||
// implementation. Each property is marked with an identifier that is referred
|
||||
// to in the places where it is used.
|
||||
func verifyProperties(chars []runeInfo) {
|
||||
for i, c := range chars {
|
||||
r := rune(i)
|
||||
|
||||
// Rune properties.
|
||||
|
||||
// A.1: modifier never changes on lowercase. [ltLower]
|
||||
if c.CCC > 0 && unicode.ToLower(r) != r {
|
||||
log.Fatalf("%U: non-starter changes when lowercased", r)
|
||||
}
|
||||
|
||||
// A.2: properties of decompositions starting with I or J. [ltLower]
|
||||
d := norm.NFD.PropertiesString(string(r)).Decomposition()
|
||||
if len(d) > 0 {
|
||||
if d[0] == 'I' || d[0] == 'J' {
|
||||
// A.2.1: we expect at least an ASCII character and a modifier.
|
||||
if len(d) < 3 {
|
||||
log.Fatalf("%U: length of decomposition was %d; want >= 3", r, len(d))
|
||||
}
|
||||
|
||||
// All subsequent runes are modifiers and all have the same CCC.
|
||||
runes := []rune(string(d[1:]))
|
||||
ccc := chars[runes[0]].CCC
|
||||
|
||||
for _, mr := range runes[1:] {
|
||||
mc := chars[mr]
|
||||
|
||||
// A.2.2: all modifiers have a CCC of Above or less.
|
||||
if ccc == 0 || ccc > above {
|
||||
log.Fatalf("%U: CCC of successive rune (%U) was %d; want (0,230]", r, mr, ccc)
|
||||
}
|
||||
|
||||
// A.2.3: a sequence of modifiers all have the same CCC.
|
||||
if mc.CCC != ccc {
|
||||
log.Fatalf("%U: CCC of follow-up modifier (%U) was %d; want %d", r, mr, mc.CCC, ccc)
|
||||
}
|
||||
|
||||
// A.2.4: for each trailing r, r in [0x300, 0x311] <=> CCC == Above.
|
||||
if (ccc == above) != (0x300 <= mr && mr <= 0x311) {
|
||||
log.Fatalf("%U: modifier %U in [U+0300, U+0311] != ccc(%U) == 230", r, mr, mr)
|
||||
}
|
||||
|
||||
if i += len(string(mr)); i >= len(d) {
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// A.3: no U+0307 in decomposition of Soft-Dotted rune. [ltUpper]
|
||||
if unicode.Is(unicode.Soft_Dotted, r) && strings.Contains(string(d), "\u0307") {
|
||||
log.Fatalf("%U: decomposition of soft-dotted rune may not contain U+0307", r)
|
||||
}
|
||||
|
||||
// A.4: only rune U+0345 may be of CCC Iota_Subscript. [elUpper]
|
||||
if c.CCC == iotaSubscript && r != 0x0345 {
|
||||
log.Fatalf("%U: only rune U+0345 may have CCC Iota_Subscript", r)
|
||||
}
|
||||
|
||||
// A.5: soft-dotted runes do not have exceptions.
|
||||
if c.SoftDotted && c.entry&exceptionBit != 0 {
|
||||
log.Fatalf("%U: soft-dotted has exception", r)
|
||||
}
|
||||
|
||||
// A.6: Greek decomposition. [elUpper]
|
||||
if unicode.Is(unicode.Greek, r) {
|
||||
if b := norm.NFD.PropertiesString(string(r)).Decomposition(); b != nil {
|
||||
runes := []rune(string(b))
|
||||
// A.6.1: If a Greek rune decomposes and the first rune of the
|
||||
// decomposition is greater than U+00FF, the rune is always
|
||||
// great and not a modifier.
|
||||
if f := runes[0]; unicode.IsMark(f) || f > 0xFF && !unicode.Is(unicode.Greek, f) {
|
||||
log.Fatalf("%U: expeced first rune of Greek decomposition to be letter, found %U", r, f)
|
||||
}
|
||||
// A.6.2: Any follow-up rune in a Greek decomposition is a
|
||||
// modifier of which the first should be gobbled in
|
||||
// decomposition.
|
||||
for _, m := range runes[1:] {
|
||||
switch m {
|
||||
case 0x0313, 0x0314, 0x0301, 0x0300, 0x0306, 0x0342, 0x0308, 0x0304, 0x345:
|
||||
default:
|
||||
log.Fatalf("%U: modifier %U is outside of expeced Greek modifier set", r, m)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Breaking properties.
|
||||
|
||||
// B.1: all runes with CCC > 0 are of break type Extend.
|
||||
if c.CCC > 0 && c.BreakType != "Extend" {
|
||||
log.Fatalf("%U: CCC == %d, but got break type %s; want Extend", r, c.CCC, c.BreakType)
|
||||
}
|
||||
|
||||
// B.2: all cased runes with c.CCC == 0 are of break type ALetter.
|
||||
if c.CCC == 0 && c.Cased && c.BreakType != "ALetter" {
|
||||
log.Fatalf("%U: cased, but got break type %s; want ALetter", r, c.BreakType)
|
||||
}
|
||||
|
||||
// B.3: letter category.
|
||||
if c.CCC == 0 && c.BreakCat != breakBreak && !c.CaseIgnorable {
|
||||
if c.BreakCat != breakLetter {
|
||||
log.Fatalf("%U: check for letter break type gave %d; want %d", r, c.BreakCat, breakLetter)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func genTablesTest() {
|
||||
w := &bytes.Buffer{}
|
||||
|
||||
fmt.Fprintln(w, "var (")
|
||||
printProperties(w, "DerivedCoreProperties.txt", "Case_Ignorable", verifyIgnore)
|
||||
|
||||
// We discard the output as we know we have perfect functions. We run them
|
||||
// just to verify the properties are correct.
|
||||
n := printProperties(ioutil.Discard, "DerivedCoreProperties.txt", "Cased", verifyCased)
|
||||
n += printProperties(ioutil.Discard, "DerivedCoreProperties.txt", "Lowercase", verifyLower)
|
||||
n += printProperties(ioutil.Discard, "DerivedCoreProperties.txt", "Uppercase", verifyUpper)
|
||||
if n > 0 {
|
||||
log.Fatalf("One of the discarded properties does not have a perfect filter.")
|
||||
}
|
||||
|
||||
// <code>; <lower> ; <title> ; <upper> ; (<condition_list> ;)?
|
||||
fmt.Fprintln(w, "\tspecial = map[rune]struct{ toLower, toTitle, toUpper string }{")
|
||||
parse("SpecialCasing.txt", func(p *ucd.Parser) {
|
||||
// Skip conditional entries.
|
||||
if p.String(4) != "" {
|
||||
return
|
||||
}
|
||||
r := p.Rune(0)
|
||||
fmt.Fprintf(w, "\t\t0x%04x: {%q, %q, %q},\n",
|
||||
r, string(p.Runes(1)), string(p.Runes(2)), string(p.Runes(3)))
|
||||
})
|
||||
fmt.Fprint(w, "\t}\n\n")
|
||||
|
||||
// <code>; <type>; <runes>
|
||||
table := map[rune]struct{ simple, full, special string }{}
|
||||
parse("CaseFolding.txt", func(p *ucd.Parser) {
|
||||
r := p.Rune(0)
|
||||
t := p.String(1)
|
||||
v := string(p.Runes(2))
|
||||
if t != "T" && v == string(unicode.ToLower(r)) {
|
||||
return
|
||||
}
|
||||
x := table[r]
|
||||
switch t {
|
||||
case "C":
|
||||
x.full = v
|
||||
x.simple = v
|
||||
case "S":
|
||||
x.simple = v
|
||||
case "F":
|
||||
x.full = v
|
||||
case "T":
|
||||
x.special = v
|
||||
}
|
||||
table[r] = x
|
||||
})
|
||||
fmt.Fprintln(w, "\tfoldMap = map[rune]struct{ simple, full, special string }{")
|
||||
for r := rune(0); r < 0x10FFFF; r++ {
|
||||
x, ok := table[r]
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
fmt.Fprintf(w, "\t\t0x%04x: {%q, %q, %q},\n", r, x.simple, x.full, x.special)
|
||||
}
|
||||
fmt.Fprint(w, "\t}\n\n")
|
||||
|
||||
// Break property
|
||||
notBreak := map[rune]bool{}
|
||||
parse("auxiliary/WordBreakProperty.txt", func(p *ucd.Parser) {
|
||||
switch p.String(1) {
|
||||
case "Extend", "Format", "MidLetter", "MidNumLet", "Single_Quote",
|
||||
"ALetter", "Hebrew_Letter", "Numeric", "ExtendNumLet":
|
||||
notBreak[p.Rune(0)] = true
|
||||
}
|
||||
})
|
||||
|
||||
fmt.Fprintln(w, "\tbreakProp = []struct{ lo, hi rune }{")
|
||||
inBreak := false
|
||||
for r := rune(0); r <= lastRuneForTesting; r++ {
|
||||
if isBreak := !notBreak[r]; isBreak != inBreak {
|
||||
if isBreak {
|
||||
fmt.Fprintf(w, "\t\t{0x%x, ", r)
|
||||
} else {
|
||||
fmt.Fprintf(w, "0x%x},\n", r-1)
|
||||
}
|
||||
inBreak = isBreak
|
||||
}
|
||||
}
|
||||
if inBreak {
|
||||
fmt.Fprintf(w, "0x%x},\n", lastRuneForTesting)
|
||||
}
|
||||
fmt.Fprint(w, "\t}\n\n")
|
||||
|
||||
// Word break test
|
||||
// Filter out all samples that do not contain cased characters.
|
||||
cased := map[rune]bool{}
|
||||
parse("DerivedCoreProperties.txt", func(p *ucd.Parser) {
|
||||
if p.String(1) == "Cased" {
|
||||
cased[p.Rune(0)] = true
|
||||
}
|
||||
})
|
||||
|
||||
fmt.Fprintln(w, "\tbreakTest = []string{")
|
||||
parse("auxiliary/WordBreakTest.txt", func(p *ucd.Parser) {
|
||||
c := strings.Split(p.String(0), " ")
|
||||
|
||||
const sep = '|'
|
||||
numCased := 0
|
||||
test := ""
|
||||
for ; len(c) >= 2; c = c[2:] {
|
||||
if c[0] == "÷" && test != "" {
|
||||
test += string(sep)
|
||||
}
|
||||
i, err := strconv.ParseUint(c[1], 16, 32)
|
||||
r := rune(i)
|
||||
if err != nil {
|
||||
log.Fatalf("Invalid rune %q.", c[1])
|
||||
}
|
||||
if r == sep {
|
||||
log.Fatalf("Separator %q not allowed in test data. Pick another one.", sep)
|
||||
}
|
||||
if cased[r] {
|
||||
numCased++
|
||||
}
|
||||
test += string(r)
|
||||
}
|
||||
if numCased > 1 {
|
||||
fmt.Fprintf(w, "\t\t%q,\n", test)
|
||||
}
|
||||
})
|
||||
fmt.Fprintln(w, "\t}")
|
||||
|
||||
fmt.Fprintln(w, ")")
|
||||
|
||||
gen.WriteGoFile("tables_test.go", "cases", w.Bytes())
|
||||
}
|
||||
|
||||
// These functions are just used for verification that their definition have not
|
||||
// changed in the Unicode Standard.
|
||||
|
||||
func verifyCased(r rune) bool {
|
||||
return verifyLower(r) || verifyUpper(r) || unicode.IsTitle(r)
|
||||
}
|
||||
|
||||
func verifyLower(r rune) bool {
|
||||
return unicode.IsLower(r) || unicode.Is(unicode.Other_Lowercase, r)
|
||||
}
|
||||
|
||||
func verifyUpper(r rune) bool {
|
||||
return unicode.IsUpper(r) || unicode.Is(unicode.Other_Uppercase, r)
|
||||
}
|
||||
|
||||
// verifyIgnore is an approximation of the Case_Ignorable property using the
|
||||
// core unicode package. It is used to reduce the size of the test data.
|
||||
func verifyIgnore(r rune) bool {
|
||||
props := []*unicode.RangeTable{
|
||||
unicode.Mn,
|
||||
unicode.Me,
|
||||
unicode.Cf,
|
||||
unicode.Lm,
|
||||
unicode.Sk,
|
||||
}
|
||||
for _, p := range props {
|
||||
if unicode.Is(p, r) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// printProperties prints tables of rune properties from the given UCD file.
|
||||
// A filter func f can be given to exclude certain values. A rune r will have
|
||||
// the indicated property if it is in the generated table or if f(r).
|
||||
func printProperties(w io.Writer, file, property string, f func(r rune) bool) int {
|
||||
verify := map[rune]bool{}
|
||||
n := 0
|
||||
varNameParts := strings.Split(property, "_")
|
||||
varNameParts[0] = strings.ToLower(varNameParts[0])
|
||||
fmt.Fprintf(w, "\t%s = map[rune]bool{\n", strings.Join(varNameParts, ""))
|
||||
parse(file, func(p *ucd.Parser) {
|
||||
if p.String(1) == property {
|
||||
r := p.Rune(0)
|
||||
verify[r] = true
|
||||
if !f(r) {
|
||||
n++
|
||||
fmt.Fprintf(w, "\t\t0x%.4x: true,\n", r)
|
||||
}
|
||||
}
|
||||
})
|
||||
fmt.Fprint(w, "\t}\n\n")
|
||||
|
||||
// Verify that f is correct, that is, it represents a subset of the property.
|
||||
for r := rune(0); r <= lastRuneForTesting; r++ {
|
||||
if !verify[r] && f(r) {
|
||||
log.Fatalf("Incorrect filter func for property %q.", property)
|
||||
}
|
||||
}
|
||||
return n
|
||||
}
|
||||
|
||||
// The newCaseTrie, sparseValues and sparseOffsets definitions below are
|
||||
// placeholders referred to by gen_trieval.go. The real definitions are
|
||||
// generated by this program and written to tables.go.
|
||||
|
||||
func newCaseTrie(int) int { return 0 }
|
||||
|
||||
var (
|
||||
sparseValues [0]valueRange
|
||||
sparseOffsets [0]uint16
|
||||
)
|
||||
217
vendor/golang.org/x/text/cases/gen_trieval.go
generated
vendored
Normal file
217
vendor/golang.org/x/text/cases/gen_trieval.go
generated
vendored
Normal file
|
|
@ -0,0 +1,217 @@
|
|||
// Copyright 2014 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build ignore
|
||||
|
||||
package main
|
||||
|
||||
// This file contains definitions for interpreting the trie value of the case
|
||||
// trie generated by "go run gen*.go". It is shared by both the generator
|
||||
// program and the resultant package. Sharing is achieved by the generator
|
||||
// copying gen_trieval.go to trieval.go and changing what's above this comment.
|
||||
|
||||
// info holds case information for a single rune. It is the value returned
|
||||
// by a trie lookup. Most mapping information can be stored in a single 16-bit
|
||||
// value. If not, for example when a rune is mapped to multiple runes, the value
|
||||
// stores some basic case data and an index into an array with additional data.
|
||||
//
|
||||
// The per-rune values have the following format:
|
||||
//
|
||||
// if (exception) {
|
||||
// 15..5 unsigned exception index
|
||||
// 4 unused
|
||||
// } else {
|
||||
// 15..8 XOR pattern or index to XOR pattern for case mapping
|
||||
// Only 13..8 are used for XOR patterns.
|
||||
// 7 inverseFold (fold to upper, not to lower)
|
||||
// 6 index: interpret the XOR pattern as an index
|
||||
// 5..4 CCC: zero (normal or break), above or other
|
||||
// }
|
||||
// 3 exception: interpret this value as an exception index
|
||||
// (TODO: is this bit necessary? Probably implied from case mode.)
|
||||
// 2..0 case mode
|
||||
//
|
||||
// For the non-exceptional cases, a rune must be either uncased, lowercase or
|
||||
// uppercase. If the rune is cased, the XOR pattern maps either a lowercase
|
||||
// rune to uppercase or an uppercase rune to lowercase (applied to the 10
|
||||
// least-significant bits of the rune).
|
||||
//
|
||||
// See the definitions below for a more detailed description of the various
|
||||
// bits.
|
||||
type info uint16
|
||||
|
||||
const (
|
||||
casedMask = 0x0003
|
||||
fullCasedMask = 0x0007
|
||||
ignorableMask = 0x0006
|
||||
ignorableValue = 0x0004
|
||||
|
||||
inverseFoldBit = 1 << 7
|
||||
|
||||
exceptionBit = 1 << 3
|
||||
exceptionShift = 5
|
||||
numExceptionBits = 11
|
||||
|
||||
xorIndexBit = 1 << 6
|
||||
xorShift = 8
|
||||
|
||||
// There is no mapping if all xor bits and the exception bit are zero.
|
||||
hasMappingMask = 0xffc0 | exceptionBit
|
||||
)
|
||||
|
||||
// The case mode bits encodes the case type of a rune. This includes uncased,
|
||||
// title, upper and lower case and case ignorable. (For a definition of these
|
||||
// terms see Chapter 3 of The Unicode Standard Core Specification.) In some rare
|
||||
// cases, a rune can be both cased and case-ignorable. This is encoded by
|
||||
// cIgnorableCased. A rune of this type is always lower case. Some runes are
|
||||
// cased while not having a mapping.
|
||||
//
|
||||
// A common pattern for scripts in the Unicode standard is for upper and lower
|
||||
// case runes to alternate for increasing rune values (e.g. the accented Latin
|
||||
// ranges starting from U+0100 and U+1E00 among others and some Cyrillic
|
||||
// characters). We use this property by defining a cXORCase mode, where the case
|
||||
// mode (always upper or lower case) is derived from the rune value. As the XOR
|
||||
// pattern for case mappings is often identical for successive runes, using
|
||||
// cXORCase can result in large series of identical trie values. This, in turn,
|
||||
// allows us to better compress the trie blocks.
|
||||
const (
|
||||
cUncased info = iota // 000
|
||||
cTitle // 001
|
||||
cLower // 010
|
||||
cUpper // 011
|
||||
cIgnorableUncased // 100
|
||||
cIgnorableCased // 101 // lower case if mappings exist
|
||||
cXORCase // 11x // case is cLower | ((rune&1) ^ x)
|
||||
|
||||
maxCaseMode = cUpper
|
||||
)
|
||||
|
||||
func (c info) isCased() bool {
|
||||
return c&casedMask != 0
|
||||
}
|
||||
|
||||
func (c info) isCaseIgnorable() bool {
|
||||
return c&ignorableMask == ignorableValue
|
||||
}
|
||||
|
||||
func (c info) isCaseIgnorableAndNonBreakStarter() bool {
|
||||
return c&(fullCasedMask|cccMask) == (ignorableValue | cccZero)
|
||||
}
|
||||
|
||||
func (c info) isNotCasedAndNotCaseIgnorable() bool {
|
||||
return c&fullCasedMask == 0
|
||||
}
|
||||
|
||||
func (c info) isCaseIgnorableAndNotCased() bool {
|
||||
return c&fullCasedMask == cIgnorableUncased
|
||||
}
|
||||
|
||||
// The case mapping implementation will need to know about various Canonical
|
||||
// Combining Class (CCC) values. We encode two of these in the trie value:
|
||||
// cccZero (0) and cccAbove (230). If the value is cccOther, it means that
|
||||
// CCC(r) > 0, but not 230. A value of cccBreak means that CCC(r) == 0 and that
|
||||
// the rune also has the break category Break (see below).
|
||||
const (
|
||||
cccBreak info = iota << 4
|
||||
cccZero
|
||||
cccAbove
|
||||
cccOther
|
||||
|
||||
cccMask = cccBreak | cccZero | cccAbove | cccOther
|
||||
)
|
||||
|
||||
const (
|
||||
starter = 0
|
||||
above = 230
|
||||
iotaSubscript = 240
|
||||
)
|
||||
|
||||
// The exceptions slice holds data that does not fit in a normal info entry.
|
||||
// The entry is pointed to by the exception index in an entry. It has the
|
||||
// following format:
|
||||
//
|
||||
// Header
|
||||
// byte 0:
|
||||
// 7..6 unused
|
||||
// 5..4 CCC type (same bits as entry)
|
||||
// 3 unused
|
||||
// 2..0 length of fold
|
||||
//
|
||||
// byte 1:
|
||||
// 7..6 unused
|
||||
// 5..3 length of 1st mapping of case type
|
||||
// 2..0 length of 2nd mapping of case type
|
||||
//
|
||||
// case 1st 2nd
|
||||
// lower -> upper, title
|
||||
// upper -> lower, title
|
||||
// title -> lower, upper
|
||||
//
|
||||
// Lengths with the value 0x7 indicate no value and implies no change.
|
||||
// A length of 0 indicates a mapping to zero-length string.
|
||||
//
|
||||
// Body bytes:
|
||||
// case folding bytes
|
||||
// lowercase mapping bytes
|
||||
// uppercase mapping bytes
|
||||
// titlecase mapping bytes
|
||||
// closure mapping bytes (for NFKC_Casefold). (TODO)
|
||||
//
|
||||
// Fallbacks:
|
||||
// missing fold -> lower
|
||||
// missing title -> upper
|
||||
// all missing -> original rune
|
||||
//
|
||||
// exceptions starts with a dummy byte to enforce that there is no zero index
|
||||
// value.
|
||||
const (
|
||||
lengthMask = 0x07
|
||||
lengthBits = 3
|
||||
noChange = 0
|
||||
)
|
||||
|
||||
// References to generated trie.
|
||||
|
||||
var trie = newCaseTrie(0)
|
||||
|
||||
var sparse = sparseBlocks{
|
||||
values: sparseValues[:],
|
||||
offsets: sparseOffsets[:],
|
||||
}
|
||||
|
||||
// Sparse block lookup code.
|
||||
|
||||
// valueRange is an entry in a sparse block.
|
||||
type valueRange struct {
|
||||
value uint16
|
||||
lo, hi byte
|
||||
}
|
||||
|
||||
type sparseBlocks struct {
|
||||
values []valueRange
|
||||
offsets []uint16
|
||||
}
|
||||
|
||||
// lookup returns the value from values block n for byte b using binary search.
|
||||
func (s *sparseBlocks) lookup(n uint32, b byte) uint16 {
|
||||
lo := s.offsets[n]
|
||||
hi := s.offsets[n+1]
|
||||
for lo < hi {
|
||||
m := lo + (hi-lo)/2
|
||||
r := s.values[m]
|
||||
if r.lo <= b && b <= r.hi {
|
||||
return r.value
|
||||
}
|
||||
if b < r.lo {
|
||||
hi = m
|
||||
} else {
|
||||
lo = m + 1
|
||||
}
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
// lastRuneForTesting is the last rune used for testing. Everything after this
|
||||
// is boring.
|
||||
const lastRuneForTesting = rune(0x1FFFF)
|
||||
83
vendor/golang.org/x/text/cases/info.go
generated
vendored
Normal file
83
vendor/golang.org/x/text/cases/info.go
generated
vendored
Normal file
|
|
@ -0,0 +1,83 @@
|
|||
// Copyright 2015 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package cases
|
||||
|
||||
func (c info) cccVal() info {
|
||||
if c&exceptionBit != 0 {
|
||||
return info(exceptions[c>>exceptionShift]) & cccMask
|
||||
}
|
||||
return c & cccMask
|
||||
}
|
||||
|
||||
func (c info) cccType() info {
|
||||
ccc := c.cccVal()
|
||||
if ccc <= cccZero {
|
||||
return cccZero
|
||||
}
|
||||
return ccc
|
||||
}
|
||||
|
||||
// TODO: Implement full Unicode breaking algorithm:
|
||||
// 1) Implement breaking in separate package.
|
||||
// 2) Use the breaker here.
|
||||
// 3) Compare table size and performance of using the more generic breaker.
|
||||
//
|
||||
// Note that we can extend the current algorithm to be much more accurate. This
|
||||
// only makes sense, though, if the performance and/or space penalty of using
|
||||
// the generic breaker is big. Extra data will only be needed for non-cased
|
||||
// runes, which means there are sufficient bits left in the caseType.
|
||||
// Also note that the standard breaking algorithm doesn't always make sense
|
||||
// for title casing. For example, a4a -> A4a, but a"4a -> A"4A (where " stands
|
||||
// for modifier \u0308).
|
||||
// ICU prohibits breaking in such cases as well.
|
||||
|
||||
// For the purpose of title casing we use an approximation of the Unicode Word
|
||||
// Breaking algorithm defined in Annex #29:
|
||||
// http://www.unicode.org/reports/tr29/#Default_Grapheme_Cluster_Table.
|
||||
//
|
||||
// For our approximation, we group the Word Break types into the following
|
||||
// categories, with associated rules:
|
||||
//
|
||||
// 1) Letter:
|
||||
// ALetter, Hebrew_Letter, Numeric, ExtendNumLet, Extend.
|
||||
// Rule: Never break between consecutive runes of this category.
|
||||
//
|
||||
// 2) Mid:
|
||||
// Format, MidLetter, MidNumLet, Single_Quote.
|
||||
// (Cf. case-ignorable: MidLetter, MidNumLet or cat is Mn, Me, Cf, Lm or Sk).
|
||||
// Rule: Don't break between Letter and Mid, but break between two Mids.
|
||||
//
|
||||
// 3) Break:
|
||||
// Any other category, including NewLine, CR, LF and Double_Quote. These
|
||||
// categories should always result in a break between two cased letters.
|
||||
// Rule: Always break.
|
||||
//
|
||||
// Note 1: the Katakana and MidNum categories can, in esoteric cases, result in
|
||||
// preventing a break between two cased letters. For now we will ignore this
|
||||
// (e.g. [ALetter] [ExtendNumLet] [Katakana] [ExtendNumLet] [ALetter] and
|
||||
// [ALetter] [Numeric] [MidNum] [Numeric] [ALetter].)
|
||||
//
|
||||
// Note 2: the rule for Mid is very approximate, but works in most cases. To
|
||||
// improve, we could store the categories in the trie value and use a FA to
|
||||
// manage breaks. See TODO comment above.
|
||||
//
|
||||
// Note 3: according to the spec, it is possible for the Extend category to
|
||||
// introduce breaks between other categories grouped in Letter. However, this
|
||||
// is undesirable for our purposes. ICU prevents breaks in such cases as well.
|
||||
|
||||
// isBreak returns whether this rune should introduce a break.
|
||||
func (c info) isBreak() bool {
|
||||
return c.cccVal() == cccBreak
|
||||
}
|
||||
|
||||
// isLetter returns whether the rune is of break type ALetter, Hebrew_Letter,
|
||||
// Numeric, ExtendNumLet, or Extend.
|
||||
func (c info) isLetter() bool {
|
||||
ccc := c.cccVal()
|
||||
if ccc == cccZero {
|
||||
return !c.isCaseIgnorable()
|
||||
}
|
||||
return ccc != cccBreak
|
||||
}
|
||||
599
vendor/golang.org/x/text/cases/map.go
generated
vendored
Normal file
599
vendor/golang.org/x/text/cases/map.go
generated
vendored
Normal file
|
|
@ -0,0 +1,599 @@
|
|||
// Copyright 2014 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package cases
|
||||
|
||||
// This file contains the definitions of case mappings for all supported
|
||||
// languages. The rules for the language-specific tailorings were taken and
|
||||
// modified from the CLDR transform definitions in common/transforms.
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"unicode"
|
||||
"unicode/utf8"
|
||||
|
||||
"golang.org/x/text/language"
|
||||
"golang.org/x/text/transform"
|
||||
"golang.org/x/text/unicode/norm"
|
||||
)
|
||||
|
||||
// A mapFunc takes a context set to the current rune and writes the mapped
|
||||
// version to the same context. It may advance the context to the next rune. It
|
||||
// returns whether a checkpoint is possible: whether the pDst bytes written to
|
||||
// dst so far won't need changing as we see more source bytes.
|
||||
type mapFunc func(*context) bool
|
||||
|
||||
// maxIgnorable defines the maximum number of ignorables to consider for
|
||||
// lookahead operations.
|
||||
const maxIgnorable = 30
|
||||
|
||||
// supported lists the language tags for which we have tailorings.
|
||||
const supported = "und af az el lt nl tr"
|
||||
|
||||
func init() {
|
||||
tags := []language.Tag{}
|
||||
for _, s := range strings.Split(supported, " ") {
|
||||
tags = append(tags, language.MustParse(s))
|
||||
}
|
||||
matcher = language.NewMatcher(tags)
|
||||
Supported = language.NewCoverage(tags)
|
||||
}
|
||||
|
||||
var (
|
||||
matcher language.Matcher
|
||||
|
||||
Supported language.Coverage
|
||||
|
||||
// We keep the following lists separate, instead of having a single per-
|
||||
// language struct, to give the compiler a chance to remove unused code.
|
||||
|
||||
// Some uppercase mappers are stateless, so we can precompute the
|
||||
// Transformers and save a bit on runtime allocations.
|
||||
upperFunc = []mapFunc{
|
||||
nil, // und
|
||||
nil, // af
|
||||
aztrUpper(upper), // az
|
||||
elUpper, // el
|
||||
ltUpper(upper), // lt
|
||||
nil, // nl
|
||||
aztrUpper(upper), // tr
|
||||
}
|
||||
|
||||
undUpper transform.Transformer = &undUpperCaser{}
|
||||
|
||||
lowerFunc = []mapFunc{
|
||||
lower, // und
|
||||
lower, // af
|
||||
aztrLower, // az
|
||||
lower, // el
|
||||
ltLower, // lt
|
||||
lower, // nl
|
||||
aztrLower, // tr
|
||||
}
|
||||
|
||||
titleInfos = []struct {
|
||||
title, lower mapFunc
|
||||
rewrite func(*context)
|
||||
}{
|
||||
{title, lower, nil}, // und
|
||||
{title, lower, afnlRewrite}, // af
|
||||
{aztrUpper(title), aztrLower, nil}, // az
|
||||
{title, lower, nil}, // el
|
||||
{ltUpper(title), ltLower, nil}, // lt
|
||||
{nlTitle, lower, afnlRewrite}, // nl
|
||||
{aztrUpper(title), aztrLower, nil}, // tr
|
||||
}
|
||||
)
|
||||
|
||||
func makeUpper(t language.Tag, o options) transform.Transformer {
|
||||
_, i, _ := matcher.Match(t)
|
||||
f := upperFunc[i]
|
||||
if f == nil {
|
||||
return undUpper
|
||||
}
|
||||
return &simpleCaser{f: f}
|
||||
}
|
||||
|
||||
func makeLower(t language.Tag, o options) transform.Transformer {
|
||||
_, i, _ := matcher.Match(t)
|
||||
f := lowerFunc[i]
|
||||
if o.noFinalSigma {
|
||||
return &simpleCaser{f: f}
|
||||
}
|
||||
return &lowerCaser{
|
||||
first: f,
|
||||
midWord: finalSigma(f),
|
||||
}
|
||||
}
|
||||
|
||||
func makeTitle(t language.Tag, o options) transform.Transformer {
|
||||
_, i, _ := matcher.Match(t)
|
||||
x := &titleInfos[i]
|
||||
lower := x.lower
|
||||
if o.noLower {
|
||||
lower = (*context).copy
|
||||
} else if !o.noFinalSigma {
|
||||
lower = finalSigma(lower)
|
||||
}
|
||||
return &titleCaser{
|
||||
title: x.title,
|
||||
lower: lower,
|
||||
rewrite: x.rewrite,
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: consider a similar special case for the fast majority lower case. This
|
||||
// is a bit more involved so will require some more precise benchmarking to
|
||||
// justify it.
|
||||
|
||||
type undUpperCaser struct{ transform.NopResetter }
|
||||
|
||||
// undUpperCaser implements the Transformer interface for doing an upper case
|
||||
// mapping for the root locale (und). It eliminates the need for an allocation
|
||||
// as it prevents escaping by not using function pointers.
|
||||
func (t *undUpperCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
c := context{dst: dst, src: src, atEOF: atEOF}
|
||||
for c.next() {
|
||||
upper(&c)
|
||||
c.checkpoint()
|
||||
}
|
||||
return c.ret()
|
||||
}
|
||||
|
||||
type simpleCaser struct {
|
||||
context
|
||||
f mapFunc
|
||||
}
|
||||
|
||||
// simpleCaser implements the Transformer interface for doing a case operation
|
||||
// on a rune-by-rune basis.
|
||||
func (t *simpleCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
t.context = context{dst: dst, src: src, atEOF: atEOF}
|
||||
c := &t.context
|
||||
for c.next() && t.f(c) {
|
||||
c.checkpoint()
|
||||
}
|
||||
return c.ret()
|
||||
}
|
||||
|
||||
// lowerCaser implements the Transformer interface. The default Unicode lower
|
||||
// casing requires different treatment for the first and subsequent characters
|
||||
// of a word, most notably to handle the Greek final Sigma.
|
||||
type lowerCaser struct {
|
||||
context
|
||||
|
||||
first, midWord mapFunc
|
||||
}
|
||||
|
||||
func (t *lowerCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
t.context = context{dst: dst, src: src, atEOF: atEOF}
|
||||
c := &t.context
|
||||
|
||||
for isInterWord := true; c.next(); {
|
||||
if isInterWord {
|
||||
if c.info.isCased() {
|
||||
if !t.first(c) {
|
||||
break
|
||||
}
|
||||
isInterWord = false
|
||||
} else if !c.copy() {
|
||||
break
|
||||
}
|
||||
} else {
|
||||
if c.info.isNotCasedAndNotCaseIgnorable() {
|
||||
if !c.copy() {
|
||||
break
|
||||
}
|
||||
isInterWord = true
|
||||
} else if !t.midWord(c) {
|
||||
break
|
||||
}
|
||||
}
|
||||
c.checkpoint()
|
||||
}
|
||||
return c.ret()
|
||||
}
|
||||
|
||||
// titleCaser implements the Transformer interface. Title casing algorithms
|
||||
// distinguish between the first letter of a word and subsequent letters of the
|
||||
// same word. It uses state to avoid requiring a potentially infinite lookahead.
|
||||
type titleCaser struct {
|
||||
context
|
||||
|
||||
// rune mappings used by the actual casing algorithms.
|
||||
title, lower mapFunc
|
||||
|
||||
rewrite func(*context)
|
||||
}
|
||||
|
||||
// Transform implements the standard Unicode title case algorithm as defined in
|
||||
// Chapter 3 of The Unicode Standard:
|
||||
// toTitlecase(X): Find the word boundaries in X according to Unicode Standard
|
||||
// Annex #29, "Unicode Text Segmentation." For each word boundary, find the
|
||||
// first cased character F following the word boundary. If F exists, map F to
|
||||
// Titlecase_Mapping(F); then map all characters C between F and the following
|
||||
// word boundary to Lowercase_Mapping(C).
|
||||
func (t *titleCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
t.context = context{dst: dst, src: src, atEOF: atEOF, isMidWord: t.isMidWord}
|
||||
c := &t.context
|
||||
|
||||
if !c.next() {
|
||||
return c.ret()
|
||||
}
|
||||
|
||||
for {
|
||||
p := c.info
|
||||
if t.rewrite != nil {
|
||||
t.rewrite(c)
|
||||
}
|
||||
|
||||
wasMid := p.isCaseIgnorableAndNonBreakStarter()
|
||||
// Break out of this loop on failure to ensure we do not modify the
|
||||
// state incorrectly.
|
||||
if p.isCased() && !p.isCaseIgnorableAndNotCased() {
|
||||
if !c.isMidWord {
|
||||
if !t.title(c) {
|
||||
break
|
||||
}
|
||||
c.isMidWord = true
|
||||
} else if !t.lower(c) {
|
||||
break
|
||||
}
|
||||
} else if !c.copy() {
|
||||
break
|
||||
}
|
||||
|
||||
// TODO: make this an "else if" if we can prove that no rune that does
|
||||
// not match the first condition of the if statement can be a break.
|
||||
if p.isBreak() {
|
||||
c.isMidWord = false
|
||||
}
|
||||
|
||||
// As we save the state of the transformer, it is safe to call
|
||||
// checkpoint after any successful write.
|
||||
c.checkpoint()
|
||||
|
||||
if !c.next() {
|
||||
break
|
||||
}
|
||||
if wasMid && c.info.isCaseIgnorableAndNonBreakStarter() {
|
||||
c.isMidWord = false
|
||||
}
|
||||
}
|
||||
return c.ret()
|
||||
}
|
||||
|
||||
// finalSigma adds Greek final Sigma handing to another casing function. It
|
||||
// determines whether a lowercased sigma should be σ or ς, by looking ahead for
|
||||
// case-ignorables and a cased letters.
|
||||
func finalSigma(f mapFunc) mapFunc {
|
||||
return func(c *context) bool {
|
||||
// ::NFD();
|
||||
// # 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
|
||||
// Σ } [:case-ignorable:]* [:cased:] → σ;
|
||||
// [:cased:] [:case-ignorable:]* { Σ → ς;
|
||||
// ::Any-Lower;
|
||||
// ::NFC();
|
||||
|
||||
if !c.hasPrefix("Σ") {
|
||||
return f(c)
|
||||
}
|
||||
|
||||
p := c.pDst
|
||||
c.writeString("ς")
|
||||
// We need to do one more iteration after maxIgnorable, as a cased
|
||||
// letter is not an ignorable and may modify the result.
|
||||
for i := 0; i < maxIgnorable+1; i++ {
|
||||
if !c.next() {
|
||||
return false
|
||||
}
|
||||
if !c.info.isCaseIgnorable() {
|
||||
if c.info.isCased() {
|
||||
// p+1 is guaranteed to be in bounds: if writing ς was
|
||||
// successful, p+1 will contain the second byte of ς. If not,
|
||||
// this function will have returned after c.next returned false.
|
||||
c.dst[p+1]++ // ς → σ
|
||||
}
|
||||
c.unreadRune()
|
||||
return true
|
||||
}
|
||||
// A case ignorable may also introduce a word break, so we may need
|
||||
// to continue searching even after detecting a break.
|
||||
c.isMidWord = c.isMidWord && !c.info.isBreak()
|
||||
c.copy()
|
||||
}
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
// elUpper implements Greek upper casing, which entails removing a predefined
|
||||
// set of non-blocked modifiers. Note that these accents should not be removed
|
||||
// for title casing!
|
||||
// Example: "Οδός" -> "ΟΔΟΣ".
|
||||
func elUpper(c *context) bool {
|
||||
// From CLDR:
|
||||
// [:Greek:] [^[:ccc=Not_Reordered:][:ccc=Above:]]*? { [\u0313\u0314\u0301\u0300\u0306\u0342\u0308\u0304] → ;
|
||||
// [:Greek:] [^[:ccc=Not_Reordered:][:ccc=Iota_Subscript:]]*? { \u0345 → ;
|
||||
|
||||
r, _ := utf8.DecodeRune(c.src[c.pSrc:])
|
||||
oldPDst := c.pDst
|
||||
if !upper(c) {
|
||||
return false
|
||||
}
|
||||
if !unicode.Is(unicode.Greek, r) {
|
||||
return true
|
||||
}
|
||||
i := 0
|
||||
// Take the properties of the uppercased rune that is already written to the
|
||||
// destination. This saves us the trouble of having to uppercase the
|
||||
// decomposed rune again.
|
||||
if b := norm.NFD.Properties(c.dst[oldPDst:]).Decomposition(); b != nil {
|
||||
// Restore the destination position and process the decomposed rune.
|
||||
r, sz := utf8.DecodeRune(b)
|
||||
if r <= 0xFF { // See A.6.1
|
||||
return true
|
||||
}
|
||||
c.pDst = oldPDst
|
||||
// Insert the first rune and ignore the modifiers. See A.6.2.
|
||||
c.writeBytes(b[:sz])
|
||||
i = len(b[sz:]) / 2 // Greek modifiers are always of length 2.
|
||||
}
|
||||
|
||||
for ; i < maxIgnorable && c.next(); i++ {
|
||||
switch r, _ := utf8.DecodeRune(c.src[c.pSrc:]); r {
|
||||
// Above and Iota Subscript
|
||||
case 0x0300, // U+0300 COMBINING GRAVE ACCENT
|
||||
0x0301, // U+0301 COMBINING ACUTE ACCENT
|
||||
0x0304, // U+0304 COMBINING MACRON
|
||||
0x0306, // U+0306 COMBINING BREVE
|
||||
0x0308, // U+0308 COMBINING DIAERESIS
|
||||
0x0313, // U+0313 COMBINING COMMA ABOVE
|
||||
0x0314, // U+0314 COMBINING REVERSED COMMA ABOVE
|
||||
0x0342, // U+0342 COMBINING GREEK PERISPOMENI
|
||||
0x0345: // U+0345 COMBINING GREEK YPOGEGRAMMENI
|
||||
// No-op. Gobble the modifier.
|
||||
|
||||
default:
|
||||
switch v, _ := trie.lookup(c.src[c.pSrc:]); info(v).cccType() {
|
||||
case cccZero:
|
||||
c.unreadRune()
|
||||
return true
|
||||
|
||||
// We don't need to test for IotaSubscript as the only rune that
|
||||
// qualifies (U+0345) was already excluded in the switch statement
|
||||
// above. See A.4.
|
||||
|
||||
case cccAbove:
|
||||
return c.copy()
|
||||
default:
|
||||
// Some other modifier. We're still allowed to gobble Greek
|
||||
// modifiers after this.
|
||||
c.copy()
|
||||
}
|
||||
}
|
||||
}
|
||||
return i == maxIgnorable
|
||||
}
|
||||
|
||||
func ltLower(c *context) bool {
|
||||
// From CLDR:
|
||||
// # Introduce an explicit dot above when lowercasing capital I's and J's
|
||||
// # whenever there are more accents above.
|
||||
// # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek)
|
||||
// # 0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I
|
||||
// # 004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J
|
||||
// # 012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK
|
||||
// # 00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE
|
||||
// # 00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE
|
||||
// # 0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE
|
||||
// ::NFD();
|
||||
// I } [^[:ccc=Not_Reordered:][:ccc=Above:]]* [:ccc=Above:] → i \u0307;
|
||||
// J } [^[:ccc=Not_Reordered:][:ccc=Above:]]* [:ccc=Above:] → j \u0307;
|
||||
// Į } [^[:ccc=Not_Reordered:][:ccc=Above:]]* [:ccc=Above:] → į \u0307;
|
||||
// Ì → i \u0307 \u0300;
|
||||
// Í → i \u0307 \u0301;
|
||||
// Ĩ → i \u0307 \u0303;
|
||||
// ::Any-Lower();
|
||||
// ::NFC();
|
||||
|
||||
i := 0
|
||||
if r := c.src[c.pSrc]; r < utf8.RuneSelf {
|
||||
lower(c)
|
||||
if r != 'I' && r != 'J' {
|
||||
return true
|
||||
}
|
||||
} else {
|
||||
p := norm.NFD.Properties(c.src[c.pSrc:])
|
||||
if d := p.Decomposition(); len(d) >= 3 && (d[0] == 'I' || d[0] == 'J') {
|
||||
// UTF-8 optimization: the decomposition will only have an above
|
||||
// modifier if the last rune of the decomposition is in [U+300-U+311].
|
||||
// In all other cases, a decomposition starting with I is always
|
||||
// an I followed by modifiers that are not cased themselves. See A.2.
|
||||
if d[1] == 0xCC && d[2] <= 0x91 { // A.2.4.
|
||||
if !c.writeBytes(d[:1]) {
|
||||
return false
|
||||
}
|
||||
c.dst[c.pDst-1] += 'a' - 'A' // lower
|
||||
|
||||
// Assumption: modifier never changes on lowercase. See A.1.
|
||||
// Assumption: all modifiers added have CCC = Above. See A.2.3.
|
||||
return c.writeString("\u0307") && c.writeBytes(d[1:])
|
||||
}
|
||||
// In all other cases the additional modifiers will have a CCC
|
||||
// that is less than 230 (Above). We will insert the U+0307, if
|
||||
// needed, after these modifiers so that a string in FCD form
|
||||
// will remain so. See A.2.2.
|
||||
lower(c)
|
||||
i = 1
|
||||
} else {
|
||||
return lower(c)
|
||||
}
|
||||
}
|
||||
|
||||
for ; i < maxIgnorable && c.next(); i++ {
|
||||
switch c.info.cccType() {
|
||||
case cccZero:
|
||||
c.unreadRune()
|
||||
return true
|
||||
case cccAbove:
|
||||
return c.writeString("\u0307") && c.copy() // See A.1.
|
||||
default:
|
||||
c.copy() // See A.1.
|
||||
}
|
||||
}
|
||||
return i == maxIgnorable
|
||||
}
|
||||
|
||||
func ltUpper(f mapFunc) mapFunc {
|
||||
return func(c *context) bool {
|
||||
// From CLDR:
|
||||
// ::NFD();
|
||||
// [:Soft_Dotted:] [^[:ccc=Not_Reordered:][:ccc=Above:]]* { \u0307 → ;
|
||||
// ::Any-Upper();
|
||||
// ::NFC();
|
||||
|
||||
// TODO: See A.5. A soft-dotted rune never has an exception. This would
|
||||
// allow us to overload the exception bit and encode this property in
|
||||
// info. Need to measure performance impact of this.
|
||||
r, _ := utf8.DecodeRune(c.src[c.pSrc:])
|
||||
oldPDst := c.pDst
|
||||
if !f(c) {
|
||||
return false
|
||||
}
|
||||
if !unicode.Is(unicode.Soft_Dotted, r) {
|
||||
return true
|
||||
}
|
||||
|
||||
// We don't need to do an NFD normalization, as a soft-dotted rune never
|
||||
// contains U+0307. See A.3.
|
||||
|
||||
i := 0
|
||||
for ; i < maxIgnorable && c.next(); i++ {
|
||||
switch c.info.cccType() {
|
||||
case cccZero:
|
||||
c.unreadRune()
|
||||
return true
|
||||
case cccAbove:
|
||||
if c.hasPrefix("\u0307") {
|
||||
// We don't do a full NFC, but rather combine runes for
|
||||
// some of the common cases. (Returning NFC or
|
||||
// preserving normal form is neither a requirement nor
|
||||
// a possibility anyway).
|
||||
if !c.next() {
|
||||
return false
|
||||
}
|
||||
if c.dst[oldPDst] == 'I' && c.pDst == oldPDst+1 && c.src[c.pSrc] == 0xcc {
|
||||
s := ""
|
||||
switch c.src[c.pSrc+1] {
|
||||
case 0x80: // U+0300 COMBINING GRAVE ACCENT
|
||||
s = "\u00cc" // U+00CC LATIN CAPITAL LETTER I WITH GRAVE
|
||||
case 0x81: // U+0301 COMBINING ACUTE ACCENT
|
||||
s = "\u00cd" // U+00CD LATIN CAPITAL LETTER I WITH ACUTE
|
||||
case 0x83: // U+0303 COMBINING TILDE
|
||||
s = "\u0128" // U+0128 LATIN CAPITAL LETTER I WITH TILDE
|
||||
case 0x88: // U+0308 COMBINING DIAERESIS
|
||||
s = "\u00cf" // U+00CF LATIN CAPITAL LETTER I WITH DIAERESIS
|
||||
default:
|
||||
}
|
||||
if s != "" {
|
||||
c.pDst = oldPDst
|
||||
return c.writeString(s)
|
||||
}
|
||||
}
|
||||
}
|
||||
return c.copy()
|
||||
default:
|
||||
c.copy()
|
||||
}
|
||||
}
|
||||
return i == maxIgnorable
|
||||
}
|
||||
}
|
||||
|
||||
func aztrUpper(f mapFunc) mapFunc {
|
||||
return func(c *context) bool {
|
||||
// i→İ;
|
||||
if c.src[c.pSrc] == 'i' {
|
||||
return c.writeString("İ")
|
||||
}
|
||||
return f(c)
|
||||
}
|
||||
}
|
||||
|
||||
func aztrLower(c *context) (done bool) {
|
||||
// From CLDR:
|
||||
// # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
|
||||
// # 0130; 0069; 0130; 0130; tr; # LATIN CAPITAL LETTER I WITH DOT ABOVE
|
||||
// İ→i;
|
||||
// # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
|
||||
// # This matches the behavior of the canonically equivalent I-dot_above
|
||||
// # 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
|
||||
// # When lowercasing, unless an I is before a dot_above, it turns into a dotless i.
|
||||
// # 0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I
|
||||
// I([^[:ccc=Not_Reordered:][:ccc=Above:]]*)\u0307 → i$1 ;
|
||||
// I→ı ;
|
||||
// ::Any-Lower();
|
||||
if c.hasPrefix("\u0130") { // İ
|
||||
return c.writeString("i")
|
||||
}
|
||||
if c.src[c.pSrc] != 'I' {
|
||||
return lower(c)
|
||||
}
|
||||
|
||||
// We ignore the lower-case I for now, but insert it later when we know
|
||||
// which form we need.
|
||||
start := c.pSrc + c.sz
|
||||
|
||||
i := 0
|
||||
Loop:
|
||||
// We check for up to n ignorables before \u0307. As \u0307 is an
|
||||
// ignorable as well, n is maxIgnorable-1.
|
||||
for ; i < maxIgnorable && c.next(); i++ {
|
||||
switch c.info.cccType() {
|
||||
case cccAbove:
|
||||
if c.hasPrefix("\u0307") {
|
||||
return c.writeString("i") && c.writeBytes(c.src[start:c.pSrc]) // ignore U+0307
|
||||
}
|
||||
done = true
|
||||
break Loop
|
||||
case cccZero:
|
||||
c.unreadRune()
|
||||
done = true
|
||||
break Loop
|
||||
default:
|
||||
// We'll write this rune after we know which starter to use.
|
||||
}
|
||||
}
|
||||
if i == maxIgnorable {
|
||||
done = true
|
||||
}
|
||||
return c.writeString("ı") && c.writeBytes(c.src[start:c.pSrc+c.sz]) && done
|
||||
}
|
||||
|
||||
func nlTitle(c *context) bool {
|
||||
// From CLDR:
|
||||
// # Special titlecasing for Dutch initial "ij".
|
||||
// ::Any-Title();
|
||||
// # Fix up Ij at the beginning of a "word" (per Any-Title, notUAX #29)
|
||||
// [:^WB=ALetter:] [:WB=Extend:]* [[:WB=MidLetter:][:WB=MidNumLet:]]? { Ij } → IJ ;
|
||||
if c.src[c.pSrc] != 'I' && c.src[c.pSrc] != 'i' {
|
||||
return title(c)
|
||||
}
|
||||
|
||||
if !c.writeString("I") || !c.next() {
|
||||
return false
|
||||
}
|
||||
if c.src[c.pSrc] == 'j' || c.src[c.pSrc] == 'J' {
|
||||
return c.writeString("J")
|
||||
}
|
||||
c.unreadRune()
|
||||
return true
|
||||
}
|
||||
|
||||
// Not part of CLDR, but see http://unicode.org/cldr/trac/ticket/7078.
|
||||
func afnlRewrite(c *context) {
|
||||
if c.hasPrefix("'") || c.hasPrefix("’") {
|
||||
c.isMidWord = true
|
||||
}
|
||||
}
|
||||
2213
vendor/golang.org/x/text/cases/tables.go
generated
vendored
Normal file
2213
vendor/golang.org/x/text/cases/tables.go
generated
vendored
Normal file
File diff suppressed because it is too large
Load diff
213
vendor/golang.org/x/text/cases/trieval.go
generated
vendored
Normal file
213
vendor/golang.org/x/text/cases/trieval.go
generated
vendored
Normal file
|
|
@ -0,0 +1,213 @@
|
|||
// This file was generated by go generate; DO NOT EDIT
|
||||
|
||||
package cases
|
||||
|
||||
// This file contains definitions for interpreting the trie value of the case
|
||||
// trie generated by "go run gen*.go". It is shared by both the generator
|
||||
// program and the resultant package. Sharing is achieved by the generator
|
||||
// copying gen_trieval.go to trieval.go and changing what's above this comment.
|
||||
|
||||
// info holds case information for a single rune. It is the value returned
|
||||
// by a trie lookup. Most mapping information can be stored in a single 16-bit
|
||||
// value. If not, for example when a rune is mapped to multiple runes, the value
|
||||
// stores some basic case data and an index into an array with additional data.
|
||||
//
|
||||
// The per-rune values have the following format:
|
||||
//
|
||||
// if (exception) {
|
||||
// 15..5 unsigned exception index
|
||||
// 4 unused
|
||||
// } else {
|
||||
// 15..8 XOR pattern or index to XOR pattern for case mapping
|
||||
// Only 13..8 are used for XOR patterns.
|
||||
// 7 inverseFold (fold to upper, not to lower)
|
||||
// 6 index: interpret the XOR pattern as an index
|
||||
// 5..4 CCC: zero (normal or break), above or other
|
||||
// }
|
||||
// 3 exception: interpret this value as an exception index
|
||||
// (TODO: is this bit necessary? Probably implied from case mode.)
|
||||
// 2..0 case mode
|
||||
//
|
||||
// For the non-exceptional cases, a rune must be either uncased, lowercase or
|
||||
// uppercase. If the rune is cased, the XOR pattern maps either a lowercase
|
||||
// rune to uppercase or an uppercase rune to lowercase (applied to the 10
|
||||
// least-significant bits of the rune).
|
||||
//
|
||||
// See the definitions below for a more detailed description of the various
|
||||
// bits.
|
||||
type info uint16
|
||||
|
||||
const (
|
||||
casedMask = 0x0003
|
||||
fullCasedMask = 0x0007
|
||||
ignorableMask = 0x0006
|
||||
ignorableValue = 0x0004
|
||||
|
||||
inverseFoldBit = 1 << 7
|
||||
|
||||
exceptionBit = 1 << 3
|
||||
exceptionShift = 5
|
||||
numExceptionBits = 11
|
||||
|
||||
xorIndexBit = 1 << 6
|
||||
xorShift = 8
|
||||
|
||||
// There is no mapping if all xor bits and the exception bit are zero.
|
||||
hasMappingMask = 0xffc0 | exceptionBit
|
||||
)
|
||||
|
||||
// The case mode bits encodes the case type of a rune. This includes uncased,
|
||||
// title, upper and lower case and case ignorable. (For a definition of these
|
||||
// terms see Chapter 3 of The Unicode Standard Core Specification.) In some rare
|
||||
// cases, a rune can be both cased and case-ignorable. This is encoded by
|
||||
// cIgnorableCased. A rune of this type is always lower case. Some runes are
|
||||
// cased while not having a mapping.
|
||||
//
|
||||
// A common pattern for scripts in the Unicode standard is for upper and lower
|
||||
// case runes to alternate for increasing rune values (e.g. the accented Latin
|
||||
// ranges starting from U+0100 and U+1E00 among others and some Cyrillic
|
||||
// characters). We use this property by defining a cXORCase mode, where the case
|
||||
// mode (always upper or lower case) is derived from the rune value. As the XOR
|
||||
// pattern for case mappings is often identical for successive runes, using
|
||||
// cXORCase can result in large series of identical trie values. This, in turn,
|
||||
// allows us to better compress the trie blocks.
|
||||
const (
|
||||
cUncased info = iota // 000
|
||||
cTitle // 001
|
||||
cLower // 010
|
||||
cUpper // 011
|
||||
cIgnorableUncased // 100
|
||||
cIgnorableCased // 101 // lower case if mappings exist
|
||||
cXORCase // 11x // case is cLower | ((rune&1) ^ x)
|
||||
|
||||
maxCaseMode = cUpper
|
||||
)
|
||||
|
||||
func (c info) isCased() bool {
|
||||
return c&casedMask != 0
|
||||
}
|
||||
|
||||
func (c info) isCaseIgnorable() bool {
|
||||
return c&ignorableMask == ignorableValue
|
||||
}
|
||||
|
||||
func (c info) isCaseIgnorableAndNonBreakStarter() bool {
|
||||
return c&(fullCasedMask|cccMask) == (ignorableValue | cccZero)
|
||||
}
|
||||
|
||||
func (c info) isNotCasedAndNotCaseIgnorable() bool {
|
||||
return c&fullCasedMask == 0
|
||||
}
|
||||
|
||||
func (c info) isCaseIgnorableAndNotCased() bool {
|
||||
return c&fullCasedMask == cIgnorableUncased
|
||||
}
|
||||
|
||||
// The case mapping implementation will need to know about various Canonical
|
||||
// Combining Class (CCC) values. We encode two of these in the trie value:
|
||||
// cccZero (0) and cccAbove (230). If the value is cccOther, it means that
|
||||
// CCC(r) > 0, but not 230. A value of cccBreak means that CCC(r) == 0 and that
|
||||
// the rune also has the break category Break (see below).
|
||||
const (
|
||||
cccBreak info = iota << 4
|
||||
cccZero
|
||||
cccAbove
|
||||
cccOther
|
||||
|
||||
cccMask = cccBreak | cccZero | cccAbove | cccOther
|
||||
)
|
||||
|
||||
const (
|
||||
starter = 0
|
||||
above = 230
|
||||
iotaSubscript = 240
|
||||
)
|
||||
|
||||
// The exceptions slice holds data that does not fit in a normal info entry.
|
||||
// The entry is pointed to by the exception index in an entry. It has the
|
||||
// following format:
|
||||
//
|
||||
// Header
|
||||
// byte 0:
|
||||
// 7..6 unused
|
||||
// 5..4 CCC type (same bits as entry)
|
||||
// 3 unused
|
||||
// 2..0 length of fold
|
||||
//
|
||||
// byte 1:
|
||||
// 7..6 unused
|
||||
// 5..3 length of 1st mapping of case type
|
||||
// 2..0 length of 2nd mapping of case type
|
||||
//
|
||||
// case 1st 2nd
|
||||
// lower -> upper, title
|
||||
// upper -> lower, title
|
||||
// title -> lower, upper
|
||||
//
|
||||
// Lengths with the value 0x7 indicate no value and implies no change.
|
||||
// A length of 0 indicates a mapping to zero-length string.
|
||||
//
|
||||
// Body bytes:
|
||||
// case folding bytes
|
||||
// lowercase mapping bytes
|
||||
// uppercase mapping bytes
|
||||
// titlecase mapping bytes
|
||||
// closure mapping bytes (for NFKC_Casefold). (TODO)
|
||||
//
|
||||
// Fallbacks:
|
||||
// missing fold -> lower
|
||||
// missing title -> upper
|
||||
// all missing -> original rune
|
||||
//
|
||||
// exceptions starts with a dummy byte to enforce that there is no zero index
|
||||
// value.
|
||||
const (
|
||||
lengthMask = 0x07
|
||||
lengthBits = 3
|
||||
noChange = 0
|
||||
)
|
||||
|
||||
// References to generated trie.
|
||||
|
||||
var trie = newCaseTrie(0)
|
||||
|
||||
var sparse = sparseBlocks{
|
||||
values: sparseValues[:],
|
||||
offsets: sparseOffsets[:],
|
||||
}
|
||||
|
||||
// Sparse block lookup code.
|
||||
|
||||
// valueRange is an entry in a sparse block.
|
||||
type valueRange struct {
|
||||
value uint16
|
||||
lo, hi byte
|
||||
}
|
||||
|
||||
type sparseBlocks struct {
|
||||
values []valueRange
|
||||
offsets []uint16
|
||||
}
|
||||
|
||||
// lookup returns the value from values block n for byte b using binary search.
|
||||
func (s *sparseBlocks) lookup(n uint32, b byte) uint16 {
|
||||
lo := s.offsets[n]
|
||||
hi := s.offsets[n+1]
|
||||
for lo < hi {
|
||||
m := lo + (hi-lo)/2
|
||||
r := s.values[m]
|
||||
if r.lo <= b && b <= r.hi {
|
||||
return r.value
|
||||
}
|
||||
if b < r.lo {
|
||||
hi = m
|
||||
} else {
|
||||
lo = m + 1
|
||||
}
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
// lastRuneForTesting is the last rune used for testing. Everything after this
|
||||
// is boring.
|
||||
const lastRuneForTesting = rune(0x1FFFF)
|
||||
100
vendor/golang.org/x/text/internal/tag/tag.go
generated
vendored
Normal file
100
vendor/golang.org/x/text/internal/tag/tag.go
generated
vendored
Normal file
|
|
@ -0,0 +1,100 @@
|
|||
// Copyright 2015 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Package tag contains functionality handling tags and related data.
|
||||
package tag
|
||||
|
||||
import "sort"
|
||||
|
||||
// An Index converts tags to a compact numeric value.
|
||||
//
|
||||
// All elements are of size 4. Tags may be up to 4 bytes long. Excess bytes can
|
||||
// be used to store additional information about the tag.
|
||||
type Index string
|
||||
|
||||
// Elem returns the element data at the given index.
|
||||
func (s Index) Elem(x int) string {
|
||||
return string(s[x*4 : x*4+4])
|
||||
}
|
||||
|
||||
// Index reports the index of the given key or -1 if it could not be found.
|
||||
// Only the first len(key) bytes from the start of the 4-byte entries will be
|
||||
// considered for the search and the first match in Index will be returned.
|
||||
func (s Index) Index(key []byte) int {
|
||||
n := len(key)
|
||||
// search the index of the first entry with an equal or higher value than
|
||||
// key in s.
|
||||
index := sort.Search(len(s)/4, func(i int) bool {
|
||||
return cmp(s[i*4:i*4+n], key) != -1
|
||||
})
|
||||
i := index * 4
|
||||
if cmp(s[i:i+len(key)], key) != 0 {
|
||||
return -1
|
||||
}
|
||||
return index
|
||||
}
|
||||
|
||||
// Next finds the next occurrence of key after index x, which must have been
|
||||
// obtained from a call to Index using the same key. It returns x+1 or -1.
|
||||
func (s Index) Next(key []byte, x int) int {
|
||||
if x++; x*4 < len(s) && cmp(s[x*4:x*4+len(key)], key) == 0 {
|
||||
return x
|
||||
}
|
||||
return -1
|
||||
}
|
||||
|
||||
// cmp returns an integer comparing a and b lexicographically.
|
||||
func cmp(a Index, b []byte) int {
|
||||
n := len(a)
|
||||
if len(b) < n {
|
||||
n = len(b)
|
||||
}
|
||||
for i, c := range b[:n] {
|
||||
switch {
|
||||
case a[i] > c:
|
||||
return 1
|
||||
case a[i] < c:
|
||||
return -1
|
||||
}
|
||||
}
|
||||
switch {
|
||||
case len(a) < len(b):
|
||||
return -1
|
||||
case len(a) > len(b):
|
||||
return 1
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
// Compare returns an integer comparing a and b lexicographically.
|
||||
func Compare(a string, b []byte) int {
|
||||
return cmp(Index(a), b)
|
||||
}
|
||||
|
||||
// FixCase reformats b to the same pattern of cases as form.
|
||||
// If returns false if string b is malformed.
|
||||
func FixCase(form string, b []byte) bool {
|
||||
if len(form) != len(b) {
|
||||
return false
|
||||
}
|
||||
for i, c := range b {
|
||||
if form[i] <= 'Z' {
|
||||
if c >= 'a' {
|
||||
c -= 'z' - 'Z'
|
||||
}
|
||||
if c < 'A' || 'Z' < c {
|
||||
return false
|
||||
}
|
||||
} else {
|
||||
if c <= 'Z' {
|
||||
c += 'z' - 'Z'
|
||||
}
|
||||
if c < 'a' || 'z' < c {
|
||||
return false
|
||||
}
|
||||
}
|
||||
b[i] = c
|
||||
}
|
||||
return true
|
||||
}
|
||||
16
vendor/golang.org/x/text/language/Makefile
generated
vendored
Normal file
16
vendor/golang.org/x/text/language/Makefile
generated
vendored
Normal file
|
|
@ -0,0 +1,16 @@
|
|||
# Copyright 2013 The Go Authors. All rights reserved.
|
||||
# Use of this source code is governed by a BSD-style
|
||||
# license that can be found in the LICENSE file.
|
||||
|
||||
CLEANFILES+=maketables
|
||||
|
||||
maketables: maketables.go
|
||||
go build $^
|
||||
|
||||
tables: maketables
|
||||
./maketables > tables.go
|
||||
gofmt -w -s tables.go
|
||||
|
||||
# Build (but do not run) maketables during testing,
|
||||
# just to make sure it still compiles.
|
||||
testshort: maketables
|
||||
16
vendor/golang.org/x/text/language/common.go
generated
vendored
Normal file
16
vendor/golang.org/x/text/language/common.go
generated
vendored
Normal file
|
|
@ -0,0 +1,16 @@
|
|||
// This file was generated by go generate; DO NOT EDIT
|
||||
|
||||
package language
|
||||
|
||||
// This file contains code common to the maketables.go and the package code.
|
||||
|
||||
// langAliasType is the type of an alias in langAliasMap.
|
||||
type langAliasType int8
|
||||
|
||||
const (
|
||||
langDeprecated langAliasType = iota
|
||||
langMacro
|
||||
langLegacy
|
||||
|
||||
langAliasTypeUnknown langAliasType = -1
|
||||
)
|
||||
197
vendor/golang.org/x/text/language/coverage.go
generated
vendored
Normal file
197
vendor/golang.org/x/text/language/coverage.go
generated
vendored
Normal file
|
|
@ -0,0 +1,197 @@
|
|||
// Copyright 2014 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package language
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"sort"
|
||||
)
|
||||
|
||||
// The Coverage interface is used to define the level of coverage of an
|
||||
// internationalization service. Note that not all types are supported by all
|
||||
// services. As lists may be generated on the fly, it is recommended that users
|
||||
// of a Coverage cache the results.
|
||||
type Coverage interface {
|
||||
// Tags returns the list of supported tags.
|
||||
Tags() []Tag
|
||||
|
||||
// BaseLanguages returns the list of supported base languages.
|
||||
BaseLanguages() []Base
|
||||
|
||||
// Scripts returns the list of supported scripts.
|
||||
Scripts() []Script
|
||||
|
||||
// Regions returns the list of supported regions.
|
||||
Regions() []Region
|
||||
}
|
||||
|
||||
var (
|
||||
// Supported defines a Coverage that lists all supported subtags. Tags
|
||||
// always returns nil.
|
||||
Supported Coverage = allSubtags{}
|
||||
)
|
||||
|
||||
// TODO:
|
||||
// - Support Variants, numbering systems.
|
||||
// - CLDR coverage levels.
|
||||
// - Set of common tags defined in this package.
|
||||
|
||||
type allSubtags struct{}
|
||||
|
||||
// Regions returns the list of supported regions. As all regions are in a
|
||||
// consecutive range, it simply returns a slice of numbers in increasing order.
|
||||
// The "undefined" region is not returned.
|
||||
func (s allSubtags) Regions() []Region {
|
||||
reg := make([]Region, numRegions)
|
||||
for i := range reg {
|
||||
reg[i] = Region{regionID(i + 1)}
|
||||
}
|
||||
return reg
|
||||
}
|
||||
|
||||
// Scripts returns the list of supported scripts. As all scripts are in a
|
||||
// consecutive range, it simply returns a slice of numbers in increasing order.
|
||||
// The "undefined" script is not returned.
|
||||
func (s allSubtags) Scripts() []Script {
|
||||
scr := make([]Script, numScripts)
|
||||
for i := range scr {
|
||||
scr[i] = Script{scriptID(i + 1)}
|
||||
}
|
||||
return scr
|
||||
}
|
||||
|
||||
// BaseLanguages returns the list of all supported base languages. It generates
|
||||
// the list by traversing the internal structures.
|
||||
func (s allSubtags) BaseLanguages() []Base {
|
||||
base := make([]Base, 0, numLanguages)
|
||||
for i := 0; i < langNoIndexOffset; i++ {
|
||||
// We included "und" already for the value 0.
|
||||
if i != nonCanonicalUnd {
|
||||
base = append(base, Base{langID(i)})
|
||||
}
|
||||
}
|
||||
i := langNoIndexOffset
|
||||
for _, v := range langNoIndex {
|
||||
for k := 0; k < 8; k++ {
|
||||
if v&1 == 1 {
|
||||
base = append(base, Base{langID(i)})
|
||||
}
|
||||
v >>= 1
|
||||
i++
|
||||
}
|
||||
}
|
||||
return base
|
||||
}
|
||||
|
||||
// Tags always returns nil.
|
||||
func (s allSubtags) Tags() []Tag {
|
||||
return nil
|
||||
}
|
||||
|
||||
// coverage is used used by NewCoverage which is used as a convenient way for
|
||||
// creating Coverage implementations for partially defined data. Very often a
|
||||
// package will only need to define a subset of slices. coverage provides a
|
||||
// convenient way to do this. Moreover, packages using NewCoverage, instead of
|
||||
// their own implementation, will not break if later new slice types are added.
|
||||
type coverage struct {
|
||||
tags func() []Tag
|
||||
bases func() []Base
|
||||
scripts func() []Script
|
||||
regions func() []Region
|
||||
}
|
||||
|
||||
func (s *coverage) Tags() []Tag {
|
||||
if s.tags == nil {
|
||||
return nil
|
||||
}
|
||||
return s.tags()
|
||||
}
|
||||
|
||||
// bases implements sort.Interface and is used to sort base languages.
|
||||
type bases []Base
|
||||
|
||||
func (b bases) Len() int {
|
||||
return len(b)
|
||||
}
|
||||
|
||||
func (b bases) Swap(i, j int) {
|
||||
b[i], b[j] = b[j], b[i]
|
||||
}
|
||||
|
||||
func (b bases) Less(i, j int) bool {
|
||||
return b[i].langID < b[j].langID
|
||||
}
|
||||
|
||||
// BaseLanguages returns the result from calling s.bases if it is specified or
|
||||
// otherwise derives the set of supported base languages from tags.
|
||||
func (s *coverage) BaseLanguages() []Base {
|
||||
if s.bases == nil {
|
||||
tags := s.Tags()
|
||||
if len(tags) == 0 {
|
||||
return nil
|
||||
}
|
||||
a := make([]Base, len(tags))
|
||||
for i, t := range tags {
|
||||
a[i] = Base{langID(t.lang)}
|
||||
}
|
||||
sort.Sort(bases(a))
|
||||
k := 0
|
||||
for i := 1; i < len(a); i++ {
|
||||
if a[k] != a[i] {
|
||||
k++
|
||||
a[k] = a[i]
|
||||
}
|
||||
}
|
||||
return a[:k+1]
|
||||
}
|
||||
return s.bases()
|
||||
}
|
||||
|
||||
func (s *coverage) Scripts() []Script {
|
||||
if s.scripts == nil {
|
||||
return nil
|
||||
}
|
||||
return s.scripts()
|
||||
}
|
||||
|
||||
func (s *coverage) Regions() []Region {
|
||||
if s.regions == nil {
|
||||
return nil
|
||||
}
|
||||
return s.regions()
|
||||
}
|
||||
|
||||
// NewCoverage returns a Coverage for the given lists. It is typically used by
|
||||
// packages providing internationalization services to define their level of
|
||||
// coverage. A list may be of type []T or func() []T, where T is either Tag,
|
||||
// Base, Script or Region. The returned Coverage derives the value for Bases
|
||||
// from Tags if no func or slice for []Base is specified. For other unspecified
|
||||
// types the returned Coverage will return nil for the respective methods.
|
||||
func NewCoverage(list ...interface{}) Coverage {
|
||||
s := &coverage{}
|
||||
for _, x := range list {
|
||||
switch v := x.(type) {
|
||||
case func() []Base:
|
||||
s.bases = v
|
||||
case func() []Script:
|
||||
s.scripts = v
|
||||
case func() []Region:
|
||||
s.regions = v
|
||||
case func() []Tag:
|
||||
s.tags = v
|
||||
case []Base:
|
||||
s.bases = func() []Base { return v }
|
||||
case []Script:
|
||||
s.scripts = func() []Script { return v }
|
||||
case []Region:
|
||||
s.regions = func() []Region { return v }
|
||||
case []Tag:
|
||||
s.tags = func() []Tag { return v }
|
||||
default:
|
||||
panic(fmt.Sprintf("language: unsupported set type %T", v))
|
||||
}
|
||||
}
|
||||
return s
|
||||
}
|
||||
20
vendor/golang.org/x/text/language/gen_common.go
generated
vendored
Normal file
20
vendor/golang.org/x/text/language/gen_common.go
generated
vendored
Normal file
|
|
@ -0,0 +1,20 @@
|
|||
// Copyright 2014 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build ignore
|
||||
|
||||
package main
|
||||
|
||||
// This file contains code common to the maketables.go and the package code.
|
||||
|
||||
// langAliasType is the type of an alias in langAliasMap.
|
||||
type langAliasType int8
|
||||
|
||||
const (
|
||||
langDeprecated langAliasType = iota
|
||||
langMacro
|
||||
langLegacy
|
||||
|
||||
langAliasTypeUnknown langAliasType = -1
|
||||
)
|
||||
162
vendor/golang.org/x/text/language/gen_index.go
generated
vendored
Normal file
162
vendor/golang.org/x/text/language/gen_index.go
generated
vendored
Normal file
|
|
@ -0,0 +1,162 @@
|
|||
// Copyright 2015 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build ignore
|
||||
|
||||
package main
|
||||
|
||||
// This file generates derivative tables based on the language package itself.
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"flag"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"log"
|
||||
"reflect"
|
||||
"sort"
|
||||
"strings"
|
||||
|
||||
"golang.org/x/text/internal/gen"
|
||||
"golang.org/x/text/language"
|
||||
"golang.org/x/text/unicode/cldr"
|
||||
)
|
||||
|
||||
var (
|
||||
test = flag.Bool("test", false,
|
||||
"test existing tables; can be used to compare web data with package data.")
|
||||
|
||||
draft = flag.String("draft",
|
||||
"contributed",
|
||||
`Minimal draft requirements (approved, contributed, provisional, unconfirmed).`)
|
||||
)
|
||||
|
||||
func main() {
|
||||
gen.Init()
|
||||
|
||||
// Read the CLDR zip file.
|
||||
r := gen.OpenCLDRCoreZip()
|
||||
defer r.Close()
|
||||
|
||||
d := &cldr.Decoder{}
|
||||
data, err := d.DecodeZip(r)
|
||||
if err != nil {
|
||||
log.Fatalf("DecodeZip: %v", err)
|
||||
}
|
||||
|
||||
w := gen.NewCodeWriter()
|
||||
defer func() {
|
||||
buf := &bytes.Buffer{}
|
||||
|
||||
if _, err = w.WriteGo(buf, "language"); err != nil {
|
||||
log.Fatalf("Error formatting file index.go: %v", err)
|
||||
}
|
||||
|
||||
// Since we're generating a table for our own package we need to rewrite
|
||||
// doing the equivalent of go fmt -r 'language.b -> b'. Using
|
||||
// bytes.Replace will do.
|
||||
out := bytes.Replace(buf.Bytes(), []byte("language."), nil, -1)
|
||||
if err := ioutil.WriteFile("index.go", out, 0600); err != nil {
|
||||
log.Fatalf("Could not create file index.go: %v", err)
|
||||
}
|
||||
}()
|
||||
|
||||
m := map[language.Tag]bool{}
|
||||
for _, lang := range data.Locales() {
|
||||
// We include all locales unconditionally to be consistent with en_US.
|
||||
// We want en_US, even though it has no data associated with it.
|
||||
|
||||
// TODO: put any of the languages for which no data exists at the end
|
||||
// of the index. This allows all components based on ICU to use that
|
||||
// as the cutoff point.
|
||||
// if x := data.RawLDML(lang); false ||
|
||||
// x.LocaleDisplayNames != nil ||
|
||||
// x.Characters != nil ||
|
||||
// x.Delimiters != nil ||
|
||||
// x.Measurement != nil ||
|
||||
// x.Dates != nil ||
|
||||
// x.Numbers != nil ||
|
||||
// x.Units != nil ||
|
||||
// x.ListPatterns != nil ||
|
||||
// x.Collations != nil ||
|
||||
// x.Segmentations != nil ||
|
||||
// x.Rbnf != nil ||
|
||||
// x.Annotations != nil ||
|
||||
// x.Metadata != nil {
|
||||
|
||||
// TODO: support POSIX natively, albeit non-standard.
|
||||
tag := language.Make(strings.Replace(lang, "_POSIX", "-u-va-posix", 1))
|
||||
m[tag] = true
|
||||
// }
|
||||
}
|
||||
// Include locales for plural rules, which uses a different structure.
|
||||
for _, plurals := range data.Supplemental().Plurals {
|
||||
for _, rules := range plurals.PluralRules {
|
||||
for _, lang := range strings.Split(rules.Locales, " ") {
|
||||
m[language.Make(lang)] = true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var core, special []language.Tag
|
||||
|
||||
for t := range m {
|
||||
if x := t.Extensions(); len(x) != 0 && fmt.Sprint(x) != "[u-va-posix]" {
|
||||
log.Fatalf("Unexpected extension %v in %v", x, t)
|
||||
}
|
||||
if len(t.Variants()) == 0 && len(t.Extensions()) == 0 {
|
||||
core = append(core, t)
|
||||
} else {
|
||||
special = append(special, t)
|
||||
}
|
||||
}
|
||||
|
||||
w.WriteComment(`
|
||||
NumCompactTags is the number of common tags. The maximum tag is
|
||||
NumCompactTags-1.`)
|
||||
w.WriteConst("NumCompactTags", len(core)+len(special))
|
||||
|
||||
sort.Sort(byAlpha(special))
|
||||
w.WriteVar("specialTags", special)
|
||||
|
||||
// TODO: order by frequency?
|
||||
sort.Sort(byAlpha(core))
|
||||
|
||||
// Size computations are just an estimate.
|
||||
w.Size += int(reflect.TypeOf(map[uint32]uint16{}).Size())
|
||||
w.Size += len(core) * 6 // size of uint32 and uint16
|
||||
|
||||
fmt.Fprintln(w)
|
||||
fmt.Fprintln(w, "var coreTags = map[uint32]uint16{")
|
||||
fmt.Fprintln(w, "0x0: 0, // und")
|
||||
i := len(special) + 1 // Und and special tags already written.
|
||||
for _, t := range core {
|
||||
if t == language.Und {
|
||||
continue
|
||||
}
|
||||
fmt.Fprint(w.Hash, t, i)
|
||||
b, s, r := t.Raw()
|
||||
fmt.Fprintf(w, "0x%s%s%s: %d, // %s\n",
|
||||
getIndex(b, 3), // 3 is enough as it is guaranteed to be a compact number
|
||||
getIndex(s, 2),
|
||||
getIndex(r, 3),
|
||||
i, t)
|
||||
i++
|
||||
}
|
||||
fmt.Fprintln(w, "}")
|
||||
}
|
||||
|
||||
// getIndex prints the subtag type and extracts its index of size nibble.
|
||||
// If the index is less than n nibbles, the result is prefixed with 0s.
|
||||
func getIndex(x interface{}, n int) string {
|
||||
s := fmt.Sprintf("%#v", x) // s is of form Type{typeID: 0x00}
|
||||
s = s[strings.Index(s, "0x")+2 : len(s)-1]
|
||||
return strings.Repeat("0", n-len(s)) + s
|
||||
}
|
||||
|
||||
type byAlpha []language.Tag
|
||||
|
||||
func (a byAlpha) Len() int { return len(a) }
|
||||
func (a byAlpha) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
|
||||
func (a byAlpha) Less(i, j int) bool { return a[i].String() < a[j].String() }
|
||||
38
vendor/golang.org/x/text/language/go1_1.go
generated
vendored
Normal file
38
vendor/golang.org/x/text/language/go1_1.go
generated
vendored
Normal file
|
|
@ -0,0 +1,38 @@
|
|||
// Copyright 2013 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build !go1.2
|
||||
|
||||
package language
|
||||
|
||||
import "sort"
|
||||
|
||||
func sortStable(s sort.Interface) {
|
||||
ss := stableSort{
|
||||
s: s,
|
||||
pos: make([]int, s.Len()),
|
||||
}
|
||||
for i := range ss.pos {
|
||||
ss.pos[i] = i
|
||||
}
|
||||
sort.Sort(&ss)
|
||||
}
|
||||
|
||||
type stableSort struct {
|
||||
s sort.Interface
|
||||
pos []int
|
||||
}
|
||||
|
||||
func (s *stableSort) Len() int {
|
||||
return len(s.pos)
|
||||
}
|
||||
|
||||
func (s *stableSort) Less(i, j int) bool {
|
||||
return s.s.Less(i, j) || !s.s.Less(j, i) && s.pos[i] < s.pos[j]
|
||||
}
|
||||
|
||||
func (s *stableSort) Swap(i, j int) {
|
||||
s.s.Swap(i, j)
|
||||
s.pos[i], s.pos[j] = s.pos[j], s.pos[i]
|
||||
}
|
||||
11
vendor/golang.org/x/text/language/go1_2.go
generated
vendored
Normal file
11
vendor/golang.org/x/text/language/go1_2.go
generated
vendored
Normal file
|
|
@ -0,0 +1,11 @@
|
|||
// Copyright 2013 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build go1.2
|
||||
|
||||
package language
|
||||
|
||||
import "sort"
|
||||
|
||||
var sortStable = sort.Stable
|
||||
762
vendor/golang.org/x/text/language/index.go
generated
vendored
Normal file
762
vendor/golang.org/x/text/language/index.go
generated
vendored
Normal file
|
|
@ -0,0 +1,762 @@
|
|||
// This file was generated by go generate; DO NOT EDIT
|
||||
|
||||
package language
|
||||
|
||||
// NumCompactTags is the number of common tags. The maximum tag is
|
||||
// NumCompactTags-1.
|
||||
const NumCompactTags = 747
|
||||
|
||||
var specialTags = []Tag{ // 2 elements
|
||||
0: {lang: 0x61, region: 0x6d, script: 0x0, pVariant: 0x5, pExt: 0xe, str: "ca-ES-valencia"},
|
||||
1: {lang: 0x9b, region: 0x132, script: 0x0, pVariant: 0x5, pExt: 0x5, str: "en-US-u-va-posix"},
|
||||
} // Size: 72 bytes
|
||||
|
||||
var coreTags = map[uint32]uint16{
|
||||
0x0: 0, // und
|
||||
0x00a00000: 3, // af
|
||||
0x00a000d0: 4, // af-NA
|
||||
0x00a0015e: 5, // af-ZA
|
||||
0x00b00000: 6, // agq
|
||||
0x00b00051: 7, // agq-CM
|
||||
0x00d00000: 8, // ak
|
||||
0x00d0007e: 9, // ak-GH
|
||||
0x01100000: 10, // am
|
||||
0x0110006e: 11, // am-ET
|
||||
0x01500000: 12, // ar
|
||||
0x01500001: 13, // ar-001
|
||||
0x01500022: 14, // ar-AE
|
||||
0x01500038: 15, // ar-BH
|
||||
0x01500061: 16, // ar-DJ
|
||||
0x01500066: 17, // ar-DZ
|
||||
0x0150006a: 18, // ar-EG
|
||||
0x0150006b: 19, // ar-EH
|
||||
0x0150006c: 20, // ar-ER
|
||||
0x01500095: 21, // ar-IL
|
||||
0x01500099: 22, // ar-IQ
|
||||
0x0150009f: 23, // ar-JO
|
||||
0x015000a6: 24, // ar-KM
|
||||
0x015000aa: 25, // ar-KW
|
||||
0x015000ae: 26, // ar-LB
|
||||
0x015000b7: 27, // ar-LY
|
||||
0x015000b8: 28, // ar-MA
|
||||
0x015000c7: 29, // ar-MR
|
||||
0x015000df: 30, // ar-OM
|
||||
0x015000eb: 31, // ar-PS
|
||||
0x015000f1: 32, // ar-QA
|
||||
0x01500106: 33, // ar-SA
|
||||
0x01500109: 34, // ar-SD
|
||||
0x01500113: 35, // ar-SO
|
||||
0x01500115: 36, // ar-SS
|
||||
0x0150011a: 37, // ar-SY
|
||||
0x0150011e: 38, // ar-TD
|
||||
0x01500126: 39, // ar-TN
|
||||
0x0150015b: 40, // ar-YE
|
||||
0x01c00000: 41, // as
|
||||
0x01c00097: 42, // as-IN
|
||||
0x01d00000: 43, // asa
|
||||
0x01d0012d: 44, // asa-TZ
|
||||
0x01f00000: 45, // ast
|
||||
0x01f0006d: 46, // ast-ES
|
||||
0x02400000: 47, // az
|
||||
0x0241e000: 48, // az-Cyrl
|
||||
0x0241e031: 49, // az-Cyrl-AZ
|
||||
0x02452000: 50, // az-Latn
|
||||
0x02452031: 51, // az-Latn-AZ
|
||||
0x02a00000: 52, // bas
|
||||
0x02a00051: 53, // bas-CM
|
||||
0x02f00000: 54, // be
|
||||
0x02f00046: 55, // be-BY
|
||||
0x03100000: 56, // bem
|
||||
0x0310015f: 57, // bem-ZM
|
||||
0x03300000: 58, // bez
|
||||
0x0330012d: 59, // bez-TZ
|
||||
0x03800000: 60, // bg
|
||||
0x03800037: 61, // bg-BG
|
||||
0x03c00000: 62, // bh
|
||||
0x04900000: 63, // bm
|
||||
0x049000c1: 64, // bm-ML
|
||||
0x04b00000: 65, // bn
|
||||
0x04b00034: 66, // bn-BD
|
||||
0x04b00097: 67, // bn-IN
|
||||
0x04c00000: 68, // bo
|
||||
0x04c00052: 69, // bo-CN
|
||||
0x04c00097: 70, // bo-IN
|
||||
0x05000000: 71, // br
|
||||
0x05000076: 72, // br-FR
|
||||
0x05300000: 73, // brx
|
||||
0x05300097: 74, // brx-IN
|
||||
0x05400000: 75, // bs
|
||||
0x0541e000: 76, // bs-Cyrl
|
||||
0x0541e032: 77, // bs-Cyrl-BA
|
||||
0x05452000: 78, // bs-Latn
|
||||
0x05452032: 79, // bs-Latn-BA
|
||||
0x06100000: 80, // ca
|
||||
0x06100021: 81, // ca-AD
|
||||
0x0610006d: 82, // ca-ES
|
||||
0x06100076: 83, // ca-FR
|
||||
0x0610009c: 84, // ca-IT
|
||||
0x06400000: 85, // ce
|
||||
0x06400104: 86, // ce-RU
|
||||
0x06600000: 87, // cgg
|
||||
0x0660012f: 88, // cgg-UG
|
||||
0x06c00000: 89, // chr
|
||||
0x06c00132: 90, // chr-US
|
||||
0x06f00000: 91, // ckb
|
||||
0x06f00099: 92, // ckb-IQ
|
||||
0x06f0009a: 93, // ckb-IR
|
||||
0x07900000: 94, // cs
|
||||
0x0790005d: 95, // cs-CZ
|
||||
0x07d00000: 96, // cu
|
||||
0x07d00104: 97, // cu-RU
|
||||
0x07f00000: 98, // cy
|
||||
0x07f00079: 99, // cy-GB
|
||||
0x08000000: 100, // da
|
||||
0x08000062: 101, // da-DK
|
||||
0x08000080: 102, // da-GL
|
||||
0x08300000: 103, // dav
|
||||
0x083000a2: 104, // dav-KE
|
||||
0x08500000: 105, // de
|
||||
0x0850002d: 106, // de-AT
|
||||
0x08500035: 107, // de-BE
|
||||
0x0850004d: 108, // de-CH
|
||||
0x0850005f: 109, // de-DE
|
||||
0x085000b0: 110, // de-LI
|
||||
0x085000b5: 111, // de-LU
|
||||
0x08800000: 112, // dje
|
||||
0x088000d2: 113, // dje-NE
|
||||
0x08b00000: 114, // dsb
|
||||
0x08b0005f: 115, // dsb-DE
|
||||
0x08f00000: 116, // dua
|
||||
0x08f00051: 117, // dua-CM
|
||||
0x09000000: 118, // dv
|
||||
0x09100000: 119, // dyo
|
||||
0x09100112: 120, // dyo-SN
|
||||
0x09300000: 121, // dz
|
||||
0x09300042: 122, // dz-BT
|
||||
0x09400000: 123, // ebu
|
||||
0x094000a2: 124, // ebu-KE
|
||||
0x09500000: 125, // ee
|
||||
0x0950007e: 126, // ee-GH
|
||||
0x09500120: 127, // ee-TG
|
||||
0x09a00000: 128, // el
|
||||
0x09a0005c: 129, // el-CY
|
||||
0x09a00085: 130, // el-GR
|
||||
0x09b00000: 131, // en
|
||||
0x09b00001: 132, // en-001
|
||||
0x09b0001a: 133, // en-150
|
||||
0x09b00024: 134, // en-AG
|
||||
0x09b00025: 135, // en-AI
|
||||
0x09b0002c: 136, // en-AS
|
||||
0x09b0002d: 137, // en-AT
|
||||
0x09b0002e: 138, // en-AU
|
||||
0x09b00033: 139, // en-BB
|
||||
0x09b00035: 140, // en-BE
|
||||
0x09b00039: 141, // en-BI
|
||||
0x09b0003c: 142, // en-BM
|
||||
0x09b00041: 143, // en-BS
|
||||
0x09b00045: 144, // en-BW
|
||||
0x09b00047: 145, // en-BZ
|
||||
0x09b00048: 146, // en-CA
|
||||
0x09b00049: 147, // en-CC
|
||||
0x09b0004d: 148, // en-CH
|
||||
0x09b0004f: 149, // en-CK
|
||||
0x09b00051: 150, // en-CM
|
||||
0x09b0005b: 151, // en-CX
|
||||
0x09b0005c: 152, // en-CY
|
||||
0x09b0005f: 153, // en-DE
|
||||
0x09b00060: 154, // en-DG
|
||||
0x09b00062: 155, // en-DK
|
||||
0x09b00063: 156, // en-DM
|
||||
0x09b0006c: 157, // en-ER
|
||||
0x09b00070: 158, // en-FI
|
||||
0x09b00071: 159, // en-FJ
|
||||
0x09b00072: 160, // en-FK
|
||||
0x09b00073: 161, // en-FM
|
||||
0x09b00079: 162, // en-GB
|
||||
0x09b0007a: 163, // en-GD
|
||||
0x09b0007d: 164, // en-GG
|
||||
0x09b0007e: 165, // en-GH
|
||||
0x09b0007f: 166, // en-GI
|
||||
0x09b00081: 167, // en-GM
|
||||
0x09b00088: 168, // en-GU
|
||||
0x09b0008a: 169, // en-GY
|
||||
0x09b0008b: 170, // en-HK
|
||||
0x09b00094: 171, // en-IE
|
||||
0x09b00095: 172, // en-IL
|
||||
0x09b00096: 173, // en-IM
|
||||
0x09b00097: 174, // en-IN
|
||||
0x09b00098: 175, // en-IO
|
||||
0x09b0009d: 176, // en-JE
|
||||
0x09b0009e: 177, // en-JM
|
||||
0x09b000a2: 178, // en-KE
|
||||
0x09b000a5: 179, // en-KI
|
||||
0x09b000a7: 180, // en-KN
|
||||
0x09b000ab: 181, // en-KY
|
||||
0x09b000af: 182, // en-LC
|
||||
0x09b000b2: 183, // en-LR
|
||||
0x09b000b3: 184, // en-LS
|
||||
0x09b000bd: 185, // en-MG
|
||||
0x09b000be: 186, // en-MH
|
||||
0x09b000c4: 187, // en-MO
|
||||
0x09b000c5: 188, // en-MP
|
||||
0x09b000c8: 189, // en-MS
|
||||
0x09b000c9: 190, // en-MT
|
||||
0x09b000ca: 191, // en-MU
|
||||
0x09b000cc: 192, // en-MW
|
||||
0x09b000ce: 193, // en-MY
|
||||
0x09b000d0: 194, // en-NA
|
||||
0x09b000d3: 195, // en-NF
|
||||
0x09b000d4: 196, // en-NG
|
||||
0x09b000d7: 197, // en-NL
|
||||
0x09b000db: 198, // en-NR
|
||||
0x09b000dd: 199, // en-NU
|
||||
0x09b000de: 200, // en-NZ
|
||||
0x09b000e4: 201, // en-PG
|
||||
0x09b000e5: 202, // en-PH
|
||||
0x09b000e6: 203, // en-PK
|
||||
0x09b000e9: 204, // en-PN
|
||||
0x09b000ea: 205, // en-PR
|
||||
0x09b000ee: 206, // en-PW
|
||||
0x09b00105: 207, // en-RW
|
||||
0x09b00107: 208, // en-SB
|
||||
0x09b00108: 209, // en-SC
|
||||
0x09b00109: 210, // en-SD
|
||||
0x09b0010a: 211, // en-SE
|
||||
0x09b0010b: 212, // en-SG
|
||||
0x09b0010c: 213, // en-SH
|
||||
0x09b0010d: 214, // en-SI
|
||||
0x09b00110: 215, // en-SL
|
||||
0x09b00115: 216, // en-SS
|
||||
0x09b00119: 217, // en-SX
|
||||
0x09b0011b: 218, // en-SZ
|
||||
0x09b0011d: 219, // en-TC
|
||||
0x09b00123: 220, // en-TK
|
||||
0x09b00127: 221, // en-TO
|
||||
0x09b0012a: 222, // en-TT
|
||||
0x09b0012b: 223, // en-TV
|
||||
0x09b0012d: 224, // en-TZ
|
||||
0x09b0012f: 225, // en-UG
|
||||
0x09b00131: 226, // en-UM
|
||||
0x09b00132: 227, // en-US
|
||||
0x09b00136: 228, // en-VC
|
||||
0x09b00139: 229, // en-VG
|
||||
0x09b0013a: 230, // en-VI
|
||||
0x09b0013c: 231, // en-VU
|
||||
0x09b0013f: 232, // en-WS
|
||||
0x09b0015e: 233, // en-ZA
|
||||
0x09b0015f: 234, // en-ZM
|
||||
0x09b00161: 235, // en-ZW
|
||||
0x09c00000: 236, // eo
|
||||
0x09c00001: 237, // eo-001
|
||||
0x09d00000: 238, // es
|
||||
0x09d0001e: 239, // es-419
|
||||
0x09d0002b: 240, // es-AR
|
||||
0x09d0003e: 241, // es-BO
|
||||
0x09d00040: 242, // es-BR
|
||||
0x09d00050: 243, // es-CL
|
||||
0x09d00053: 244, // es-CO
|
||||
0x09d00055: 245, // es-CR
|
||||
0x09d00058: 246, // es-CU
|
||||
0x09d00064: 247, // es-DO
|
||||
0x09d00067: 248, // es-EA
|
||||
0x09d00068: 249, // es-EC
|
||||
0x09d0006d: 250, // es-ES
|
||||
0x09d00084: 251, // es-GQ
|
||||
0x09d00087: 252, // es-GT
|
||||
0x09d0008d: 253, // es-HN
|
||||
0x09d00092: 254, // es-IC
|
||||
0x09d000cd: 255, // es-MX
|
||||
0x09d000d6: 256, // es-NI
|
||||
0x09d000e0: 257, // es-PA
|
||||
0x09d000e2: 258, // es-PE
|
||||
0x09d000e5: 259, // es-PH
|
||||
0x09d000ea: 260, // es-PR
|
||||
0x09d000ef: 261, // es-PY
|
||||
0x09d00118: 262, // es-SV
|
||||
0x09d00132: 263, // es-US
|
||||
0x09d00133: 264, // es-UY
|
||||
0x09d00138: 265, // es-VE
|
||||
0x09f00000: 266, // et
|
||||
0x09f00069: 267, // et-EE
|
||||
0x0a100000: 268, // eu
|
||||
0x0a10006d: 269, // eu-ES
|
||||
0x0a200000: 270, // ewo
|
||||
0x0a200051: 271, // ewo-CM
|
||||
0x0a400000: 272, // fa
|
||||
0x0a400023: 273, // fa-AF
|
||||
0x0a40009a: 274, // fa-IR
|
||||
0x0a600000: 275, // ff
|
||||
0x0a600051: 276, // ff-CM
|
||||
0x0a600082: 277, // ff-GN
|
||||
0x0a6000c7: 278, // ff-MR
|
||||
0x0a600112: 279, // ff-SN
|
||||
0x0a800000: 280, // fi
|
||||
0x0a800070: 281, // fi-FI
|
||||
0x0aa00000: 282, // fil
|
||||
0x0aa000e5: 283, // fil-PH
|
||||
0x0ad00000: 284, // fo
|
||||
0x0ad00062: 285, // fo-DK
|
||||
0x0ad00074: 286, // fo-FO
|
||||
0x0af00000: 287, // fr
|
||||
0x0af00035: 288, // fr-BE
|
||||
0x0af00036: 289, // fr-BF
|
||||
0x0af00039: 290, // fr-BI
|
||||
0x0af0003a: 291, // fr-BJ
|
||||
0x0af0003b: 292, // fr-BL
|
||||
0x0af00048: 293, // fr-CA
|
||||
0x0af0004a: 294, // fr-CD
|
||||
0x0af0004b: 295, // fr-CF
|
||||
0x0af0004c: 296, // fr-CG
|
||||
0x0af0004d: 297, // fr-CH
|
||||
0x0af0004e: 298, // fr-CI
|
||||
0x0af00051: 299, // fr-CM
|
||||
0x0af00061: 300, // fr-DJ
|
||||
0x0af00066: 301, // fr-DZ
|
||||
0x0af00076: 302, // fr-FR
|
||||
0x0af00078: 303, // fr-GA
|
||||
0x0af0007c: 304, // fr-GF
|
||||
0x0af00082: 305, // fr-GN
|
||||
0x0af00083: 306, // fr-GP
|
||||
0x0af00084: 307, // fr-GQ
|
||||
0x0af0008f: 308, // fr-HT
|
||||
0x0af000a6: 309, // fr-KM
|
||||
0x0af000b5: 310, // fr-LU
|
||||
0x0af000b8: 311, // fr-MA
|
||||
0x0af000b9: 312, // fr-MC
|
||||
0x0af000bc: 313, // fr-MF
|
||||
0x0af000bd: 314, // fr-MG
|
||||
0x0af000c1: 315, // fr-ML
|
||||
0x0af000c6: 316, // fr-MQ
|
||||
0x0af000c7: 317, // fr-MR
|
||||
0x0af000ca: 318, // fr-MU
|
||||
0x0af000d1: 319, // fr-NC
|
||||
0x0af000d2: 320, // fr-NE
|
||||
0x0af000e3: 321, // fr-PF
|
||||
0x0af000e8: 322, // fr-PM
|
||||
0x0af00100: 323, // fr-RE
|
||||
0x0af00105: 324, // fr-RW
|
||||
0x0af00108: 325, // fr-SC
|
||||
0x0af00112: 326, // fr-SN
|
||||
0x0af0011a: 327, // fr-SY
|
||||
0x0af0011e: 328, // fr-TD
|
||||
0x0af00120: 329, // fr-TG
|
||||
0x0af00126: 330, // fr-TN
|
||||
0x0af0013c: 331, // fr-VU
|
||||
0x0af0013d: 332, // fr-WF
|
||||
0x0af0015c: 333, // fr-YT
|
||||
0x0b600000: 334, // fur
|
||||
0x0b60009c: 335, // fur-IT
|
||||
0x0b900000: 336, // fy
|
||||
0x0b9000d7: 337, // fy-NL
|
||||
0x0ba00000: 338, // ga
|
||||
0x0ba00094: 339, // ga-IE
|
||||
0x0c200000: 340, // gd
|
||||
0x0c200079: 341, // gd-GB
|
||||
0x0c800000: 342, // gl
|
||||
0x0c80006d: 343, // gl-ES
|
||||
0x0d200000: 344, // gsw
|
||||
0x0d20004d: 345, // gsw-CH
|
||||
0x0d200076: 346, // gsw-FR
|
||||
0x0d2000b0: 347, // gsw-LI
|
||||
0x0d300000: 348, // gu
|
||||
0x0d300097: 349, // gu-IN
|
||||
0x0d700000: 350, // guw
|
||||
0x0d800000: 351, // guz
|
||||
0x0d8000a2: 352, // guz-KE
|
||||
0x0d900000: 353, // gv
|
||||
0x0d900096: 354, // gv-IM
|
||||
0x0dc00000: 355, // ha
|
||||
0x0dc0007e: 356, // ha-GH
|
||||
0x0dc000d2: 357, // ha-NE
|
||||
0x0dc000d4: 358, // ha-NG
|
||||
0x0de00000: 359, // haw
|
||||
0x0de00132: 360, // haw-US
|
||||
0x0e000000: 361, // he
|
||||
0x0e000095: 362, // he-IL
|
||||
0x0e100000: 363, // hi
|
||||
0x0e100097: 364, // hi-IN
|
||||
0x0ee00000: 365, // hr
|
||||
0x0ee00032: 366, // hr-BA
|
||||
0x0ee0008e: 367, // hr-HR
|
||||
0x0ef00000: 368, // hsb
|
||||
0x0ef0005f: 369, // hsb-DE
|
||||
0x0f200000: 370, // hu
|
||||
0x0f200090: 371, // hu-HU
|
||||
0x0f300000: 372, // hy
|
||||
0x0f300027: 373, // hy-AM
|
||||
0x0f800000: 374, // id
|
||||
0x0f800093: 375, // id-ID
|
||||
0x0fa00000: 376, // ig
|
||||
0x0fa000d4: 377, // ig-NG
|
||||
0x0fb00000: 378, // ii
|
||||
0x0fb00052: 379, // ii-CN
|
||||
0x10200000: 380, // is
|
||||
0x1020009b: 381, // is-IS
|
||||
0x10300000: 382, // it
|
||||
0x1030004d: 383, // it-CH
|
||||
0x1030009c: 384, // it-IT
|
||||
0x10300111: 385, // it-SM
|
||||
0x10400000: 386, // iu
|
||||
0x10700000: 387, // ja
|
||||
0x107000a0: 388, // ja-JP
|
||||
0x10900000: 389, // jbo
|
||||
0x10a00000: 390, // jgo
|
||||
0x10a00051: 391, // jgo-CM
|
||||
0x10c00000: 392, // jmc
|
||||
0x10c0012d: 393, // jmc-TZ
|
||||
0x10f00000: 394, // jv
|
||||
0x11100000: 395, // ka
|
||||
0x1110007b: 396, // ka-GE
|
||||
0x11300000: 397, // kab
|
||||
0x11300066: 398, // kab-DZ
|
||||
0x11500000: 399, // kaj
|
||||
0x11600000: 400, // kam
|
||||
0x116000a2: 401, // kam-KE
|
||||
0x11900000: 402, // kcg
|
||||
0x11b00000: 403, // kde
|
||||
0x11b0012d: 404, // kde-TZ
|
||||
0x11d00000: 405, // kea
|
||||
0x11d00059: 406, // kea-CV
|
||||
0x12800000: 407, // khq
|
||||
0x128000c1: 408, // khq-ML
|
||||
0x12b00000: 409, // ki
|
||||
0x12b000a2: 410, // ki-KE
|
||||
0x12f00000: 411, // kk
|
||||
0x12f000ac: 412, // kk-KZ
|
||||
0x13000000: 413, // kkj
|
||||
0x13000051: 414, // kkj-CM
|
||||
0x13100000: 415, // kl
|
||||
0x13100080: 416, // kl-GL
|
||||
0x13200000: 417, // kln
|
||||
0x132000a2: 418, // kln-KE
|
||||
0x13300000: 419, // km
|
||||
0x133000a4: 420, // km-KH
|
||||
0x13500000: 421, // kn
|
||||
0x13500097: 422, // kn-IN
|
||||
0x13600000: 423, // ko
|
||||
0x136000a8: 424, // ko-KP
|
||||
0x136000a9: 425, // ko-KR
|
||||
0x13800000: 426, // kok
|
||||
0x13800097: 427, // kok-IN
|
||||
0x14100000: 428, // ks
|
||||
0x14100097: 429, // ks-IN
|
||||
0x14200000: 430, // ksb
|
||||
0x1420012d: 431, // ksb-TZ
|
||||
0x14300000: 432, // ksf
|
||||
0x14300051: 433, // ksf-CM
|
||||
0x14400000: 434, // ksh
|
||||
0x1440005f: 435, // ksh-DE
|
||||
0x14500000: 436, // ku
|
||||
0x14a00000: 437, // kw
|
||||
0x14a00079: 438, // kw-GB
|
||||
0x14d00000: 439, // ky
|
||||
0x14d000a3: 440, // ky-KG
|
||||
0x15100000: 441, // lag
|
||||
0x1510012d: 442, // lag-TZ
|
||||
0x15400000: 443, // lb
|
||||
0x154000b5: 444, // lb-LU
|
||||
0x15a00000: 445, // lg
|
||||
0x15a0012f: 446, // lg-UG
|
||||
0x16100000: 447, // lkt
|
||||
0x16100132: 448, // lkt-US
|
||||
0x16400000: 449, // ln
|
||||
0x16400029: 450, // ln-AO
|
||||
0x1640004a: 451, // ln-CD
|
||||
0x1640004b: 452, // ln-CF
|
||||
0x1640004c: 453, // ln-CG
|
||||
0x16500000: 454, // lo
|
||||
0x165000ad: 455, // lo-LA
|
||||
0x16800000: 456, // lrc
|
||||
0x16800099: 457, // lrc-IQ
|
||||
0x1680009a: 458, // lrc-IR
|
||||
0x16900000: 459, // lt
|
||||
0x169000b4: 460, // lt-LT
|
||||
0x16b00000: 461, // lu
|
||||
0x16b0004a: 462, // lu-CD
|
||||
0x16d00000: 463, // luo
|
||||
0x16d000a2: 464, // luo-KE
|
||||
0x16e00000: 465, // luy
|
||||
0x16e000a2: 466, // luy-KE
|
||||
0x17000000: 467, // lv
|
||||
0x170000b6: 468, // lv-LV
|
||||
0x17a00000: 469, // mas
|
||||
0x17a000a2: 470, // mas-KE
|
||||
0x17a0012d: 471, // mas-TZ
|
||||
0x18000000: 472, // mer
|
||||
0x180000a2: 473, // mer-KE
|
||||
0x18200000: 474, // mfe
|
||||
0x182000ca: 475, // mfe-MU
|
||||
0x18300000: 476, // mg
|
||||
0x183000bd: 477, // mg-MG
|
||||
0x18400000: 478, // mgh
|
||||
0x184000cf: 479, // mgh-MZ
|
||||
0x18500000: 480, // mgo
|
||||
0x18500051: 481, // mgo-CM
|
||||
0x18c00000: 482, // mk
|
||||
0x18c000c0: 483, // mk-MK
|
||||
0x18d00000: 484, // ml
|
||||
0x18d00097: 485, // ml-IN
|
||||
0x18f00000: 486, // mn
|
||||
0x18f000c3: 487, // mn-MN
|
||||
0x19600000: 488, // mr
|
||||
0x19600097: 489, // mr-IN
|
||||
0x19a00000: 490, // ms
|
||||
0x19a0003d: 491, // ms-BN
|
||||
0x19a000ce: 492, // ms-MY
|
||||
0x19a0010b: 493, // ms-SG
|
||||
0x19b00000: 494, // mt
|
||||
0x19b000c9: 495, // mt-MT
|
||||
0x19d00000: 496, // mua
|
||||
0x19d00051: 497, // mua-CM
|
||||
0x1a500000: 498, // my
|
||||
0x1a5000c2: 499, // my-MM
|
||||
0x1a900000: 500, // mzn
|
||||
0x1a90009a: 501, // mzn-IR
|
||||
0x1ab00000: 502, // nah
|
||||
0x1ae00000: 503, // naq
|
||||
0x1ae000d0: 504, // naq-NA
|
||||
0x1af00000: 505, // nb
|
||||
0x1af000d8: 506, // nb-NO
|
||||
0x1af0010e: 507, // nb-SJ
|
||||
0x1b100000: 508, // nd
|
||||
0x1b100161: 509, // nd-ZW
|
||||
0x1b400000: 510, // ne
|
||||
0x1b400097: 511, // ne-IN
|
||||
0x1b4000d9: 512, // ne-NP
|
||||
0x1bd00000: 513, // nl
|
||||
0x1bd0002f: 514, // nl-AW
|
||||
0x1bd00035: 515, // nl-BE
|
||||
0x1bd0003f: 516, // nl-BQ
|
||||
0x1bd0005a: 517, // nl-CW
|
||||
0x1bd000d7: 518, // nl-NL
|
||||
0x1bd00114: 519, // nl-SR
|
||||
0x1bd00119: 520, // nl-SX
|
||||
0x1be00000: 521, // nmg
|
||||
0x1be00051: 522, // nmg-CM
|
||||
0x1bf00000: 523, // nn
|
||||
0x1bf000d8: 524, // nn-NO
|
||||
0x1c000000: 525, // nnh
|
||||
0x1c000051: 526, // nnh-CM
|
||||
0x1c100000: 527, // no
|
||||
0x1c500000: 528, // nqo
|
||||
0x1c600000: 529, // nr
|
||||
0x1c800000: 530, // nso
|
||||
0x1c900000: 531, // nus
|
||||
0x1c900115: 532, // nus-SS
|
||||
0x1cc00000: 533, // ny
|
||||
0x1ce00000: 534, // nyn
|
||||
0x1ce0012f: 535, // nyn-UG
|
||||
0x1d200000: 536, // om
|
||||
0x1d20006e: 537, // om-ET
|
||||
0x1d2000a2: 538, // om-KE
|
||||
0x1d300000: 539, // or
|
||||
0x1d300097: 540, // or-IN
|
||||
0x1d400000: 541, // os
|
||||
0x1d40007b: 542, // os-GE
|
||||
0x1d400104: 543, // os-RU
|
||||
0x1d700000: 544, // pa
|
||||
0x1d705000: 545, // pa-Arab
|
||||
0x1d7050e6: 546, // pa-Arab-PK
|
||||
0x1d72f000: 547, // pa-Guru
|
||||
0x1d72f097: 548, // pa-Guru-IN
|
||||
0x1db00000: 549, // pap
|
||||
0x1e700000: 550, // pl
|
||||
0x1e7000e7: 551, // pl-PL
|
||||
0x1ed00000: 552, // prg
|
||||
0x1ed00001: 553, // prg-001
|
||||
0x1ee00000: 554, // ps
|
||||
0x1ee00023: 555, // ps-AF
|
||||
0x1ef00000: 556, // pt
|
||||
0x1ef00029: 557, // pt-AO
|
||||
0x1ef00040: 558, // pt-BR
|
||||
0x1ef0004d: 559, // pt-CH
|
||||
0x1ef00059: 560, // pt-CV
|
||||
0x1ef00084: 561, // pt-GQ
|
||||
0x1ef00089: 562, // pt-GW
|
||||
0x1ef000b5: 563, // pt-LU
|
||||
0x1ef000c4: 564, // pt-MO
|
||||
0x1ef000cf: 565, // pt-MZ
|
||||
0x1ef000ec: 566, // pt-PT
|
||||
0x1ef00116: 567, // pt-ST
|
||||
0x1ef00124: 568, // pt-TL
|
||||
0x1f100000: 569, // qu
|
||||
0x1f10003e: 570, // qu-BO
|
||||
0x1f100068: 571, // qu-EC
|
||||
0x1f1000e2: 572, // qu-PE
|
||||
0x1fc00000: 573, // rm
|
||||
0x1fc0004d: 574, // rm-CH
|
||||
0x20100000: 575, // rn
|
||||
0x20100039: 576, // rn-BI
|
||||
0x20300000: 577, // ro
|
||||
0x203000ba: 578, // ro-MD
|
||||
0x20300102: 579, // ro-RO
|
||||
0x20500000: 580, // rof
|
||||
0x2050012d: 581, // rof-TZ
|
||||
0x20700000: 582, // ru
|
||||
0x20700046: 583, // ru-BY
|
||||
0x207000a3: 584, // ru-KG
|
||||
0x207000ac: 585, // ru-KZ
|
||||
0x207000ba: 586, // ru-MD
|
||||
0x20700104: 587, // ru-RU
|
||||
0x2070012e: 588, // ru-UA
|
||||
0x20a00000: 589, // rw
|
||||
0x20a00105: 590, // rw-RW
|
||||
0x20b00000: 591, // rwk
|
||||
0x20b0012d: 592, // rwk-TZ
|
||||
0x20f00000: 593, // sah
|
||||
0x20f00104: 594, // sah-RU
|
||||
0x21000000: 595, // saq
|
||||
0x210000a2: 596, // saq-KE
|
||||
0x21400000: 597, // sbp
|
||||
0x2140012d: 598, // sbp-TZ
|
||||
0x21c00000: 599, // sdh
|
||||
0x21d00000: 600, // se
|
||||
0x21d00070: 601, // se-FI
|
||||
0x21d000d8: 602, // se-NO
|
||||
0x21d0010a: 603, // se-SE
|
||||
0x21f00000: 604, // seh
|
||||
0x21f000cf: 605, // seh-MZ
|
||||
0x22100000: 606, // ses
|
||||
0x221000c1: 607, // ses-ML
|
||||
0x22200000: 608, // sg
|
||||
0x2220004b: 609, // sg-CF
|
||||
0x22600000: 610, // shi
|
||||
0x22652000: 611, // shi-Latn
|
||||
0x226520b8: 612, // shi-Latn-MA
|
||||
0x226d2000: 613, // shi-Tfng
|
||||
0x226d20b8: 614, // shi-Tfng-MA
|
||||
0x22800000: 615, // si
|
||||
0x228000b1: 616, // si-LK
|
||||
0x22a00000: 617, // sk
|
||||
0x22a0010f: 618, // sk-SK
|
||||
0x22c00000: 619, // sl
|
||||
0x22c0010d: 620, // sl-SI
|
||||
0x23000000: 621, // sma
|
||||
0x23100000: 622, // smi
|
||||
0x23200000: 623, // smj
|
||||
0x23300000: 624, // smn
|
||||
0x23300070: 625, // smn-FI
|
||||
0x23500000: 626, // sms
|
||||
0x23600000: 627, // sn
|
||||
0x23600161: 628, // sn-ZW
|
||||
0x23800000: 629, // so
|
||||
0x23800061: 630, // so-DJ
|
||||
0x2380006e: 631, // so-ET
|
||||
0x238000a2: 632, // so-KE
|
||||
0x23800113: 633, // so-SO
|
||||
0x23a00000: 634, // sq
|
||||
0x23a00026: 635, // sq-AL
|
||||
0x23a000c0: 636, // sq-MK
|
||||
0x23a0014a: 637, // sq-XK
|
||||
0x23b00000: 638, // sr
|
||||
0x23b1e000: 639, // sr-Cyrl
|
||||
0x23b1e032: 640, // sr-Cyrl-BA
|
||||
0x23b1e0bb: 641, // sr-Cyrl-ME
|
||||
0x23b1e103: 642, // sr-Cyrl-RS
|
||||
0x23b1e14a: 643, // sr-Cyrl-XK
|
||||
0x23b52000: 644, // sr-Latn
|
||||
0x23b52032: 645, // sr-Latn-BA
|
||||
0x23b520bb: 646, // sr-Latn-ME
|
||||
0x23b52103: 647, // sr-Latn-RS
|
||||
0x23b5214a: 648, // sr-Latn-XK
|
||||
0x24000000: 649, // ss
|
||||
0x24100000: 650, // ssy
|
||||
0x24200000: 651, // st
|
||||
0x24700000: 652, // sv
|
||||
0x24700030: 653, // sv-AX
|
||||
0x24700070: 654, // sv-FI
|
||||
0x2470010a: 655, // sv-SE
|
||||
0x24800000: 656, // sw
|
||||
0x2480004a: 657, // sw-CD
|
||||
0x248000a2: 658, // sw-KE
|
||||
0x2480012d: 659, // sw-TZ
|
||||
0x2480012f: 660, // sw-UG
|
||||
0x24f00000: 661, // syr
|
||||
0x25100000: 662, // ta
|
||||
0x25100097: 663, // ta-IN
|
||||
0x251000b1: 664, // ta-LK
|
||||
0x251000ce: 665, // ta-MY
|
||||
0x2510010b: 666, // ta-SG
|
||||
0x25800000: 667, // te
|
||||
0x25800097: 668, // te-IN
|
||||
0x25a00000: 669, // teo
|
||||
0x25a000a2: 670, // teo-KE
|
||||
0x25a0012f: 671, // teo-UG
|
||||
0x25d00000: 672, // th
|
||||
0x25d00121: 673, // th-TH
|
||||
0x26100000: 674, // ti
|
||||
0x2610006c: 675, // ti-ER
|
||||
0x2610006e: 676, // ti-ET
|
||||
0x26200000: 677, // tig
|
||||
0x26400000: 678, // tk
|
||||
0x26400125: 679, // tk-TM
|
||||
0x26b00000: 680, // tn
|
||||
0x26c00000: 681, // to
|
||||
0x26c00127: 682, // to-TO
|
||||
0x26f00000: 683, // tr
|
||||
0x26f0005c: 684, // tr-CY
|
||||
0x26f00129: 685, // tr-TR
|
||||
0x27200000: 686, // ts
|
||||
0x27e00000: 687, // twq
|
||||
0x27e000d2: 688, // twq-NE
|
||||
0x28200000: 689, // tzm
|
||||
0x282000b8: 690, // tzm-MA
|
||||
0x28400000: 691, // ug
|
||||
0x28400052: 692, // ug-CN
|
||||
0x28600000: 693, // uk
|
||||
0x2860012e: 694, // uk-UA
|
||||
0x28c00000: 695, // ur
|
||||
0x28c00097: 696, // ur-IN
|
||||
0x28c000e6: 697, // ur-PK
|
||||
0x28d00000: 698, // uz
|
||||
0x28d05000: 699, // uz-Arab
|
||||
0x28d05023: 700, // uz-Arab-AF
|
||||
0x28d1e000: 701, // uz-Cyrl
|
||||
0x28d1e134: 702, // uz-Cyrl-UZ
|
||||
0x28d52000: 703, // uz-Latn
|
||||
0x28d52134: 704, // uz-Latn-UZ
|
||||
0x28e00000: 705, // vai
|
||||
0x28e52000: 706, // vai-Latn
|
||||
0x28e520b2: 707, // vai-Latn-LR
|
||||
0x28ed9000: 708, // vai-Vaii
|
||||
0x28ed90b2: 709, // vai-Vaii-LR
|
||||
0x28f00000: 710, // ve
|
||||
0x29200000: 711, // vi
|
||||
0x2920013b: 712, // vi-VN
|
||||
0x29700000: 713, // vo
|
||||
0x29700001: 714, // vo-001
|
||||
0x29a00000: 715, // vun
|
||||
0x29a0012d: 716, // vun-TZ
|
||||
0x29b00000: 717, // wa
|
||||
0x29c00000: 718, // wae
|
||||
0x29c0004d: 719, // wae-CH
|
||||
0x2a400000: 720, // wo
|
||||
0x2a900000: 721, // xh
|
||||
0x2b100000: 722, // xog
|
||||
0x2b10012f: 723, // xog-UG
|
||||
0x2b700000: 724, // yav
|
||||
0x2b700051: 725, // yav-CM
|
||||
0x2b900000: 726, // yi
|
||||
0x2b900001: 727, // yi-001
|
||||
0x2ba00000: 728, // yo
|
||||
0x2ba0003a: 729, // yo-BJ
|
||||
0x2ba000d4: 730, // yo-NG
|
||||
0x2bd00000: 731, // yue
|
||||
0x2bd0008b: 732, // yue-HK
|
||||
0x2c300000: 733, // zgh
|
||||
0x2c3000b8: 734, // zgh-MA
|
||||
0x2c400000: 735, // zh
|
||||
0x2c434000: 736, // zh-Hans
|
||||
0x2c434052: 737, // zh-Hans-CN
|
||||
0x2c43408b: 738, // zh-Hans-HK
|
||||
0x2c4340c4: 739, // zh-Hans-MO
|
||||
0x2c43410b: 740, // zh-Hans-SG
|
||||
0x2c435000: 741, // zh-Hant
|
||||
0x2c43508b: 742, // zh-Hant-HK
|
||||
0x2c4350c4: 743, // zh-Hant-MO
|
||||
0x2c43512c: 744, // zh-Hant-TW
|
||||
0x2c600000: 745, // zu
|
||||
0x2c60015e: 746, // zu-ZA
|
||||
}
|
||||
|
||||
// Total table size 4550 bytes (4KiB); checksum: B6D49547
|
||||
975
vendor/golang.org/x/text/language/language.go
generated
vendored
Normal file
975
vendor/golang.org/x/text/language/language.go
generated
vendored
Normal file
|
|
@ -0,0 +1,975 @@
|
|||
// Copyright 2013 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
//go:generate go run maketables.go gen_common.go -output tables.go
|
||||
//go:generate go run gen_index.go
|
||||
|
||||
// Package language implements BCP 47 language tags and related functionality.
|
||||
//
|
||||
// The Tag type, which is used to represent languages, is agnostic to the
|
||||
// meaning of its subtags. Tags are not fully canonicalized to preserve
|
||||
// information that may be valuable in certain contexts. As a consequence, two
|
||||
// different tags may represent identical languages.
|
||||
//
|
||||
// Initializing language- or locale-specific components usually consists of
|
||||
// two steps. The first step is to select a display language based on the
|
||||
// preferred languages of the user and the languages supported by an application.
|
||||
// The second step is to create the language-specific services based on
|
||||
// this selection. Each is discussed in more details below.
|
||||
//
|
||||
// Matching preferred against supported languages
|
||||
//
|
||||
// An application may support various languages. This list is typically limited
|
||||
// by the languages for which there exists translations of the user interface.
|
||||
// Similarly, a user may provide a list of preferred languages which is limited
|
||||
// by the languages understood by this user.
|
||||
// An application should use a Matcher to find the best supported language based
|
||||
// on the user's preferred list.
|
||||
// Matchers are aware of the intricacies of equivalence between languages.
|
||||
// The default Matcher implementation takes into account things such as
|
||||
// deprecated subtags, legacy tags, and mutual intelligibility between scripts
|
||||
// and languages.
|
||||
//
|
||||
// A Matcher for English, Australian English, Danish, and standard Mandarin can
|
||||
// be defined as follows:
|
||||
//
|
||||
// var matcher = language.NewMatcher([]language.Tag{
|
||||
// language.English, // The first language is used as fallback.
|
||||
// language.MustParse("en-AU"),
|
||||
// language.Danish,
|
||||
// language.Chinese,
|
||||
// })
|
||||
//
|
||||
// The following code selects the best match for someone speaking Spanish and
|
||||
// Norwegian:
|
||||
//
|
||||
// preferred := []language.Tag{ language.Spanish, language.Norwegian }
|
||||
// tag, _, _ := matcher.Match(preferred...)
|
||||
//
|
||||
// In this case, the best match is Danish, as Danish is sufficiently a match to
|
||||
// Norwegian to not have to fall back to the default.
|
||||
// See ParseAcceptLanguage on how to handle the Accept-Language HTTP header.
|
||||
//
|
||||
// Selecting language-specific services
|
||||
//
|
||||
// One should always use the Tag returned by the Matcher to create an instance
|
||||
// of any of the language-specific services provided by the text repository.
|
||||
// This prevents the mixing of languages, such as having a different language for
|
||||
// messages and display names, as well as improper casing or sorting order for
|
||||
// the selected language.
|
||||
// Using the returned Tag also allows user-defined settings, such as collation
|
||||
// order or numbering system to be transparently passed as options.
|
||||
//
|
||||
// If you have language-specific data in your application, however, it will in
|
||||
// most cases suffice to use the index returned by the matcher to identify
|
||||
// the user language.
|
||||
// The following loop provides an alternative in case this is not sufficient:
|
||||
//
|
||||
// supported := map[language.Tag]data{
|
||||
// language.English: enData,
|
||||
// language.MustParse("en-AU"): enAUData,
|
||||
// language.Danish: daData,
|
||||
// language.Chinese: zhData,
|
||||
// }
|
||||
// tag, _, _ := matcher.Match(preferred...)
|
||||
// for ; tag != language.Und; tag = tag.Parent() {
|
||||
// if v, ok := supported[tag]; ok {
|
||||
// return v
|
||||
// }
|
||||
// }
|
||||
// return enData // should not reach here
|
||||
//
|
||||
// Repeatedly taking the Parent of the tag returned by Match will eventually
|
||||
// match one of the tags used to initialize the Matcher.
|
||||
//
|
||||
// Canonicalization
|
||||
//
|
||||
// By default, only legacy and deprecated tags are converted into their
|
||||
// canonical equivalent. All other information is preserved. This approach makes
|
||||
// the confidence scores more accurate and allows matchers to distinguish
|
||||
// between variants that are otherwise lost.
|
||||
//
|
||||
// As a consequence, two tags that should be treated as identical according to
|
||||
// BCP 47 or CLDR, like "en-Latn" and "en", will be represented differently. The
|
||||
// Matchers will handle such distinctions, though, and are aware of the
|
||||
// equivalence relations. The CanonType type can be used to alter the
|
||||
// canonicalization form.
|
||||
//
|
||||
// References
|
||||
//
|
||||
// BCP 47 - Tags for Identifying Languages
|
||||
// http://tools.ietf.org/html/bcp47
|
||||
package language
|
||||
|
||||
// TODO: Remove above NOTE after:
|
||||
// - verifying that tables are dropped correctly (most notably matcher tables).
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"strings"
|
||||
)
|
||||
|
||||
const (
|
||||
// maxCoreSize is the maximum size of a BCP 47 tag without variants and
|
||||
// extensions. Equals max lang (3) + script (4) + max reg (3) + 2 dashes.
|
||||
maxCoreSize = 12
|
||||
|
||||
// max99thPercentileSize is a somewhat arbitrary buffer size that presumably
|
||||
// is large enough to hold at least 99% of the BCP 47 tags.
|
||||
max99thPercentileSize = 32
|
||||
|
||||
// maxSimpleUExtensionSize is the maximum size of a -u extension with one
|
||||
// key-type pair. Equals len("-u-") + key (2) + dash + max value (8).
|
||||
maxSimpleUExtensionSize = 14
|
||||
)
|
||||
|
||||
// Tag represents a BCP 47 language tag. It is used to specify an instance of a
|
||||
// specific language or locale. All language tag values are guaranteed to be
|
||||
// well-formed.
|
||||
type Tag struct {
|
||||
lang langID
|
||||
region regionID
|
||||
script scriptID
|
||||
pVariant byte // offset in str, includes preceding '-'
|
||||
pExt uint16 // offset of first extension, includes preceding '-'
|
||||
|
||||
// str is the string representation of the Tag. It will only be used if the
|
||||
// tag has variants or extensions.
|
||||
str string
|
||||
}
|
||||
|
||||
// Make is a convenience wrapper for Parse that omits the error.
|
||||
// In case of an error, a sensible default is returned.
|
||||
func Make(s string) Tag {
|
||||
return Default.Make(s)
|
||||
}
|
||||
|
||||
// Make is a convenience wrapper for c.Parse that omits the error.
|
||||
// In case of an error, a sensible default is returned.
|
||||
func (c CanonType) Make(s string) Tag {
|
||||
t, _ := c.Parse(s)
|
||||
return t
|
||||
}
|
||||
|
||||
// Raw returns the raw base language, script and region, without making an
|
||||
// attempt to infer their values.
|
||||
func (t Tag) Raw() (b Base, s Script, r Region) {
|
||||
return Base{t.lang}, Script{t.script}, Region{t.region}
|
||||
}
|
||||
|
||||
// equalTags compares language, script and region subtags only.
|
||||
func (t Tag) equalTags(a Tag) bool {
|
||||
return t.lang == a.lang && t.script == a.script && t.region == a.region
|
||||
}
|
||||
|
||||
// IsRoot returns true if t is equal to language "und".
|
||||
func (t Tag) IsRoot() bool {
|
||||
if int(t.pVariant) < len(t.str) {
|
||||
return false
|
||||
}
|
||||
return t.equalTags(und)
|
||||
}
|
||||
|
||||
// private reports whether the Tag consists solely of a private use tag.
|
||||
func (t Tag) private() bool {
|
||||
return t.str != "" && t.pVariant == 0
|
||||
}
|
||||
|
||||
// CanonType can be used to enable or disable various types of canonicalization.
|
||||
type CanonType int
|
||||
|
||||
const (
|
||||
// Replace deprecated base languages with their preferred replacements.
|
||||
DeprecatedBase CanonType = 1 << iota
|
||||
// Replace deprecated scripts with their preferred replacements.
|
||||
DeprecatedScript
|
||||
// Replace deprecated regions with their preferred replacements.
|
||||
DeprecatedRegion
|
||||
// Remove redundant scripts.
|
||||
SuppressScript
|
||||
// Normalize legacy encodings. This includes legacy languages defined in
|
||||
// CLDR as well as bibliographic codes defined in ISO-639.
|
||||
Legacy
|
||||
// Map the dominant language of a macro language group to the macro language
|
||||
// subtag. For example cmn -> zh.
|
||||
Macro
|
||||
// The CLDR flag should be used if full compatibility with CLDR is required.
|
||||
// There are a few cases where language.Tag may differ from CLDR. To follow all
|
||||
// of CLDR's suggestions, use All|CLDR.
|
||||
CLDR
|
||||
|
||||
// Raw can be used to Compose or Parse without Canonicalization.
|
||||
Raw CanonType = 0
|
||||
|
||||
// Replace all deprecated tags with their preferred replacements.
|
||||
Deprecated = DeprecatedBase | DeprecatedScript | DeprecatedRegion
|
||||
|
||||
// All canonicalizations recommended by BCP 47.
|
||||
BCP47 = Deprecated | SuppressScript
|
||||
|
||||
// All canonicalizations.
|
||||
All = BCP47 | Legacy | Macro
|
||||
|
||||
// Default is the canonicalization used by Parse, Make and Compose. To
|
||||
// preserve as much information as possible, canonicalizations that remove
|
||||
// potentially valuable information are not included. The Matcher is
|
||||
// designed to recognize similar tags that would be the same if
|
||||
// they were canonicalized using All.
|
||||
Default = Deprecated | Legacy
|
||||
|
||||
canonLang = DeprecatedBase | Legacy | Macro
|
||||
|
||||
// TODO: LikelyScript, LikelyRegion: suppress similar to ICU.
|
||||
)
|
||||
|
||||
// canonicalize returns the canonicalized equivalent of the tag and
|
||||
// whether there was any change.
|
||||
func (t Tag) canonicalize(c CanonType) (Tag, bool) {
|
||||
if c == Raw {
|
||||
return t, false
|
||||
}
|
||||
changed := false
|
||||
if c&SuppressScript != 0 {
|
||||
if t.lang < langNoIndexOffset && uint8(t.script) == suppressScript[t.lang] {
|
||||
t.script = 0
|
||||
changed = true
|
||||
}
|
||||
}
|
||||
if c&canonLang != 0 {
|
||||
for {
|
||||
if l, aliasType := normLang(t.lang); l != t.lang {
|
||||
switch aliasType {
|
||||
case langLegacy:
|
||||
if c&Legacy != 0 {
|
||||
if t.lang == _sh && t.script == 0 {
|
||||
t.script = _Latn
|
||||
}
|
||||
t.lang = l
|
||||
changed = true
|
||||
}
|
||||
case langMacro:
|
||||
if c&Macro != 0 {
|
||||
// We deviate here from CLDR. The mapping "nb" -> "no"
|
||||
// qualifies as a typical Macro language mapping. However,
|
||||
// for legacy reasons, CLDR maps "no", the macro language
|
||||
// code for Norwegian, to the dominant variant "nb". This
|
||||
// change is currently under consideration for CLDR as well.
|
||||
// See http://unicode.org/cldr/trac/ticket/2698 and also
|
||||
// http://unicode.org/cldr/trac/ticket/1790 for some of the
|
||||
// practical implications. TODO: this check could be removed
|
||||
// if CLDR adopts this change.
|
||||
if c&CLDR == 0 || t.lang != _nb {
|
||||
changed = true
|
||||
t.lang = l
|
||||
}
|
||||
}
|
||||
case langDeprecated:
|
||||
if c&DeprecatedBase != 0 {
|
||||
if t.lang == _mo && t.region == 0 {
|
||||
t.region = _MD
|
||||
}
|
||||
t.lang = l
|
||||
changed = true
|
||||
// Other canonicalization types may still apply.
|
||||
continue
|
||||
}
|
||||
}
|
||||
} else if c&Legacy != 0 && t.lang == _no && c&CLDR != 0 {
|
||||
t.lang = _nb
|
||||
changed = true
|
||||
}
|
||||
break
|
||||
}
|
||||
}
|
||||
if c&DeprecatedScript != 0 {
|
||||
if t.script == _Qaai {
|
||||
changed = true
|
||||
t.script = _Zinh
|
||||
}
|
||||
}
|
||||
if c&DeprecatedRegion != 0 {
|
||||
if r := normRegion(t.region); r != 0 {
|
||||
changed = true
|
||||
t.region = r
|
||||
}
|
||||
}
|
||||
return t, changed
|
||||
}
|
||||
|
||||
// Canonicalize returns the canonicalized equivalent of the tag.
|
||||
func (c CanonType) Canonicalize(t Tag) (Tag, error) {
|
||||
t, changed := t.canonicalize(c)
|
||||
if changed {
|
||||
t.remakeString()
|
||||
}
|
||||
return t, nil
|
||||
}
|
||||
|
||||
// Confidence indicates the level of certainty for a given return value.
|
||||
// For example, Serbian may be written in Cyrillic or Latin script.
|
||||
// The confidence level indicates whether a value was explicitly specified,
|
||||
// whether it is typically the only possible value, or whether there is
|
||||
// an ambiguity.
|
||||
type Confidence int
|
||||
|
||||
const (
|
||||
No Confidence = iota // full confidence that there was no match
|
||||
Low // most likely value picked out of a set of alternatives
|
||||
High // value is generally assumed to be the correct match
|
||||
Exact // exact match or explicitly specified value
|
||||
)
|
||||
|
||||
var confName = []string{"No", "Low", "High", "Exact"}
|
||||
|
||||
func (c Confidence) String() string {
|
||||
return confName[c]
|
||||
}
|
||||
|
||||
// remakeString is used to update t.str in case lang, script or region changed.
|
||||
// It is assumed that pExt and pVariant still point to the start of the
|
||||
// respective parts.
|
||||
func (t *Tag) remakeString() {
|
||||
if t.str == "" {
|
||||
return
|
||||
}
|
||||
extra := t.str[t.pVariant:]
|
||||
if t.pVariant > 0 {
|
||||
extra = extra[1:]
|
||||
}
|
||||
if t.equalTags(und) && strings.HasPrefix(extra, "x-") {
|
||||
t.str = extra
|
||||
t.pVariant = 0
|
||||
t.pExt = 0
|
||||
return
|
||||
}
|
||||
var buf [max99thPercentileSize]byte // avoid extra memory allocation in most cases.
|
||||
b := buf[:t.genCoreBytes(buf[:])]
|
||||
if extra != "" {
|
||||
diff := len(b) - int(t.pVariant)
|
||||
b = append(b, '-')
|
||||
b = append(b, extra...)
|
||||
t.pVariant = uint8(int(t.pVariant) + diff)
|
||||
t.pExt = uint16(int(t.pExt) + diff)
|
||||
} else {
|
||||
t.pVariant = uint8(len(b))
|
||||
t.pExt = uint16(len(b))
|
||||
}
|
||||
t.str = string(b)
|
||||
}
|
||||
|
||||
// genCoreBytes writes a string for the base languages, script and region tags
|
||||
// to the given buffer and returns the number of bytes written. It will never
|
||||
// write more than maxCoreSize bytes.
|
||||
func (t *Tag) genCoreBytes(buf []byte) int {
|
||||
n := t.lang.stringToBuf(buf[:])
|
||||
if t.script != 0 {
|
||||
n += copy(buf[n:], "-")
|
||||
n += copy(buf[n:], t.script.String())
|
||||
}
|
||||
if t.region != 0 {
|
||||
n += copy(buf[n:], "-")
|
||||
n += copy(buf[n:], t.region.String())
|
||||
}
|
||||
return n
|
||||
}
|
||||
|
||||
// String returns the canonical string representation of the language tag.
|
||||
func (t Tag) String() string {
|
||||
if t.str != "" {
|
||||
return t.str
|
||||
}
|
||||
if t.script == 0 && t.region == 0 {
|
||||
return t.lang.String()
|
||||
}
|
||||
buf := [maxCoreSize]byte{}
|
||||
return string(buf[:t.genCoreBytes(buf[:])])
|
||||
}
|
||||
|
||||
// Base returns the base language of the language tag. If the base language is
|
||||
// unspecified, an attempt will be made to infer it from the context.
|
||||
// It uses a variant of CLDR's Add Likely Subtags algorithm. This is subject to change.
|
||||
func (t Tag) Base() (Base, Confidence) {
|
||||
if t.lang != 0 {
|
||||
return Base{t.lang}, Exact
|
||||
}
|
||||
c := High
|
||||
if t.script == 0 && !(Region{t.region}).IsCountry() {
|
||||
c = Low
|
||||
}
|
||||
if tag, err := addTags(t); err == nil && tag.lang != 0 {
|
||||
return Base{tag.lang}, c
|
||||
}
|
||||
return Base{0}, No
|
||||
}
|
||||
|
||||
// Script infers the script for the language tag. If it was not explicitly given, it will infer
|
||||
// a most likely candidate.
|
||||
// If more than one script is commonly used for a language, the most likely one
|
||||
// is returned with a low confidence indication. For example, it returns (Cyrl, Low)
|
||||
// for Serbian.
|
||||
// If a script cannot be inferred (Zzzz, No) is returned. We do not use Zyyy (undetermined)
|
||||
// as one would suspect from the IANA registry for BCP 47. In a Unicode context Zyyy marks
|
||||
// common characters (like 1, 2, 3, '.', etc.) and is therefore more like multiple scripts.
|
||||
// See http://www.unicode.org/reports/tr24/#Values for more details. Zzzz is also used for
|
||||
// unknown value in CLDR. (Zzzz, Exact) is returned if Zzzz was explicitly specified.
|
||||
// Note that an inferred script is never guaranteed to be the correct one. Latin is
|
||||
// almost exclusively used for Afrikaans, but Arabic has been used for some texts
|
||||
// in the past. Also, the script that is commonly used may change over time.
|
||||
// It uses a variant of CLDR's Add Likely Subtags algorithm. This is subject to change.
|
||||
func (t Tag) Script() (Script, Confidence) {
|
||||
if t.script != 0 {
|
||||
return Script{t.script}, Exact
|
||||
}
|
||||
sc, c := scriptID(_Zzzz), No
|
||||
if t.lang < langNoIndexOffset {
|
||||
if scr := scriptID(suppressScript[t.lang]); scr != 0 {
|
||||
// Note: it is not always the case that a language with a suppress
|
||||
// script value is only written in one script (e.g. kk, ms, pa).
|
||||
if t.region == 0 {
|
||||
return Script{scriptID(scr)}, High
|
||||
}
|
||||
sc, c = scr, High
|
||||
}
|
||||
}
|
||||
if tag, err := addTags(t); err == nil {
|
||||
if tag.script != sc {
|
||||
sc, c = tag.script, Low
|
||||
}
|
||||
} else {
|
||||
t, _ = (Deprecated | Macro).Canonicalize(t)
|
||||
if tag, err := addTags(t); err == nil && tag.script != sc {
|
||||
sc, c = tag.script, Low
|
||||
}
|
||||
}
|
||||
return Script{sc}, c
|
||||
}
|
||||
|
||||
// Region returns the region for the language tag. If it was not explicitly given, it will
|
||||
// infer a most likely candidate from the context.
|
||||
// It uses a variant of CLDR's Add Likely Subtags algorithm. This is subject to change.
|
||||
func (t Tag) Region() (Region, Confidence) {
|
||||
if t.region != 0 {
|
||||
return Region{t.region}, Exact
|
||||
}
|
||||
if t, err := addTags(t); err == nil {
|
||||
return Region{t.region}, Low // TODO: differentiate between high and low.
|
||||
}
|
||||
t, _ = (Deprecated | Macro).Canonicalize(t)
|
||||
if tag, err := addTags(t); err == nil {
|
||||
return Region{tag.region}, Low
|
||||
}
|
||||
return Region{_ZZ}, No // TODO: return world instead of undetermined?
|
||||
}
|
||||
|
||||
// Variant returns the variants specified explicitly for this language tag.
|
||||
// or nil if no variant was specified.
|
||||
func (t Tag) Variants() []Variant {
|
||||
v := []Variant{}
|
||||
if int(t.pVariant) < int(t.pExt) {
|
||||
for x, str := "", t.str[t.pVariant:t.pExt]; str != ""; {
|
||||
x, str = nextToken(str)
|
||||
v = append(v, Variant{x})
|
||||
}
|
||||
}
|
||||
return v
|
||||
}
|
||||
|
||||
// Parent returns the CLDR parent of t. In CLDR, missing fields in data for a
|
||||
// specific language are substituted with fields from the parent language.
|
||||
// The parent for a language may change for newer versions of CLDR.
|
||||
func (t Tag) Parent() Tag {
|
||||
if t.str != "" {
|
||||
// Strip the variants and extensions.
|
||||
t, _ = Raw.Compose(t.Raw())
|
||||
if t.region == 0 && t.script != 0 && t.lang != 0 {
|
||||
base, _ := addTags(Tag{lang: t.lang})
|
||||
if base.script == t.script {
|
||||
return Tag{lang: t.lang}
|
||||
}
|
||||
}
|
||||
return t
|
||||
}
|
||||
if t.lang != 0 {
|
||||
if t.region != 0 {
|
||||
maxScript := t.script
|
||||
if maxScript == 0 {
|
||||
max, _ := addTags(t)
|
||||
maxScript = max.script
|
||||
}
|
||||
|
||||
for i := range parents {
|
||||
if langID(parents[i].lang) == t.lang && scriptID(parents[i].maxScript) == maxScript {
|
||||
for _, r := range parents[i].fromRegion {
|
||||
if regionID(r) == t.region {
|
||||
return Tag{
|
||||
lang: t.lang,
|
||||
script: scriptID(parents[i].script),
|
||||
region: regionID(parents[i].toRegion),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Strip the script if it is the default one.
|
||||
base, _ := addTags(Tag{lang: t.lang})
|
||||
if base.script != maxScript {
|
||||
return Tag{lang: t.lang, script: maxScript}
|
||||
}
|
||||
return Tag{lang: t.lang}
|
||||
} else if t.script != 0 {
|
||||
// The parent for an base-script pair with a non-default script is
|
||||
// "und" instead of the base language.
|
||||
base, _ := addTags(Tag{lang: t.lang})
|
||||
if base.script != t.script {
|
||||
return und
|
||||
}
|
||||
return Tag{lang: t.lang}
|
||||
}
|
||||
}
|
||||
return und
|
||||
}
|
||||
|
||||
// returns token t and the rest of the string.
|
||||
func nextToken(s string) (t, tail string) {
|
||||
p := strings.Index(s[1:], "-")
|
||||
if p == -1 {
|
||||
return s[1:], ""
|
||||
}
|
||||
p++
|
||||
return s[1:p], s[p:]
|
||||
}
|
||||
|
||||
// Extension is a single BCP 47 extension.
|
||||
type Extension struct {
|
||||
s string
|
||||
}
|
||||
|
||||
// String returns the string representation of the extension, including the
|
||||
// type tag.
|
||||
func (e Extension) String() string {
|
||||
return e.s
|
||||
}
|
||||
|
||||
// ParseExtension parses s as an extension and returns it on success.
|
||||
func ParseExtension(s string) (e Extension, err error) {
|
||||
scan := makeScannerString(s)
|
||||
var end int
|
||||
if n := len(scan.token); n != 1 {
|
||||
return Extension{}, errSyntax
|
||||
}
|
||||
scan.toLower(0, len(scan.b))
|
||||
end = parseExtension(&scan)
|
||||
if end != len(s) {
|
||||
return Extension{}, errSyntax
|
||||
}
|
||||
return Extension{string(scan.b)}, nil
|
||||
}
|
||||
|
||||
// Type returns the one-byte extension type of e. It returns 0 for the zero
|
||||
// exception.
|
||||
func (e Extension) Type() byte {
|
||||
if e.s == "" {
|
||||
return 0
|
||||
}
|
||||
return e.s[0]
|
||||
}
|
||||
|
||||
// Tokens returns the list of tokens of e.
|
||||
func (e Extension) Tokens() []string {
|
||||
return strings.Split(e.s, "-")
|
||||
}
|
||||
|
||||
// Extension returns the extension of type x for tag t. It will return
|
||||
// false for ok if t does not have the requested extension. The returned
|
||||
// extension will be invalid in this case.
|
||||
func (t Tag) Extension(x byte) (ext Extension, ok bool) {
|
||||
for i := int(t.pExt); i < len(t.str)-1; {
|
||||
var ext string
|
||||
i, ext = getExtension(t.str, i)
|
||||
if ext[0] == x {
|
||||
return Extension{ext}, true
|
||||
}
|
||||
}
|
||||
return Extension{string(x)}, false
|
||||
}
|
||||
|
||||
// Extensions returns all extensions of t.
|
||||
func (t Tag) Extensions() []Extension {
|
||||
e := []Extension{}
|
||||
for i := int(t.pExt); i < len(t.str)-1; {
|
||||
var ext string
|
||||
i, ext = getExtension(t.str, i)
|
||||
e = append(e, Extension{ext})
|
||||
}
|
||||
return e
|
||||
}
|
||||
|
||||
// TypeForKey returns the type associated with the given key, where key and type
|
||||
// are of the allowed values defined for the Unicode locale extension ('u') in
|
||||
// http://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
|
||||
// TypeForKey will traverse the inheritance chain to get the correct value.
|
||||
func (t Tag) TypeForKey(key string) string {
|
||||
if start, end, _ := t.findTypeForKey(key); end != start {
|
||||
return t.str[start:end]
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
var (
|
||||
errPrivateUse = errors.New("cannot set a key on a private use tag")
|
||||
errInvalidArguments = errors.New("invalid key or type")
|
||||
)
|
||||
|
||||
// SetTypeForKey returns a new Tag with the key set to type, where key and type
|
||||
// are of the allowed values defined for the Unicode locale extension ('u') in
|
||||
// http://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
|
||||
// An empty value removes an existing pair with the same key.
|
||||
func (t Tag) SetTypeForKey(key, value string) (Tag, error) {
|
||||
if t.private() {
|
||||
return t, errPrivateUse
|
||||
}
|
||||
if len(key) != 2 {
|
||||
return t, errInvalidArguments
|
||||
}
|
||||
|
||||
// Remove the setting if value is "".
|
||||
if value == "" {
|
||||
start, end, _ := t.findTypeForKey(key)
|
||||
if start != end {
|
||||
// Remove key tag and leading '-'.
|
||||
start -= 4
|
||||
|
||||
// Remove a possible empty extension.
|
||||
if (end == len(t.str) || t.str[end+2] == '-') && t.str[start-2] == '-' {
|
||||
start -= 2
|
||||
}
|
||||
if start == int(t.pVariant) && end == len(t.str) {
|
||||
t.str = ""
|
||||
t.pVariant, t.pExt = 0, 0
|
||||
} else {
|
||||
t.str = fmt.Sprintf("%s%s", t.str[:start], t.str[end:])
|
||||
}
|
||||
}
|
||||
return t, nil
|
||||
}
|
||||
|
||||
if len(value) < 3 || len(value) > 8 {
|
||||
return t, errInvalidArguments
|
||||
}
|
||||
|
||||
var (
|
||||
buf [maxCoreSize + maxSimpleUExtensionSize]byte
|
||||
uStart int // start of the -u extension.
|
||||
)
|
||||
|
||||
// Generate the tag string if needed.
|
||||
if t.str == "" {
|
||||
uStart = t.genCoreBytes(buf[:])
|
||||
buf[uStart] = '-'
|
||||
uStart++
|
||||
}
|
||||
|
||||
// Create new key-type pair and parse it to verify.
|
||||
b := buf[uStart:]
|
||||
copy(b, "u-")
|
||||
copy(b[2:], key)
|
||||
b[4] = '-'
|
||||
b = b[:5+copy(b[5:], value)]
|
||||
scan := makeScanner(b)
|
||||
if parseExtensions(&scan); scan.err != nil {
|
||||
return t, scan.err
|
||||
}
|
||||
|
||||
// Assemble the replacement string.
|
||||
if t.str == "" {
|
||||
t.pVariant, t.pExt = byte(uStart-1), uint16(uStart-1)
|
||||
t.str = string(buf[:uStart+len(b)])
|
||||
} else {
|
||||
s := t.str
|
||||
start, end, hasExt := t.findTypeForKey(key)
|
||||
if start == end {
|
||||
if hasExt {
|
||||
b = b[2:]
|
||||
}
|
||||
t.str = fmt.Sprintf("%s-%s%s", s[:start], b, s[end:])
|
||||
} else {
|
||||
t.str = fmt.Sprintf("%s%s%s", s[:start], value, s[end:])
|
||||
}
|
||||
}
|
||||
return t, nil
|
||||
}
|
||||
|
||||
// findKeyAndType returns the start and end position for the type corresponding
|
||||
// to key or the point at which to insert the key-value pair if the type
|
||||
// wasn't found. The hasExt return value reports whether an -u extension was present.
|
||||
// Note: the extensions are typically very small and are likely to contain
|
||||
// only one key-type pair.
|
||||
func (t Tag) findTypeForKey(key string) (start, end int, hasExt bool) {
|
||||
p := int(t.pExt)
|
||||
if len(key) != 2 || p == len(t.str) || p == 0 {
|
||||
return p, p, false
|
||||
}
|
||||
s := t.str
|
||||
|
||||
// Find the correct extension.
|
||||
for p++; s[p] != 'u'; p++ {
|
||||
if s[p] > 'u' {
|
||||
p--
|
||||
return p, p, false
|
||||
}
|
||||
if p = nextExtension(s, p); p == len(s) {
|
||||
return len(s), len(s), false
|
||||
}
|
||||
}
|
||||
// Proceed to the hyphen following the extension name.
|
||||
p++
|
||||
|
||||
// curKey is the key currently being processed.
|
||||
curKey := ""
|
||||
|
||||
// Iterate over keys until we get the end of a section.
|
||||
for {
|
||||
// p points to the hyphen preceding the current token.
|
||||
if p3 := p + 3; s[p3] == '-' {
|
||||
// Found a key.
|
||||
// Check whether we just processed the key that was requested.
|
||||
if curKey == key {
|
||||
return start, p, true
|
||||
}
|
||||
// Set to the next key and continue scanning type tokens.
|
||||
curKey = s[p+1 : p3]
|
||||
if curKey > key {
|
||||
return p, p, true
|
||||
}
|
||||
// Start of the type token sequence.
|
||||
start = p + 4
|
||||
// A type is at least 3 characters long.
|
||||
p += 7 // 4 + 3
|
||||
} else {
|
||||
// Attribute or type, which is at least 3 characters long.
|
||||
p += 4
|
||||
}
|
||||
// p points past the third character of a type or attribute.
|
||||
max := p + 5 // maximum length of token plus hyphen.
|
||||
if len(s) < max {
|
||||
max = len(s)
|
||||
}
|
||||
for ; p < max && s[p] != '-'; p++ {
|
||||
}
|
||||
// Bail if we have exhausted all tokens or if the next token starts
|
||||
// a new extension.
|
||||
if p == len(s) || s[p+2] == '-' {
|
||||
if curKey == key {
|
||||
return start, p, true
|
||||
}
|
||||
return p, p, true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// CompactIndex returns an index, where 0 <= index < NumCompactTags, for tags
|
||||
// for which data exists in the text repository. The index will change over time
|
||||
// and should not be stored in persistent storage. Extensions, except for the
|
||||
// 'va' type of the 'u' extension, are ignored. It will return 0, false if no
|
||||
// compact tag exists, where 0 is the index for the root language (Und).
|
||||
func CompactIndex(t Tag) (index int, ok bool) {
|
||||
// TODO: perhaps give more frequent tags a lower index.
|
||||
// TODO: we could make the indexes stable. This will excluded some
|
||||
// possibilities for optimization, so don't do this quite yet.
|
||||
b, s, r := t.Raw()
|
||||
if len(t.str) > 0 {
|
||||
if strings.HasPrefix(t.str, "x-") {
|
||||
// We have no entries for user-defined tags.
|
||||
return 0, false
|
||||
}
|
||||
if uint16(t.pVariant) != t.pExt {
|
||||
// There are no tags with variants and an u-va type.
|
||||
if t.TypeForKey("va") != "" {
|
||||
return 0, false
|
||||
}
|
||||
t, _ = Raw.Compose(b, s, r, t.Variants())
|
||||
} else if _, ok := t.Extension('u'); ok {
|
||||
// Strip all but the 'va' entry.
|
||||
variant := t.TypeForKey("va")
|
||||
t, _ = Raw.Compose(b, s, r)
|
||||
t, _ = t.SetTypeForKey("va", variant)
|
||||
}
|
||||
if len(t.str) > 0 {
|
||||
// We have some variants.
|
||||
for i, s := range specialTags {
|
||||
if s == t {
|
||||
return i + 1, true
|
||||
}
|
||||
}
|
||||
return 0, false
|
||||
}
|
||||
}
|
||||
// No variants specified: just compare core components.
|
||||
// The key has the form lllssrrr, where l, s, and r are nibbles for
|
||||
// respectively the langID, scriptID, and regionID.
|
||||
key := uint32(b.langID) << (8 + 12)
|
||||
key |= uint32(s.scriptID) << 12
|
||||
key |= uint32(r.regionID)
|
||||
x, ok := coreTags[key]
|
||||
return int(x), ok
|
||||
}
|
||||
|
||||
// Base is an ISO 639 language code, used for encoding the base language
|
||||
// of a language tag.
|
||||
type Base struct {
|
||||
langID
|
||||
}
|
||||
|
||||
// ParseBase parses a 2- or 3-letter ISO 639 code.
|
||||
// It returns a ValueError if s is a well-formed but unknown language identifier
|
||||
// or another error if another error occurred.
|
||||
func ParseBase(s string) (Base, error) {
|
||||
if n := len(s); n < 2 || 3 < n {
|
||||
return Base{}, errSyntax
|
||||
}
|
||||
var buf [3]byte
|
||||
l, err := getLangID(buf[:copy(buf[:], s)])
|
||||
return Base{l}, err
|
||||
}
|
||||
|
||||
// Script is a 4-letter ISO 15924 code for representing scripts.
|
||||
// It is idiomatically represented in title case.
|
||||
type Script struct {
|
||||
scriptID
|
||||
}
|
||||
|
||||
// ParseScript parses a 4-letter ISO 15924 code.
|
||||
// It returns a ValueError if s is a well-formed but unknown script identifier
|
||||
// or another error if another error occurred.
|
||||
func ParseScript(s string) (Script, error) {
|
||||
if len(s) != 4 {
|
||||
return Script{}, errSyntax
|
||||
}
|
||||
var buf [4]byte
|
||||
sc, err := getScriptID(script, buf[:copy(buf[:], s)])
|
||||
return Script{sc}, err
|
||||
}
|
||||
|
||||
// Region is an ISO 3166-1 or UN M.49 code for representing countries and regions.
|
||||
type Region struct {
|
||||
regionID
|
||||
}
|
||||
|
||||
// EncodeM49 returns the Region for the given UN M.49 code.
|
||||
// It returns an error if r is not a valid code.
|
||||
func EncodeM49(r int) (Region, error) {
|
||||
rid, err := getRegionM49(r)
|
||||
return Region{rid}, err
|
||||
}
|
||||
|
||||
// ParseRegion parses a 2- or 3-letter ISO 3166-1 or a UN M.49 code.
|
||||
// It returns a ValueError if s is a well-formed but unknown region identifier
|
||||
// or another error if another error occurred.
|
||||
func ParseRegion(s string) (Region, error) {
|
||||
if n := len(s); n < 2 || 3 < n {
|
||||
return Region{}, errSyntax
|
||||
}
|
||||
var buf [3]byte
|
||||
r, err := getRegionID(buf[:copy(buf[:], s)])
|
||||
return Region{r}, err
|
||||
}
|
||||
|
||||
// IsCountry returns whether this region is a country or autonomous area. This
|
||||
// includes non-standard definitions from CLDR.
|
||||
func (r Region) IsCountry() bool {
|
||||
if r.regionID == 0 || r.IsGroup() || r.IsPrivateUse() && r.regionID != _XK {
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// IsGroup returns whether this region defines a collection of regions. This
|
||||
// includes non-standard definitions from CLDR.
|
||||
func (r Region) IsGroup() bool {
|
||||
if r.regionID == 0 {
|
||||
return false
|
||||
}
|
||||
return int(regionInclusion[r.regionID]) < len(regionContainment)
|
||||
}
|
||||
|
||||
// Contains returns whether Region c is contained by Region r. It returns true
|
||||
// if c == r.
|
||||
func (r Region) Contains(c Region) bool {
|
||||
return r.regionID.contains(c.regionID)
|
||||
}
|
||||
|
||||
func (r regionID) contains(c regionID) bool {
|
||||
if r == c {
|
||||
return true
|
||||
}
|
||||
g := regionInclusion[r]
|
||||
if g >= nRegionGroups {
|
||||
return false
|
||||
}
|
||||
m := regionContainment[g]
|
||||
|
||||
d := regionInclusion[c]
|
||||
b := regionInclusionBits[d]
|
||||
|
||||
// A contained country may belong to multiple disjoint groups. Matching any
|
||||
// of these indicates containment. If the contained region is a group, it
|
||||
// must strictly be a subset.
|
||||
if d >= nRegionGroups {
|
||||
return b&m != 0
|
||||
}
|
||||
return b&^m == 0
|
||||
}
|
||||
|
||||
var errNoTLD = errors.New("language: region is not a valid ccTLD")
|
||||
|
||||
// TLD returns the country code top-level domain (ccTLD). UK is returned for GB.
|
||||
// In all other cases it returns either the region itself or an error.
|
||||
//
|
||||
// This method may return an error for a region for which there exists a
|
||||
// canonical form with a ccTLD. To get that ccTLD canonicalize r first. The
|
||||
// region will already be canonicalized it was obtained from a Tag that was
|
||||
// obtained using any of the default methods.
|
||||
func (r Region) TLD() (Region, error) {
|
||||
// See http://en.wikipedia.org/wiki/Country_code_top-level_domain for the
|
||||
// difference between ISO 3166-1 and IANA ccTLD.
|
||||
if r.regionID == _GB {
|
||||
r = Region{_UK}
|
||||
}
|
||||
if (r.typ() & ccTLD) == 0 {
|
||||
return Region{}, errNoTLD
|
||||
}
|
||||
return r, nil
|
||||
}
|
||||
|
||||
// Canonicalize returns the region or a possible replacement if the region is
|
||||
// deprecated. It will not return a replacement for deprecated regions that
|
||||
// are split into multiple regions.
|
||||
func (r Region) Canonicalize() Region {
|
||||
if cr := normRegion(r.regionID); cr != 0 {
|
||||
return Region{cr}
|
||||
}
|
||||
return r
|
||||
}
|
||||
|
||||
// Variant represents a registered variant of a language as defined by BCP 47.
|
||||
type Variant struct {
|
||||
variant string
|
||||
}
|
||||
|
||||
// ParseVariant parses and returns a Variant. An error is returned if s is not
|
||||
// a valid variant.
|
||||
func ParseVariant(s string) (Variant, error) {
|
||||
s = strings.ToLower(s)
|
||||
if _, ok := variantIndex[s]; ok {
|
||||
return Variant{s}, nil
|
||||
}
|
||||
return Variant{}, mkErrInvalid([]byte(s))
|
||||
}
|
||||
|
||||
// String returns the string representation of the variant.
|
||||
func (v Variant) String() string {
|
||||
return v.variant
|
||||
}
|
||||
396
vendor/golang.org/x/text/language/lookup.go
generated
vendored
Normal file
396
vendor/golang.org/x/text/language/lookup.go
generated
vendored
Normal file
|
|
@ -0,0 +1,396 @@
|
|||
// Copyright 2013 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package language
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"sort"
|
||||
"strconv"
|
||||
|
||||
"golang.org/x/text/internal/tag"
|
||||
)
|
||||
|
||||
// findIndex tries to find the given tag in idx and returns a standardized error
|
||||
// if it could not be found.
|
||||
func findIndex(idx tag.Index, key []byte, form string) (index int, err error) {
|
||||
if !tag.FixCase(form, key) {
|
||||
return 0, errSyntax
|
||||
}
|
||||
i := idx.Index(key)
|
||||
if i == -1 {
|
||||
return 0, mkErrInvalid(key)
|
||||
}
|
||||
return i, nil
|
||||
}
|
||||
|
||||
func searchUint(imap []uint16, key uint16) int {
|
||||
return sort.Search(len(imap), func(i int) bool {
|
||||
return imap[i] >= key
|
||||
})
|
||||
}
|
||||
|
||||
type langID uint16
|
||||
|
||||
// getLangID returns the langID of s if s is a canonical subtag
|
||||
// or langUnknown if s is not a canonical subtag.
|
||||
func getLangID(s []byte) (langID, error) {
|
||||
if len(s) == 2 {
|
||||
return getLangISO2(s)
|
||||
}
|
||||
return getLangISO3(s)
|
||||
}
|
||||
|
||||
// mapLang returns the mapped langID of id according to mapping m.
|
||||
func normLang(id langID) (langID, langAliasType) {
|
||||
k := sort.Search(len(langAliasMap), func(i int) bool {
|
||||
return langAliasMap[i].from >= uint16(id)
|
||||
})
|
||||
if k < len(langAliasMap) && langAliasMap[k].from == uint16(id) {
|
||||
return langID(langAliasMap[k].to), langAliasTypes[k]
|
||||
}
|
||||
return id, langAliasTypeUnknown
|
||||
}
|
||||
|
||||
// getLangISO2 returns the langID for the given 2-letter ISO language code
|
||||
// or unknownLang if this does not exist.
|
||||
func getLangISO2(s []byte) (langID, error) {
|
||||
if !tag.FixCase("zz", s) {
|
||||
return 0, errSyntax
|
||||
}
|
||||
if i := lang.Index(s); i != -1 && lang.Elem(i)[3] != 0 {
|
||||
return langID(i), nil
|
||||
}
|
||||
return 0, mkErrInvalid(s)
|
||||
}
|
||||
|
||||
const base = 'z' - 'a' + 1
|
||||
|
||||
func strToInt(s []byte) uint {
|
||||
v := uint(0)
|
||||
for i := 0; i < len(s); i++ {
|
||||
v *= base
|
||||
v += uint(s[i] - 'a')
|
||||
}
|
||||
return v
|
||||
}
|
||||
|
||||
// converts the given integer to the original ASCII string passed to strToInt.
|
||||
// len(s) must match the number of characters obtained.
|
||||
func intToStr(v uint, s []byte) {
|
||||
for i := len(s) - 1; i >= 0; i-- {
|
||||
s[i] = byte(v%base) + 'a'
|
||||
v /= base
|
||||
}
|
||||
}
|
||||
|
||||
// getLangISO3 returns the langID for the given 3-letter ISO language code
|
||||
// or unknownLang if this does not exist.
|
||||
func getLangISO3(s []byte) (langID, error) {
|
||||
if tag.FixCase("und", s) {
|
||||
// first try to match canonical 3-letter entries
|
||||
for i := lang.Index(s[:2]); i != -1; i = lang.Next(s[:2], i) {
|
||||
if e := lang.Elem(i); e[3] == 0 && e[2] == s[2] {
|
||||
// We treat "und" as special and always translate it to "unspecified".
|
||||
// Note that ZZ and Zzzz are private use and are not treated as
|
||||
// unspecified by default.
|
||||
id := langID(i)
|
||||
if id == nonCanonicalUnd {
|
||||
return 0, nil
|
||||
}
|
||||
return id, nil
|
||||
}
|
||||
}
|
||||
if i := altLangISO3.Index(s); i != -1 {
|
||||
return langID(altLangIndex[altLangISO3.Elem(i)[3]]), nil
|
||||
}
|
||||
n := strToInt(s)
|
||||
if langNoIndex[n/8]&(1<<(n%8)) != 0 {
|
||||
return langID(n) + langNoIndexOffset, nil
|
||||
}
|
||||
// Check for non-canonical uses of ISO3.
|
||||
for i := lang.Index(s[:1]); i != -1; i = lang.Next(s[:1], i) {
|
||||
if e := lang.Elem(i); e[2] == s[1] && e[3] == s[2] {
|
||||
return langID(i), nil
|
||||
}
|
||||
}
|
||||
return 0, mkErrInvalid(s)
|
||||
}
|
||||
return 0, errSyntax
|
||||
}
|
||||
|
||||
// stringToBuf writes the string to b and returns the number of bytes
|
||||
// written. cap(b) must be >= 3.
|
||||
func (id langID) stringToBuf(b []byte) int {
|
||||
if id >= langNoIndexOffset {
|
||||
intToStr(uint(id)-langNoIndexOffset, b[:3])
|
||||
return 3
|
||||
} else if id == 0 {
|
||||
return copy(b, "und")
|
||||
}
|
||||
l := lang[id<<2:]
|
||||
if l[3] == 0 {
|
||||
return copy(b, l[:3])
|
||||
}
|
||||
return copy(b, l[:2])
|
||||
}
|
||||
|
||||
// String returns the BCP 47 representation of the langID.
|
||||
// Use b as variable name, instead of id, to ensure the variable
|
||||
// used is consistent with that of Base in which this type is embedded.
|
||||
func (b langID) String() string {
|
||||
if b == 0 {
|
||||
return "und"
|
||||
} else if b >= langNoIndexOffset {
|
||||
b -= langNoIndexOffset
|
||||
buf := [3]byte{}
|
||||
intToStr(uint(b), buf[:])
|
||||
return string(buf[:])
|
||||
}
|
||||
l := lang.Elem(int(b))
|
||||
if l[3] == 0 {
|
||||
return l[:3]
|
||||
}
|
||||
return l[:2]
|
||||
}
|
||||
|
||||
// ISO3 returns the ISO 639-3 language code.
|
||||
func (b langID) ISO3() string {
|
||||
if b == 0 || b >= langNoIndexOffset {
|
||||
return b.String()
|
||||
}
|
||||
l := lang.Elem(int(b))
|
||||
if l[3] == 0 {
|
||||
return l[:3]
|
||||
} else if l[2] == 0 {
|
||||
return altLangISO3.Elem(int(l[3]))[:3]
|
||||
}
|
||||
// This allocation will only happen for 3-letter ISO codes
|
||||
// that are non-canonical BCP 47 language identifiers.
|
||||
return l[0:1] + l[2:4]
|
||||
}
|
||||
|
||||
// IsPrivateUse reports whether this language code is reserved for private use.
|
||||
func (b langID) IsPrivateUse() bool {
|
||||
return langPrivateStart <= b && b <= langPrivateEnd
|
||||
}
|
||||
|
||||
type regionID uint16
|
||||
|
||||
// getRegionID returns the region id for s if s is a valid 2-letter region code
|
||||
// or unknownRegion.
|
||||
func getRegionID(s []byte) (regionID, error) {
|
||||
if len(s) == 3 {
|
||||
if isAlpha(s[0]) {
|
||||
return getRegionISO3(s)
|
||||
}
|
||||
if i, err := strconv.ParseUint(string(s), 10, 10); err == nil {
|
||||
return getRegionM49(int(i))
|
||||
}
|
||||
}
|
||||
return getRegionISO2(s)
|
||||
}
|
||||
|
||||
// getRegionISO2 returns the regionID for the given 2-letter ISO country code
|
||||
// or unknownRegion if this does not exist.
|
||||
func getRegionISO2(s []byte) (regionID, error) {
|
||||
i, err := findIndex(regionISO, s, "ZZ")
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
return regionID(i) + isoRegionOffset, nil
|
||||
}
|
||||
|
||||
// getRegionISO3 returns the regionID for the given 3-letter ISO country code
|
||||
// or unknownRegion if this does not exist.
|
||||
func getRegionISO3(s []byte) (regionID, error) {
|
||||
if tag.FixCase("ZZZ", s) {
|
||||
for i := regionISO.Index(s[:1]); i != -1; i = regionISO.Next(s[:1], i) {
|
||||
if e := regionISO.Elem(i); e[2] == s[1] && e[3] == s[2] {
|
||||
return regionID(i) + isoRegionOffset, nil
|
||||
}
|
||||
}
|
||||
for i := 0; i < len(altRegionISO3); i += 3 {
|
||||
if tag.Compare(altRegionISO3[i:i+3], s) == 0 {
|
||||
return regionID(altRegionIDs[i/3]), nil
|
||||
}
|
||||
}
|
||||
return 0, mkErrInvalid(s)
|
||||
}
|
||||
return 0, errSyntax
|
||||
}
|
||||
|
||||
func getRegionM49(n int) (regionID, error) {
|
||||
if 0 < n && n <= 999 {
|
||||
const (
|
||||
searchBits = 7
|
||||
regionBits = 9
|
||||
regionMask = 1<<regionBits - 1
|
||||
)
|
||||
idx := n >> searchBits
|
||||
buf := fromM49[m49Index[idx]:m49Index[idx+1]]
|
||||
val := uint16(n) << regionBits // we rely on bits shifting out
|
||||
i := sort.Search(len(buf), func(i int) bool {
|
||||
return buf[i] >= val
|
||||
})
|
||||
if r := fromM49[int(m49Index[idx])+i]; r&^regionMask == val {
|
||||
return regionID(r & regionMask), nil
|
||||
}
|
||||
}
|
||||
var e ValueError
|
||||
fmt.Fprint(bytes.NewBuffer([]byte(e.v[:])), n)
|
||||
return 0, e
|
||||
}
|
||||
|
||||
// normRegion returns a region if r is deprecated or 0 otherwise.
|
||||
// TODO: consider supporting BYS (-> BLR), CSK (-> 200 or CZ), PHI (-> PHL) and AFI (-> DJ).
|
||||
// TODO: consider mapping split up regions to new most populous one (like CLDR).
|
||||
func normRegion(r regionID) regionID {
|
||||
m := regionOldMap
|
||||
k := sort.Search(len(m), func(i int) bool {
|
||||
return m[i].from >= uint16(r)
|
||||
})
|
||||
if k < len(m) && m[k].from == uint16(r) {
|
||||
return regionID(m[k].to)
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
const (
|
||||
iso3166UserAssigned = 1 << iota
|
||||
ccTLD
|
||||
bcp47Region
|
||||
)
|
||||
|
||||
func (r regionID) typ() byte {
|
||||
return regionTypes[r]
|
||||
}
|
||||
|
||||
// String returns the BCP 47 representation for the region.
|
||||
// It returns "ZZ" for an unspecified region.
|
||||
func (r regionID) String() string {
|
||||
if r < isoRegionOffset {
|
||||
if r == 0 {
|
||||
return "ZZ"
|
||||
}
|
||||
return fmt.Sprintf("%03d", r.M49())
|
||||
}
|
||||
r -= isoRegionOffset
|
||||
return regionISO.Elem(int(r))[:2]
|
||||
}
|
||||
|
||||
// ISO3 returns the 3-letter ISO code of r.
|
||||
// Note that not all regions have a 3-letter ISO code.
|
||||
// In such cases this method returns "ZZZ".
|
||||
func (r regionID) ISO3() string {
|
||||
if r < isoRegionOffset {
|
||||
return "ZZZ"
|
||||
}
|
||||
r -= isoRegionOffset
|
||||
reg := regionISO.Elem(int(r))
|
||||
switch reg[2] {
|
||||
case 0:
|
||||
return altRegionISO3[reg[3]:][:3]
|
||||
case ' ':
|
||||
return "ZZZ"
|
||||
}
|
||||
return reg[0:1] + reg[2:4]
|
||||
}
|
||||
|
||||
// M49 returns the UN M.49 encoding of r, or 0 if this encoding
|
||||
// is not defined for r.
|
||||
func (r regionID) M49() int {
|
||||
return int(m49[r])
|
||||
}
|
||||
|
||||
// IsPrivateUse reports whether r has the ISO 3166 User-assigned status. This
|
||||
// may include private-use tags that are assigned by CLDR and used in this
|
||||
// implementation. So IsPrivateUse and IsCountry can be simultaneously true.
|
||||
func (r regionID) IsPrivateUse() bool {
|
||||
return r.typ()&iso3166UserAssigned != 0
|
||||
}
|
||||
|
||||
type scriptID uint8
|
||||
|
||||
// getScriptID returns the script id for string s. It assumes that s
|
||||
// is of the format [A-Z][a-z]{3}.
|
||||
func getScriptID(idx tag.Index, s []byte) (scriptID, error) {
|
||||
i, err := findIndex(idx, s, "Zzzz")
|
||||
return scriptID(i), err
|
||||
}
|
||||
|
||||
// String returns the script code in title case.
|
||||
// It returns "Zzzz" for an unspecified script.
|
||||
func (s scriptID) String() string {
|
||||
if s == 0 {
|
||||
return "Zzzz"
|
||||
}
|
||||
return script.Elem(int(s))
|
||||
}
|
||||
|
||||
// IsPrivateUse reports whether this script code is reserved for private use.
|
||||
func (s scriptID) IsPrivateUse() bool {
|
||||
return _Qaaa <= s && s <= _Qabx
|
||||
}
|
||||
|
||||
const (
|
||||
maxAltTaglen = len("en-US-POSIX")
|
||||
maxLen = maxAltTaglen
|
||||
)
|
||||
|
||||
var (
|
||||
// grandfatheredMap holds a mapping from legacy and grandfathered tags to
|
||||
// their base language or index to more elaborate tag.
|
||||
grandfatheredMap = map[[maxLen]byte]int16{
|
||||
[maxLen]byte{'a', 'r', 't', '-', 'l', 'o', 'j', 'b', 'a', 'n'}: _jbo, // art-lojban
|
||||
[maxLen]byte{'i', '-', 'a', 'm', 'i'}: _ami, // i-ami
|
||||
[maxLen]byte{'i', '-', 'b', 'n', 'n'}: _bnn, // i-bnn
|
||||
[maxLen]byte{'i', '-', 'h', 'a', 'k'}: _hak, // i-hak
|
||||
[maxLen]byte{'i', '-', 'k', 'l', 'i', 'n', 'g', 'o', 'n'}: _tlh, // i-klingon
|
||||
[maxLen]byte{'i', '-', 'l', 'u', 'x'}: _lb, // i-lux
|
||||
[maxLen]byte{'i', '-', 'n', 'a', 'v', 'a', 'j', 'o'}: _nv, // i-navajo
|
||||
[maxLen]byte{'i', '-', 'p', 'w', 'n'}: _pwn, // i-pwn
|
||||
[maxLen]byte{'i', '-', 't', 'a', 'o'}: _tao, // i-tao
|
||||
[maxLen]byte{'i', '-', 't', 'a', 'y'}: _tay, // i-tay
|
||||
[maxLen]byte{'i', '-', 't', 's', 'u'}: _tsu, // i-tsu
|
||||
[maxLen]byte{'n', 'o', '-', 'b', 'o', 'k'}: _nb, // no-bok
|
||||
[maxLen]byte{'n', 'o', '-', 'n', 'y', 'n'}: _nn, // no-nyn
|
||||
[maxLen]byte{'s', 'g', 'n', '-', 'b', 'e', '-', 'f', 'r'}: _sfb, // sgn-BE-FR
|
||||
[maxLen]byte{'s', 'g', 'n', '-', 'b', 'e', '-', 'n', 'l'}: _vgt, // sgn-BE-NL
|
||||
[maxLen]byte{'s', 'g', 'n', '-', 'c', 'h', '-', 'd', 'e'}: _sgg, // sgn-CH-DE
|
||||
[maxLen]byte{'z', 'h', '-', 'g', 'u', 'o', 'y', 'u'}: _cmn, // zh-guoyu
|
||||
[maxLen]byte{'z', 'h', '-', 'h', 'a', 'k', 'k', 'a'}: _hak, // zh-hakka
|
||||
[maxLen]byte{'z', 'h', '-', 'm', 'i', 'n', '-', 'n', 'a', 'n'}: _nan, // zh-min-nan
|
||||
[maxLen]byte{'z', 'h', '-', 'x', 'i', 'a', 'n', 'g'}: _hsn, // zh-xiang
|
||||
|
||||
// Grandfathered tags with no modern replacement will be converted as
|
||||
// follows:
|
||||
[maxLen]byte{'c', 'e', 'l', '-', 'g', 'a', 'u', 'l', 'i', 's', 'h'}: -1, // cel-gaulish
|
||||
[maxLen]byte{'e', 'n', '-', 'g', 'b', '-', 'o', 'e', 'd'}: -2, // en-GB-oed
|
||||
[maxLen]byte{'i', '-', 'd', 'e', 'f', 'a', 'u', 'l', 't'}: -3, // i-default
|
||||
[maxLen]byte{'i', '-', 'e', 'n', 'o', 'c', 'h', 'i', 'a', 'n'}: -4, // i-enochian
|
||||
[maxLen]byte{'i', '-', 'm', 'i', 'n', 'g', 'o'}: -5, // i-mingo
|
||||
[maxLen]byte{'z', 'h', '-', 'm', 'i', 'n'}: -6, // zh-min
|
||||
|
||||
// CLDR-specific tag.
|
||||
[maxLen]byte{'r', 'o', 'o', 't'}: 0, // root
|
||||
[maxLen]byte{'e', 'n', '-', 'u', 's', '-', 'p', 'o', 's', 'i', 'x'}: -7, // en_US_POSIX"
|
||||
}
|
||||
|
||||
altTagIndex = [...]uint8{0, 17, 31, 45, 61, 74, 86, 102}
|
||||
|
||||
altTags = "xtg-x-cel-gaulishen-GB-oxendicten-x-i-defaultund-x-i-enochiansee-x-i-mingonan-x-zh-minen-US-u-va-posix"
|
||||
)
|
||||
|
||||
func grandfathered(s [maxAltTaglen]byte) (t Tag, ok bool) {
|
||||
if v, ok := grandfatheredMap[s]; ok {
|
||||
if v < 0 {
|
||||
return Make(altTags[altTagIndex[-v-1]:altTagIndex[-v]]), true
|
||||
}
|
||||
t.lang = langID(v)
|
||||
return t, true
|
||||
}
|
||||
return t, false
|
||||
}
|
||||
1635
vendor/golang.org/x/text/language/maketables.go
generated
vendored
Normal file
1635
vendor/golang.org/x/text/language/maketables.go
generated
vendored
Normal file
File diff suppressed because it is too large
Load diff
840
vendor/golang.org/x/text/language/match.go
generated
vendored
Normal file
840
vendor/golang.org/x/text/language/match.go
generated
vendored
Normal file
|
|
@ -0,0 +1,840 @@
|
|||
// Copyright 2013 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package language
|
||||
|
||||
import "errors"
|
||||
|
||||
// Matcher is the interface that wraps the Match method.
|
||||
//
|
||||
// Match returns the best match for any of the given tags, along with
|
||||
// a unique index associated with the returned tag and a confidence
|
||||
// score.
|
||||
type Matcher interface {
|
||||
Match(t ...Tag) (tag Tag, index int, c Confidence)
|
||||
}
|
||||
|
||||
// Comprehends reports the confidence score for a speaker of a given language
|
||||
// to being able to comprehend the written form of an alternative language.
|
||||
func Comprehends(speaker, alternative Tag) Confidence {
|
||||
_, _, c := NewMatcher([]Tag{alternative}).Match(speaker)
|
||||
return c
|
||||
}
|
||||
|
||||
// NewMatcher returns a Matcher that matches an ordered list of preferred tags
|
||||
// against a list of supported tags based on written intelligibility, closeness
|
||||
// of dialect, equivalence of subtags and various other rules. It is initialized
|
||||
// with the list of supported tags. The first element is used as the default
|
||||
// value in case no match is found.
|
||||
//
|
||||
// Its Match method matches the first of the given Tags to reach a certain
|
||||
// confidence threshold. The tags passed to Match should therefore be specified
|
||||
// in order of preference. Extensions are ignored for matching.
|
||||
//
|
||||
// The index returned by the Match method corresponds to the index of the
|
||||
// matched tag in t, but is augmented with the Unicode extension ('u')of the
|
||||
// corresponding preferred tag. This allows user locale options to be passed
|
||||
// transparently.
|
||||
func NewMatcher(t []Tag) Matcher {
|
||||
return newMatcher(t)
|
||||
}
|
||||
|
||||
func (m *matcher) Match(want ...Tag) (t Tag, index int, c Confidence) {
|
||||
match, w, c := m.getBest(want...)
|
||||
if match == nil {
|
||||
t = m.default_.tag
|
||||
} else {
|
||||
t, index = match.tag, match.index
|
||||
}
|
||||
// Copy options from the user-provided tag into the result tag. This is hard
|
||||
// to do after the fact, so we do it here.
|
||||
// TODO: consider also adding in variants that are compatible with the
|
||||
// matched language.
|
||||
// TODO: Add back region if it is non-ambiguous? Or create another tag to
|
||||
// preserve the region?
|
||||
if u, ok := w.Extension('u'); ok {
|
||||
t, _ = Raw.Compose(t, u)
|
||||
}
|
||||
return t, index, c
|
||||
}
|
||||
|
||||
type scriptRegionFlags uint8
|
||||
|
||||
const (
|
||||
isList = 1 << iota
|
||||
scriptInFrom
|
||||
regionInFrom
|
||||
)
|
||||
|
||||
func (t *Tag) setUndefinedLang(id langID) {
|
||||
if t.lang == 0 {
|
||||
t.lang = id
|
||||
}
|
||||
}
|
||||
|
||||
func (t *Tag) setUndefinedScript(id scriptID) {
|
||||
if t.script == 0 {
|
||||
t.script = id
|
||||
}
|
||||
}
|
||||
|
||||
func (t *Tag) setUndefinedRegion(id regionID) {
|
||||
if t.region == 0 || t.region.contains(id) {
|
||||
t.region = id
|
||||
}
|
||||
}
|
||||
|
||||
// ErrMissingLikelyTagsData indicates no information was available
|
||||
// to compute likely values of missing tags.
|
||||
var ErrMissingLikelyTagsData = errors.New("missing likely tags data")
|
||||
|
||||
// addLikelySubtags sets subtags to their most likely value, given the locale.
|
||||
// In most cases this means setting fields for unknown values, but in some
|
||||
// cases it may alter a value. It returns a ErrMissingLikelyTagsData error
|
||||
// if the given locale cannot be expanded.
|
||||
func (t Tag) addLikelySubtags() (Tag, error) {
|
||||
id, err := addTags(t)
|
||||
if err != nil {
|
||||
return t, err
|
||||
} else if id.equalTags(t) {
|
||||
return t, nil
|
||||
}
|
||||
id.remakeString()
|
||||
return id, nil
|
||||
}
|
||||
|
||||
// specializeRegion attempts to specialize a group region.
|
||||
func specializeRegion(t *Tag) bool {
|
||||
if i := regionInclusion[t.region]; i < nRegionGroups {
|
||||
x := likelyRegionGroup[i]
|
||||
if langID(x.lang) == t.lang && scriptID(x.script) == t.script {
|
||||
t.region = regionID(x.region)
|
||||
}
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func addTags(t Tag) (Tag, error) {
|
||||
// We leave private use identifiers alone.
|
||||
if t.private() {
|
||||
return t, nil
|
||||
}
|
||||
if t.script != 0 && t.region != 0 {
|
||||
if t.lang != 0 {
|
||||
// already fully specified
|
||||
specializeRegion(&t)
|
||||
return t, nil
|
||||
}
|
||||
// Search matches for und-script-region. Note that for these cases
|
||||
// region will never be a group so there is no need to check for this.
|
||||
list := likelyRegion[t.region : t.region+1]
|
||||
if x := list[0]; x.flags&isList != 0 {
|
||||
list = likelyRegionList[x.lang : x.lang+uint16(x.script)]
|
||||
}
|
||||
for _, x := range list {
|
||||
// Deviating from the spec. See match_test.go for details.
|
||||
if scriptID(x.script) == t.script {
|
||||
t.setUndefinedLang(langID(x.lang))
|
||||
return t, nil
|
||||
}
|
||||
}
|
||||
}
|
||||
if t.lang != 0 {
|
||||
// Search matches for lang-script and lang-region, where lang != und.
|
||||
if t.lang < langNoIndexOffset {
|
||||
x := likelyLang[t.lang]
|
||||
if x.flags&isList != 0 {
|
||||
list := likelyLangList[x.region : x.region+uint16(x.script)]
|
||||
if t.script != 0 {
|
||||
for _, x := range list {
|
||||
if scriptID(x.script) == t.script && x.flags&scriptInFrom != 0 {
|
||||
t.setUndefinedRegion(regionID(x.region))
|
||||
return t, nil
|
||||
}
|
||||
}
|
||||
} else if t.region != 0 {
|
||||
count := 0
|
||||
goodScript := true
|
||||
tt := t
|
||||
for _, x := range list {
|
||||
// We visit all entries for which the script was not
|
||||
// defined, including the ones where the region was not
|
||||
// defined. This allows for proper disambiguation within
|
||||
// regions.
|
||||
if x.flags&scriptInFrom == 0 && t.region.contains(regionID(x.region)) {
|
||||
tt.region = regionID(x.region)
|
||||
tt.setUndefinedScript(scriptID(x.script))
|
||||
goodScript = goodScript && tt.script == scriptID(x.script)
|
||||
count++
|
||||
}
|
||||
}
|
||||
if count == 1 {
|
||||
return tt, nil
|
||||
}
|
||||
// Even if we fail to find a unique Region, we might have
|
||||
// an unambiguous script.
|
||||
if goodScript {
|
||||
t.script = tt.script
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Search matches for und-script.
|
||||
if t.script != 0 {
|
||||
x := likelyScript[t.script]
|
||||
if x.region != 0 {
|
||||
t.setUndefinedRegion(regionID(x.region))
|
||||
t.setUndefinedLang(langID(x.lang))
|
||||
return t, nil
|
||||
}
|
||||
}
|
||||
// Search matches for und-region. If und-script-region exists, it would
|
||||
// have been found earlier.
|
||||
if t.region != 0 {
|
||||
if i := regionInclusion[t.region]; i < nRegionGroups {
|
||||
x := likelyRegionGroup[i]
|
||||
if x.region != 0 {
|
||||
t.setUndefinedLang(langID(x.lang))
|
||||
t.setUndefinedScript(scriptID(x.script))
|
||||
t.region = regionID(x.region)
|
||||
}
|
||||
} else {
|
||||
x := likelyRegion[t.region]
|
||||
if x.flags&isList != 0 {
|
||||
x = likelyRegionList[x.lang]
|
||||
}
|
||||
if x.script != 0 && x.flags != scriptInFrom {
|
||||
t.setUndefinedLang(langID(x.lang))
|
||||
t.setUndefinedScript(scriptID(x.script))
|
||||
return t, nil
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Search matches for lang.
|
||||
if t.lang < langNoIndexOffset {
|
||||
x := likelyLang[t.lang]
|
||||
if x.flags&isList != 0 {
|
||||
x = likelyLangList[x.region]
|
||||
}
|
||||
if x.region != 0 {
|
||||
t.setUndefinedScript(scriptID(x.script))
|
||||
t.setUndefinedRegion(regionID(x.region))
|
||||
}
|
||||
specializeRegion(&t)
|
||||
if t.lang == 0 {
|
||||
t.lang = _en // default language
|
||||
}
|
||||
return t, nil
|
||||
}
|
||||
return t, ErrMissingLikelyTagsData
|
||||
}
|
||||
|
||||
func (t *Tag) setTagsFrom(id Tag) {
|
||||
t.lang = id.lang
|
||||
t.script = id.script
|
||||
t.region = id.region
|
||||
}
|
||||
|
||||
// minimize removes the region or script subtags from t such that
|
||||
// t.addLikelySubtags() == t.minimize().addLikelySubtags().
|
||||
func (t Tag) minimize() (Tag, error) {
|
||||
t, err := minimizeTags(t)
|
||||
if err != nil {
|
||||
return t, err
|
||||
}
|
||||
t.remakeString()
|
||||
return t, nil
|
||||
}
|
||||
|
||||
// minimizeTags mimics the behavior of the ICU 51 C implementation.
|
||||
func minimizeTags(t Tag) (Tag, error) {
|
||||
if t.equalTags(und) {
|
||||
return t, nil
|
||||
}
|
||||
max, err := addTags(t)
|
||||
if err != nil {
|
||||
return t, err
|
||||
}
|
||||
for _, id := range [...]Tag{
|
||||
{lang: t.lang},
|
||||
{lang: t.lang, region: t.region},
|
||||
{lang: t.lang, script: t.script},
|
||||
} {
|
||||
if x, err := addTags(id); err == nil && max.equalTags(x) {
|
||||
t.setTagsFrom(id)
|
||||
break
|
||||
}
|
||||
}
|
||||
return t, nil
|
||||
}
|
||||
|
||||
// Tag Matching
|
||||
// CLDR defines an algorithm for finding the best match between two sets of language
|
||||
// tags. The basic algorithm defines how to score a possible match and then find
|
||||
// the match with the best score
|
||||
// (see http://www.unicode.org/reports/tr35/#LanguageMatching).
|
||||
// Using scoring has several disadvantages. The scoring obfuscates the importance of
|
||||
// the various factors considered, making the algorithm harder to understand. Using
|
||||
// scoring also requires the full score to be computed for each pair of tags.
|
||||
//
|
||||
// We will use a different algorithm which aims to have the following properties:
|
||||
// - clarity on the precedence of the various selection factors, and
|
||||
// - improved performance by allowing early termination of a comparison.
|
||||
//
|
||||
// Matching algorithm (overview)
|
||||
// Input:
|
||||
// - supported: a set of supported tags
|
||||
// - default: the default tag to return in case there is no match
|
||||
// - desired: list of desired tags, ordered by preference, starting with
|
||||
// the most-preferred.
|
||||
//
|
||||
// Algorithm:
|
||||
// 1) Set the best match to the lowest confidence level
|
||||
// 2) For each tag in "desired":
|
||||
// a) For each tag in "supported":
|
||||
// 1) compute the match between the two tags.
|
||||
// 2) if the match is better than the previous best match, replace it
|
||||
// with the new match. (see next section)
|
||||
// b) if the current best match is above a certain threshold, return this
|
||||
// match without proceeding to the next tag in "desired". [See Note 1]
|
||||
// 3) If the best match so far is below a certain threshold, return "default".
|
||||
//
|
||||
// Ranking:
|
||||
// We use two phases to determine whether one pair of tags are a better match
|
||||
// than another pair of tags. First, we determine a rough confidence level. If the
|
||||
// levels are different, the one with the highest confidence wins.
|
||||
// Second, if the rough confidence levels are identical, we use a set of tie-breaker
|
||||
// rules.
|
||||
//
|
||||
// The confidence level of matching a pair of tags is determined by finding the
|
||||
// lowest confidence level of any matches of the corresponding subtags (the
|
||||
// result is deemed as good as its weakest link).
|
||||
// We define the following levels:
|
||||
// Exact - An exact match of a subtag, before adding likely subtags.
|
||||
// MaxExact - An exact match of a subtag, after adding likely subtags.
|
||||
// [See Note 2].
|
||||
// High - High level of mutual intelligibility between different subtag
|
||||
// variants.
|
||||
// Low - Low level of mutual intelligibility between different subtag
|
||||
// variants.
|
||||
// No - No mutual intelligibility.
|
||||
//
|
||||
// The following levels can occur for each type of subtag:
|
||||
// Base: Exact, MaxExact, High, Low, No
|
||||
// Script: Exact, MaxExact [see Note 3], Low, No
|
||||
// Region: Exact, MaxExact, High
|
||||
// Variant: Exact, High
|
||||
// Private: Exact, No
|
||||
//
|
||||
// Any result with a confidence level of Low or higher is deemed a possible match.
|
||||
// Once a desired tag matches any of the supported tags with a level of MaxExact
|
||||
// or higher, the next desired tag is not considered (see Step 2.b).
|
||||
// Note that CLDR provides languageMatching data that defines close equivalence
|
||||
// classes for base languages, scripts and regions.
|
||||
//
|
||||
// Tie-breaking
|
||||
// If we get the same confidence level for two matches, we apply a sequence of
|
||||
// tie-breaking rules. The first that succeeds defines the result. The rules are
|
||||
// applied in the following order.
|
||||
// 1) Original language was defined and was identical.
|
||||
// 2) Original region was defined and was identical.
|
||||
// 3) Distance between two maximized regions was the smallest.
|
||||
// 4) Original script was defined and was identical.
|
||||
// 5) Distance from want tag to have tag using the parent relation [see Note 5.]
|
||||
// If there is still no winner after these rules are applied, the first match
|
||||
// found wins.
|
||||
//
|
||||
// Notes:
|
||||
// [1] Note that even if we may not have a perfect match, if a match is above a
|
||||
// certain threshold, it is considered a better match than any other match
|
||||
// to a tag later in the list of preferred language tags.
|
||||
// [2] In practice, as matching of Exact is done in a separate phase from
|
||||
// matching the other levels, we reuse the Exact level to mean MaxExact in
|
||||
// the second phase. As a consequence, we only need the levels defined by
|
||||
// the Confidence type. The MaxExact confidence level is mapped to High in
|
||||
// the public API.
|
||||
// [3] We do not differentiate between maximized script values that were derived
|
||||
// from suppressScript versus most likely tag data. We determined that in
|
||||
// ranking the two, one ranks just after the other. Moreover, the two cannot
|
||||
// occur concurrently. As a consequence, they are identical for practical
|
||||
// purposes.
|
||||
// [4] In case of deprecated, macro-equivalents and legacy mappings, we assign
|
||||
// the MaxExact level to allow iw vs he to still be a closer match than
|
||||
// en-AU vs en-US, for example.
|
||||
// [5] In CLDR a locale inherits fields that are unspecified for this locale
|
||||
// from its parent. Therefore, if a locale is a parent of another locale,
|
||||
// it is a strong measure for closeness, especially when no other tie
|
||||
// breaker rule applies. One could also argue it is inconsistent, for
|
||||
// example, when pt-AO matches pt (which CLDR equates with pt-BR), even
|
||||
// though its parent is pt-PT according to the inheritance rules.
|
||||
//
|
||||
// Implementation Details:
|
||||
// There are several performance considerations worth pointing out. Most notably,
|
||||
// we preprocess as much as possible (within reason) at the time of creation of a
|
||||
// matcher. This includes:
|
||||
// - creating a per-language map, which includes data for the raw base language
|
||||
// and its canonicalized variant (if applicable),
|
||||
// - expanding entries for the equivalence classes defined in CLDR's
|
||||
// languageMatch data.
|
||||
// The per-language map ensures that typically only a very small number of tags
|
||||
// need to be considered. The pre-expansion of canonicalized subtags and
|
||||
// equivalence classes reduces the amount of map lookups that need to be done at
|
||||
// runtime.
|
||||
|
||||
// matcher keeps a set of supported language tags, indexed by language.
|
||||
type matcher struct {
|
||||
default_ *haveTag
|
||||
index map[langID]*matchHeader
|
||||
passSettings bool
|
||||
}
|
||||
|
||||
// matchHeader has the lists of tags for exact matches and matches based on
|
||||
// maximized and canonicalized tags for a given language.
|
||||
type matchHeader struct {
|
||||
exact []haveTag
|
||||
max []haveTag
|
||||
}
|
||||
|
||||
// haveTag holds a supported Tag and its maximized script and region. The maximized
|
||||
// or canonicalized language is not stored as it is not needed during matching.
|
||||
type haveTag struct {
|
||||
tag Tag
|
||||
|
||||
// index of this tag in the original list of supported tags.
|
||||
index int
|
||||
|
||||
// conf is the maximum confidence that can result from matching this haveTag.
|
||||
// When conf < Exact this means it was inserted after applying a CLDR equivalence rule.
|
||||
conf Confidence
|
||||
|
||||
// Maximized region and script.
|
||||
maxRegion regionID
|
||||
maxScript scriptID
|
||||
|
||||
// altScript may be checked as an alternative match to maxScript. If altScript
|
||||
// matches, the confidence level for this match is Low. Theoretically there
|
||||
// could be multiple alternative scripts. This does not occur in practice.
|
||||
altScript scriptID
|
||||
|
||||
// nextMax is the index of the next haveTag with the same maximized tags.
|
||||
nextMax uint16
|
||||
}
|
||||
|
||||
func makeHaveTag(tag Tag, index int) (haveTag, langID) {
|
||||
max := tag
|
||||
if tag.lang != 0 {
|
||||
max, _ = max.canonicalize(All)
|
||||
max, _ = addTags(max)
|
||||
max.remakeString()
|
||||
}
|
||||
return haveTag{tag, index, Exact, max.region, max.script, altScript(max.lang, max.script), 0}, max.lang
|
||||
}
|
||||
|
||||
// altScript returns an alternative script that may match the given script with
|
||||
// a low confidence. At the moment, the langMatch data allows for at most one
|
||||
// script to map to another and we rely on this to keep the code simple.
|
||||
func altScript(l langID, s scriptID) scriptID {
|
||||
for _, alt := range matchScript {
|
||||
if (alt.lang == 0 || langID(alt.lang) == l) && scriptID(alt.have) == s {
|
||||
return scriptID(alt.want)
|
||||
}
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
// addIfNew adds a haveTag to the list of tags only if it is a unique tag.
|
||||
// Tags that have the same maximized values are linked by index.
|
||||
func (h *matchHeader) addIfNew(n haveTag, exact bool) {
|
||||
// Don't add new exact matches.
|
||||
for _, v := range h.exact {
|
||||
if v.tag.equalsRest(n.tag) {
|
||||
return
|
||||
}
|
||||
}
|
||||
if exact {
|
||||
h.exact = append(h.exact, n)
|
||||
}
|
||||
// Allow duplicate maximized tags, but create a linked list to allow quickly
|
||||
// comparing the equivalents and bail out.
|
||||
for i, v := range h.max {
|
||||
if v.maxScript == n.maxScript &&
|
||||
v.maxRegion == n.maxRegion &&
|
||||
v.tag.variantOrPrivateTagStr() == n.tag.variantOrPrivateTagStr() {
|
||||
for h.max[i].nextMax != 0 {
|
||||
i = int(h.max[i].nextMax)
|
||||
}
|
||||
h.max[i].nextMax = uint16(len(h.max))
|
||||
break
|
||||
}
|
||||
}
|
||||
h.max = append(h.max, n)
|
||||
}
|
||||
|
||||
// header returns the matchHeader for the given language. It creates one if
|
||||
// it doesn't already exist.
|
||||
func (m *matcher) header(l langID) *matchHeader {
|
||||
if h := m.index[l]; h != nil {
|
||||
return h
|
||||
}
|
||||
h := &matchHeader{}
|
||||
m.index[l] = h
|
||||
return h
|
||||
}
|
||||
|
||||
// newMatcher builds an index for the given supported tags and returns it as
|
||||
// a matcher. It also expands the index by considering various equivalence classes
|
||||
// for a given tag.
|
||||
func newMatcher(supported []Tag) *matcher {
|
||||
m := &matcher{
|
||||
index: make(map[langID]*matchHeader),
|
||||
}
|
||||
if len(supported) == 0 {
|
||||
m.default_ = &haveTag{}
|
||||
return m
|
||||
}
|
||||
// Add supported languages to the index. Add exact matches first to give
|
||||
// them precedence.
|
||||
for i, tag := range supported {
|
||||
pair, _ := makeHaveTag(tag, i)
|
||||
m.header(tag.lang).addIfNew(pair, true)
|
||||
}
|
||||
m.default_ = &m.header(supported[0].lang).exact[0]
|
||||
for i, tag := range supported {
|
||||
pair, max := makeHaveTag(tag, i)
|
||||
if max != tag.lang {
|
||||
m.header(max).addIfNew(pair, false)
|
||||
}
|
||||
}
|
||||
|
||||
// update is used to add indexes in the map for equivalent languages.
|
||||
// If force is true, the update will also apply to derived entries. To
|
||||
// avoid applying a "transitive closure", use false.
|
||||
update := func(want, have uint16, conf Confidence, force bool) {
|
||||
if hh := m.index[langID(have)]; hh != nil {
|
||||
if !force && len(hh.exact) == 0 {
|
||||
return
|
||||
}
|
||||
hw := m.header(langID(want))
|
||||
for _, v := range hh.max {
|
||||
if conf < v.conf {
|
||||
v.conf = conf
|
||||
}
|
||||
v.nextMax = 0 // this value needs to be recomputed
|
||||
if v.altScript != 0 {
|
||||
v.altScript = altScript(langID(want), v.maxScript)
|
||||
}
|
||||
hw.addIfNew(v, conf == Exact && len(hh.exact) > 0)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Add entries for languages with mutual intelligibility as defined by CLDR's
|
||||
// languageMatch data.
|
||||
for _, ml := range matchLang {
|
||||
update(ml.want, ml.have, Confidence(ml.conf), false)
|
||||
if !ml.oneway {
|
||||
update(ml.have, ml.want, Confidence(ml.conf), false)
|
||||
}
|
||||
}
|
||||
|
||||
// Add entries for possible canonicalizations. This is an optimization to
|
||||
// ensure that only one map lookup needs to be done at runtime per desired tag.
|
||||
// First we match deprecated equivalents. If they are perfect equivalents
|
||||
// (their canonicalization simply substitutes a different language code, but
|
||||
// nothing else), the match confidence is Exact, otherwise it is High.
|
||||
for i, lm := range langAliasMap {
|
||||
if lm.from == _sh {
|
||||
continue
|
||||
}
|
||||
|
||||
// If deprecated codes match and there is no fiddling with the script or
|
||||
// or region, we consider it an exact match.
|
||||
conf := Exact
|
||||
if langAliasTypes[i] != langMacro {
|
||||
if !isExactEquivalent(langID(lm.from)) {
|
||||
conf = High
|
||||
}
|
||||
update(lm.to, lm.from, conf, true)
|
||||
}
|
||||
update(lm.from, lm.to, conf, true)
|
||||
}
|
||||
return m
|
||||
}
|
||||
|
||||
// getBest gets the best matching tag in m for any of the given tags, taking into
|
||||
// account the order of preference of the given tags.
|
||||
func (m *matcher) getBest(want ...Tag) (got *haveTag, orig Tag, c Confidence) {
|
||||
best := bestMatch{}
|
||||
for _, w := range want {
|
||||
var max Tag
|
||||
// Check for exact match first.
|
||||
h := m.index[w.lang]
|
||||
if w.lang != 0 {
|
||||
// Base language is defined.
|
||||
if h == nil {
|
||||
continue
|
||||
}
|
||||
for i := range h.exact {
|
||||
have := &h.exact[i]
|
||||
if have.tag.equalsRest(w) {
|
||||
return have, w, Exact
|
||||
}
|
||||
}
|
||||
max, _ = w.canonicalize(Legacy | Deprecated)
|
||||
max, _ = addTags(max)
|
||||
} else {
|
||||
// Base language is not defined.
|
||||
if h != nil {
|
||||
for i := range h.exact {
|
||||
have := &h.exact[i]
|
||||
if have.tag.equalsRest(w) {
|
||||
return have, w, Exact
|
||||
}
|
||||
}
|
||||
}
|
||||
if w.script == 0 && w.region == 0 {
|
||||
// We skip all tags matching und for approximate matching, including
|
||||
// private tags.
|
||||
continue
|
||||
}
|
||||
max, _ = addTags(w)
|
||||
if h = m.index[max.lang]; h == nil {
|
||||
continue
|
||||
}
|
||||
}
|
||||
// Check for match based on maximized tag.
|
||||
for i := range h.max {
|
||||
have := &h.max[i]
|
||||
best.update(have, w, max.script, max.region)
|
||||
if best.conf == Exact {
|
||||
for have.nextMax != 0 {
|
||||
have = &h.max[have.nextMax]
|
||||
best.update(have, w, max.script, max.region)
|
||||
}
|
||||
return best.have, best.want, High
|
||||
}
|
||||
}
|
||||
}
|
||||
if best.conf <= No {
|
||||
if len(want) != 0 {
|
||||
return nil, want[0], No
|
||||
}
|
||||
return nil, Tag{}, No
|
||||
}
|
||||
return best.have, best.want, best.conf
|
||||
}
|
||||
|
||||
// bestMatch accumulates the best match so far.
|
||||
type bestMatch struct {
|
||||
have *haveTag
|
||||
want Tag
|
||||
conf Confidence
|
||||
// Cached results from applying tie-breaking rules.
|
||||
origLang bool
|
||||
origReg bool
|
||||
regDist uint8
|
||||
origScript bool
|
||||
parentDist uint8 // 255 if have is not an ancestor of want tag.
|
||||
}
|
||||
|
||||
// update updates the existing best match if the new pair is considered to be a
|
||||
// better match.
|
||||
// To determine if the given pair is a better match, it first computes the rough
|
||||
// confidence level. If this surpasses the current match, it will replace it and
|
||||
// update the tie-breaker rule cache. If there is a tie, it proceeds with applying
|
||||
// a series of tie-breaker rules. If there is no conclusive winner after applying
|
||||
// the tie-breaker rules, it leaves the current match as the preferred match.
|
||||
func (m *bestMatch) update(have *haveTag, tag Tag, maxScript scriptID, maxRegion regionID) {
|
||||
// Bail if the maximum attainable confidence is below that of the current best match.
|
||||
c := have.conf
|
||||
if c < m.conf {
|
||||
return
|
||||
}
|
||||
if have.maxScript != maxScript {
|
||||
// There is usually very little comprehension between different scripts.
|
||||
// In a few cases there may still be Low comprehension. This possibility is
|
||||
// pre-computed and stored in have.altScript.
|
||||
if Low < m.conf || have.altScript != maxScript {
|
||||
return
|
||||
}
|
||||
c = Low
|
||||
} else if have.maxRegion != maxRegion {
|
||||
// There is usually a small difference between languages across regions.
|
||||
// We use the region distance (below) to disambiguate between equal matches.
|
||||
if High < c {
|
||||
c = High
|
||||
}
|
||||
}
|
||||
|
||||
// We store the results of the computations of the tie-breaker rules along
|
||||
// with the best match. There is no need to do the checks once we determine
|
||||
// we have a winner, but we do still need to do the tie-breaker computations.
|
||||
// We use "beaten" to keep track if we still need to do the checks.
|
||||
beaten := false // true if the new pair defeats the current one.
|
||||
if c != m.conf {
|
||||
if c < m.conf {
|
||||
return
|
||||
}
|
||||
beaten = true
|
||||
}
|
||||
|
||||
// Tie-breaker rules:
|
||||
// We prefer if the pre-maximized language was specified and identical.
|
||||
origLang := have.tag.lang == tag.lang && tag.lang != 0
|
||||
if !beaten && m.origLang != origLang {
|
||||
if m.origLang {
|
||||
return
|
||||
}
|
||||
beaten = true
|
||||
}
|
||||
|
||||
// We prefer if the pre-maximized region was specified and identical.
|
||||
origReg := have.tag.region == tag.region && tag.region != 0
|
||||
if !beaten && m.origReg != origReg {
|
||||
if m.origReg {
|
||||
return
|
||||
}
|
||||
beaten = true
|
||||
}
|
||||
|
||||
// Next we prefer smaller distances between regions, as defined by regionDist.
|
||||
regDist := regionDist(have.maxRegion, maxRegion, tag.lang)
|
||||
if !beaten && m.regDist != regDist {
|
||||
if regDist > m.regDist {
|
||||
return
|
||||
}
|
||||
beaten = true
|
||||
}
|
||||
|
||||
// Next we prefer if the pre-maximized script was specified and identical.
|
||||
origScript := have.tag.script == tag.script && tag.script != 0
|
||||
if !beaten && m.origScript != origScript {
|
||||
if m.origScript {
|
||||
return
|
||||
}
|
||||
beaten = true
|
||||
}
|
||||
|
||||
// Finally we prefer tags which have a closer parent relationship.
|
||||
parentDist := parentDistance(have.tag.region, tag)
|
||||
if !beaten && m.parentDist != parentDist {
|
||||
if parentDist > m.parentDist {
|
||||
return
|
||||
}
|
||||
beaten = true
|
||||
}
|
||||
|
||||
// Update m to the newly found best match.
|
||||
if beaten {
|
||||
m.have = have
|
||||
m.want = tag
|
||||
m.conf = c
|
||||
m.origLang = origLang
|
||||
m.origReg = origReg
|
||||
m.origScript = origScript
|
||||
m.regDist = regDist
|
||||
m.parentDist = parentDist
|
||||
}
|
||||
}
|
||||
|
||||
// parentDistance returns the number of times Parent must be called before the
|
||||
// regions match. It is assumed that it has already been checked that lang and
|
||||
// script are identical. If haveRegion does not occur in the ancestor chain of
|
||||
// tag, it returns 255.
|
||||
func parentDistance(haveRegion regionID, tag Tag) uint8 {
|
||||
p := tag.Parent()
|
||||
d := uint8(1)
|
||||
for haveRegion != p.region {
|
||||
if p.region == 0 {
|
||||
return 255
|
||||
}
|
||||
p = p.Parent()
|
||||
d++
|
||||
}
|
||||
return d
|
||||
}
|
||||
|
||||
// regionDist wraps regionDistance with some exceptions to the algorithmic distance.
|
||||
func regionDist(a, b regionID, lang langID) uint8 {
|
||||
if lang == _en {
|
||||
// Two variants of non-US English are close to each other, regardless of distance.
|
||||
if a != _US && b != _US {
|
||||
return 2
|
||||
}
|
||||
}
|
||||
return uint8(regionDistance(a, b))
|
||||
}
|
||||
|
||||
// regionDistance computes the distance between two regions based on the
|
||||
// distance in the graph of region containments as defined in CLDR. It iterates
|
||||
// over increasingly inclusive sets of groups, represented as bit vectors, until
|
||||
// the source bit vector has bits in common with the destination vector.
|
||||
func regionDistance(a, b regionID) int {
|
||||
if a == b {
|
||||
return 0
|
||||
}
|
||||
p, q := regionInclusion[a], regionInclusion[b]
|
||||
if p < nRegionGroups {
|
||||
p, q = q, p
|
||||
}
|
||||
set := regionInclusionBits
|
||||
if q < nRegionGroups && set[p]&(1<<q) != 0 {
|
||||
return 1
|
||||
}
|
||||
d := 2
|
||||
for goal := set[q]; set[p]&goal == 0; p = regionInclusionNext[p] {
|
||||
d++
|
||||
}
|
||||
return d
|
||||
}
|
||||
|
||||
func (t Tag) variants() string {
|
||||
if t.pVariant == 0 {
|
||||
return ""
|
||||
}
|
||||
return t.str[t.pVariant:t.pExt]
|
||||
}
|
||||
|
||||
// variantOrPrivateTagStr returns variants or private use tags.
|
||||
func (t Tag) variantOrPrivateTagStr() string {
|
||||
if t.pExt > 0 {
|
||||
return t.str[t.pVariant:t.pExt]
|
||||
}
|
||||
return t.str[t.pVariant:]
|
||||
}
|
||||
|
||||
// equalsRest compares everything except the language.
|
||||
func (a Tag) equalsRest(b Tag) bool {
|
||||
// TODO: don't include extensions in this comparison. To do this efficiently,
|
||||
// though, we should handle private tags separately.
|
||||
return a.script == b.script && a.region == b.region && a.variantOrPrivateTagStr() == b.variantOrPrivateTagStr()
|
||||
}
|
||||
|
||||
// isExactEquivalent returns true if canonicalizing the language will not alter
|
||||
// the script or region of a tag.
|
||||
func isExactEquivalent(l langID) bool {
|
||||
for _, o := range notEquivalent {
|
||||
if o == l {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
var notEquivalent []langID
|
||||
|
||||
func init() {
|
||||
// Create a list of all languages for which canonicalization may alter the
|
||||
// script or region.
|
||||
for _, lm := range langAliasMap {
|
||||
tag := Tag{lang: langID(lm.from)}
|
||||
if tag, _ = tag.canonicalize(All); tag.script != 0 || tag.region != 0 {
|
||||
notEquivalent = append(notEquivalent, langID(lm.from))
|
||||
}
|
||||
}
|
||||
}
|
||||
859
vendor/golang.org/x/text/language/parse.go
generated
vendored
Normal file
859
vendor/golang.org/x/text/language/parse.go
generated
vendored
Normal file
|
|
@ -0,0 +1,859 @@
|
|||
// Copyright 2013 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package language
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"errors"
|
||||
"fmt"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"golang.org/x/text/internal/tag"
|
||||
)
|
||||
|
||||
// isAlpha returns true if the byte is not a digit.
|
||||
// b must be an ASCII letter or digit.
|
||||
func isAlpha(b byte) bool {
|
||||
return b > '9'
|
||||
}
|
||||
|
||||
// isAlphaNum returns true if the string contains only ASCII letters or digits.
|
||||
func isAlphaNum(s []byte) bool {
|
||||
for _, c := range s {
|
||||
if !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9') {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// errSyntax is returned by any of the parsing functions when the
|
||||
// input is not well-formed, according to BCP 47.
|
||||
// TODO: return the position at which the syntax error occurred?
|
||||
var errSyntax = errors.New("language: tag is not well-formed")
|
||||
|
||||
// ValueError is returned by any of the parsing functions when the
|
||||
// input is well-formed but the respective subtag is not recognized
|
||||
// as a valid value.
|
||||
type ValueError struct {
|
||||
v [8]byte
|
||||
}
|
||||
|
||||
func mkErrInvalid(s []byte) error {
|
||||
var e ValueError
|
||||
copy(e.v[:], s)
|
||||
return e
|
||||
}
|
||||
|
||||
func (e ValueError) tag() []byte {
|
||||
n := bytes.IndexByte(e.v[:], 0)
|
||||
if n == -1 {
|
||||
n = 8
|
||||
}
|
||||
return e.v[:n]
|
||||
}
|
||||
|
||||
// Error implements the error interface.
|
||||
func (e ValueError) Error() string {
|
||||
return fmt.Sprintf("language: subtag %q is well-formed but unknown", e.tag())
|
||||
}
|
||||
|
||||
// Subtag returns the subtag for which the error occurred.
|
||||
func (e ValueError) Subtag() string {
|
||||
return string(e.tag())
|
||||
}
|
||||
|
||||
// scanner is used to scan BCP 47 tokens, which are separated by _ or -.
|
||||
type scanner struct {
|
||||
b []byte
|
||||
bytes [max99thPercentileSize]byte
|
||||
token []byte
|
||||
start int // start position of the current token
|
||||
end int // end position of the current token
|
||||
next int // next point for scan
|
||||
err error
|
||||
done bool
|
||||
}
|
||||
|
||||
func makeScannerString(s string) scanner {
|
||||
scan := scanner{}
|
||||
if len(s) <= len(scan.bytes) {
|
||||
scan.b = scan.bytes[:copy(scan.bytes[:], s)]
|
||||
} else {
|
||||
scan.b = []byte(s)
|
||||
}
|
||||
scan.init()
|
||||
return scan
|
||||
}
|
||||
|
||||
// makeScanner returns a scanner using b as the input buffer.
|
||||
// b is not copied and may be modified by the scanner routines.
|
||||
func makeScanner(b []byte) scanner {
|
||||
scan := scanner{b: b}
|
||||
scan.init()
|
||||
return scan
|
||||
}
|
||||
|
||||
func (s *scanner) init() {
|
||||
for i, c := range s.b {
|
||||
if c == '_' {
|
||||
s.b[i] = '-'
|
||||
}
|
||||
}
|
||||
s.scan()
|
||||
}
|
||||
|
||||
// restToLower converts the string between start and end to lower case.
|
||||
func (s *scanner) toLower(start, end int) {
|
||||
for i := start; i < end; i++ {
|
||||
c := s.b[i]
|
||||
if 'A' <= c && c <= 'Z' {
|
||||
s.b[i] += 'a' - 'A'
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (s *scanner) setError(e error) {
|
||||
if s.err == nil || (e == errSyntax && s.err != errSyntax) {
|
||||
s.err = e
|
||||
}
|
||||
}
|
||||
|
||||
// resizeRange shrinks or grows the array at position oldStart such that
|
||||
// a new string of size newSize can fit between oldStart and oldEnd.
|
||||
// Sets the scan point to after the resized range.
|
||||
func (s *scanner) resizeRange(oldStart, oldEnd, newSize int) {
|
||||
s.start = oldStart
|
||||
if end := oldStart + newSize; end != oldEnd {
|
||||
diff := end - oldEnd
|
||||
if end < cap(s.b) {
|
||||
b := make([]byte, len(s.b)+diff)
|
||||
copy(b, s.b[:oldStart])
|
||||
copy(b[end:], s.b[oldEnd:])
|
||||
s.b = b
|
||||
} else {
|
||||
s.b = append(s.b[end:], s.b[oldEnd:]...)
|
||||
}
|
||||
s.next = end + (s.next - s.end)
|
||||
s.end = end
|
||||
}
|
||||
}
|
||||
|
||||
// replace replaces the current token with repl.
|
||||
func (s *scanner) replace(repl string) {
|
||||
s.resizeRange(s.start, s.end, len(repl))
|
||||
copy(s.b[s.start:], repl)
|
||||
}
|
||||
|
||||
// gobble removes the current token from the input.
|
||||
// Caller must call scan after calling gobble.
|
||||
func (s *scanner) gobble(e error) {
|
||||
s.setError(e)
|
||||
if s.start == 0 {
|
||||
s.b = s.b[:+copy(s.b, s.b[s.next:])]
|
||||
s.end = 0
|
||||
} else {
|
||||
s.b = s.b[:s.start-1+copy(s.b[s.start-1:], s.b[s.end:])]
|
||||
s.end = s.start - 1
|
||||
}
|
||||
s.next = s.start
|
||||
}
|
||||
|
||||
// deleteRange removes the given range from s.b before the current token.
|
||||
func (s *scanner) deleteRange(start, end int) {
|
||||
s.setError(errSyntax)
|
||||
s.b = s.b[:start+copy(s.b[start:], s.b[end:])]
|
||||
diff := end - start
|
||||
s.next -= diff
|
||||
s.start -= diff
|
||||
s.end -= diff
|
||||
}
|
||||
|
||||
// scan parses the next token of a BCP 47 string. Tokens that are larger
|
||||
// than 8 characters or include non-alphanumeric characters result in an error
|
||||
// and are gobbled and removed from the output.
|
||||
// It returns the end position of the last token consumed.
|
||||
func (s *scanner) scan() (end int) {
|
||||
end = s.end
|
||||
s.token = nil
|
||||
for s.start = s.next; s.next < len(s.b); {
|
||||
i := bytes.IndexByte(s.b[s.next:], '-')
|
||||
if i == -1 {
|
||||
s.end = len(s.b)
|
||||
s.next = len(s.b)
|
||||
i = s.end - s.start
|
||||
} else {
|
||||
s.end = s.next + i
|
||||
s.next = s.end + 1
|
||||
}
|
||||
token := s.b[s.start:s.end]
|
||||
if i < 1 || i > 8 || !isAlphaNum(token) {
|
||||
s.gobble(errSyntax)
|
||||
continue
|
||||
}
|
||||
s.token = token
|
||||
return end
|
||||
}
|
||||
if n := len(s.b); n > 0 && s.b[n-1] == '-' {
|
||||
s.setError(errSyntax)
|
||||
s.b = s.b[:len(s.b)-1]
|
||||
}
|
||||
s.done = true
|
||||
return end
|
||||
}
|
||||
|
||||
// acceptMinSize parses multiple tokens of the given size or greater.
|
||||
// It returns the end position of the last token consumed.
|
||||
func (s *scanner) acceptMinSize(min int) (end int) {
|
||||
end = s.end
|
||||
s.scan()
|
||||
for ; len(s.token) >= min; s.scan() {
|
||||
end = s.end
|
||||
}
|
||||
return end
|
||||
}
|
||||
|
||||
// Parse parses the given BCP 47 string and returns a valid Tag. If parsing
|
||||
// failed it returns an error and any part of the tag that could be parsed.
|
||||
// If parsing succeeded but an unknown value was found, it returns
|
||||
// ValueError. The Tag returned in this case is just stripped of the unknown
|
||||
// value. All other values are preserved. It accepts tags in the BCP 47 format
|
||||
// and extensions to this standard defined in
|
||||
// http://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
|
||||
// The resulting tag is canonicalized using the default canonicalization type.
|
||||
func Parse(s string) (t Tag, err error) {
|
||||
return Default.Parse(s)
|
||||
}
|
||||
|
||||
// Parse parses the given BCP 47 string and returns a valid Tag. If parsing
|
||||
// failed it returns an error and any part of the tag that could be parsed.
|
||||
// If parsing succeeded but an unknown value was found, it returns
|
||||
// ValueError. The Tag returned in this case is just stripped of the unknown
|
||||
// value. All other values are preserved. It accepts tags in the BCP 47 format
|
||||
// and extensions to this standard defined in
|
||||
// http://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
|
||||
// The resulting tag is canonicalized using the the canonicalization type c.
|
||||
func (c CanonType) Parse(s string) (t Tag, err error) {
|
||||
// TODO: consider supporting old-style locale key-value pairs.
|
||||
if s == "" {
|
||||
return und, errSyntax
|
||||
}
|
||||
if len(s) <= maxAltTaglen {
|
||||
b := [maxAltTaglen]byte{}
|
||||
for i, c := range s {
|
||||
// Generating invalid UTF-8 is okay as it won't match.
|
||||
if 'A' <= c && c <= 'Z' {
|
||||
c += 'a' - 'A'
|
||||
} else if c == '_' {
|
||||
c = '-'
|
||||
}
|
||||
b[i] = byte(c)
|
||||
}
|
||||
if t, ok := grandfathered(b); ok {
|
||||
return t, nil
|
||||
}
|
||||
}
|
||||
scan := makeScannerString(s)
|
||||
t, err = parse(&scan, s)
|
||||
t, changed := t.canonicalize(c)
|
||||
if changed {
|
||||
t.remakeString()
|
||||
}
|
||||
return t, err
|
||||
}
|
||||
|
||||
func parse(scan *scanner, s string) (t Tag, err error) {
|
||||
t = und
|
||||
var end int
|
||||
if n := len(scan.token); n <= 1 {
|
||||
scan.toLower(0, len(scan.b))
|
||||
if n == 0 || scan.token[0] != 'x' {
|
||||
return t, errSyntax
|
||||
}
|
||||
end = parseExtensions(scan)
|
||||
} else if n >= 4 {
|
||||
return und, errSyntax
|
||||
} else { // the usual case
|
||||
t, end = parseTag(scan)
|
||||
if n := len(scan.token); n == 1 {
|
||||
t.pExt = uint16(end)
|
||||
end = parseExtensions(scan)
|
||||
} else if end < len(scan.b) {
|
||||
scan.setError(errSyntax)
|
||||
scan.b = scan.b[:end]
|
||||
}
|
||||
}
|
||||
if int(t.pVariant) < len(scan.b) {
|
||||
if end < len(s) {
|
||||
s = s[:end]
|
||||
}
|
||||
if len(s) > 0 && tag.Compare(s, scan.b) == 0 {
|
||||
t.str = s
|
||||
} else {
|
||||
t.str = string(scan.b)
|
||||
}
|
||||
} else {
|
||||
t.pVariant, t.pExt = 0, 0
|
||||
}
|
||||
return t, scan.err
|
||||
}
|
||||
|
||||
// parseTag parses language, script, region and variants.
|
||||
// It returns a Tag and the end position in the input that was parsed.
|
||||
func parseTag(scan *scanner) (t Tag, end int) {
|
||||
var e error
|
||||
// TODO: set an error if an unknown lang, script or region is encountered.
|
||||
t.lang, e = getLangID(scan.token)
|
||||
scan.setError(e)
|
||||
scan.replace(t.lang.String())
|
||||
langStart := scan.start
|
||||
end = scan.scan()
|
||||
for len(scan.token) == 3 && isAlpha(scan.token[0]) {
|
||||
// From http://tools.ietf.org/html/bcp47, <lang>-<extlang> tags are equivalent
|
||||
// to a tag of the form <extlang>.
|
||||
lang, e := getLangID(scan.token)
|
||||
if lang != 0 {
|
||||
t.lang = lang
|
||||
copy(scan.b[langStart:], lang.String())
|
||||
scan.b[langStart+3] = '-'
|
||||
scan.start = langStart + 4
|
||||
}
|
||||
scan.gobble(e)
|
||||
end = scan.scan()
|
||||
}
|
||||
if len(scan.token) == 4 && isAlpha(scan.token[0]) {
|
||||
t.script, e = getScriptID(script, scan.token)
|
||||
if t.script == 0 {
|
||||
scan.gobble(e)
|
||||
}
|
||||
end = scan.scan()
|
||||
}
|
||||
if n := len(scan.token); n >= 2 && n <= 3 {
|
||||
t.region, e = getRegionID(scan.token)
|
||||
if t.region == 0 {
|
||||
scan.gobble(e)
|
||||
} else {
|
||||
scan.replace(t.region.String())
|
||||
}
|
||||
end = scan.scan()
|
||||
}
|
||||
scan.toLower(scan.start, len(scan.b))
|
||||
t.pVariant = byte(end)
|
||||
end = parseVariants(scan, end, t)
|
||||
t.pExt = uint16(end)
|
||||
return t, end
|
||||
}
|
||||
|
||||
var separator = []byte{'-'}
|
||||
|
||||
// parseVariants scans tokens as long as each token is a valid variant string.
|
||||
// Duplicate variants are removed.
|
||||
func parseVariants(scan *scanner, end int, t Tag) int {
|
||||
start := scan.start
|
||||
varIDBuf := [4]uint8{}
|
||||
variantBuf := [4][]byte{}
|
||||
varID := varIDBuf[:0]
|
||||
variant := variantBuf[:0]
|
||||
last := -1
|
||||
needSort := false
|
||||
for ; len(scan.token) >= 4; scan.scan() {
|
||||
// TODO: measure the impact of needing this conversion and redesign
|
||||
// the data structure if there is an issue.
|
||||
v, ok := variantIndex[string(scan.token)]
|
||||
if !ok {
|
||||
// unknown variant
|
||||
// TODO: allow user-defined variants?
|
||||
scan.gobble(mkErrInvalid(scan.token))
|
||||
continue
|
||||
}
|
||||
varID = append(varID, v)
|
||||
variant = append(variant, scan.token)
|
||||
if !needSort {
|
||||
if last < int(v) {
|
||||
last = int(v)
|
||||
} else {
|
||||
needSort = true
|
||||
// There is no legal combinations of more than 7 variants
|
||||
// (and this is by no means a useful sequence).
|
||||
const maxVariants = 8
|
||||
if len(varID) > maxVariants {
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
end = scan.end
|
||||
}
|
||||
if needSort {
|
||||
sort.Sort(variantsSort{varID, variant})
|
||||
k, l := 0, -1
|
||||
for i, v := range varID {
|
||||
w := int(v)
|
||||
if l == w {
|
||||
// Remove duplicates.
|
||||
continue
|
||||
}
|
||||
varID[k] = varID[i]
|
||||
variant[k] = variant[i]
|
||||
k++
|
||||
l = w
|
||||
}
|
||||
if str := bytes.Join(variant[:k], separator); len(str) == 0 {
|
||||
end = start - 1
|
||||
} else {
|
||||
scan.resizeRange(start, end, len(str))
|
||||
copy(scan.b[scan.start:], str)
|
||||
end = scan.end
|
||||
}
|
||||
}
|
||||
return end
|
||||
}
|
||||
|
||||
type variantsSort struct {
|
||||
i []uint8
|
||||
v [][]byte
|
||||
}
|
||||
|
||||
func (s variantsSort) Len() int {
|
||||
return len(s.i)
|
||||
}
|
||||
|
||||
func (s variantsSort) Swap(i, j int) {
|
||||
s.i[i], s.i[j] = s.i[j], s.i[i]
|
||||
s.v[i], s.v[j] = s.v[j], s.v[i]
|
||||
}
|
||||
|
||||
func (s variantsSort) Less(i, j int) bool {
|
||||
return s.i[i] < s.i[j]
|
||||
}
|
||||
|
||||
type bytesSort [][]byte
|
||||
|
||||
func (b bytesSort) Len() int {
|
||||
return len(b)
|
||||
}
|
||||
|
||||
func (b bytesSort) Swap(i, j int) {
|
||||
b[i], b[j] = b[j], b[i]
|
||||
}
|
||||
|
||||
func (b bytesSort) Less(i, j int) bool {
|
||||
return bytes.Compare(b[i], b[j]) == -1
|
||||
}
|
||||
|
||||
// parseExtensions parses and normalizes the extensions in the buffer.
|
||||
// It returns the last position of scan.b that is part of any extension.
|
||||
// It also trims scan.b to remove excess parts accordingly.
|
||||
func parseExtensions(scan *scanner) int {
|
||||
start := scan.start
|
||||
exts := [][]byte{}
|
||||
private := []byte{}
|
||||
end := scan.end
|
||||
for len(scan.token) == 1 {
|
||||
extStart := scan.start
|
||||
ext := scan.token[0]
|
||||
end = parseExtension(scan)
|
||||
extension := scan.b[extStart:end]
|
||||
if len(extension) < 3 || (ext != 'x' && len(extension) < 4) {
|
||||
scan.setError(errSyntax)
|
||||
end = extStart
|
||||
continue
|
||||
} else if start == extStart && (ext == 'x' || scan.start == len(scan.b)) {
|
||||
scan.b = scan.b[:end]
|
||||
return end
|
||||
} else if ext == 'x' {
|
||||
private = extension
|
||||
break
|
||||
}
|
||||
exts = append(exts, extension)
|
||||
}
|
||||
sort.Sort(bytesSort(exts))
|
||||
if len(private) > 0 {
|
||||
exts = append(exts, private)
|
||||
}
|
||||
scan.b = scan.b[:start]
|
||||
if len(exts) > 0 {
|
||||
scan.b = append(scan.b, bytes.Join(exts, separator)...)
|
||||
} else if start > 0 {
|
||||
// Strip trailing '-'.
|
||||
scan.b = scan.b[:start-1]
|
||||
}
|
||||
return end
|
||||
}
|
||||
|
||||
// parseExtension parses a single extension and returns the position of
|
||||
// the extension end.
|
||||
func parseExtension(scan *scanner) int {
|
||||
start, end := scan.start, scan.end
|
||||
switch scan.token[0] {
|
||||
case 'u':
|
||||
attrStart := end
|
||||
scan.scan()
|
||||
for last := []byte{}; len(scan.token) > 2; scan.scan() {
|
||||
if bytes.Compare(scan.token, last) != -1 {
|
||||
// Attributes are unsorted. Start over from scratch.
|
||||
p := attrStart + 1
|
||||
scan.next = p
|
||||
attrs := [][]byte{}
|
||||
for scan.scan(); len(scan.token) > 2; scan.scan() {
|
||||
attrs = append(attrs, scan.token)
|
||||
end = scan.end
|
||||
}
|
||||
sort.Sort(bytesSort(attrs))
|
||||
copy(scan.b[p:], bytes.Join(attrs, separator))
|
||||
break
|
||||
}
|
||||
last = scan.token
|
||||
end = scan.end
|
||||
}
|
||||
var last, key []byte
|
||||
for attrEnd := end; len(scan.token) == 2; last = key {
|
||||
key = scan.token
|
||||
keyEnd := scan.end
|
||||
end = scan.acceptMinSize(3)
|
||||
// TODO: check key value validity
|
||||
if keyEnd == end || bytes.Compare(key, last) != 1 {
|
||||
// We have an invalid key or the keys are not sorted.
|
||||
// Start scanning keys from scratch and reorder.
|
||||
p := attrEnd + 1
|
||||
scan.next = p
|
||||
keys := [][]byte{}
|
||||
for scan.scan(); len(scan.token) == 2; {
|
||||
keyStart, keyEnd := scan.start, scan.end
|
||||
end = scan.acceptMinSize(3)
|
||||
if keyEnd != end {
|
||||
keys = append(keys, scan.b[keyStart:end])
|
||||
} else {
|
||||
scan.setError(errSyntax)
|
||||
end = keyStart
|
||||
}
|
||||
}
|
||||
sort.Sort(bytesSort(keys))
|
||||
reordered := bytes.Join(keys, separator)
|
||||
if e := p + len(reordered); e < end {
|
||||
scan.deleteRange(e, end)
|
||||
end = e
|
||||
}
|
||||
copy(scan.b[p:], bytes.Join(keys, separator))
|
||||
break
|
||||
}
|
||||
}
|
||||
case 't':
|
||||
scan.scan()
|
||||
if n := len(scan.token); n >= 2 && n <= 3 && isAlpha(scan.token[1]) {
|
||||
_, end = parseTag(scan)
|
||||
scan.toLower(start, end)
|
||||
}
|
||||
for len(scan.token) == 2 && !isAlpha(scan.token[1]) {
|
||||
end = scan.acceptMinSize(3)
|
||||
}
|
||||
case 'x':
|
||||
end = scan.acceptMinSize(1)
|
||||
default:
|
||||
end = scan.acceptMinSize(2)
|
||||
}
|
||||
return end
|
||||
}
|
||||
|
||||
// Compose creates a Tag from individual parts, which may be of type Tag, Base,
|
||||
// Script, Region, Variant, []Variant, Extension, []Extension or error. If a
|
||||
// Base, Script or Region or slice of type Variant or Extension is passed more
|
||||
// than once, the latter will overwrite the former. Variants and Extensions are
|
||||
// accumulated, but if two extensions of the same type are passed, the latter
|
||||
// will replace the former. A Tag overwrites all former values and typically
|
||||
// only makes sense as the first argument. The resulting tag is returned after
|
||||
// canonicalizing using the Default CanonType. If one or more errors are
|
||||
// encountered, one of the errors is returned.
|
||||
func Compose(part ...interface{}) (t Tag, err error) {
|
||||
return Default.Compose(part...)
|
||||
}
|
||||
|
||||
// Compose creates a Tag from individual parts, which may be of type Tag, Base,
|
||||
// Script, Region, Variant, []Variant, Extension, []Extension or error. If a
|
||||
// Base, Script or Region or slice of type Variant or Extension is passed more
|
||||
// than once, the latter will overwrite the former. Variants and Extensions are
|
||||
// accumulated, but if two extensions of the same type are passed, the latter
|
||||
// will replace the former. A Tag overwrites all former values and typically
|
||||
// only makes sense as the first argument. The resulting tag is returned after
|
||||
// canonicalizing using CanonType c. If one or more errors are encountered,
|
||||
// one of the errors is returned.
|
||||
func (c CanonType) Compose(part ...interface{}) (t Tag, err error) {
|
||||
var b builder
|
||||
if err = b.update(part...); err != nil {
|
||||
return und, err
|
||||
}
|
||||
t, _ = b.tag.canonicalize(c)
|
||||
|
||||
if len(b.ext) > 0 || len(b.variant) > 0 {
|
||||
sort.Sort(sortVariant(b.variant))
|
||||
sort.Strings(b.ext)
|
||||
if b.private != "" {
|
||||
b.ext = append(b.ext, b.private)
|
||||
}
|
||||
n := maxCoreSize + tokenLen(b.variant...) + tokenLen(b.ext...)
|
||||
buf := make([]byte, n)
|
||||
p := t.genCoreBytes(buf)
|
||||
t.pVariant = byte(p)
|
||||
p += appendTokens(buf[p:], b.variant...)
|
||||
t.pExt = uint16(p)
|
||||
p += appendTokens(buf[p:], b.ext...)
|
||||
t.str = string(buf[:p])
|
||||
} else if b.private != "" {
|
||||
t.str = b.private
|
||||
t.remakeString()
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
type builder struct {
|
||||
tag Tag
|
||||
|
||||
private string // the x extension
|
||||
ext []string
|
||||
variant []string
|
||||
|
||||
err error
|
||||
}
|
||||
|
||||
func (b *builder) addExt(e string) {
|
||||
if e == "" {
|
||||
} else if e[0] == 'x' {
|
||||
b.private = e
|
||||
} else {
|
||||
b.ext = append(b.ext, e)
|
||||
}
|
||||
}
|
||||
|
||||
var errInvalidArgument = errors.New("invalid Extension or Variant")
|
||||
|
||||
func (b *builder) update(part ...interface{}) (err error) {
|
||||
replace := func(l *[]string, s string, eq func(a, b string) bool) bool {
|
||||
if s == "" {
|
||||
b.err = errInvalidArgument
|
||||
return true
|
||||
}
|
||||
for i, v := range *l {
|
||||
if eq(v, s) {
|
||||
(*l)[i] = s
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
for _, x := range part {
|
||||
switch v := x.(type) {
|
||||
case Tag:
|
||||
b.tag.lang = v.lang
|
||||
b.tag.region = v.region
|
||||
b.tag.script = v.script
|
||||
if v.str != "" {
|
||||
b.variant = nil
|
||||
for x, s := "", v.str[v.pVariant:v.pExt]; s != ""; {
|
||||
x, s = nextToken(s)
|
||||
b.variant = append(b.variant, x)
|
||||
}
|
||||
b.ext, b.private = nil, ""
|
||||
for i, e := int(v.pExt), ""; i < len(v.str); {
|
||||
i, e = getExtension(v.str, i)
|
||||
b.addExt(e)
|
||||
}
|
||||
}
|
||||
case Base:
|
||||
b.tag.lang = v.langID
|
||||
case Script:
|
||||
b.tag.script = v.scriptID
|
||||
case Region:
|
||||
b.tag.region = v.regionID
|
||||
case Variant:
|
||||
if !replace(&b.variant, v.variant, func(a, b string) bool { return a == b }) {
|
||||
b.variant = append(b.variant, v.variant)
|
||||
}
|
||||
case Extension:
|
||||
if !replace(&b.ext, v.s, func(a, b string) bool { return a[0] == b[0] }) {
|
||||
b.addExt(v.s)
|
||||
}
|
||||
case []Variant:
|
||||
b.variant = nil
|
||||
for _, x := range v {
|
||||
b.update(x)
|
||||
}
|
||||
case []Extension:
|
||||
b.ext, b.private = nil, ""
|
||||
for _, e := range v {
|
||||
b.update(e)
|
||||
}
|
||||
// TODO: support parsing of raw strings based on morphology or just extensions?
|
||||
case error:
|
||||
err = v
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func tokenLen(token ...string) (n int) {
|
||||
for _, t := range token {
|
||||
n += len(t) + 1
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func appendTokens(b []byte, token ...string) int {
|
||||
p := 0
|
||||
for _, t := range token {
|
||||
b[p] = '-'
|
||||
copy(b[p+1:], t)
|
||||
p += 1 + len(t)
|
||||
}
|
||||
return p
|
||||
}
|
||||
|
||||
type sortVariant []string
|
||||
|
||||
func (s sortVariant) Len() int {
|
||||
return len(s)
|
||||
}
|
||||
|
||||
func (s sortVariant) Swap(i, j int) {
|
||||
s[j], s[i] = s[i], s[j]
|
||||
}
|
||||
|
||||
func (s sortVariant) Less(i, j int) bool {
|
||||
return variantIndex[s[i]] < variantIndex[s[j]]
|
||||
}
|
||||
|
||||
func findExt(list []string, x byte) int {
|
||||
for i, e := range list {
|
||||
if e[0] == x {
|
||||
return i
|
||||
}
|
||||
}
|
||||
return -1
|
||||
}
|
||||
|
||||
// getExtension returns the name, body and end position of the extension.
|
||||
func getExtension(s string, p int) (end int, ext string) {
|
||||
if s[p] == '-' {
|
||||
p++
|
||||
}
|
||||
if s[p] == 'x' {
|
||||
return len(s), s[p:]
|
||||
}
|
||||
end = nextExtension(s, p)
|
||||
return end, s[p:end]
|
||||
}
|
||||
|
||||
// nextExtension finds the next extension within the string, searching
|
||||
// for the -<char>- pattern from position p.
|
||||
// In the fast majority of cases, language tags will have at most
|
||||
// one extension and extensions tend to be small.
|
||||
func nextExtension(s string, p int) int {
|
||||
for n := len(s) - 3; p < n; {
|
||||
if s[p] == '-' {
|
||||
if s[p+2] == '-' {
|
||||
return p
|
||||
}
|
||||
p += 3
|
||||
} else {
|
||||
p++
|
||||
}
|
||||
}
|
||||
return len(s)
|
||||
}
|
||||
|
||||
var errInvalidWeight = errors.New("ParseAcceptLanguage: invalid weight")
|
||||
|
||||
// ParseAcceptLanguage parses the contents of a Accept-Language header as
|
||||
// defined in http://www.ietf.org/rfc/rfc2616.txt and returns a list of Tags and
|
||||
// a list of corresponding quality weights. It is more permissive than RFC 2616
|
||||
// and may return non-nil slices even if the input is not valid.
|
||||
// The Tags will be sorted by highest weight first and then by first occurrence.
|
||||
// Tags with a weight of zero will be dropped. An error will be returned if the
|
||||
// input could not be parsed.
|
||||
func ParseAcceptLanguage(s string) (tag []Tag, q []float32, err error) {
|
||||
var entry string
|
||||
for s != "" {
|
||||
if entry, s = split(s, ','); entry == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
entry, weight := split(entry, ';')
|
||||
|
||||
// Scan the language.
|
||||
t, err := Parse(entry)
|
||||
if err != nil {
|
||||
id, ok := acceptFallback[entry]
|
||||
if !ok {
|
||||
return nil, nil, err
|
||||
}
|
||||
t = Tag{lang: id}
|
||||
}
|
||||
|
||||
// Scan the optional weight.
|
||||
w := 1.0
|
||||
if weight != "" {
|
||||
weight = consume(weight, 'q')
|
||||
weight = consume(weight, '=')
|
||||
// consume returns the empty string when a token could not be
|
||||
// consumed, resulting in an error for ParseFloat.
|
||||
if w, err = strconv.ParseFloat(weight, 32); err != nil {
|
||||
return nil, nil, errInvalidWeight
|
||||
}
|
||||
// Drop tags with a quality weight of 0.
|
||||
if w <= 0 {
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
tag = append(tag, t)
|
||||
q = append(q, float32(w))
|
||||
}
|
||||
sortStable(&tagSort{tag, q})
|
||||
return tag, q, nil
|
||||
}
|
||||
|
||||
// consume removes a leading token c from s and returns the result or the empty
|
||||
// string if there is no such token.
|
||||
func consume(s string, c byte) string {
|
||||
if s == "" || s[0] != c {
|
||||
return ""
|
||||
}
|
||||
return strings.TrimSpace(s[1:])
|
||||
}
|
||||
|
||||
func split(s string, c byte) (head, tail string) {
|
||||
if i := strings.IndexByte(s, c); i >= 0 {
|
||||
return strings.TrimSpace(s[:i]), strings.TrimSpace(s[i+1:])
|
||||
}
|
||||
return strings.TrimSpace(s), ""
|
||||
}
|
||||
|
||||
// Add hack mapping to deal with a small number of cases that that occur
|
||||
// in Accept-Language (with reasonable frequency).
|
||||
var acceptFallback = map[string]langID{
|
||||
"english": _en,
|
||||
"deutsch": _de,
|
||||
"italian": _it,
|
||||
"french": _fr,
|
||||
"*": _mul, // defined in the spec to match all languages.
|
||||
}
|
||||
|
||||
type tagSort struct {
|
||||
tag []Tag
|
||||
q []float32
|
||||
}
|
||||
|
||||
func (s *tagSort) Len() int {
|
||||
return len(s.q)
|
||||
}
|
||||
|
||||
func (s *tagSort) Less(i, j int) bool {
|
||||
return s.q[i] > s.q[j]
|
||||
}
|
||||
|
||||
func (s *tagSort) Swap(i, j int) {
|
||||
s.tag[i], s.tag[j] = s.tag[j], s.tag[i]
|
||||
s.q[i], s.q[j] = s.q[j], s.q[i]
|
||||
}
|
||||
2791
vendor/golang.org/x/text/language/tables.go
generated
vendored
Normal file
2791
vendor/golang.org/x/text/language/tables.go
generated
vendored
Normal file
File diff suppressed because it is too large
Load diff
143
vendor/golang.org/x/text/language/tags.go
generated
vendored
Normal file
143
vendor/golang.org/x/text/language/tags.go
generated
vendored
Normal file
|
|
@ -0,0 +1,143 @@
|
|||
// Copyright 2013 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package language
|
||||
|
||||
// TODO: Various sets of commonly use tags and regions.
|
||||
|
||||
// MustParse is like Parse, but panics if the given BCP 47 tag cannot be parsed.
|
||||
// It simplifies safe initialization of Tag values.
|
||||
func MustParse(s string) Tag {
|
||||
t, err := Parse(s)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
return t
|
||||
}
|
||||
|
||||
// MustParse is like Parse, but panics if the given BCP 47 tag cannot be parsed.
|
||||
// It simplifies safe initialization of Tag values.
|
||||
func (c CanonType) MustParse(s string) Tag {
|
||||
t, err := c.Parse(s)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
return t
|
||||
}
|
||||
|
||||
// MustParseBase is like ParseBase, but panics if the given base cannot be parsed.
|
||||
// It simplifies safe initialization of Base values.
|
||||
func MustParseBase(s string) Base {
|
||||
b, err := ParseBase(s)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
return b
|
||||
}
|
||||
|
||||
// MustParseScript is like ParseScript, but panics if the given script cannot be
|
||||
// parsed. It simplifies safe initialization of Script values.
|
||||
func MustParseScript(s string) Script {
|
||||
scr, err := ParseScript(s)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
return scr
|
||||
}
|
||||
|
||||
// MustParseRegion is like ParseRegion, but panics if the given region cannot be
|
||||
// parsed. It simplifies safe initialization of Region values.
|
||||
func MustParseRegion(s string) Region {
|
||||
r, err := ParseRegion(s)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
return r
|
||||
}
|
||||
|
||||
var (
|
||||
und = Tag{}
|
||||
|
||||
Und Tag = Tag{}
|
||||
|
||||
Afrikaans Tag = Tag{lang: _af} // af
|
||||
Amharic Tag = Tag{lang: _am} // am
|
||||
Arabic Tag = Tag{lang: _ar} // ar
|
||||
ModernStandardArabic Tag = Tag{lang: _ar, region: _001} // ar-001
|
||||
Azerbaijani Tag = Tag{lang: _az} // az
|
||||
Bulgarian Tag = Tag{lang: _bg} // bg
|
||||
Bengali Tag = Tag{lang: _bn} // bn
|
||||
Catalan Tag = Tag{lang: _ca} // ca
|
||||
Czech Tag = Tag{lang: _cs} // cs
|
||||
Danish Tag = Tag{lang: _da} // da
|
||||
German Tag = Tag{lang: _de} // de
|
||||
Greek Tag = Tag{lang: _el} // el
|
||||
English Tag = Tag{lang: _en} // en
|
||||
AmericanEnglish Tag = Tag{lang: _en, region: _US} // en-US
|
||||
BritishEnglish Tag = Tag{lang: _en, region: _GB} // en-GB
|
||||
Spanish Tag = Tag{lang: _es} // es
|
||||
EuropeanSpanish Tag = Tag{lang: _es, region: _ES} // es-ES
|
||||
LatinAmericanSpanish Tag = Tag{lang: _es, region: _419} // es-419
|
||||
Estonian Tag = Tag{lang: _et} // et
|
||||
Persian Tag = Tag{lang: _fa} // fa
|
||||
Finnish Tag = Tag{lang: _fi} // fi
|
||||
Filipino Tag = Tag{lang: _fil} // fil
|
||||
French Tag = Tag{lang: _fr} // fr
|
||||
CanadianFrench Tag = Tag{lang: _fr, region: _CA} // fr-CA
|
||||
Gujarati Tag = Tag{lang: _gu} // gu
|
||||
Hebrew Tag = Tag{lang: _he} // he
|
||||
Hindi Tag = Tag{lang: _hi} // hi
|
||||
Croatian Tag = Tag{lang: _hr} // hr
|
||||
Hungarian Tag = Tag{lang: _hu} // hu
|
||||
Armenian Tag = Tag{lang: _hy} // hy
|
||||
Indonesian Tag = Tag{lang: _id} // id
|
||||
Icelandic Tag = Tag{lang: _is} // is
|
||||
Italian Tag = Tag{lang: _it} // it
|
||||
Japanese Tag = Tag{lang: _ja} // ja
|
||||
Georgian Tag = Tag{lang: _ka} // ka
|
||||
Kazakh Tag = Tag{lang: _kk} // kk
|
||||
Khmer Tag = Tag{lang: _km} // km
|
||||
Kannada Tag = Tag{lang: _kn} // kn
|
||||
Korean Tag = Tag{lang: _ko} // ko
|
||||
Kirghiz Tag = Tag{lang: _ky} // ky
|
||||
Lao Tag = Tag{lang: _lo} // lo
|
||||
Lithuanian Tag = Tag{lang: _lt} // lt
|
||||
Latvian Tag = Tag{lang: _lv} // lv
|
||||
Macedonian Tag = Tag{lang: _mk} // mk
|
||||
Malayalam Tag = Tag{lang: _ml} // ml
|
||||
Mongolian Tag = Tag{lang: _mn} // mn
|
||||
Marathi Tag = Tag{lang: _mr} // mr
|
||||
Malay Tag = Tag{lang: _ms} // ms
|
||||
Burmese Tag = Tag{lang: _my} // my
|
||||
Nepali Tag = Tag{lang: _ne} // ne
|
||||
Dutch Tag = Tag{lang: _nl} // nl
|
||||
Norwegian Tag = Tag{lang: _no} // no
|
||||
Punjabi Tag = Tag{lang: _pa} // pa
|
||||
Polish Tag = Tag{lang: _pl} // pl
|
||||
Portuguese Tag = Tag{lang: _pt} // pt
|
||||
BrazilianPortuguese Tag = Tag{lang: _pt, region: _BR} // pt-BR
|
||||
EuropeanPortuguese Tag = Tag{lang: _pt, region: _PT} // pt-PT
|
||||
Romanian Tag = Tag{lang: _ro} // ro
|
||||
Russian Tag = Tag{lang: _ru} // ru
|
||||
Sinhala Tag = Tag{lang: _si} // si
|
||||
Slovak Tag = Tag{lang: _sk} // sk
|
||||
Slovenian Tag = Tag{lang: _sl} // sl
|
||||
Albanian Tag = Tag{lang: _sq} // sq
|
||||
Serbian Tag = Tag{lang: _sr} // sr
|
||||
SerbianLatin Tag = Tag{lang: _sr, script: _Latn} // sr-Latn
|
||||
Swedish Tag = Tag{lang: _sv} // sv
|
||||
Swahili Tag = Tag{lang: _sw} // sw
|
||||
Tamil Tag = Tag{lang: _ta} // ta
|
||||
Telugu Tag = Tag{lang: _te} // te
|
||||
Thai Tag = Tag{lang: _th} // th
|
||||
Turkish Tag = Tag{lang: _tr} // tr
|
||||
Ukrainian Tag = Tag{lang: _uk} // uk
|
||||
Urdu Tag = Tag{lang: _ur} // ur
|
||||
Uzbek Tag = Tag{lang: _uz} // uz
|
||||
Vietnamese Tag = Tag{lang: _vi} // vi
|
||||
Chinese Tag = Tag{lang: _zh} // zh
|
||||
SimplifiedChinese Tag = Tag{lang: _zh, script: _Hans} // zh-Hans
|
||||
TraditionalChinese Tag = Tag{lang: _zh, script: _Hant} // zh-Hant
|
||||
Zulu Tag = Tag{lang: _zu} // zu
|
||||
)
|
||||
126
vendor/golang.org/x/text/runes/cond.go
generated
vendored
Normal file
126
vendor/golang.org/x/text/runes/cond.go
generated
vendored
Normal file
|
|
@ -0,0 +1,126 @@
|
|||
// Copyright 2015 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package runes
|
||||
|
||||
import (
|
||||
"unicode/utf8"
|
||||
|
||||
"golang.org/x/text/transform"
|
||||
)
|
||||
|
||||
// Note: below we pass invalid UTF-8 to the tIn and tNotIn transformers as is.
|
||||
// This is done for various reasons:
|
||||
// - To retain the semantics of the Nop transformer: if input is passed to a Nop
|
||||
// one would expect it to be unchanged.
|
||||
// - It would be very expensive to pass a converted RuneError to a transformer:
|
||||
// a transformer might need more source bytes after RuneError, meaning that
|
||||
// the only way to pass it safely is to create a new buffer and manage the
|
||||
// intermingling of RuneErrors and normal input.
|
||||
// - Many transformers leave ill-formed UTF-8 as is, so this is not
|
||||
// inconsistent. Generally ill-formed UTF-8 is only replaced if it is a
|
||||
// logical consequence of the operation (as for Map) or if it otherwise would
|
||||
// pose security concerns (as for Remove).
|
||||
// - An alternative would be to return an error on ill-formed UTF-8, but this
|
||||
// would be inconsistent with other operations.
|
||||
|
||||
// If returns a transformer that applies tIn to consecutive runes for which
|
||||
// s.Contains(r) and tNotIn to consecutive runes for which !s.Contains(r). Reset
|
||||
// is called on tIn and tNotIn at the start of each run. A Nop transformer will
|
||||
// substitute a nil value passed to tIn or tNotIn. Invalid UTF-8 is translated
|
||||
// to RuneError to determine which transformer to apply, but is passed as is to
|
||||
// the respective transformer.
|
||||
func If(s Set, tIn, tNotIn transform.Transformer) Transformer {
|
||||
if tIn == nil && tNotIn == nil {
|
||||
return Transformer{transform.Nop}
|
||||
}
|
||||
if tIn == nil {
|
||||
tIn = transform.Nop
|
||||
}
|
||||
if tNotIn == nil {
|
||||
tNotIn = transform.Nop
|
||||
}
|
||||
a := &cond{
|
||||
tIn: tIn,
|
||||
tNotIn: tNotIn,
|
||||
f: s.Contains,
|
||||
}
|
||||
a.Reset()
|
||||
return Transformer{a}
|
||||
}
|
||||
|
||||
type cond struct {
|
||||
tIn, tNotIn transform.Transformer
|
||||
f func(rune) bool
|
||||
check func(rune) bool // current check to perform
|
||||
t transform.Transformer // current transformer to use
|
||||
}
|
||||
|
||||
// Reset implements transform.Transformer.
|
||||
func (t *cond) Reset() {
|
||||
t.check = t.is
|
||||
t.t = t.tIn
|
||||
t.t.Reset() // notIn will be reset on first usage.
|
||||
}
|
||||
|
||||
func (t *cond) is(r rune) bool {
|
||||
if t.f(r) {
|
||||
return true
|
||||
}
|
||||
t.check = t.isNot
|
||||
t.t = t.tNotIn
|
||||
t.tNotIn.Reset()
|
||||
return false
|
||||
}
|
||||
|
||||
func (t *cond) isNot(r rune) bool {
|
||||
if !t.f(r) {
|
||||
return true
|
||||
}
|
||||
t.check = t.is
|
||||
t.t = t.tIn
|
||||
t.tIn.Reset()
|
||||
return false
|
||||
}
|
||||
|
||||
func (t *cond) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
p := 0
|
||||
for nSrc < len(src) && err == nil {
|
||||
// Don't process too much at a time, as the work might be wasted if the
|
||||
// destination buffer isn't large enough to hold the result or a
|
||||
// transform returns an error early.
|
||||
const maxChunk = 4096
|
||||
max := len(src)
|
||||
if n := nSrc + maxChunk; n < len(src) {
|
||||
max = n
|
||||
}
|
||||
atEnd := false
|
||||
size := 0
|
||||
current := t.t
|
||||
for ; p < max; p += size {
|
||||
var r rune
|
||||
r, size = utf8.DecodeRune(src[p:])
|
||||
if r == utf8.RuneError && size == 1 {
|
||||
if !atEOF && !utf8.FullRune(src[p:]) {
|
||||
err = transform.ErrShortSrc
|
||||
break
|
||||
}
|
||||
}
|
||||
if !t.check(r) {
|
||||
// The next rune will be the start of a new run.
|
||||
atEnd = true
|
||||
break
|
||||
}
|
||||
}
|
||||
nDst2, nSrc2, err2 := current.Transform(dst[nDst:], src[nSrc:p], atEnd || (atEOF && p == len(src)))
|
||||
nDst += nDst2
|
||||
nSrc += nSrc2
|
||||
if err2 != nil {
|
||||
return nDst, nSrc, err2
|
||||
}
|
||||
// At this point either err != nil or t.check will pass for the rune at p.
|
||||
p = nSrc + size
|
||||
}
|
||||
return nDst, nSrc, err
|
||||
}
|
||||
278
vendor/golang.org/x/text/runes/runes.go
generated
vendored
Normal file
278
vendor/golang.org/x/text/runes/runes.go
generated
vendored
Normal file
|
|
@ -0,0 +1,278 @@
|
|||
// Copyright 2014 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Package runes provide transforms for UTF-8 encoded text.
|
||||
package runes
|
||||
|
||||
import (
|
||||
"unicode"
|
||||
"unicode/utf8"
|
||||
|
||||
"golang.org/x/text/transform"
|
||||
)
|
||||
|
||||
// A Set is a collection of runes.
|
||||
type Set interface {
|
||||
// Contains returns true if r is contained in the set.
|
||||
Contains(r rune) bool
|
||||
}
|
||||
|
||||
type setFunc func(rune) bool
|
||||
|
||||
func (s setFunc) Contains(r rune) bool {
|
||||
return s(r)
|
||||
}
|
||||
|
||||
// Note: using funcs here instead of wrapping types result in cleaner
|
||||
// documentation and a smaller API.
|
||||
|
||||
// In creates a Set with a Contains method that returns true for all runes in
|
||||
// the given RangeTable.
|
||||
func In(rt *unicode.RangeTable) Set {
|
||||
return setFunc(func(r rune) bool { return unicode.Is(rt, r) })
|
||||
}
|
||||
|
||||
// In creates a Set with a Contains method that returns true for all runes not
|
||||
// in the given RangeTable.
|
||||
func NotIn(rt *unicode.RangeTable) Set {
|
||||
return setFunc(func(r rune) bool { return !unicode.Is(rt, r) })
|
||||
}
|
||||
|
||||
// Predicate creates a Set with a Contains method that returns f(r).
|
||||
func Predicate(f func(rune) bool) Set {
|
||||
return setFunc(f)
|
||||
}
|
||||
|
||||
// Transformer implements the transform.Transformer interface.
|
||||
type Transformer struct {
|
||||
transform.Transformer
|
||||
}
|
||||
|
||||
// Bytes returns a new byte slice with the result of converting b using t. It
|
||||
// calls Reset on t. It returns nil if any error was found. This can only happen
|
||||
// if an error-producing Transformer is passed to If.
|
||||
func (t Transformer) Bytes(b []byte) []byte {
|
||||
b, _, err := transform.Bytes(t, b)
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
return b
|
||||
}
|
||||
|
||||
// String returns a string with the result of converting s using t. It calls
|
||||
// Reset on t. It returns the empty string if any error was found. This can only
|
||||
// happen if an error-producing Transformer is passed to If.
|
||||
func (t Transformer) String(s string) string {
|
||||
s, _, err := transform.String(t, s)
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
return s
|
||||
}
|
||||
|
||||
// TODO:
|
||||
// - Copy: copying strings and bytes in whole-rune units.
|
||||
// - Validation (maybe)
|
||||
// - Well-formed-ness (maybe)
|
||||
|
||||
const runeErrorString = string(utf8.RuneError)
|
||||
|
||||
// Remove returns a Transformer that removes runes r for which s.Contains(r).
|
||||
// Illegal input bytes are replaced by RuneError before being passed to f.
|
||||
func Remove(s Set) Transformer {
|
||||
if f, ok := s.(setFunc); ok {
|
||||
// This little trick cuts the running time of BenchmarkRemove for sets
|
||||
// created by Predicate roughly in half.
|
||||
// TODO: special-case RangeTables as well.
|
||||
return Transformer{remove(f)}
|
||||
}
|
||||
return Transformer{remove(s.Contains)}
|
||||
}
|
||||
|
||||
// TODO: remove transform.RemoveFunc.
|
||||
|
||||
type remove func(r rune) bool
|
||||
|
||||
func (remove) Reset() {}
|
||||
|
||||
// Transform implements transform.Transformer.
|
||||
func (t remove) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
for r, size := rune(0), 0; nSrc < len(src); {
|
||||
if r = rune(src[nSrc]); r < utf8.RuneSelf {
|
||||
size = 1
|
||||
} else {
|
||||
r, size = utf8.DecodeRune(src[nSrc:])
|
||||
|
||||
if size == 1 {
|
||||
// Invalid rune.
|
||||
if !atEOF && !utf8.FullRune(src[nSrc:]) {
|
||||
err = transform.ErrShortSrc
|
||||
break
|
||||
}
|
||||
// We replace illegal bytes with RuneError. Not doing so might
|
||||
// otherwise turn a sequence of invalid UTF-8 into valid UTF-8.
|
||||
// The resulting byte sequence may subsequently contain runes
|
||||
// for which t(r) is true that were passed unnoticed.
|
||||
if !t(utf8.RuneError) {
|
||||
if nDst+3 > len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break
|
||||
}
|
||||
dst[nDst+0] = runeErrorString[0]
|
||||
dst[nDst+1] = runeErrorString[1]
|
||||
dst[nDst+2] = runeErrorString[2]
|
||||
nDst += 3
|
||||
}
|
||||
nSrc++
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
if t(r) {
|
||||
nSrc += size
|
||||
continue
|
||||
}
|
||||
if nDst+size > len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break
|
||||
}
|
||||
for i := 0; i < size; i++ {
|
||||
dst[nDst] = src[nSrc]
|
||||
nDst++
|
||||
nSrc++
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// Map returns a Transformer that maps the runes in the input using the given
|
||||
// mapping. Illegal bytes in the input are converted to utf8.RuneError before
|
||||
// being passed to the mapping func.
|
||||
func Map(mapping func(rune) rune) Transformer {
|
||||
return Transformer{mapper(mapping)}
|
||||
}
|
||||
|
||||
type mapper func(rune) rune
|
||||
|
||||
func (mapper) Reset() {}
|
||||
|
||||
// Transform implements transform.Transformer.
|
||||
func (t mapper) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
var replacement rune
|
||||
var b [utf8.UTFMax]byte
|
||||
|
||||
for r, size := rune(0), 0; nSrc < len(src); {
|
||||
if r = rune(src[nSrc]); r < utf8.RuneSelf {
|
||||
if replacement = t(r); replacement < utf8.RuneSelf {
|
||||
if nDst == len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break
|
||||
}
|
||||
dst[nDst] = byte(replacement)
|
||||
nDst++
|
||||
nSrc++
|
||||
continue
|
||||
}
|
||||
size = 1
|
||||
} else if r, size = utf8.DecodeRune(src[nSrc:]); size == 1 {
|
||||
// Invalid rune.
|
||||
if !atEOF && !utf8.FullRune(src[nSrc:]) {
|
||||
err = transform.ErrShortSrc
|
||||
break
|
||||
}
|
||||
|
||||
if replacement = t(utf8.RuneError); replacement == utf8.RuneError {
|
||||
if nDst+3 > len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break
|
||||
}
|
||||
dst[nDst+0] = runeErrorString[0]
|
||||
dst[nDst+1] = runeErrorString[1]
|
||||
dst[nDst+2] = runeErrorString[2]
|
||||
nDst += 3
|
||||
nSrc++
|
||||
continue
|
||||
}
|
||||
} else if replacement = t(r); replacement == r {
|
||||
if nDst+size > len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break
|
||||
}
|
||||
for i := 0; i < size; i++ {
|
||||
dst[nDst] = src[nSrc]
|
||||
nDst++
|
||||
nSrc++
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
n := utf8.EncodeRune(b[:], replacement)
|
||||
|
||||
if nDst+n > len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break
|
||||
}
|
||||
for i := 0; i < n; i++ {
|
||||
dst[nDst] = b[i]
|
||||
nDst++
|
||||
}
|
||||
nSrc += size
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// ReplaceIllFormed returns a transformer that replaces all input bytes that are
|
||||
// not part of a well-formed UTF-8 code sequence with utf8.RuneError.
|
||||
func ReplaceIllFormed() Transformer {
|
||||
return Transformer{&replaceIllFormed{}}
|
||||
}
|
||||
|
||||
type replaceIllFormed struct{ transform.NopResetter }
|
||||
|
||||
func (t replaceIllFormed) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
for nSrc < len(src) {
|
||||
r, size := utf8.DecodeRune(src[nSrc:])
|
||||
|
||||
// Look for an ASCII rune.
|
||||
if r < utf8.RuneSelf {
|
||||
if nDst == len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break
|
||||
}
|
||||
dst[nDst] = byte(r)
|
||||
nDst++
|
||||
nSrc++
|
||||
continue
|
||||
}
|
||||
|
||||
// Look for a valid non-ASCII rune.
|
||||
if r != utf8.RuneError || size != 1 {
|
||||
if size != copy(dst[nDst:], src[nSrc:nSrc+size]) {
|
||||
err = transform.ErrShortDst
|
||||
break
|
||||
}
|
||||
nDst += size
|
||||
nSrc += size
|
||||
continue
|
||||
}
|
||||
|
||||
// Look for short source data.
|
||||
if !atEOF && !utf8.FullRune(src[nSrc:]) {
|
||||
err = transform.ErrShortSrc
|
||||
break
|
||||
}
|
||||
|
||||
// We have an invalid rune.
|
||||
if nDst+3 > len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break
|
||||
}
|
||||
dst[nDst+0] = runeErrorString[0]
|
||||
dst[nDst+1] = runeErrorString[1]
|
||||
dst[nDst+2] = runeErrorString[2]
|
||||
nDst += 3
|
||||
nSrc++
|
||||
}
|
||||
return nDst, nSrc, err
|
||||
}
|
||||
290
vendor/golang.org/x/text/secure/bidirule/bidirule.go
generated
vendored
Normal file
290
vendor/golang.org/x/text/secure/bidirule/bidirule.go
generated
vendored
Normal file
|
|
@ -0,0 +1,290 @@
|
|||
// Copyright 2016 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Package bidirule implements the Bidi Rule defined by RFC 5893.
|
||||
//
|
||||
// This package is under development. The API may change without notice and
|
||||
// without preserving backward compatibility.
|
||||
package bidirule
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"unicode/utf8"
|
||||
|
||||
"golang.org/x/text/transform"
|
||||
"golang.org/x/text/unicode/bidi"
|
||||
)
|
||||
|
||||
// This file contains an implementation of RFC 5893: Right-to-Left Scripts for
|
||||
// Internationalized Domain Names for Applications (IDNA)
|
||||
//
|
||||
// A label is an individual component of a domain name. Labels are usually
|
||||
// shown separated by dots; for example, the domain name "www.example.com" is
|
||||
// composed of three labels: "www", "example", and "com".
|
||||
//
|
||||
// An RTL label is a label that contains at least one character of class R, AL,
|
||||
// or AN. An LTR label is any label that is not an RTL label.
|
||||
//
|
||||
// A "Bidi domain name" is a domain name that contains at least one RTL label.
|
||||
//
|
||||
// The following guarantees can be made based on the above:
|
||||
//
|
||||
// o In a domain name consisting of only labels that satisfy the rule,
|
||||
// the requirements of Section 3 are satisfied. Note that even LTR
|
||||
// labels and pure ASCII labels have to be tested.
|
||||
//
|
||||
// o In a domain name consisting of only LDH labels (as defined in the
|
||||
// Definitions document [RFC5890]) and labels that satisfy the rule,
|
||||
// the requirements of Section 3 are satisfied as long as a label
|
||||
// that starts with an ASCII digit does not come after a
|
||||
// right-to-left label.
|
||||
//
|
||||
// No guarantee is given for other combinations.
|
||||
|
||||
// ErrInvalid indicates a label is invalid according to the Bidi Rule.
|
||||
var ErrInvalid = errors.New("bidirule: failed Bidi Rule")
|
||||
|
||||
type ruleState uint8
|
||||
|
||||
const (
|
||||
ruleInitial ruleState = iota
|
||||
ruleLTR
|
||||
ruleLTRFinal
|
||||
ruleRTL
|
||||
ruleRTLFinal
|
||||
ruleInvalid
|
||||
)
|
||||
|
||||
type ruleTransition struct {
|
||||
next ruleState
|
||||
mask uint16
|
||||
}
|
||||
|
||||
var transitions = [...][2]ruleTransition{
|
||||
// [2.1] The first character must be a character with Bidi property L, R, or
|
||||
// AL. If it has the R or AL property, it is an RTL label; if it has the L
|
||||
// property, it is an LTR label.
|
||||
ruleInitial: {
|
||||
{ruleLTRFinal, 1 << bidi.L},
|
||||
{ruleRTLFinal, 1<<bidi.R | 1<<bidi.AL},
|
||||
},
|
||||
ruleRTL: {
|
||||
// [2.3] In an RTL label, the end of the label must be a character with
|
||||
// Bidi property R, AL, EN, or AN, followed by zero or more characters
|
||||
// with Bidi property NSM.
|
||||
{ruleRTLFinal, 1<<bidi.R | 1<<bidi.AL | 1<<bidi.EN | 1<<bidi.AN},
|
||||
|
||||
// [2.2] In an RTL label, only characters with the Bidi properties R,
|
||||
// AL, AN, EN, ES, CS, ET, ON, BN, or NSM are allowed.
|
||||
// We exclude the entries from [2.3]
|
||||
{ruleRTL, 1<<bidi.ES | 1<<bidi.CS | 1<<bidi.ET | 1<<bidi.ON | 1<<bidi.BN | 1<<bidi.NSM},
|
||||
},
|
||||
ruleRTLFinal: {
|
||||
// [2.3] In an RTL label, the end of the label must be a character with
|
||||
// Bidi property R, AL, EN, or AN, followed by zero or more characters
|
||||
// with Bidi property NSM.
|
||||
{ruleRTLFinal, 1<<bidi.R | 1<<bidi.AL | 1<<bidi.EN | 1<<bidi.AN | 1<<bidi.NSM},
|
||||
|
||||
// [2.2] In an RTL label, only characters with the Bidi properties R,
|
||||
// AL, AN, EN, ES, CS, ET, ON, BN, or NSM are allowed.
|
||||
// We exclude the entries from [2.3] and NSM.
|
||||
{ruleRTL, 1<<bidi.ES | 1<<bidi.CS | 1<<bidi.ET | 1<<bidi.ON | 1<<bidi.BN},
|
||||
},
|
||||
ruleLTR: {
|
||||
// [2.6] In an LTR label, the end of the label must be a character with
|
||||
// Bidi property L or EN, followed by zero or more characters with Bidi
|
||||
// property NSM.
|
||||
{ruleLTRFinal, 1<<bidi.L | 1<<bidi.EN},
|
||||
|
||||
// [2.5] In an LTR label, only characters with the Bidi properties L,
|
||||
// EN, ES, CS, ET, ON, BN, or NSM are allowed.
|
||||
// We exclude the entries from [2.6].
|
||||
{ruleLTR, 1<<bidi.ES | 1<<bidi.CS | 1<<bidi.ET | 1<<bidi.ON | 1<<bidi.BN | 1<<bidi.NSM},
|
||||
},
|
||||
ruleLTRFinal: {
|
||||
// [2.6] In an LTR label, the end of the label must be a character with
|
||||
// Bidi property L or EN, followed by zero or more characters with Bidi
|
||||
// property NSM.
|
||||
{ruleLTRFinal, 1<<bidi.L | 1<<bidi.EN | 1<<bidi.NSM},
|
||||
|
||||
// [2.5] In an LTR label, only characters with the Bidi properties L,
|
||||
// EN, ES, CS, ET, ON, BN, or NSM are allowed.
|
||||
// We exclude the entries from [2.6].
|
||||
{ruleLTR, 1<<bidi.ES | 1<<bidi.CS | 1<<bidi.ET | 1<<bidi.ON | 1<<bidi.BN},
|
||||
},
|
||||
ruleInvalid: {
|
||||
{ruleInvalid, 0},
|
||||
{ruleInvalid, 0},
|
||||
},
|
||||
}
|
||||
|
||||
// [2.4] In an RTL label, if an EN is present, no AN may be present, and
|
||||
// vice versa.
|
||||
const exclusiveRTL = uint16(1<<bidi.EN | 1<<bidi.AN)
|
||||
|
||||
// Direction reports the direction of the given label as defined by RFC 5893 or
|
||||
// an error if b is not a valid label according to the Bidi Rule.
|
||||
func Direction(b []byte) (bidi.Direction, error) {
|
||||
t := Transformer{}
|
||||
if n, ok := t.advance(b); ok && n == len(b) {
|
||||
switch t.state {
|
||||
case ruleLTRFinal, ruleInitial:
|
||||
return bidi.LeftToRight, nil
|
||||
case ruleRTLFinal:
|
||||
return bidi.RightToLeft, nil
|
||||
}
|
||||
}
|
||||
return bidi.Neutral, ErrInvalid
|
||||
}
|
||||
|
||||
// DirectionString reports the direction of the given label as defined by RFC
|
||||
// 5893 or an error if s is not a valid label according to the Bidi Rule.
|
||||
func DirectionString(s string) (bidi.Direction, error) {
|
||||
t := Transformer{}
|
||||
if n, ok := t.advanceString(s); ok && n == len(s) {
|
||||
switch t.state {
|
||||
case ruleLTRFinal, ruleInitial:
|
||||
return bidi.LeftToRight, nil
|
||||
case ruleRTLFinal:
|
||||
return bidi.RightToLeft, nil
|
||||
}
|
||||
}
|
||||
return bidi.Neutral, ErrInvalid
|
||||
}
|
||||
|
||||
// New returns a Transformer that verifies that input adheres to the Bidi Rule.
|
||||
func New() *Transformer {
|
||||
return &Transformer{}
|
||||
}
|
||||
|
||||
// Transformer implements transform.Transform.
|
||||
type Transformer struct {
|
||||
state ruleState
|
||||
seen uint16
|
||||
}
|
||||
|
||||
// Reset implements transform.Transformer.
|
||||
func (t *Transformer) Reset() { *t = Transformer{} }
|
||||
|
||||
// Transform implements transform.Transformer. This Transformer has state and
|
||||
// needs to be reset between uses.
|
||||
func (t *Transformer) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
if len(dst) < len(src) {
|
||||
src = src[:len(dst)]
|
||||
atEOF = false
|
||||
err = transform.ErrShortDst
|
||||
}
|
||||
n, err1 := t.Span(src, atEOF)
|
||||
copy(dst, src[:n])
|
||||
if err == nil || err1 != nil && err1 != transform.ErrShortSrc {
|
||||
err = err1
|
||||
}
|
||||
return n, n, err
|
||||
}
|
||||
|
||||
// Span returns the first n bytes of src that conform to the Bidi rule.
|
||||
func (t *Transformer) Span(src []byte, atEOF bool) (n int, err error) {
|
||||
if t.state == ruleInvalid {
|
||||
return 0, ErrInvalid
|
||||
}
|
||||
n, ok := t.advance(src)
|
||||
switch {
|
||||
case !ok:
|
||||
err = ErrInvalid
|
||||
case n < len(src):
|
||||
if !atEOF {
|
||||
err = transform.ErrShortSrc
|
||||
break
|
||||
}
|
||||
err = ErrInvalid
|
||||
case t.state != ruleLTRFinal && t.state != ruleRTLFinal && t.state != ruleInitial:
|
||||
err = ErrInvalid
|
||||
}
|
||||
return n, err
|
||||
}
|
||||
|
||||
// Precomputing the ASCII values decreases running time for the ASCII fast path
|
||||
// by about 30%.
|
||||
var asciiTable [128]bidi.Properties
|
||||
|
||||
func init() {
|
||||
for i := range asciiTable {
|
||||
p, _ := bidi.LookupRune(rune(i))
|
||||
asciiTable[i] = p
|
||||
}
|
||||
}
|
||||
|
||||
func (t *Transformer) advance(s []byte) (n int, ok bool) {
|
||||
var e bidi.Properties
|
||||
var sz int
|
||||
for n < len(s) {
|
||||
if s[n] < utf8.RuneSelf {
|
||||
e, sz = asciiTable[s[n]], 1
|
||||
} else {
|
||||
e, sz = bidi.Lookup(s[n:])
|
||||
if sz <= 1 {
|
||||
if sz == 1 {
|
||||
return n, false // invalid UTF-8
|
||||
}
|
||||
return n, true // incomplete UTF-8 encoding
|
||||
}
|
||||
}
|
||||
// TODO: using CompactClass results in noticeable speedup.
|
||||
// See unicode/bidi/prop.go:Properties.CompactClass.
|
||||
c := uint16(1 << e.Class())
|
||||
t.seen |= c
|
||||
if t.seen&exclusiveRTL == exclusiveRTL {
|
||||
t.state = ruleInvalid
|
||||
return n, false
|
||||
}
|
||||
switch tr := transitions[t.state]; {
|
||||
case tr[0].mask&c != 0:
|
||||
t.state = tr[0].next
|
||||
case tr[1].mask&c != 0:
|
||||
t.state = tr[1].next
|
||||
default:
|
||||
t.state = ruleInvalid
|
||||
return n, false
|
||||
}
|
||||
n += sz
|
||||
}
|
||||
return n, true
|
||||
}
|
||||
|
||||
func (t *Transformer) advanceString(s string) (n int, ok bool) {
|
||||
var e bidi.Properties
|
||||
var sz int
|
||||
for n < len(s) {
|
||||
if s[n] < utf8.RuneSelf {
|
||||
e, sz = asciiTable[s[n]], 1
|
||||
} else {
|
||||
e, sz = bidi.LookupString(s[n:])
|
||||
if sz <= 1 {
|
||||
if sz == 1 {
|
||||
return n, false // invalid UTF-8
|
||||
}
|
||||
return n, true // incomplete UTF-8 encoding
|
||||
}
|
||||
}
|
||||
// TODO: using CompactClass results in noticeable speedup.
|
||||
// See unicode/bidi/prop.go:Properties.CompactClass.
|
||||
c := uint16(1 << e.Class())
|
||||
t.seen |= c
|
||||
if t.seen&exclusiveRTL == exclusiveRTL {
|
||||
t.state = ruleInvalid
|
||||
return n, false
|
||||
}
|
||||
switch tr := transitions[t.state]; {
|
||||
case tr[0].mask&c != 0:
|
||||
t.state = tr[0].next
|
||||
case tr[1].mask&c != 0:
|
||||
t.state = tr[1].next
|
||||
default:
|
||||
t.state = ruleInvalid
|
||||
return n, false
|
||||
}
|
||||
n += sz
|
||||
}
|
||||
return n, true
|
||||
}
|
||||
36
vendor/golang.org/x/text/secure/precis/class.go
generated
vendored
Normal file
36
vendor/golang.org/x/text/secure/precis/class.go
generated
vendored
Normal file
|
|
@ -0,0 +1,36 @@
|
|||
// Copyright 2015 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package precis
|
||||
|
||||
import (
|
||||
"unicode/utf8"
|
||||
)
|
||||
|
||||
// TODO: Add contextual character rules from Appendix A of RFC5892.
|
||||
|
||||
// A class is a set of characters that match certain derived properties. The
|
||||
// PRECIS framework defines two classes: The Freeform class and the Identifier
|
||||
// class. The freeform class should be used for profiles where expressiveness is
|
||||
// prioritized over safety such as nicknames or passwords. The identifier class
|
||||
// should be used for profiles where safety is the first priority such as
|
||||
// addressable network labels and usernames.
|
||||
type class struct {
|
||||
validFrom property
|
||||
}
|
||||
|
||||
// Contains satisfies the runes.Set interface and returns whether the given rune
|
||||
// is a member of the class.
|
||||
func (c class) Contains(r rune) bool {
|
||||
b := make([]byte, 4)
|
||||
n := utf8.EncodeRune(b, r)
|
||||
|
||||
trieval, _ := dpTrie.lookup(b[:n])
|
||||
return c.validFrom <= property(trieval)
|
||||
}
|
||||
|
||||
var (
|
||||
identifier = &class{validFrom: pValid}
|
||||
freeform = &class{validFrom: idDisOrFreePVal}
|
||||
)
|
||||
139
vendor/golang.org/x/text/secure/precis/context.go
generated
vendored
Normal file
139
vendor/golang.org/x/text/secure/precis/context.go
generated
vendored
Normal file
|
|
@ -0,0 +1,139 @@
|
|||
// Copyright 2016 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package precis
|
||||
|
||||
import "errors"
|
||||
|
||||
// This file contains tables and code related to context rules.
|
||||
|
||||
type catBitmap uint16
|
||||
|
||||
const (
|
||||
// These bits, once set depending on the current value, are never unset.
|
||||
bJapanese catBitmap = 1 << iota
|
||||
bArabicIndicDigit
|
||||
bExtendedArabicIndicDigit
|
||||
|
||||
// These bits are set on each iteration depending on the current value.
|
||||
bJoinStart
|
||||
bJoinMid
|
||||
bJoinEnd
|
||||
bVirama
|
||||
bLatinSmallL
|
||||
bGreek
|
||||
bHebrew
|
||||
|
||||
// These bits indicated which of the permanent bits need to be set at the
|
||||
// end of the checks.
|
||||
bMustHaveJapn
|
||||
|
||||
permanent = bJapanese | bArabicIndicDigit | bExtendedArabicIndicDigit | bMustHaveJapn
|
||||
)
|
||||
|
||||
const finalShift = 10
|
||||
|
||||
var errContext = errors.New("precis: contextual rule violated")
|
||||
|
||||
func init() {
|
||||
// Programmatically set these required bits as, manually setting them seems
|
||||
// too error prone.
|
||||
for i, ct := range categoryTransitions {
|
||||
categoryTransitions[i].keep |= permanent
|
||||
categoryTransitions[i].accept |= ct.term
|
||||
}
|
||||
}
|
||||
|
||||
var categoryTransitions = []struct {
|
||||
keep catBitmap // mask selecting which bits to keep from the previous state
|
||||
set catBitmap // mask for which bits to set for this transition
|
||||
|
||||
// These bitmaps are used for rules that require lookahead.
|
||||
// term&accept == term must be true, which is enforced programmatically.
|
||||
term catBitmap // bits accepted as termination condition
|
||||
accept catBitmap // bits that pass, but not sufficient as termination
|
||||
|
||||
// The rule function cannot take a *context as an argument, as it would
|
||||
// cause the context to escape, adding significant overhead.
|
||||
rule func(beforeBits catBitmap) (doLookahead bool, err error)
|
||||
}{
|
||||
joiningL: {set: bJoinStart},
|
||||
joiningD: {set: bJoinStart | bJoinEnd},
|
||||
joiningT: {keep: bJoinStart, set: bJoinMid},
|
||||
joiningR: {set: bJoinEnd},
|
||||
viramaModifier: {set: bVirama},
|
||||
viramaJoinT: {set: bVirama | bJoinMid},
|
||||
latinSmallL: {set: bLatinSmallL},
|
||||
greek: {set: bGreek},
|
||||
greekJoinT: {set: bGreek | bJoinMid},
|
||||
hebrew: {set: bHebrew},
|
||||
hebrewJoinT: {set: bHebrew | bJoinMid},
|
||||
japanese: {set: bJapanese},
|
||||
katakanaMiddleDot: {set: bMustHaveJapn},
|
||||
|
||||
zeroWidthNonJoiner: {
|
||||
term: bJoinEnd,
|
||||
accept: bJoinMid,
|
||||
rule: func(before catBitmap) (doLookAhead bool, err error) {
|
||||
if before&bVirama != 0 {
|
||||
return false, nil
|
||||
}
|
||||
if before&bJoinStart == 0 {
|
||||
return false, errContext
|
||||
}
|
||||
return true, nil
|
||||
},
|
||||
},
|
||||
zeroWidthJoiner: {
|
||||
rule: func(before catBitmap) (doLookAhead bool, err error) {
|
||||
if before&bVirama == 0 {
|
||||
err = errContext
|
||||
}
|
||||
return false, err
|
||||
},
|
||||
},
|
||||
middleDot: {
|
||||
term: bLatinSmallL,
|
||||
rule: func(before catBitmap) (doLookAhead bool, err error) {
|
||||
if before&bLatinSmallL == 0 {
|
||||
return false, errContext
|
||||
}
|
||||
return true, nil
|
||||
},
|
||||
},
|
||||
greekLowerNumeralSign: {
|
||||
set: bGreek,
|
||||
term: bGreek,
|
||||
rule: func(before catBitmap) (doLookAhead bool, err error) {
|
||||
return true, nil
|
||||
},
|
||||
},
|
||||
hebrewPreceding: {
|
||||
set: bHebrew,
|
||||
rule: func(before catBitmap) (doLookAhead bool, err error) {
|
||||
if before&bHebrew == 0 {
|
||||
err = errContext
|
||||
}
|
||||
return false, err
|
||||
},
|
||||
},
|
||||
arabicIndicDigit: {
|
||||
set: bArabicIndicDigit,
|
||||
rule: func(before catBitmap) (doLookAhead bool, err error) {
|
||||
if before&bExtendedArabicIndicDigit != 0 {
|
||||
err = errContext
|
||||
}
|
||||
return false, err
|
||||
},
|
||||
},
|
||||
extendedArabicIndicDigit: {
|
||||
set: bExtendedArabicIndicDigit,
|
||||
rule: func(before catBitmap) (doLookAhead bool, err error) {
|
||||
if before&bArabicIndicDigit != 0 {
|
||||
err = errContext
|
||||
}
|
||||
return false, err
|
||||
},
|
||||
},
|
||||
}
|
||||
14
vendor/golang.org/x/text/secure/precis/doc.go
generated
vendored
Normal file
14
vendor/golang.org/x/text/secure/precis/doc.go
generated
vendored
Normal file
|
|
@ -0,0 +1,14 @@
|
|||
// Copyright 2015 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Package precis contains types and functions for the preparation,
|
||||
// enforcement, and comparison of internationalized strings ("PRECIS") as
|
||||
// defined in RFC 7564. It also contains several pre-defined profiles for
|
||||
// passwords, nicknames, and usernames as defined in RFC 7613 and RFC 7700.
|
||||
//
|
||||
// BE ADVISED: This package is under construction and the API may change in
|
||||
// backwards incompatible ways and without notice.
|
||||
package precis
|
||||
|
||||
//go:generate go run gen.go gen_trieval.go
|
||||
310
vendor/golang.org/x/text/secure/precis/gen.go
generated
vendored
Normal file
310
vendor/golang.org/x/text/secure/precis/gen.go
generated
vendored
Normal file
|
|
@ -0,0 +1,310 @@
|
|||
// Copyright 2015 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Unicode table generator.
|
||||
// Data read from the web.
|
||||
|
||||
// +build ignore
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"flag"
|
||||
"log"
|
||||
"unicode"
|
||||
"unicode/utf8"
|
||||
|
||||
"golang.org/x/text/internal/gen"
|
||||
"golang.org/x/text/internal/triegen"
|
||||
"golang.org/x/text/internal/ucd"
|
||||
"golang.org/x/text/unicode/norm"
|
||||
"golang.org/x/text/unicode/rangetable"
|
||||
)
|
||||
|
||||
var outputFile = flag.String("output", "tables.go", "output file for generated tables; default tables.go")
|
||||
|
||||
var assigned, disallowedRunes *unicode.RangeTable
|
||||
|
||||
var runeCategory = map[rune]category{}
|
||||
|
||||
var overrides = map[category]category{
|
||||
viramaModifier: viramaJoinT,
|
||||
greek: greekJoinT,
|
||||
hebrew: hebrewJoinT,
|
||||
}
|
||||
|
||||
func setCategory(r rune, cat category) {
|
||||
if c, ok := runeCategory[r]; ok {
|
||||
if override, ok := overrides[c]; cat == joiningT && ok {
|
||||
cat = override
|
||||
} else {
|
||||
log.Fatalf("%U: multiple categories for rune (%v and %v)", r, c, cat)
|
||||
}
|
||||
}
|
||||
runeCategory[r] = cat
|
||||
}
|
||||
|
||||
func init() {
|
||||
if numCategories > 1<<propShift {
|
||||
log.Fatalf("Number of categories is %d; may at most be %d", numCategories, 1<<propShift)
|
||||
}
|
||||
}
|
||||
|
||||
func main() {
|
||||
gen.Init()
|
||||
|
||||
// Load data
|
||||
runes := []rune{}
|
||||
// PrecisIgnorableProperties: https://tools.ietf.org/html/rfc7564#section-9.13
|
||||
ucd.Parse(gen.OpenUCDFile("DerivedCoreProperties.txt"), func(p *ucd.Parser) {
|
||||
if p.String(1) == "Default_Ignorable_Code_Point" {
|
||||
runes = append(runes, p.Rune(0))
|
||||
}
|
||||
})
|
||||
ucd.Parse(gen.OpenUCDFile("PropList.txt"), func(p *ucd.Parser) {
|
||||
switch p.String(1) {
|
||||
case "Noncharacter_Code_Point":
|
||||
runes = append(runes, p.Rune(0))
|
||||
}
|
||||
})
|
||||
// OldHangulJamo: https://tools.ietf.org/html/rfc5892#section-2.9
|
||||
ucd.Parse(gen.OpenUCDFile("HangulSyllableType.txt"), func(p *ucd.Parser) {
|
||||
switch p.String(1) {
|
||||
case "L", "V", "T":
|
||||
runes = append(runes, p.Rune(0))
|
||||
}
|
||||
})
|
||||
|
||||
disallowedRunes = rangetable.New(runes...)
|
||||
assigned = rangetable.Assigned(unicode.Version)
|
||||
|
||||
// Load category data.
|
||||
runeCategory['l'] = latinSmallL
|
||||
ucd.Parse(gen.OpenUCDFile("UnicodeData.txt"), func(p *ucd.Parser) {
|
||||
const cccVirama = 9
|
||||
if p.Int(ucd.CanonicalCombiningClass) == cccVirama {
|
||||
setCategory(p.Rune(0), viramaModifier)
|
||||
}
|
||||
})
|
||||
ucd.Parse(gen.OpenUCDFile("Scripts.txt"), func(p *ucd.Parser) {
|
||||
switch p.String(1) {
|
||||
case "Greek":
|
||||
setCategory(p.Rune(0), greek)
|
||||
case "Hebrew":
|
||||
setCategory(p.Rune(0), hebrew)
|
||||
case "Hiragana", "Katakana", "Han":
|
||||
setCategory(p.Rune(0), japanese)
|
||||
}
|
||||
})
|
||||
|
||||
// Set the rule categories associated with exceptions. This overrides any
|
||||
// previously set categories. The original categories are manually
|
||||
// reintroduced in the categoryTransitions table.
|
||||
for r, e := range exceptions {
|
||||
if e.cat != 0 {
|
||||
runeCategory[r] = e.cat
|
||||
}
|
||||
}
|
||||
cat := map[string]category{
|
||||
"L": joiningL,
|
||||
"D": joiningD,
|
||||
"T": joiningT,
|
||||
|
||||
"R": joiningR,
|
||||
}
|
||||
ucd.Parse(gen.OpenUCDFile("extracted/DerivedJoiningType.txt"), func(p *ucd.Parser) {
|
||||
switch v := p.String(1); v {
|
||||
case "L", "D", "T", "R":
|
||||
setCategory(p.Rune(0), cat[v])
|
||||
}
|
||||
})
|
||||
|
||||
writeTables()
|
||||
gen.Repackage("gen_trieval.go", "trieval.go", "precis")
|
||||
}
|
||||
|
||||
type exception struct {
|
||||
prop property
|
||||
cat category
|
||||
}
|
||||
|
||||
func init() {
|
||||
// Programmatically add the Arabic and Indic digits to the exceptions map.
|
||||
// See comment in the exceptions map below why these are marked disallowed.
|
||||
for i := rune(0); i <= 9; i++ {
|
||||
exceptions[0x0660+i] = exception{
|
||||
prop: disallowed,
|
||||
cat: arabicIndicDigit,
|
||||
}
|
||||
exceptions[0x06F0+i] = exception{
|
||||
prop: disallowed,
|
||||
cat: extendedArabicIndicDigit,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// The Exceptions class as defined in RFC 5892
|
||||
// https://tools.ietf.org/html/rfc5892#section-2.6
|
||||
var exceptions = map[rune]exception{
|
||||
0x00DF: {prop: pValid},
|
||||
0x03C2: {prop: pValid},
|
||||
0x06FD: {prop: pValid},
|
||||
0x06FE: {prop: pValid},
|
||||
0x0F0B: {prop: pValid},
|
||||
0x3007: {prop: pValid},
|
||||
|
||||
// ContextO|J rules are marked as disallowed, taking a "guilty until proven
|
||||
// innocent" approach. The main reason for this is that the check for
|
||||
// whether a context rule should be applied can be moved to the logic for
|
||||
// handing disallowed runes, taken it off the common path. The exception to
|
||||
// this rule is for katakanaMiddleDot, as the rule logic is handled without
|
||||
// using a rule function.
|
||||
|
||||
// ContextJ (Join control)
|
||||
0x200C: {prop: disallowed, cat: zeroWidthNonJoiner},
|
||||
0x200D: {prop: disallowed, cat: zeroWidthJoiner},
|
||||
|
||||
// ContextO
|
||||
0x00B7: {prop: disallowed, cat: middleDot},
|
||||
0x0375: {prop: disallowed, cat: greekLowerNumeralSign},
|
||||
0x05F3: {prop: disallowed, cat: hebrewPreceding}, // punctuation Geresh
|
||||
0x05F4: {prop: disallowed, cat: hebrewPreceding}, // punctuation Gershayim
|
||||
0x30FB: {prop: pValid, cat: katakanaMiddleDot},
|
||||
|
||||
// These are officially ContextO, but the implementation does not require
|
||||
// special treatment of these, so we simply mark them as valid.
|
||||
0x0660: {prop: pValid},
|
||||
0x0661: {prop: pValid},
|
||||
0x0662: {prop: pValid},
|
||||
0x0663: {prop: pValid},
|
||||
0x0664: {prop: pValid},
|
||||
0x0665: {prop: pValid},
|
||||
0x0666: {prop: pValid},
|
||||
0x0667: {prop: pValid},
|
||||
0x0668: {prop: pValid},
|
||||
0x0669: {prop: pValid},
|
||||
0x06F0: {prop: pValid},
|
||||
0x06F1: {prop: pValid},
|
||||
0x06F2: {prop: pValid},
|
||||
0x06F3: {prop: pValid},
|
||||
0x06F4: {prop: pValid},
|
||||
0x06F5: {prop: pValid},
|
||||
0x06F6: {prop: pValid},
|
||||
0x06F7: {prop: pValid},
|
||||
0x06F8: {prop: pValid},
|
||||
0x06F9: {prop: pValid},
|
||||
|
||||
0x0640: {prop: disallowed},
|
||||
0x07FA: {prop: disallowed},
|
||||
0x302E: {prop: disallowed},
|
||||
0x302F: {prop: disallowed},
|
||||
0x3031: {prop: disallowed},
|
||||
0x3032: {prop: disallowed},
|
||||
0x3033: {prop: disallowed},
|
||||
0x3034: {prop: disallowed},
|
||||
0x3035: {prop: disallowed},
|
||||
0x303B: {prop: disallowed},
|
||||
}
|
||||
|
||||
// LetterDigits: https://tools.ietf.org/html/rfc5892#section-2.1
|
||||
// r in {Ll, Lu, Lo, Nd, Lm, Mn, Mc}.
|
||||
func isLetterDigits(r rune) bool {
|
||||
return unicode.In(r,
|
||||
unicode.Ll, unicode.Lu, unicode.Lm, unicode.Lo, // Letters
|
||||
unicode.Mn, unicode.Mc, // Modifiers
|
||||
unicode.Nd, // Digits
|
||||
)
|
||||
}
|
||||
|
||||
func isIdDisAndFreePVal(r rune) bool {
|
||||
return unicode.In(r,
|
||||
// OtherLetterDigits: https://tools.ietf.org/html/rfc7564#section-9.18
|
||||
// r in in {Lt, Nl, No, Me}
|
||||
unicode.Lt, unicode.Nl, unicode.No, // Other letters / numbers
|
||||
unicode.Me, // Modifiers
|
||||
|
||||
// Spaces: https://tools.ietf.org/html/rfc7564#section-9.14
|
||||
// r in in {Zs}
|
||||
unicode.Zs,
|
||||
|
||||
// Symbols: https://tools.ietf.org/html/rfc7564#section-9.15
|
||||
// r in {Sm, Sc, Sk, So}
|
||||
unicode.Sm, unicode.Sc, unicode.Sk, unicode.So,
|
||||
|
||||
// Punctuation: https://tools.ietf.org/html/rfc7564#section-9.16
|
||||
// r in {Pc, Pd, Ps, Pe, Pi, Pf, Po}
|
||||
unicode.Pc, unicode.Pd, unicode.Ps, unicode.Pe,
|
||||
unicode.Pi, unicode.Pf, unicode.Po,
|
||||
)
|
||||
}
|
||||
|
||||
// HasCompat: https://tools.ietf.org/html/rfc7564#section-9.17
|
||||
func hasCompat(r rune) bool {
|
||||
return !norm.NFKC.IsNormalString(string(r))
|
||||
}
|
||||
|
||||
// From https://tools.ietf.org/html/rfc5892:
|
||||
//
|
||||
// If .cp. .in. Exceptions Then Exceptions(cp);
|
||||
// Else If .cp. .in. BackwardCompatible Then BackwardCompatible(cp);
|
||||
// Else If .cp. .in. Unassigned Then UNASSIGNED;
|
||||
// Else If .cp. .in. ASCII7 Then PVALID;
|
||||
// Else If .cp. .in. JoinControl Then CONTEXTJ;
|
||||
// Else If .cp. .in. OldHangulJamo Then DISALLOWED;
|
||||
// Else If .cp. .in. PrecisIgnorableProperties Then DISALLOWED;
|
||||
// Else If .cp. .in. Controls Then DISALLOWED;
|
||||
// Else If .cp. .in. HasCompat Then ID_DIS or FREE_PVAL;
|
||||
// Else If .cp. .in. LetterDigits Then PVALID;
|
||||
// Else If .cp. .in. OtherLetterDigits Then ID_DIS or FREE_PVAL;
|
||||
// Else If .cp. .in. Spaces Then ID_DIS or FREE_PVAL;
|
||||
// Else If .cp. .in. Symbols Then ID_DIS or FREE_PVAL;
|
||||
// Else If .cp. .in. Punctuation Then ID_DIS or FREE_PVAL;
|
||||
// Else DISALLOWED;
|
||||
|
||||
func writeTables() {
|
||||
propTrie := triegen.NewTrie("derivedProperties")
|
||||
w := gen.NewCodeWriter()
|
||||
defer w.WriteGoFile(*outputFile, "precis")
|
||||
gen.WriteUnicodeVersion(w)
|
||||
|
||||
// Iterate over all the runes...
|
||||
for i := rune(0); i < unicode.MaxRune; i++ {
|
||||
r := rune(i)
|
||||
|
||||
if !utf8.ValidRune(r) {
|
||||
continue
|
||||
}
|
||||
|
||||
e, ok := exceptions[i]
|
||||
p := e.prop
|
||||
switch {
|
||||
case ok:
|
||||
case !unicode.In(r, assigned):
|
||||
p = unassigned
|
||||
case r >= 0x0021 && r <= 0x007e: // Is ASCII 7
|
||||
p = pValid
|
||||
case unicode.In(r, disallowedRunes, unicode.Cc):
|
||||
p = disallowed
|
||||
case hasCompat(r):
|
||||
p = idDisOrFreePVal
|
||||
case isLetterDigits(r):
|
||||
p = pValid
|
||||
case isIdDisAndFreePVal(r):
|
||||
p = idDisOrFreePVal
|
||||
default:
|
||||
p = disallowed
|
||||
}
|
||||
cat := runeCategory[r]
|
||||
// Don't set category for runes that are disallowed.
|
||||
if p == disallowed {
|
||||
cat = exceptions[r].cat
|
||||
}
|
||||
propTrie.Insert(r, uint64(p)|uint64(cat))
|
||||
}
|
||||
sz, err := propTrie.Gen(w)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
w.Size += sz
|
||||
}
|
||||
68
vendor/golang.org/x/text/secure/precis/gen_trieval.go
generated
vendored
Normal file
68
vendor/golang.org/x/text/secure/precis/gen_trieval.go
generated
vendored
Normal file
|
|
@ -0,0 +1,68 @@
|
|||
// Copyright 2015 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build ignore
|
||||
|
||||
package main
|
||||
|
||||
// entry is the entry of a trie table
|
||||
// 7..6 property (unassigned, disallowed, maybe, valid)
|
||||
// 5..0 category
|
||||
type entry uint8
|
||||
|
||||
const (
|
||||
propShift = 6
|
||||
propMask = 0xc0
|
||||
catMask = 0x3f
|
||||
)
|
||||
|
||||
func (e entry) property() property { return property(e & propMask) }
|
||||
func (e entry) category() category { return category(e & catMask) }
|
||||
|
||||
type property uint8
|
||||
|
||||
// The order of these constants matter. A Profile may consider runes to be
|
||||
// allowed either from pValid or idDisOrFreePVal.
|
||||
const (
|
||||
unassigned property = iota << propShift
|
||||
disallowed
|
||||
idDisOrFreePVal // disallowed for Identifier, pValid for FreeForm
|
||||
pValid
|
||||
)
|
||||
|
||||
// compute permutations of all properties and specialCategories.
|
||||
type category uint8
|
||||
|
||||
const (
|
||||
other category = iota
|
||||
|
||||
// Special rune types
|
||||
joiningL
|
||||
joiningD
|
||||
joiningT
|
||||
joiningR
|
||||
viramaModifier
|
||||
viramaJoinT // Virama + JoiningT
|
||||
latinSmallL // U+006c
|
||||
greek
|
||||
greekJoinT // Greek + JoiningT
|
||||
hebrew
|
||||
hebrewJoinT // Hebrew + JoiningT
|
||||
japanese // hirigana, katakana, han
|
||||
|
||||
// Special rune types associated with contextual rules defined in
|
||||
// https://tools.ietf.org/html/rfc5892#appendix-A.
|
||||
// ContextO
|
||||
zeroWidthNonJoiner // rule 1
|
||||
zeroWidthJoiner // rule 2
|
||||
// ContextJ
|
||||
middleDot // rule 3
|
||||
greekLowerNumeralSign // rule 4
|
||||
hebrewPreceding // rule 5 and 6
|
||||
katakanaMiddleDot // rule 7
|
||||
arabicIndicDigit // rule 8
|
||||
extendedArabicIndicDigit // rule 9
|
||||
|
||||
numCategories
|
||||
)
|
||||
70
vendor/golang.org/x/text/secure/precis/nickname.go
generated
vendored
Normal file
70
vendor/golang.org/x/text/secure/precis/nickname.go
generated
vendored
Normal file
|
|
@ -0,0 +1,70 @@
|
|||
// Copyright 2015 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package precis
|
||||
|
||||
import (
|
||||
"unicode"
|
||||
"unicode/utf8"
|
||||
|
||||
"golang.org/x/text/transform"
|
||||
)
|
||||
|
||||
type nickAdditionalMapping struct {
|
||||
// TODO: This transformer needs to be stateless somehow…
|
||||
notStart bool
|
||||
prevSpace bool
|
||||
}
|
||||
|
||||
func (t *nickAdditionalMapping) Reset() {
|
||||
t.prevSpace = false
|
||||
t.notStart = false
|
||||
}
|
||||
|
||||
func (t *nickAdditionalMapping) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
// RFC 7700 §2.1. Rules
|
||||
//
|
||||
// 2. Additional Mapping Rule: The additional mapping rule consists of
|
||||
// the following sub-rules.
|
||||
//
|
||||
// 1. Any instances of non-ASCII space MUST be mapped to ASCII
|
||||
// space (U+0020); a non-ASCII space is any Unicode code point
|
||||
// having a general category of "Zs", naturally with the
|
||||
// exception of U+0020.
|
||||
//
|
||||
// 2. Any instances of the ASCII space character at the beginning
|
||||
// or end of a nickname MUST be removed (e.g., "stpeter " is
|
||||
// mapped to "stpeter").
|
||||
//
|
||||
// 3. Interior sequences of more than one ASCII space character
|
||||
// MUST be mapped to a single ASCII space character (e.g.,
|
||||
// "St Peter" is mapped to "St Peter").
|
||||
|
||||
for nSrc < len(src) {
|
||||
r, size := utf8.DecodeRune(src[nSrc:])
|
||||
if size == 0 { // Incomplete UTF-8 encoding
|
||||
if !atEOF {
|
||||
return nDst, nSrc, transform.ErrShortSrc
|
||||
}
|
||||
size = 1
|
||||
}
|
||||
if unicode.Is(unicode.Zs, r) {
|
||||
t.prevSpace = true
|
||||
} else {
|
||||
if t.prevSpace && t.notStart {
|
||||
dst[nDst] = ' '
|
||||
nDst += 1
|
||||
}
|
||||
if size != copy(dst[nDst:], src[nSrc:nSrc+size]) {
|
||||
nDst += size
|
||||
return nDst, nSrc, transform.ErrShortDst
|
||||
}
|
||||
nDst += size
|
||||
t.prevSpace = false
|
||||
t.notStart = true
|
||||
}
|
||||
nSrc += size
|
||||
}
|
||||
return nDst, nSrc, nil
|
||||
}
|
||||
106
vendor/golang.org/x/text/secure/precis/options.go
generated
vendored
Normal file
106
vendor/golang.org/x/text/secure/precis/options.go
generated
vendored
Normal file
|
|
@ -0,0 +1,106 @@
|
|||
// Copyright 2015 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package precis
|
||||
|
||||
import (
|
||||
"golang.org/x/text/cases"
|
||||
"golang.org/x/text/runes"
|
||||
"golang.org/x/text/transform"
|
||||
"golang.org/x/text/unicode/norm"
|
||||
"golang.org/x/text/width"
|
||||
)
|
||||
|
||||
// An Option is used to define the behavior and rules of a Profile.
|
||||
type Option func(*options)
|
||||
|
||||
type options struct {
|
||||
// Preparation options
|
||||
foldWidth bool
|
||||
|
||||
// Enforcement options
|
||||
cases transform.Transformer
|
||||
disallow runes.Set
|
||||
norm norm.Form
|
||||
additional []func() transform.Transformer
|
||||
width *width.Transformer
|
||||
disallowEmpty bool
|
||||
bidiRule bool
|
||||
|
||||
// Comparison options
|
||||
ignorecase bool
|
||||
}
|
||||
|
||||
func getOpts(o ...Option) (res options) {
|
||||
for _, f := range o {
|
||||
f(&res)
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
var (
|
||||
// The IgnoreCase option causes the profile to perform a case insensitive
|
||||
// comparison during the PRECIS comparison step.
|
||||
IgnoreCase Option = ignoreCase
|
||||
|
||||
// The FoldWidth option causes the profile to map non-canonical wide and
|
||||
// narrow variants to their decomposition mapping. This is useful for
|
||||
// profiles that are based on the identifier class which would otherwise
|
||||
// disallow such characters.
|
||||
FoldWidth Option = foldWidth
|
||||
|
||||
// The DisallowEmpty option causes the enforcement step to return an error if
|
||||
// the resulting string would be empty.
|
||||
DisallowEmpty Option = disallowEmpty
|
||||
|
||||
// The BidiRule option causes the Bidi Rule defined in RFC 5893 to be
|
||||
// applied.
|
||||
BidiRule Option = bidiRule
|
||||
)
|
||||
|
||||
var (
|
||||
ignoreCase = func(o *options) {
|
||||
o.ignorecase = true
|
||||
}
|
||||
foldWidth = func(o *options) {
|
||||
o.foldWidth = true
|
||||
}
|
||||
disallowEmpty = func(o *options) {
|
||||
o.disallowEmpty = true
|
||||
}
|
||||
bidiRule = func(o *options) {
|
||||
o.bidiRule = true
|
||||
}
|
||||
)
|
||||
|
||||
// The AdditionalMapping option defines the additional mapping rule for the
|
||||
// Profile by applying Transformer's in sequence.
|
||||
func AdditionalMapping(t ...func() transform.Transformer) Option {
|
||||
return func(o *options) {
|
||||
o.additional = t
|
||||
}
|
||||
}
|
||||
|
||||
// The Norm option defines a Profile's normalization rule. Defaults to NFC.
|
||||
func Norm(f norm.Form) Option {
|
||||
return func(o *options) {
|
||||
o.norm = f
|
||||
}
|
||||
}
|
||||
|
||||
// The FoldCase option defines a Profile's case mapping rule. Options can be
|
||||
// provided to determine the type of case folding used.
|
||||
func FoldCase(opts ...cases.Option) Option {
|
||||
return func(o *options) {
|
||||
o.cases = cases.Fold(opts...)
|
||||
}
|
||||
}
|
||||
|
||||
// The Disallow option further restricts a Profile's allowed characters beyond
|
||||
// what is disallowed by the underlying string class.
|
||||
func Disallow(set runes.Set) Option {
|
||||
return func(o *options) {
|
||||
o.disallow = set
|
||||
}
|
||||
}
|
||||
330
vendor/golang.org/x/text/secure/precis/profile.go
generated
vendored
Normal file
330
vendor/golang.org/x/text/secure/precis/profile.go
generated
vendored
Normal file
|
|
@ -0,0 +1,330 @@
|
|||
// Copyright 2015 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package precis
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"unicode/utf8"
|
||||
|
||||
"golang.org/x/text/runes"
|
||||
"golang.org/x/text/secure/bidirule"
|
||||
"golang.org/x/text/transform"
|
||||
"golang.org/x/text/width"
|
||||
)
|
||||
|
||||
var (
|
||||
errDisallowedRune = errors.New("precis: disallowed rune encountered")
|
||||
)
|
||||
|
||||
var dpTrie = newDerivedPropertiesTrie(0)
|
||||
|
||||
// A Profile represents a set of rules for normalizing and validating strings in
|
||||
// the PRECIS framework.
|
||||
type Profile struct {
|
||||
options
|
||||
class *class
|
||||
}
|
||||
|
||||
// NewIdentifier creates a new PRECIS profile based on the Identifier string
|
||||
// class. Profiles created from this class are suitable for use where safety is
|
||||
// prioritized over expressiveness like network identifiers, user accounts, chat
|
||||
// rooms, and file names.
|
||||
func NewIdentifier(opts ...Option) *Profile {
|
||||
return &Profile{
|
||||
options: getOpts(opts...),
|
||||
class: identifier,
|
||||
}
|
||||
}
|
||||
|
||||
// NewFreeform creates a new PRECIS profile based on the Freeform string class.
|
||||
// Profiles created from this class are suitable for use where expressiveness is
|
||||
// prioritized over safety like passwords, and display-elements such as
|
||||
// nicknames in a chat room.
|
||||
func NewFreeform(opts ...Option) *Profile {
|
||||
return &Profile{
|
||||
options: getOpts(opts...),
|
||||
class: freeform,
|
||||
}
|
||||
}
|
||||
|
||||
// NewTransformer creates a new transform.Transformer that performs the PRECIS
|
||||
// preparation and enforcement steps on the given UTF-8 encoded bytes.
|
||||
func (p *Profile) NewTransformer() *Transformer {
|
||||
var ts []transform.Transformer
|
||||
|
||||
// These transforms are applied in the order defined in
|
||||
// https://tools.ietf.org/html/rfc7564#section-7
|
||||
|
||||
if p.options.foldWidth {
|
||||
ts = append(ts, width.Fold)
|
||||
}
|
||||
|
||||
for _, f := range p.options.additional {
|
||||
ts = append(ts, f())
|
||||
}
|
||||
|
||||
if p.options.cases != nil {
|
||||
ts = append(ts, p.options.cases)
|
||||
}
|
||||
|
||||
ts = append(ts, p.options.norm)
|
||||
|
||||
if p.options.bidiRule {
|
||||
ts = append(ts, bidirule.New())
|
||||
}
|
||||
|
||||
ts = append(ts, &checker{p: p, allowed: p.Allowed()})
|
||||
|
||||
// TODO: Add the disallow empty rule with a dummy transformer?
|
||||
|
||||
return &Transformer{transform.Chain(ts...)}
|
||||
}
|
||||
|
||||
var errEmptyString = errors.New("precis: transformation resulted in empty string")
|
||||
|
||||
type buffers struct {
|
||||
src []byte
|
||||
buf [2][]byte
|
||||
next int
|
||||
}
|
||||
|
||||
func (b *buffers) init(n int) {
|
||||
b.buf[0] = make([]byte, 0, n)
|
||||
b.buf[1] = make([]byte, 0, n)
|
||||
}
|
||||
|
||||
func (b *buffers) apply(t transform.Transformer) (err error) {
|
||||
// TODO: use Span, once available.
|
||||
x := b.next & 1
|
||||
b.src, _, err = transform.Append(t, b.buf[x][:0], b.src)
|
||||
b.buf[x] = b.src
|
||||
b.next++
|
||||
return err
|
||||
}
|
||||
|
||||
func (b *buffers) enforce(p *Profile, src []byte) (str []byte, err error) {
|
||||
b.src = src
|
||||
|
||||
// These transforms are applied in the order defined in
|
||||
// https://tools.ietf.org/html/rfc7564#section-7
|
||||
|
||||
// TODO: allow different width transforms options.
|
||||
if p.options.foldWidth {
|
||||
// TODO: use Span, once available.
|
||||
if err = b.apply(width.Fold); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
for _, f := range p.options.additional {
|
||||
if err = b.apply(f()); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
if p.options.cases != nil {
|
||||
if err = b.apply(p.options.cases); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
if n := p.norm.QuickSpan(b.src); n < len(b.src) {
|
||||
x := b.next & 1
|
||||
n = copy(b.buf[x], b.src[:n])
|
||||
b.src, _, err = transform.Append(p.norm, b.buf[x][:n], b.src[n:])
|
||||
b.buf[x] = b.src
|
||||
b.next++
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
if p.options.bidiRule {
|
||||
if err := b.apply(bidirule.New()); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
c := checker{p: p}
|
||||
if _, err := c.span(b.src, true); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if p.disallow != nil {
|
||||
for i := 0; i < len(b.src); {
|
||||
r, size := utf8.DecodeRune(b.src[i:])
|
||||
if p.disallow.Contains(r) {
|
||||
return nil, errDisallowedRune
|
||||
}
|
||||
i += size
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: Add the disallow empty rule with a dummy transformer?
|
||||
|
||||
if p.options.disallowEmpty && len(b.src) == 0 {
|
||||
return nil, errEmptyString
|
||||
}
|
||||
return b.src, nil
|
||||
}
|
||||
|
||||
// Append appends the result of applying p to src writing the result to dst.
|
||||
// It returns an error if the input string is invalid.
|
||||
func (p *Profile) Append(dst, src []byte) ([]byte, error) {
|
||||
var buf buffers
|
||||
buf.init(8 + len(src) + len(src)>>2)
|
||||
b, err := buf.enforce(p, src)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return append(dst, b...), nil
|
||||
}
|
||||
|
||||
// Bytes returns a new byte slice with the result of applying the profile to b.
|
||||
func (p *Profile) Bytes(b []byte) ([]byte, error) {
|
||||
var buf buffers
|
||||
buf.init(8 + len(b) + len(b)>>2)
|
||||
b, err := buf.enforce(p, b)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if buf.next == 0 {
|
||||
c := make([]byte, len(b))
|
||||
copy(c, b)
|
||||
return c, nil
|
||||
}
|
||||
return b, nil
|
||||
}
|
||||
|
||||
// String returns a string with the result of applying the profile to s.
|
||||
func (p *Profile) String(s string) (string, error) {
|
||||
var buf buffers
|
||||
buf.init(8 + len(s) + len(s)>>2)
|
||||
b, err := buf.enforce(p, []byte(s))
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return string(b), nil
|
||||
}
|
||||
|
||||
// Compare enforces both strings, and then compares them for bit-string identity
|
||||
// (byte-for-byte equality). If either string cannot be enforced, the comparison
|
||||
// is false.
|
||||
func (p *Profile) Compare(a, b string) bool {
|
||||
a, err := p.String(a)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
b, err = p.String(b)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
|
||||
// TODO: This is out of order. Need to extract the transformation logic and
|
||||
// put this in where the normal case folding would go (but only for
|
||||
// comparison).
|
||||
if p.options.ignorecase {
|
||||
a = width.Fold.String(a)
|
||||
b = width.Fold.String(a)
|
||||
}
|
||||
|
||||
return a == b
|
||||
}
|
||||
|
||||
// Allowed returns a runes.Set containing every rune that is a member of the
|
||||
// underlying profile's string class and not disallowed by any profile specific
|
||||
// rules.
|
||||
func (p *Profile) Allowed() runes.Set {
|
||||
if p.options.disallow != nil {
|
||||
return runes.Predicate(func(r rune) bool {
|
||||
return p.class.Contains(r) && !p.options.disallow.Contains(r)
|
||||
})
|
||||
}
|
||||
return p.class
|
||||
}
|
||||
|
||||
type checker struct {
|
||||
p *Profile
|
||||
allowed runes.Set
|
||||
|
||||
beforeBits catBitmap
|
||||
termBits catBitmap
|
||||
acceptBits catBitmap
|
||||
}
|
||||
|
||||
func (c *checker) Reset() {
|
||||
c.beforeBits = 0
|
||||
c.termBits = 0
|
||||
c.acceptBits = 0
|
||||
}
|
||||
|
||||
func (c *checker) span(src []byte, atEOF bool) (n int, err error) {
|
||||
for n < len(src) {
|
||||
e, sz := dpTrie.lookup(src[n:])
|
||||
d := categoryTransitions[category(e&catMask)]
|
||||
if sz == 0 {
|
||||
if !atEOF {
|
||||
return n, transform.ErrShortSrc
|
||||
}
|
||||
return n, errDisallowedRune
|
||||
}
|
||||
if property(e) < c.p.class.validFrom {
|
||||
if d.rule == nil {
|
||||
return n, errDisallowedRune
|
||||
}
|
||||
doLookAhead, err := d.rule(c.beforeBits)
|
||||
if err != nil {
|
||||
return n, err
|
||||
}
|
||||
if doLookAhead {
|
||||
c.beforeBits &= d.keep
|
||||
c.beforeBits |= d.set
|
||||
// We may still have a lookahead rule which we will require to
|
||||
// complete (by checking termBits == 0) before setting the new
|
||||
// bits.
|
||||
if c.termBits != 0 && (!c.checkLookahead() || c.termBits == 0) {
|
||||
return n, err
|
||||
}
|
||||
c.termBits = d.term
|
||||
c.acceptBits = d.accept
|
||||
n += sz
|
||||
continue
|
||||
}
|
||||
}
|
||||
c.beforeBits &= d.keep
|
||||
c.beforeBits |= d.set
|
||||
if c.termBits != 0 && !c.checkLookahead() {
|
||||
return n, errContext
|
||||
}
|
||||
n += sz
|
||||
}
|
||||
if m := c.beforeBits >> finalShift; c.beforeBits&m != m || c.termBits != 0 {
|
||||
err = errContext
|
||||
}
|
||||
return n, err
|
||||
}
|
||||
|
||||
func (c *checker) checkLookahead() bool {
|
||||
switch {
|
||||
case c.beforeBits&c.termBits != 0:
|
||||
c.termBits = 0
|
||||
c.acceptBits = 0
|
||||
case c.beforeBits&c.acceptBits != 0:
|
||||
default:
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// TODO: we may get rid of this transform if transform.Chain understands
|
||||
// something like a Spanner interface.
|
||||
func (c checker) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
short := false
|
||||
if len(dst) < len(src) {
|
||||
src = src[:len(dst)]
|
||||
atEOF = false
|
||||
short = true
|
||||
}
|
||||
nSrc, err = c.span(src, atEOF)
|
||||
nDst = copy(dst, src[:nSrc])
|
||||
if short && (err == transform.ErrShortSrc || err == nil) {
|
||||
err = transform.ErrShortDst
|
||||
}
|
||||
return nDst, nSrc, err
|
||||
}
|
||||
56
vendor/golang.org/x/text/secure/precis/profiles.go
generated
vendored
Normal file
56
vendor/golang.org/x/text/secure/precis/profiles.go
generated
vendored
Normal file
|
|
@ -0,0 +1,56 @@
|
|||
// Copyright 2015 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package precis
|
||||
|
||||
import (
|
||||
"unicode"
|
||||
|
||||
"golang.org/x/text/runes"
|
||||
"golang.org/x/text/transform"
|
||||
"golang.org/x/text/unicode/norm"
|
||||
)
|
||||
|
||||
var (
|
||||
Nickname *Profile = nickname // Implements the Nickname profile specified in RFC 7700.
|
||||
UsernameCaseMapped *Profile = usernameCaseMap // Implements the UsernameCaseMapped profile specified in RFC 7613.
|
||||
UsernameCasePreserved *Profile = usernameNoCaseMap // Implements the UsernameCasePreserved profile specified in RFC 7613.
|
||||
OpaqueString *Profile = opaquestring // Implements the OpaqueString profile defined in RFC 7613 for passwords and other secure labels.
|
||||
)
|
||||
|
||||
// TODO: mvl: "Ultimately, I would manually define the structs for the internal
|
||||
// profiles. This avoid pulling in unneeded tables when they are not used."
|
||||
var (
|
||||
nickname = NewFreeform(
|
||||
AdditionalMapping(func() transform.Transformer {
|
||||
return &nickAdditionalMapping{}
|
||||
}),
|
||||
IgnoreCase,
|
||||
Norm(norm.NFKC),
|
||||
DisallowEmpty,
|
||||
)
|
||||
usernameCaseMap = NewIdentifier(
|
||||
FoldWidth,
|
||||
FoldCase(),
|
||||
Norm(norm.NFC),
|
||||
BidiRule,
|
||||
)
|
||||
usernameNoCaseMap = NewIdentifier(
|
||||
FoldWidth,
|
||||
Norm(norm.NFC),
|
||||
BidiRule,
|
||||
)
|
||||
opaquestring = NewFreeform(
|
||||
AdditionalMapping(func() transform.Transformer {
|
||||
return runes.Map(func(r rune) rune {
|
||||
if unicode.Is(unicode.Zs, r) {
|
||||
return ' '
|
||||
}
|
||||
return r
|
||||
})
|
||||
}),
|
||||
Norm(norm.NFC),
|
||||
DisallowEmpty,
|
||||
)
|
||||
)
|
||||
3788
vendor/golang.org/x/text/secure/precis/tables.go
generated
vendored
Normal file
3788
vendor/golang.org/x/text/secure/precis/tables.go
generated
vendored
Normal file
File diff suppressed because it is too large
Load diff
32
vendor/golang.org/x/text/secure/precis/transformer.go
generated
vendored
Normal file
32
vendor/golang.org/x/text/secure/precis/transformer.go
generated
vendored
Normal file
|
|
@ -0,0 +1,32 @@
|
|||
// Copyright 2015 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package precis
|
||||
|
||||
import "golang.org/x/text/transform"
|
||||
|
||||
// Transformer implements the transform.Transformer interface.
|
||||
type Transformer struct {
|
||||
t transform.Transformer
|
||||
}
|
||||
|
||||
// Reset implements the transform.Transformer interface.
|
||||
func (t Transformer) Reset() { t.t.Reset() }
|
||||
|
||||
// Transform implements the transform.Transformer interface.
|
||||
func (t Transformer) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
return t.t.Transform(dst, src, atEOF)
|
||||
}
|
||||
|
||||
// Bytes returns a new byte slice with the result of applying t to b.
|
||||
func (t Transformer) Bytes(b []byte) []byte {
|
||||
b, _, _ = transform.Bytes(t, b)
|
||||
return b
|
||||
}
|
||||
|
||||
// String returns a string with the result of applying t to s.
|
||||
func (t Transformer) String(s string) string {
|
||||
s, _, _ = transform.String(t, s)
|
||||
return s
|
||||
}
|
||||
64
vendor/golang.org/x/text/secure/precis/trieval.go
generated
vendored
Normal file
64
vendor/golang.org/x/text/secure/precis/trieval.go
generated
vendored
Normal file
|
|
@ -0,0 +1,64 @@
|
|||
// This file was generated by go generate; DO NOT EDIT
|
||||
|
||||
package precis
|
||||
|
||||
// entry is the entry of a trie table
|
||||
// 7..6 property (unassigned, disallowed, maybe, valid)
|
||||
// 5..0 category
|
||||
type entry uint8
|
||||
|
||||
const (
|
||||
propShift = 6
|
||||
propMask = 0xc0
|
||||
catMask = 0x3f
|
||||
)
|
||||
|
||||
func (e entry) property() property { return property(e & propMask) }
|
||||
func (e entry) category() category { return category(e & catMask) }
|
||||
|
||||
type property uint8
|
||||
|
||||
// The order of these constants matter. A Profile may consider runes to be
|
||||
// allowed either from pValid or idDisOrFreePVal.
|
||||
const (
|
||||
unassigned property = iota << propShift
|
||||
disallowed
|
||||
idDisOrFreePVal // disallowed for Identifier, pValid for FreeForm
|
||||
pValid
|
||||
)
|
||||
|
||||
// compute permutations of all properties and specialCategories.
|
||||
type category uint8
|
||||
|
||||
const (
|
||||
other category = iota
|
||||
|
||||
// Special rune types
|
||||
joiningL
|
||||
joiningD
|
||||
joiningT
|
||||
joiningR
|
||||
viramaModifier
|
||||
viramaJoinT // Virama + JoiningT
|
||||
latinSmallL // U+006c
|
||||
greek
|
||||
greekJoinT // Greek + JoiningT
|
||||
hebrew
|
||||
hebrewJoinT // Hebrew + JoiningT
|
||||
japanese // hirigana, katakana, han
|
||||
|
||||
// Special rune types associated with contextual rules defined in
|
||||
// https://tools.ietf.org/html/rfc5892#appendix-A.
|
||||
// ContextO
|
||||
zeroWidthNonJoiner // rule 1
|
||||
zeroWidthJoiner // rule 2
|
||||
// ContextJ
|
||||
middleDot // rule 3
|
||||
greekLowerNumeralSign // rule 4
|
||||
hebrewPreceding // rule 5 and 6
|
||||
katakanaMiddleDot // rule 7
|
||||
arabicIndicDigit // rule 8
|
||||
extendedArabicIndicDigit // rule 9
|
||||
|
||||
numCategories
|
||||
)
|
||||
661
vendor/golang.org/x/text/transform/transform.go
generated
vendored
Normal file
661
vendor/golang.org/x/text/transform/transform.go
generated
vendored
Normal file
|
|
@ -0,0 +1,661 @@
|
|||
// Copyright 2013 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Package transform provides reader and writer wrappers that transform the
|
||||
// bytes passing through as well as various transformations. Example
|
||||
// transformations provided by other packages include normalization and
|
||||
// conversion between character sets.
|
||||
package transform
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"errors"
|
||||
"io"
|
||||
"unicode/utf8"
|
||||
)
|
||||
|
||||
var (
|
||||
// ErrShortDst means that the destination buffer was too short to
|
||||
// receive all of the transformed bytes.
|
||||
ErrShortDst = errors.New("transform: short destination buffer")
|
||||
|
||||
// ErrShortSrc means that the source buffer has insufficient data to
|
||||
// complete the transformation.
|
||||
ErrShortSrc = errors.New("transform: short source buffer")
|
||||
|
||||
// errInconsistentByteCount means that Transform returned success (nil
|
||||
// error) but also returned nSrc inconsistent with the src argument.
|
||||
errInconsistentByteCount = errors.New("transform: inconsistent byte count returned")
|
||||
|
||||
// errShortInternal means that an internal buffer is not large enough
|
||||
// to make progress and the Transform operation must be aborted.
|
||||
errShortInternal = errors.New("transform: short internal buffer")
|
||||
)
|
||||
|
||||
// Transformer transforms bytes.
|
||||
type Transformer interface {
|
||||
// Transform writes to dst the transformed bytes read from src, and
|
||||
// returns the number of dst bytes written and src bytes read. The
|
||||
// atEOF argument tells whether src represents the last bytes of the
|
||||
// input.
|
||||
//
|
||||
// Callers should always process the nDst bytes produced and account
|
||||
// for the nSrc bytes consumed before considering the error err.
|
||||
//
|
||||
// A nil error means that all of the transformed bytes (whether freshly
|
||||
// transformed from src or left over from previous Transform calls)
|
||||
// were written to dst. A nil error can be returned regardless of
|
||||
// whether atEOF is true. If err is nil then nSrc must equal len(src);
|
||||
// the converse is not necessarily true.
|
||||
//
|
||||
// ErrShortDst means that dst was too short to receive all of the
|
||||
// transformed bytes. ErrShortSrc means that src had insufficient data
|
||||
// to complete the transformation. If both conditions apply, then
|
||||
// either error may be returned. Other than the error conditions listed
|
||||
// here, implementations are free to report other errors that arise.
|
||||
Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error)
|
||||
|
||||
// Reset resets the state and allows a Transformer to be reused.
|
||||
Reset()
|
||||
}
|
||||
|
||||
// NopResetter can be embedded by implementations of Transformer to add a nop
|
||||
// Reset method.
|
||||
type NopResetter struct{}
|
||||
|
||||
// Reset implements the Reset method of the Transformer interface.
|
||||
func (NopResetter) Reset() {}
|
||||
|
||||
// Reader wraps another io.Reader by transforming the bytes read.
|
||||
type Reader struct {
|
||||
r io.Reader
|
||||
t Transformer
|
||||
err error
|
||||
|
||||
// dst[dst0:dst1] contains bytes that have been transformed by t but
|
||||
// not yet copied out via Read.
|
||||
dst []byte
|
||||
dst0, dst1 int
|
||||
|
||||
// src[src0:src1] contains bytes that have been read from r but not
|
||||
// yet transformed through t.
|
||||
src []byte
|
||||
src0, src1 int
|
||||
|
||||
// transformComplete is whether the transformation is complete,
|
||||
// regardless of whether or not it was successful.
|
||||
transformComplete bool
|
||||
}
|
||||
|
||||
const defaultBufSize = 4096
|
||||
|
||||
// NewReader returns a new Reader that wraps r by transforming the bytes read
|
||||
// via t. It calls Reset on t.
|
||||
func NewReader(r io.Reader, t Transformer) *Reader {
|
||||
t.Reset()
|
||||
return &Reader{
|
||||
r: r,
|
||||
t: t,
|
||||
dst: make([]byte, defaultBufSize),
|
||||
src: make([]byte, defaultBufSize),
|
||||
}
|
||||
}
|
||||
|
||||
// Read implements the io.Reader interface.
|
||||
func (r *Reader) Read(p []byte) (int, error) {
|
||||
n, err := 0, error(nil)
|
||||
for {
|
||||
// Copy out any transformed bytes and return the final error if we are done.
|
||||
if r.dst0 != r.dst1 {
|
||||
n = copy(p, r.dst[r.dst0:r.dst1])
|
||||
r.dst0 += n
|
||||
if r.dst0 == r.dst1 && r.transformComplete {
|
||||
return n, r.err
|
||||
}
|
||||
return n, nil
|
||||
} else if r.transformComplete {
|
||||
return 0, r.err
|
||||
}
|
||||
|
||||
// Try to transform some source bytes, or to flush the transformer if we
|
||||
// are out of source bytes. We do this even if r.r.Read returned an error.
|
||||
// As the io.Reader documentation says, "process the n > 0 bytes returned
|
||||
// before considering the error".
|
||||
if r.src0 != r.src1 || r.err != nil {
|
||||
r.dst0 = 0
|
||||
r.dst1, n, err = r.t.Transform(r.dst, r.src[r.src0:r.src1], r.err == io.EOF)
|
||||
r.src0 += n
|
||||
|
||||
switch {
|
||||
case err == nil:
|
||||
if r.src0 != r.src1 {
|
||||
r.err = errInconsistentByteCount
|
||||
}
|
||||
// The Transform call was successful; we are complete if we
|
||||
// cannot read more bytes into src.
|
||||
r.transformComplete = r.err != nil
|
||||
continue
|
||||
case err == ErrShortDst && (r.dst1 != 0 || n != 0):
|
||||
// Make room in dst by copying out, and try again.
|
||||
continue
|
||||
case err == ErrShortSrc && r.src1-r.src0 != len(r.src) && r.err == nil:
|
||||
// Read more bytes into src via the code below, and try again.
|
||||
default:
|
||||
r.transformComplete = true
|
||||
// The reader error (r.err) takes precedence over the
|
||||
// transformer error (err) unless r.err is nil or io.EOF.
|
||||
if r.err == nil || r.err == io.EOF {
|
||||
r.err = err
|
||||
}
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
// Move any untransformed source bytes to the start of the buffer
|
||||
// and read more bytes.
|
||||
if r.src0 != 0 {
|
||||
r.src0, r.src1 = 0, copy(r.src, r.src[r.src0:r.src1])
|
||||
}
|
||||
n, r.err = r.r.Read(r.src[r.src1:])
|
||||
r.src1 += n
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: implement ReadByte (and ReadRune??).
|
||||
|
||||
// Writer wraps another io.Writer by transforming the bytes read.
|
||||
// The user needs to call Close to flush unwritten bytes that may
|
||||
// be buffered.
|
||||
type Writer struct {
|
||||
w io.Writer
|
||||
t Transformer
|
||||
dst []byte
|
||||
|
||||
// src[:n] contains bytes that have not yet passed through t.
|
||||
src []byte
|
||||
n int
|
||||
}
|
||||
|
||||
// NewWriter returns a new Writer that wraps w by transforming the bytes written
|
||||
// via t. It calls Reset on t.
|
||||
func NewWriter(w io.Writer, t Transformer) *Writer {
|
||||
t.Reset()
|
||||
return &Writer{
|
||||
w: w,
|
||||
t: t,
|
||||
dst: make([]byte, defaultBufSize),
|
||||
src: make([]byte, defaultBufSize),
|
||||
}
|
||||
}
|
||||
|
||||
// Write implements the io.Writer interface. If there are not enough
|
||||
// bytes available to complete a Transform, the bytes will be buffered
|
||||
// for the next write. Call Close to convert the remaining bytes.
|
||||
func (w *Writer) Write(data []byte) (n int, err error) {
|
||||
src := data
|
||||
if w.n > 0 {
|
||||
// Append bytes from data to the last remainder.
|
||||
// TODO: limit the amount copied on first try.
|
||||
n = copy(w.src[w.n:], data)
|
||||
w.n += n
|
||||
src = w.src[:w.n]
|
||||
}
|
||||
for {
|
||||
nDst, nSrc, err := w.t.Transform(w.dst, src, false)
|
||||
if _, werr := w.w.Write(w.dst[:nDst]); werr != nil {
|
||||
return n, werr
|
||||
}
|
||||
src = src[nSrc:]
|
||||
if w.n == 0 {
|
||||
n += nSrc
|
||||
} else if len(src) <= n {
|
||||
// Enough bytes from w.src have been consumed. We make src point
|
||||
// to data instead to reduce the copying.
|
||||
w.n = 0
|
||||
n -= len(src)
|
||||
src = data[n:]
|
||||
if n < len(data) && (err == nil || err == ErrShortSrc) {
|
||||
continue
|
||||
}
|
||||
}
|
||||
switch err {
|
||||
case ErrShortDst:
|
||||
// This error is okay as long as we are making progress.
|
||||
if nDst > 0 || nSrc > 0 {
|
||||
continue
|
||||
}
|
||||
case ErrShortSrc:
|
||||
if len(src) < len(w.src) {
|
||||
m := copy(w.src, src)
|
||||
// If w.n > 0, bytes from data were already copied to w.src and n
|
||||
// was already set to the number of bytes consumed.
|
||||
if w.n == 0 {
|
||||
n += m
|
||||
}
|
||||
w.n = m
|
||||
err = nil
|
||||
} else if nDst > 0 || nSrc > 0 {
|
||||
// Not enough buffer to store the remainder. Keep processing as
|
||||
// long as there is progress. Without this case, transforms that
|
||||
// require a lookahead larger than the buffer may result in an
|
||||
// error. This is not something one may expect to be common in
|
||||
// practice, but it may occur when buffers are set to small
|
||||
// sizes during testing.
|
||||
continue
|
||||
}
|
||||
case nil:
|
||||
if w.n > 0 {
|
||||
err = errInconsistentByteCount
|
||||
}
|
||||
}
|
||||
return n, err
|
||||
}
|
||||
}
|
||||
|
||||
// Close implements the io.Closer interface.
|
||||
func (w *Writer) Close() error {
|
||||
src := w.src[:w.n]
|
||||
for {
|
||||
nDst, nSrc, err := w.t.Transform(w.dst, src, true)
|
||||
if _, werr := w.w.Write(w.dst[:nDst]); werr != nil {
|
||||
return werr
|
||||
}
|
||||
if err != ErrShortDst {
|
||||
return err
|
||||
}
|
||||
src = src[nSrc:]
|
||||
}
|
||||
}
|
||||
|
||||
type nop struct{ NopResetter }
|
||||
|
||||
func (nop) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
n := copy(dst, src)
|
||||
if n < len(src) {
|
||||
err = ErrShortDst
|
||||
}
|
||||
return n, n, err
|
||||
}
|
||||
|
||||
type discard struct{ NopResetter }
|
||||
|
||||
func (discard) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
return 0, len(src), nil
|
||||
}
|
||||
|
||||
var (
|
||||
// Discard is a Transformer for which all Transform calls succeed
|
||||
// by consuming all bytes and writing nothing.
|
||||
Discard Transformer = discard{}
|
||||
|
||||
// Nop is a Transformer that copies src to dst.
|
||||
Nop Transformer = nop{}
|
||||
)
|
||||
|
||||
// chain is a sequence of links. A chain with N Transformers has N+1 links and
|
||||
// N+1 buffers. Of those N+1 buffers, the first and last are the src and dst
|
||||
// buffers given to chain.Transform and the middle N-1 buffers are intermediate
|
||||
// buffers owned by the chain. The i'th link transforms bytes from the i'th
|
||||
// buffer chain.link[i].b at read offset chain.link[i].p to the i+1'th buffer
|
||||
// chain.link[i+1].b at write offset chain.link[i+1].n, for i in [0, N).
|
||||
type chain struct {
|
||||
link []link
|
||||
err error
|
||||
// errStart is the index at which the error occurred plus 1. Processing
|
||||
// errStart at this level at the next call to Transform. As long as
|
||||
// errStart > 0, chain will not consume any more source bytes.
|
||||
errStart int
|
||||
}
|
||||
|
||||
func (c *chain) fatalError(errIndex int, err error) {
|
||||
if i := errIndex + 1; i > c.errStart {
|
||||
c.errStart = i
|
||||
c.err = err
|
||||
}
|
||||
}
|
||||
|
||||
type link struct {
|
||||
t Transformer
|
||||
// b[p:n] holds the bytes to be transformed by t.
|
||||
b []byte
|
||||
p int
|
||||
n int
|
||||
}
|
||||
|
||||
func (l *link) src() []byte {
|
||||
return l.b[l.p:l.n]
|
||||
}
|
||||
|
||||
func (l *link) dst() []byte {
|
||||
return l.b[l.n:]
|
||||
}
|
||||
|
||||
// Chain returns a Transformer that applies t in sequence.
|
||||
func Chain(t ...Transformer) Transformer {
|
||||
if len(t) == 0 {
|
||||
return nop{}
|
||||
}
|
||||
c := &chain{link: make([]link, len(t)+1)}
|
||||
for i, tt := range t {
|
||||
c.link[i].t = tt
|
||||
}
|
||||
// Allocate intermediate buffers.
|
||||
b := make([][defaultBufSize]byte, len(t)-1)
|
||||
for i := range b {
|
||||
c.link[i+1].b = b[i][:]
|
||||
}
|
||||
return c
|
||||
}
|
||||
|
||||
// Reset resets the state of Chain. It calls Reset on all the Transformers.
|
||||
func (c *chain) Reset() {
|
||||
for i, l := range c.link {
|
||||
if l.t != nil {
|
||||
l.t.Reset()
|
||||
}
|
||||
c.link[i].p, c.link[i].n = 0, 0
|
||||
}
|
||||
}
|
||||
|
||||
// Transform applies the transformers of c in sequence.
|
||||
func (c *chain) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
// Set up src and dst in the chain.
|
||||
srcL := &c.link[0]
|
||||
dstL := &c.link[len(c.link)-1]
|
||||
srcL.b, srcL.p, srcL.n = src, 0, len(src)
|
||||
dstL.b, dstL.n = dst, 0
|
||||
var lastFull, needProgress bool // for detecting progress
|
||||
|
||||
// i is the index of the next Transformer to apply, for i in [low, high].
|
||||
// low is the lowest index for which c.link[low] may still produce bytes.
|
||||
// high is the highest index for which c.link[high] has a Transformer.
|
||||
// The error returned by Transform determines whether to increase or
|
||||
// decrease i. We try to completely fill a buffer before converting it.
|
||||
for low, i, high := c.errStart, c.errStart, len(c.link)-2; low <= i && i <= high; {
|
||||
in, out := &c.link[i], &c.link[i+1]
|
||||
nDst, nSrc, err0 := in.t.Transform(out.dst(), in.src(), atEOF && low == i)
|
||||
out.n += nDst
|
||||
in.p += nSrc
|
||||
if i > 0 && in.p == in.n {
|
||||
in.p, in.n = 0, 0
|
||||
}
|
||||
needProgress, lastFull = lastFull, false
|
||||
switch err0 {
|
||||
case ErrShortDst:
|
||||
// Process the destination buffer next. Return if we are already
|
||||
// at the high index.
|
||||
if i == high {
|
||||
return dstL.n, srcL.p, ErrShortDst
|
||||
}
|
||||
if out.n != 0 {
|
||||
i++
|
||||
// If the Transformer at the next index is not able to process any
|
||||
// source bytes there is nothing that can be done to make progress
|
||||
// and the bytes will remain unprocessed. lastFull is used to
|
||||
// detect this and break out of the loop with a fatal error.
|
||||
lastFull = true
|
||||
continue
|
||||
}
|
||||
// The destination buffer was too small, but is completely empty.
|
||||
// Return a fatal error as this transformation can never complete.
|
||||
c.fatalError(i, errShortInternal)
|
||||
case ErrShortSrc:
|
||||
if i == 0 {
|
||||
// Save ErrShortSrc in err. All other errors take precedence.
|
||||
err = ErrShortSrc
|
||||
break
|
||||
}
|
||||
// Source bytes were depleted before filling up the destination buffer.
|
||||
// Verify we made some progress, move the remaining bytes to the errStart
|
||||
// and try to get more source bytes.
|
||||
if needProgress && nSrc == 0 || in.n-in.p == len(in.b) {
|
||||
// There were not enough source bytes to proceed while the source
|
||||
// buffer cannot hold any more bytes. Return a fatal error as this
|
||||
// transformation can never complete.
|
||||
c.fatalError(i, errShortInternal)
|
||||
break
|
||||
}
|
||||
// in.b is an internal buffer and we can make progress.
|
||||
in.p, in.n = 0, copy(in.b, in.src())
|
||||
fallthrough
|
||||
case nil:
|
||||
// if i == low, we have depleted the bytes at index i or any lower levels.
|
||||
// In that case we increase low and i. In all other cases we decrease i to
|
||||
// fetch more bytes before proceeding to the next index.
|
||||
if i > low {
|
||||
i--
|
||||
continue
|
||||
}
|
||||
default:
|
||||
c.fatalError(i, err0)
|
||||
}
|
||||
// Exhausted level low or fatal error: increase low and continue
|
||||
// to process the bytes accepted so far.
|
||||
i++
|
||||
low = i
|
||||
}
|
||||
|
||||
// If c.errStart > 0, this means we found a fatal error. We will clear
|
||||
// all upstream buffers. At this point, no more progress can be made
|
||||
// downstream, as Transform would have bailed while handling ErrShortDst.
|
||||
if c.errStart > 0 {
|
||||
for i := 1; i < c.errStart; i++ {
|
||||
c.link[i].p, c.link[i].n = 0, 0
|
||||
}
|
||||
err, c.errStart, c.err = c.err, 0, nil
|
||||
}
|
||||
return dstL.n, srcL.p, err
|
||||
}
|
||||
|
||||
// RemoveFunc returns a Transformer that removes from the input all runes r for
|
||||
// which f(r) is true. Illegal bytes in the input are replaced by RuneError.
|
||||
func RemoveFunc(f func(r rune) bool) Transformer {
|
||||
return removeF(f)
|
||||
}
|
||||
|
||||
type removeF func(r rune) bool
|
||||
|
||||
func (removeF) Reset() {}
|
||||
|
||||
// Transform implements the Transformer interface.
|
||||
func (t removeF) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
for r, sz := rune(0), 0; len(src) > 0; src = src[sz:] {
|
||||
|
||||
if r = rune(src[0]); r < utf8.RuneSelf {
|
||||
sz = 1
|
||||
} else {
|
||||
r, sz = utf8.DecodeRune(src)
|
||||
|
||||
if sz == 1 {
|
||||
// Invalid rune.
|
||||
if !atEOF && !utf8.FullRune(src) {
|
||||
err = ErrShortSrc
|
||||
break
|
||||
}
|
||||
// We replace illegal bytes with RuneError. Not doing so might
|
||||
// otherwise turn a sequence of invalid UTF-8 into valid UTF-8.
|
||||
// The resulting byte sequence may subsequently contain runes
|
||||
// for which t(r) is true that were passed unnoticed.
|
||||
if !t(r) {
|
||||
if nDst+3 > len(dst) {
|
||||
err = ErrShortDst
|
||||
break
|
||||
}
|
||||
nDst += copy(dst[nDst:], "\uFFFD")
|
||||
}
|
||||
nSrc++
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
if !t(r) {
|
||||
if nDst+sz > len(dst) {
|
||||
err = ErrShortDst
|
||||
break
|
||||
}
|
||||
nDst += copy(dst[nDst:], src[:sz])
|
||||
}
|
||||
nSrc += sz
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// grow returns a new []byte that is longer than b, and copies the first n bytes
|
||||
// of b to the start of the new slice.
|
||||
func grow(b []byte, n int) []byte {
|
||||
m := len(b)
|
||||
if m <= 32 {
|
||||
m = 64
|
||||
} else if m <= 256 {
|
||||
m *= 2
|
||||
} else {
|
||||
m += m >> 1
|
||||
}
|
||||
buf := make([]byte, m)
|
||||
copy(buf, b[:n])
|
||||
return buf
|
||||
}
|
||||
|
||||
const initialBufSize = 128
|
||||
|
||||
// String returns a string with the result of converting s[:n] using t, where
|
||||
// n <= len(s). If err == nil, n will be len(s). It calls Reset on t.
|
||||
func String(t Transformer, s string) (result string, n int, err error) {
|
||||
t.Reset()
|
||||
if s == "" {
|
||||
// Fast path for the common case for empty input. Results in about a
|
||||
// 86% reduction of running time for BenchmarkStringLowerEmpty.
|
||||
if _, _, err := t.Transform(nil, nil, true); err == nil {
|
||||
return "", 0, nil
|
||||
}
|
||||
}
|
||||
|
||||
// Allocate only once. Note that both dst and src escape when passed to
|
||||
// Transform.
|
||||
buf := [2 * initialBufSize]byte{}
|
||||
dst := buf[:initialBufSize:initialBufSize]
|
||||
src := buf[initialBufSize : 2*initialBufSize]
|
||||
|
||||
// The input string s is transformed in multiple chunks (starting with a
|
||||
// chunk size of initialBufSize). nDst and nSrc are per-chunk (or
|
||||
// per-Transform-call) indexes, pDst and pSrc are overall indexes.
|
||||
nDst, nSrc := 0, 0
|
||||
pDst, pSrc := 0, 0
|
||||
|
||||
// pPrefix is the length of a common prefix: the first pPrefix bytes of the
|
||||
// result will equal the first pPrefix bytes of s. It is not guaranteed to
|
||||
// be the largest such value, but if pPrefix, len(result) and len(s) are
|
||||
// all equal after the final transform (i.e. calling Transform with atEOF
|
||||
// being true returned nil error) then we don't need to allocate a new
|
||||
// result string.
|
||||
pPrefix := 0
|
||||
for {
|
||||
// Invariant: pDst == pPrefix && pSrc == pPrefix.
|
||||
|
||||
n := copy(src, s[pSrc:])
|
||||
nDst, nSrc, err = t.Transform(dst, src[:n], pSrc+n == len(s))
|
||||
pDst += nDst
|
||||
pSrc += nSrc
|
||||
|
||||
// TODO: let transformers implement an optional Spanner interface, akin
|
||||
// to norm's QuickSpan. This would even allow us to avoid any allocation.
|
||||
if !bytes.Equal(dst[:nDst], src[:nSrc]) {
|
||||
break
|
||||
}
|
||||
pPrefix = pSrc
|
||||
if err == ErrShortDst {
|
||||
// A buffer can only be short if a transformer modifies its input.
|
||||
break
|
||||
} else if err == ErrShortSrc {
|
||||
if nSrc == 0 {
|
||||
// No progress was made.
|
||||
break
|
||||
}
|
||||
// Equal so far and !atEOF, so continue checking.
|
||||
} else if err != nil || pPrefix == len(s) {
|
||||
return string(s[:pPrefix]), pPrefix, err
|
||||
}
|
||||
}
|
||||
// Post-condition: pDst == pPrefix + nDst && pSrc == pPrefix + nSrc.
|
||||
|
||||
// We have transformed the first pSrc bytes of the input s to become pDst
|
||||
// transformed bytes. Those transformed bytes are discontiguous: the first
|
||||
// pPrefix of them equal s[:pPrefix] and the last nDst of them equal
|
||||
// dst[:nDst]. We copy them around, into a new dst buffer if necessary, so
|
||||
// that they become one contiguous slice: dst[:pDst].
|
||||
if pPrefix != 0 {
|
||||
newDst := dst
|
||||
if pDst > len(newDst) {
|
||||
newDst = make([]byte, len(s)+nDst-nSrc)
|
||||
}
|
||||
copy(newDst[pPrefix:pDst], dst[:nDst])
|
||||
copy(newDst[:pPrefix], s[:pPrefix])
|
||||
dst = newDst
|
||||
}
|
||||
|
||||
// Prevent duplicate Transform calls with atEOF being true at the end of
|
||||
// the input. Also return if we have an unrecoverable error.
|
||||
if (err == nil && pSrc == len(s)) ||
|
||||
(err != nil && err != ErrShortDst && err != ErrShortSrc) {
|
||||
return string(dst[:pDst]), pSrc, err
|
||||
}
|
||||
|
||||
// Transform the remaining input, growing dst and src buffers as necessary.
|
||||
for {
|
||||
n := copy(src, s[pSrc:])
|
||||
nDst, nSrc, err := t.Transform(dst[pDst:], src[:n], pSrc+n == len(s))
|
||||
pDst += nDst
|
||||
pSrc += nSrc
|
||||
|
||||
// If we got ErrShortDst or ErrShortSrc, do not grow as long as we can
|
||||
// make progress. This may avoid excessive allocations.
|
||||
if err == ErrShortDst {
|
||||
if nDst == 0 {
|
||||
dst = grow(dst, pDst)
|
||||
}
|
||||
} else if err == ErrShortSrc {
|
||||
if nSrc == 0 {
|
||||
src = grow(src, 0)
|
||||
}
|
||||
} else if err != nil || pSrc == len(s) {
|
||||
return string(dst[:pDst]), pSrc, err
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Bytes returns a new byte slice with the result of converting b[:n] using t,
|
||||
// where n <= len(b). If err == nil, n will be len(b). It calls Reset on t.
|
||||
func Bytes(t Transformer, b []byte) (result []byte, n int, err error) {
|
||||
return doAppend(t, 0, make([]byte, len(b)), b)
|
||||
}
|
||||
|
||||
// Append appends the result of converting src[:n] using t to dst, where
|
||||
// n <= len(src), If err == nil, n will be len(src). It calls Reset on t.
|
||||
func Append(t Transformer, dst, src []byte) (result []byte, n int, err error) {
|
||||
if len(dst) == cap(dst) {
|
||||
n := len(src) + len(dst) // It is okay for this to be 0.
|
||||
b := make([]byte, n)
|
||||
dst = b[:copy(b, dst)]
|
||||
}
|
||||
return doAppend(t, len(dst), dst[:cap(dst)], src)
|
||||
}
|
||||
|
||||
func doAppend(t Transformer, pDst int, dst, src []byte) (result []byte, n int, err error) {
|
||||
t.Reset()
|
||||
pSrc := 0
|
||||
for {
|
||||
nDst, nSrc, err := t.Transform(dst[pDst:], src[pSrc:], true)
|
||||
pDst += nDst
|
||||
pSrc += nSrc
|
||||
if err != ErrShortDst {
|
||||
return dst[:pDst], pSrc, err
|
||||
}
|
||||
|
||||
// Grow the destination buffer, but do not grow as long as we can make
|
||||
// progress. This may avoid excessive allocations.
|
||||
if nDst == 0 {
|
||||
dst = grow(dst, pDst)
|
||||
}
|
||||
}
|
||||
}
|
||||
198
vendor/golang.org/x/text/unicode/bidi/bidi.go
generated
vendored
Normal file
198
vendor/golang.org/x/text/unicode/bidi/bidi.go
generated
vendored
Normal file
|
|
@ -0,0 +1,198 @@
|
|||
// Copyright 2015 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
//go:generate go run gen.go gen_trieval.go gen_ranges.go
|
||||
|
||||
// Package bidi contains functionality for bidirectional text support.
|
||||
//
|
||||
// See http://www.unicode.org/reports/tr9.
|
||||
//
|
||||
// NOTE: UNDER CONSTRUCTION. This API may change in backwards incompatible ways
|
||||
// and without notice.
|
||||
package bidi
|
||||
|
||||
// TODO:
|
||||
// The following functionality would not be hard to implement, but hinges on
|
||||
// the definition of a Segmenter interface. For now this is up to the user.
|
||||
// - Iterate over paragraphs
|
||||
// - Segmenter to iterate over runs directly from a given text.
|
||||
// Also:
|
||||
// - Transformer for reordering?
|
||||
// - Transformer (validator, really) for Bidi Rule.
|
||||
|
||||
// This API tries to avoid dealing with embedding levels for now. Under the hood
|
||||
// these will be computed, but the question is to which extent the user should
|
||||
// know they exist. We should at some point allow the user to specify an
|
||||
// embedding hierarchy, though.
|
||||
|
||||
// A Direction indicates the overall flow of text.
|
||||
type Direction int
|
||||
|
||||
const (
|
||||
// LeftToRight indicates the text contains no right-to-left characters and
|
||||
// that either there are some left-to-right characters or the option
|
||||
// DefaultDirection(LeftToRight) was passed.
|
||||
LeftToRight Direction = iota
|
||||
|
||||
// RightToLeft indicates the text contains no left-to-right characters and
|
||||
// that either there are some right-to-left characters or the option
|
||||
// DefaultDirection(RightToLeft) was passed.
|
||||
RightToLeft
|
||||
|
||||
// Mixed indicates text contains both left-to-right and right-to-left
|
||||
// characters.
|
||||
Mixed
|
||||
|
||||
// Neutral means that text contains no left-to-right and right-to-left
|
||||
// characters and that no default direction has been set.
|
||||
Neutral
|
||||
)
|
||||
|
||||
type options struct{}
|
||||
|
||||
// An Option is an option for Bidi processing.
|
||||
type Option func(*options)
|
||||
|
||||
// ICU allows the user to define embedding levels. This may be used, for example,
|
||||
// to use hierarchical structure of markup languages to define embeddings.
|
||||
// The following option may be a way to expose this functionality in this API.
|
||||
// // LevelFunc sets a function that associates nesting levels with the given text.
|
||||
// // The levels function will be called with monotonically increasing values for p.
|
||||
// func LevelFunc(levels func(p int) int) Option {
|
||||
// panic("unimplemented")
|
||||
// }
|
||||
|
||||
// DefaultDirection sets the default direction for a Paragraph. The direction is
|
||||
// overridden if the text contains directional characters.
|
||||
func DefaultDirection(d Direction) Option {
|
||||
panic("unimplemented")
|
||||
}
|
||||
|
||||
// A Paragraph holds a single Paragraph for Bidi processing.
|
||||
type Paragraph struct {
|
||||
// buffers
|
||||
}
|
||||
|
||||
// SetBytes configures p for the given paragraph text. It replaces text
|
||||
// previously set by SetBytes or SetString. If b contains a paragraph separator
|
||||
// it will only process the first paragraph and report the number of bytes
|
||||
// consumed from b including this separator. Error may be non-nil if options are
|
||||
// given.
|
||||
func (p *Paragraph) SetBytes(b []byte, opts ...Option) (n int, err error) {
|
||||
panic("unimplemented")
|
||||
}
|
||||
|
||||
// SetString configures p for the given paragraph text. It replaces text
|
||||
// previously set by SetBytes or SetString. If b contains a paragraph separator
|
||||
// it will only process the first paragraph and report the number of bytes
|
||||
// consumed from b including this separator. Error may be non-nil if options are
|
||||
// given.
|
||||
func (p *Paragraph) SetString(s string, opts ...Option) (n int, err error) {
|
||||
panic("unimplemented")
|
||||
}
|
||||
|
||||
// IsLeftToRight reports whether the principle direction of rendering for this
|
||||
// paragraphs is left-to-right. If this returns false, the principle direction
|
||||
// of rendering is right-to-left.
|
||||
func (p *Paragraph) IsLeftToRight() bool {
|
||||
panic("unimplemented")
|
||||
}
|
||||
|
||||
// Direction returns the direction of the text of this paragraph.
|
||||
//
|
||||
// The direction may be LeftToRight, RightToLeft, Mixed, or Neutral.
|
||||
func (p *Paragraph) Direction() Direction {
|
||||
panic("unimplemented")
|
||||
}
|
||||
|
||||
// RunAt reports the Run at the given position of the input text.
|
||||
//
|
||||
// This method can be used for computing line breaks on paragraphs.
|
||||
func (p *Paragraph) RunAt(pos int) Run {
|
||||
panic("unimplemented")
|
||||
}
|
||||
|
||||
// Order computes the visual ordering of all the runs in a Paragraph.
|
||||
func (p *Paragraph) Order() (Ordering, error) {
|
||||
panic("unimplemented")
|
||||
}
|
||||
|
||||
// Line computes the visual ordering of runs for a single line starting and
|
||||
// ending at the given positions in the original text.
|
||||
func (p *Paragraph) Line(start, end int) (Ordering, error) {
|
||||
panic("unimplemented")
|
||||
}
|
||||
|
||||
// An Ordering holds the computed visual order of runs of a Paragraph. Calling
|
||||
// SetBytes or SetString on the originating Paragraph invalidates an Ordering.
|
||||
// The methods of an Ordering should only be called by one goroutine at a time.
|
||||
type Ordering struct{}
|
||||
|
||||
// Direction reports the directionality of the runs.
|
||||
//
|
||||
// The direction may be LeftToRight, RightToLeft, Mixed, or Neutral.
|
||||
func (o *Ordering) Direction() Direction {
|
||||
panic("unimplemented")
|
||||
}
|
||||
|
||||
// NumRuns returns the number of runs.
|
||||
func (o *Ordering) NumRuns() int {
|
||||
panic("unimplemented")
|
||||
}
|
||||
|
||||
// Run returns the ith run within the ordering.
|
||||
func (o *Ordering) Run(i int) Run {
|
||||
panic("unimplemented")
|
||||
}
|
||||
|
||||
// TODO: perhaps with options.
|
||||
// // Reorder creates a reader that reads the runes in visual order per character.
|
||||
// // Modifiers remain after the runes they modify.
|
||||
// func (l *Runs) Reorder() io.Reader {
|
||||
// panic("unimplemented")
|
||||
// }
|
||||
|
||||
// A Run is a continuous sequence of characters of a single direction.
|
||||
type Run struct {
|
||||
}
|
||||
|
||||
// String returns the text of the run in its original order.
|
||||
func (r *Run) String() string {
|
||||
panic("unimplemented")
|
||||
}
|
||||
|
||||
// Bytes returns the text of the run in its original order.
|
||||
func (r *Run) Bytes() []byte {
|
||||
panic("unimplemented")
|
||||
}
|
||||
|
||||
// TODO: methods for
|
||||
// - Display order
|
||||
// - headers and footers
|
||||
// - bracket replacement.
|
||||
|
||||
// Direction reports the direction of the run.
|
||||
func (r *Run) Direction() Direction {
|
||||
panic("unimplemented")
|
||||
}
|
||||
|
||||
// Position of the Run within the text passed to SetBytes or SetString of the
|
||||
// originating Paragraph value.
|
||||
func (r *Run) Pos() (start, end int) {
|
||||
panic("unimplemented")
|
||||
}
|
||||
|
||||
// AppendReverse reverses the order of characters of in, appends them to out,
|
||||
// and returns the result. Modifiers will still follow the runes they modify.
|
||||
// Brackets are replaced with their counterparts.
|
||||
func AppendReverse(out, in []byte) []byte {
|
||||
panic("unimplemented")
|
||||
}
|
||||
|
||||
// ReverseString reverses the order of characters in s and returns a new string.
|
||||
// Modifiers will still follow the runes they modify. Brackets are replaced with
|
||||
// their counterparts.
|
||||
func ReverseString(s string) string {
|
||||
panic("unimplemented")
|
||||
}
|
||||
307
vendor/golang.org/x/text/unicode/bidi/bracket.go
generated
vendored
Normal file
307
vendor/golang.org/x/text/unicode/bidi/bracket.go
generated
vendored
Normal file
|
|
@ -0,0 +1,307 @@
|
|||
// Copyright 2015 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package bidi
|
||||
|
||||
import (
|
||||
"container/list"
|
||||
"fmt"
|
||||
"sort"
|
||||
)
|
||||
|
||||
// This file contains a port of the reference implementation of the
|
||||
// Bidi Parentheses Algorithm:
|
||||
// http://www.unicode.org/Public/PROGRAMS/BidiReferenceJava/BidiPBAReference.java
|
||||
//
|
||||
// The implementation in this file covers definitions BD14-BD16 and rule N0
|
||||
// of UAX#9.
|
||||
//
|
||||
// Some preprocessing is done for each rune before data is passed to this
|
||||
// algorithm:
|
||||
// - opening and closing brackets are identified
|
||||
// - a bracket pair type, like '(' and ')' is assigned a unique identifier that
|
||||
// is identical for the opening and closing bracket. It is left to do these
|
||||
// mappings.
|
||||
// - The BPA algorithm requires that bracket characters that are canonical
|
||||
// equivalents of each other be able to be substituted for each other.
|
||||
// It is the responsibility of the caller to do this canonicalization.
|
||||
//
|
||||
// In implementing BD16, this implementation departs slightly from the "logical"
|
||||
// algorithm defined in UAX#9. In particular, the stack referenced there
|
||||
// supports operations that go beyond a "basic" stack. An equivalent
|
||||
// implementation based on a linked list is used here.
|
||||
|
||||
// Bidi_Paired_Bracket_Type
|
||||
// BD14. An opening paired bracket is a character whose
|
||||
// Bidi_Paired_Bracket_Type property value is Open.
|
||||
//
|
||||
// BD15. A closing paired bracket is a character whose
|
||||
// Bidi_Paired_Bracket_Type property value is Close.
|
||||
type bracketType byte
|
||||
|
||||
const (
|
||||
bpNone bracketType = iota
|
||||
bpOpen
|
||||
bpClose
|
||||
)
|
||||
|
||||
// bracketPair holds a pair of index values for opening and closing bracket
|
||||
// location of a bracket pair.
|
||||
type bracketPair struct {
|
||||
opener int
|
||||
closer int
|
||||
}
|
||||
|
||||
func (b *bracketPair) String() string {
|
||||
return fmt.Sprintf("(%v, %v)", b.opener, b.closer)
|
||||
}
|
||||
|
||||
// bracketPairs is a slice of bracketPairs with a sort.Interface implementation.
|
||||
type bracketPairs []bracketPair
|
||||
|
||||
func (b bracketPairs) Len() int { return len(b) }
|
||||
func (b bracketPairs) Swap(i, j int) { b[i], b[j] = b[j], b[i] }
|
||||
func (b bracketPairs) Less(i, j int) bool { return b[i].opener < b[j].opener }
|
||||
|
||||
// resolvePairedBrackets runs the paired bracket part of the UBA algorithm.
|
||||
//
|
||||
// For each rune, it takes the indexes into the original string, the class the
|
||||
// bracket type (in pairTypes) and the bracket identifier (pairValues). It also
|
||||
// takes the direction type for the start-of-sentence and the embedding level.
|
||||
//
|
||||
// The identifiers for bracket types are the rune of the canonicalized opening
|
||||
// bracket for brackets (open or close) or 0 for runes that are not brackets.
|
||||
func resolvePairedBrackets(s *isolatingRunSequence) {
|
||||
p := bracketPairer{
|
||||
sos: s.sos,
|
||||
openers: list.New(),
|
||||
codesIsolatedRun: s.types,
|
||||
indexes: s.indexes,
|
||||
}
|
||||
dirEmbed := L
|
||||
if s.level&1 != 0 {
|
||||
dirEmbed = R
|
||||
}
|
||||
p.locateBrackets(s.p.pairTypes, s.p.pairValues)
|
||||
p.resolveBrackets(dirEmbed)
|
||||
}
|
||||
|
||||
type bracketPairer struct {
|
||||
sos Class // direction corresponding to start of sequence
|
||||
|
||||
// The following is a restatement of BD 16 using non-algorithmic language.
|
||||
//
|
||||
// A bracket pair is a pair of characters consisting of an opening
|
||||
// paired bracket and a closing paired bracket such that the
|
||||
// Bidi_Paired_Bracket property value of the former equals the latter,
|
||||
// subject to the following constraints.
|
||||
// - both characters of a pair occur in the same isolating run sequence
|
||||
// - the closing character of a pair follows the opening character
|
||||
// - any bracket character can belong at most to one pair, the earliest possible one
|
||||
// - any bracket character not part of a pair is treated like an ordinary character
|
||||
// - pairs may nest properly, but their spans may not overlap otherwise
|
||||
|
||||
// Bracket characters with canonical decompositions are supposed to be
|
||||
// treated as if they had been normalized, to allow normalized and non-
|
||||
// normalized text to give the same result. In this implementation that step
|
||||
// is pushed out to the caller. The caller has to ensure that the pairValue
|
||||
// slices contain the rune of the opening bracket after normalization for
|
||||
// any opening or closing bracket.
|
||||
|
||||
openers *list.List // list of positions for opening brackets
|
||||
|
||||
// bracket pair positions sorted by location of opening bracket
|
||||
pairPositions bracketPairs
|
||||
|
||||
codesIsolatedRun []Class // directional bidi codes for an isolated run
|
||||
indexes []int // array of index values into the original string
|
||||
|
||||
}
|
||||
|
||||
// matchOpener reports whether characters at given positions form a matching
|
||||
// bracket pair.
|
||||
func (p *bracketPairer) matchOpener(pairValues []rune, opener, closer int) bool {
|
||||
return pairValues[p.indexes[opener]] == pairValues[p.indexes[closer]]
|
||||
}
|
||||
|
||||
// locateBrackets locates matching bracket pairs according to BD16.
|
||||
//
|
||||
// This implementation uses a linked list instead of a stack, because, while
|
||||
// elements are added at the front (like a push) they are not generally removed
|
||||
// in atomic 'pop' operations, reducing the benefit of the stack archetype.
|
||||
func (p *bracketPairer) locateBrackets(pairTypes []bracketType, pairValues []rune) {
|
||||
// traverse the run
|
||||
// do that explicitly (not in a for-each) so we can record position
|
||||
for i, index := range p.indexes {
|
||||
|
||||
// look at the bracket type for each character
|
||||
switch pairTypes[index] {
|
||||
case bpNone:
|
||||
// continue scanning
|
||||
|
||||
case bpOpen:
|
||||
// remember opener location, most recent first
|
||||
p.openers.PushFront(i)
|
||||
|
||||
case bpClose:
|
||||
// see if there is a match
|
||||
count := 0
|
||||
for elem := p.openers.Front(); elem != nil; elem = elem.Next() {
|
||||
count++
|
||||
opener := elem.Value.(int)
|
||||
if p.matchOpener(pairValues, opener, i) {
|
||||
// if the opener matches, add nested pair to the ordered list
|
||||
p.pairPositions = append(p.pairPositions, bracketPair{opener, i})
|
||||
// remove up to and including matched opener
|
||||
for ; count > 0; count-- {
|
||||
p.openers.Remove(p.openers.Front())
|
||||
}
|
||||
break
|
||||
}
|
||||
}
|
||||
sort.Sort(p.pairPositions)
|
||||
// if we get here, the closing bracket matched no openers
|
||||
// and gets ignored
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Bracket pairs within an isolating run sequence are processed as units so
|
||||
// that both the opening and the closing paired bracket in a pair resolve to
|
||||
// the same direction.
|
||||
//
|
||||
// N0. Process bracket pairs in an isolating run sequence sequentially in
|
||||
// the logical order of the text positions of the opening paired brackets
|
||||
// using the logic given below. Within this scope, bidirectional types EN
|
||||
// and AN are treated as R.
|
||||
//
|
||||
// Identify the bracket pairs in the current isolating run sequence
|
||||
// according to BD16. For each bracket-pair element in the list of pairs of
|
||||
// text positions:
|
||||
//
|
||||
// a Inspect the bidirectional types of the characters enclosed within the
|
||||
// bracket pair.
|
||||
//
|
||||
// b If any strong type (either L or R) matching the embedding direction is
|
||||
// found, set the type for both brackets in the pair to match the embedding
|
||||
// direction.
|
||||
//
|
||||
// o [ e ] o -> o e e e o
|
||||
//
|
||||
// o [ o e ] -> o e o e e
|
||||
//
|
||||
// o [ NI e ] -> o e NI e e
|
||||
//
|
||||
// c Otherwise, if a strong type (opposite the embedding direction) is
|
||||
// found, test for adjacent strong types as follows: 1 First, check
|
||||
// backwards before the opening paired bracket until the first strong type
|
||||
// (L, R, or sos) is found. If that first preceding strong type is opposite
|
||||
// the embedding direction, then set the type for both brackets in the pair
|
||||
// to that type. 2 Otherwise, set the type for both brackets in the pair to
|
||||
// the embedding direction.
|
||||
//
|
||||
// o [ o ] e -> o o o o e
|
||||
//
|
||||
// o [ o NI ] o -> o o o NI o o
|
||||
//
|
||||
// e [ o ] o -> e e o e o
|
||||
//
|
||||
// e [ o ] e -> e e o e e
|
||||
//
|
||||
// e ( o [ o ] NI ) e -> e e o o o o NI e e
|
||||
//
|
||||
// d Otherwise, do not set the type for the current bracket pair. Note that
|
||||
// if the enclosed text contains no strong types the paired brackets will
|
||||
// both resolve to the same level when resolved individually using rules N1
|
||||
// and N2.
|
||||
//
|
||||
// e ( NI ) o -> e ( NI ) o
|
||||
|
||||
// getStrongTypeN0 maps character's directional code to strong type as required
|
||||
// by rule N0.
|
||||
//
|
||||
// TODO: have separate type for "strong" directionality.
|
||||
func (p *bracketPairer) getStrongTypeN0(index int) Class {
|
||||
switch p.codesIsolatedRun[index] {
|
||||
// in the scope of N0, number types are treated as R
|
||||
case EN, AN, AL, R:
|
||||
return R
|
||||
case L:
|
||||
return L
|
||||
default:
|
||||
return ON
|
||||
}
|
||||
}
|
||||
|
||||
// classifyPairContent reports the strong types contained inside a Bracket Pair,
|
||||
// assuming the given embedding direction.
|
||||
//
|
||||
// It returns ON if no strong type is found. If a single strong type is found,
|
||||
// it returns this this type. Otherwise it returns the embedding direction.
|
||||
//
|
||||
// TODO: use separate type for "strong" directionality.
|
||||
func (p *bracketPairer) classifyPairContent(loc bracketPair, dirEmbed Class) Class {
|
||||
dirOpposite := ON
|
||||
for i := loc.opener + 1; i < loc.closer; i++ {
|
||||
dir := p.getStrongTypeN0(i)
|
||||
if dir == ON {
|
||||
continue
|
||||
}
|
||||
if dir == dirEmbed {
|
||||
return dir // type matching embedding direction found
|
||||
}
|
||||
dirOpposite = dir
|
||||
}
|
||||
// return ON if no strong type found, or class opposite to dirEmbed
|
||||
return dirOpposite
|
||||
}
|
||||
|
||||
// classBeforePair determines which strong types are present before a Bracket
|
||||
// Pair. Return R or L if strong type found, otherwise ON.
|
||||
func (p *bracketPairer) classBeforePair(loc bracketPair) Class {
|
||||
for i := loc.opener - 1; i >= 0; i-- {
|
||||
if dir := p.getStrongTypeN0(i); dir != ON {
|
||||
return dir
|
||||
}
|
||||
}
|
||||
// no strong types found, return sos
|
||||
return p.sos
|
||||
}
|
||||
|
||||
// assignBracketType implements rule N0 for a single bracket pair.
|
||||
func (p *bracketPairer) assignBracketType(loc bracketPair, dirEmbed Class) {
|
||||
// rule "N0, a", inspect contents of pair
|
||||
dirPair := p.classifyPairContent(loc, dirEmbed)
|
||||
|
||||
// dirPair is now L, R, or N (no strong type found)
|
||||
|
||||
// the following logical tests are performed out of order compared to
|
||||
// the statement of the rules but yield the same results
|
||||
if dirPair == ON {
|
||||
return // case "d" - nothing to do
|
||||
}
|
||||
|
||||
if dirPair != dirEmbed {
|
||||
// case "c": strong type found, opposite - check before (c.1)
|
||||
dirPair = p.classBeforePair(loc)
|
||||
if dirPair == dirEmbed || dirPair == ON {
|
||||
// no strong opposite type found before - use embedding (c.2)
|
||||
dirPair = dirEmbed
|
||||
}
|
||||
}
|
||||
// else: case "b", strong type found matching embedding,
|
||||
// no explicit action needed, as dirPair is already set to embedding
|
||||
// direction
|
||||
|
||||
// set the bracket types to the type found
|
||||
p.codesIsolatedRun[loc.opener] = dirPair
|
||||
p.codesIsolatedRun[loc.closer] = dirPair
|
||||
}
|
||||
|
||||
// resolveBrackets implements rule N0 for a list of pairs.
|
||||
func (p *bracketPairer) resolveBrackets(dirEmbed Class) {
|
||||
for _, loc := range p.pairPositions {
|
||||
p.assignBracketType(loc, dirEmbed)
|
||||
}
|
||||
}
|
||||
1055
vendor/golang.org/x/text/unicode/bidi/core.go
generated
vendored
Normal file
1055
vendor/golang.org/x/text/unicode/bidi/core.go
generated
vendored
Normal file
File diff suppressed because it is too large
Load diff
133
vendor/golang.org/x/text/unicode/bidi/gen.go
generated
vendored
Normal file
133
vendor/golang.org/x/text/unicode/bidi/gen.go
generated
vendored
Normal file
|
|
@ -0,0 +1,133 @@
|
|||
// Copyright 2015 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build ignore
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"flag"
|
||||
"log"
|
||||
|
||||
"golang.org/x/text/internal/gen"
|
||||
"golang.org/x/text/internal/triegen"
|
||||
"golang.org/x/text/internal/ucd"
|
||||
)
|
||||
|
||||
var outputFile = flag.String("out", "tables.go", "output file")
|
||||
|
||||
func main() {
|
||||
gen.Init()
|
||||
gen.Repackage("gen_trieval.go", "trieval.go", "bidi")
|
||||
gen.Repackage("gen_ranges.go", "ranges_test.go", "bidi")
|
||||
|
||||
genTables()
|
||||
}
|
||||
|
||||
// bidiClass names and codes taken from class "bc" in
|
||||
// http://www.unicode.org/Public/8.0.0/ucd/PropertyValueAliases.txt
|
||||
var bidiClass = map[string]Class{
|
||||
"AL": AL, // ArabicLetter
|
||||
"AN": AN, // ArabicNumber
|
||||
"B": B, // ParagraphSeparator
|
||||
"BN": BN, // BoundaryNeutral
|
||||
"CS": CS, // CommonSeparator
|
||||
"EN": EN, // EuropeanNumber
|
||||
"ES": ES, // EuropeanSeparator
|
||||
"ET": ET, // EuropeanTerminator
|
||||
"L": L, // LeftToRight
|
||||
"NSM": NSM, // NonspacingMark
|
||||
"ON": ON, // OtherNeutral
|
||||
"R": R, // RightToLeft
|
||||
"S": S, // SegmentSeparator
|
||||
"WS": WS, // WhiteSpace
|
||||
|
||||
"FSI": Control,
|
||||
"PDF": Control,
|
||||
"PDI": Control,
|
||||
"LRE": Control,
|
||||
"LRI": Control,
|
||||
"LRO": Control,
|
||||
"RLE": Control,
|
||||
"RLI": Control,
|
||||
"RLO": Control,
|
||||
}
|
||||
|
||||
func genTables() {
|
||||
if numClass > 0x0F {
|
||||
log.Fatalf("Too many Class constants (%#x > 0x0F).", numClass)
|
||||
}
|
||||
w := gen.NewCodeWriter()
|
||||
defer w.WriteGoFile(*outputFile, "bidi")
|
||||
|
||||
gen.WriteUnicodeVersion(w)
|
||||
|
||||
t := triegen.NewTrie("bidi")
|
||||
|
||||
// Build data about bracket mapping. These bits need to be or-ed with
|
||||
// any other bits.
|
||||
orMask := map[rune]uint64{}
|
||||
|
||||
xorMap := map[rune]int{}
|
||||
xorMasks := []rune{0} // First value is no-op.
|
||||
|
||||
ucd.Parse(gen.OpenUCDFile("BidiBrackets.txt"), func(p *ucd.Parser) {
|
||||
r1 := p.Rune(0)
|
||||
r2 := p.Rune(1)
|
||||
xor := r1 ^ r2
|
||||
if _, ok := xorMap[xor]; !ok {
|
||||
xorMap[xor] = len(xorMasks)
|
||||
xorMasks = append(xorMasks, xor)
|
||||
}
|
||||
entry := uint64(xorMap[xor]) << xorMaskShift
|
||||
switch p.String(2) {
|
||||
case "o":
|
||||
entry |= openMask
|
||||
case "c", "n":
|
||||
default:
|
||||
log.Fatalf("Unknown bracket class %q.", p.String(2))
|
||||
}
|
||||
orMask[r1] = entry
|
||||
})
|
||||
|
||||
w.WriteComment(`
|
||||
xorMasks contains masks to be xor-ed with brackets to get the reverse
|
||||
version.`)
|
||||
w.WriteVar("xorMasks", xorMasks)
|
||||
|
||||
done := map[rune]bool{}
|
||||
|
||||
insert := func(r rune, c Class) {
|
||||
if !done[r] {
|
||||
t.Insert(r, orMask[r]|uint64(c))
|
||||
done[r] = true
|
||||
}
|
||||
}
|
||||
|
||||
// Insert the derived BiDi properties.
|
||||
ucd.Parse(gen.OpenUCDFile("extracted/DerivedBidiClass.txt"), func(p *ucd.Parser) {
|
||||
r := p.Rune(0)
|
||||
class, ok := bidiClass[p.String(1)]
|
||||
if !ok {
|
||||
log.Fatalf("%U: Unknown BiDi class %q", r, p.String(1))
|
||||
}
|
||||
insert(r, class)
|
||||
})
|
||||
visitDefaults(insert)
|
||||
|
||||
// TODO: use sparse blocks. This would reduce table size considerably
|
||||
// from the looks of it.
|
||||
|
||||
sz, err := t.Gen(w)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
w.Size += sz
|
||||
}
|
||||
|
||||
// dummy values to make methods in gen_common compile. The real versions
|
||||
// will be generated by this file to tables.go.
|
||||
var (
|
||||
xorMasks []rune
|
||||
)
|
||||
57
vendor/golang.org/x/text/unicode/bidi/gen_ranges.go
generated
vendored
Normal file
57
vendor/golang.org/x/text/unicode/bidi/gen_ranges.go
generated
vendored
Normal file
|
|
@ -0,0 +1,57 @@
|
|||
// Copyright 2015 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build ignore
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"unicode"
|
||||
|
||||
"golang.org/x/text/internal/gen"
|
||||
"golang.org/x/text/internal/ucd"
|
||||
"golang.org/x/text/unicode/rangetable"
|
||||
)
|
||||
|
||||
// These tables are hand-extracted from:
|
||||
// http://www.unicode.org/Public/8.0.0/ucd/extracted/DerivedBidiClass.txt
|
||||
func visitDefaults(fn func(r rune, c Class)) {
|
||||
// first write default values for ranges listed above.
|
||||
visitRunes(fn, AL, []rune{
|
||||
0x0600, 0x07BF, // Arabic
|
||||
0x08A0, 0x08FF, // Arabic Extended-A
|
||||
0xFB50, 0xFDCF, // Arabic Presentation Forms
|
||||
0xFDF0, 0xFDFF,
|
||||
0xFE70, 0xFEFF,
|
||||
0x0001EE00, 0x0001EEFF, // Arabic Mathematical Alpha Symbols
|
||||
})
|
||||
visitRunes(fn, R, []rune{
|
||||
0x0590, 0x05FF, // Hebrew
|
||||
0x07C0, 0x089F, // Nko et al.
|
||||
0xFB1D, 0xFB4F,
|
||||
0x00010800, 0x00010FFF, // Cypriot Syllabary et. al.
|
||||
0x0001E800, 0x0001EDFF,
|
||||
0x0001EF00, 0x0001EFFF,
|
||||
})
|
||||
visitRunes(fn, ET, []rune{ // European Terminator
|
||||
0x20A0, 0x20Cf, // Currency symbols
|
||||
})
|
||||
rangetable.Visit(unicode.Noncharacter_Code_Point, func(r rune) {
|
||||
fn(r, BN) // Boundary Neutral
|
||||
})
|
||||
ucd.Parse(gen.OpenUCDFile("DerivedCoreProperties.txt"), func(p *ucd.Parser) {
|
||||
if p.String(1) == "Default_Ignorable_Code_Point" {
|
||||
fn(p.Rune(0), BN) // Boundary Neutral
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func visitRunes(fn func(r rune, c Class), c Class, runes []rune) {
|
||||
for i := 0; i < len(runes); i += 2 {
|
||||
lo, hi := runes[i], runes[i+1]
|
||||
for j := lo; j <= hi; j++ {
|
||||
fn(j, c)
|
||||
}
|
||||
}
|
||||
}
|
||||
64
vendor/golang.org/x/text/unicode/bidi/gen_trieval.go
generated
vendored
Normal file
64
vendor/golang.org/x/text/unicode/bidi/gen_trieval.go
generated
vendored
Normal file
|
|
@ -0,0 +1,64 @@
|
|||
// Copyright 2015 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build ignore
|
||||
|
||||
package main
|
||||
|
||||
// Class is the Unicode BiDi class. Each rune has a single class.
|
||||
type Class uint
|
||||
|
||||
const (
|
||||
L Class = iota // LeftToRight
|
||||
R // RightToLeft
|
||||
EN // EuropeanNumber
|
||||
ES // EuropeanSeparator
|
||||
ET // EuropeanTerminator
|
||||
AN // ArabicNumber
|
||||
CS // CommonSeparator
|
||||
B // ParagraphSeparator
|
||||
S // SegmentSeparator
|
||||
WS // WhiteSpace
|
||||
ON // OtherNeutral
|
||||
BN // BoundaryNeutral
|
||||
NSM // NonspacingMark
|
||||
AL // ArabicLetter
|
||||
Control // Control LRO - PDI
|
||||
|
||||
numClass
|
||||
|
||||
LRO // LeftToRightOverride
|
||||
RLO // RightToLeftOverride
|
||||
LRE // LeftToRightEmbedding
|
||||
RLE // RightToLeftEmbedding
|
||||
PDF // PopDirectionalFormat
|
||||
LRI // LeftToRightIsolate
|
||||
RLI // RightToLeftIsolate
|
||||
FSI // FirstStrongIsolate
|
||||
PDI // PopDirectionalIsolate
|
||||
|
||||
unknownClass = ^Class(0)
|
||||
)
|
||||
|
||||
var controlToClass = map[rune]Class{
|
||||
0x202D: LRO, // LeftToRightOverride,
|
||||
0x202E: RLO, // RightToLeftOverride,
|
||||
0x202A: LRE, // LeftToRightEmbedding,
|
||||
0x202B: RLE, // RightToLeftEmbedding,
|
||||
0x202C: PDF, // PopDirectionalFormat,
|
||||
0x2066: LRI, // LeftToRightIsolate,
|
||||
0x2067: RLI, // RightToLeftIsolate,
|
||||
0x2068: FSI, // FirstStrongIsolate,
|
||||
0x2069: PDI, // PopDirectionalIsolate,
|
||||
}
|
||||
|
||||
// A trie entry has the following bits:
|
||||
// 7..5 XOR mask for brackets
|
||||
// 4 1: Bracket open, 0: Bracket close
|
||||
// 3..0 Class type
|
||||
|
||||
const (
|
||||
openMask = 0x10
|
||||
xorMaskShift = 5
|
||||
)
|
||||
206
vendor/golang.org/x/text/unicode/bidi/prop.go
generated
vendored
Normal file
206
vendor/golang.org/x/text/unicode/bidi/prop.go
generated
vendored
Normal file
|
|
@ -0,0 +1,206 @@
|
|||
// Copyright 2016 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package bidi
|
||||
|
||||
import "unicode/utf8"
|
||||
|
||||
// Properties provides access to BiDi properties of runes.
|
||||
type Properties struct {
|
||||
entry uint8
|
||||
last uint8
|
||||
}
|
||||
|
||||
var trie = newBidiTrie(0)
|
||||
|
||||
// TODO: using this for bidirule reduces the running time by about 5%. Consider
|
||||
// if this is worth exposing or if we can find a way to speed up the Class
|
||||
// method.
|
||||
//
|
||||
// // CompactClass is like Class, but maps all of the BiDi control classes
|
||||
// // (LRO, RLO, LRE, RLE, PDF, LRI, RLI, FSI, PDI) to the class Control.
|
||||
// func (p Properties) CompactClass() Class {
|
||||
// return Class(p.entry & 0x0F)
|
||||
// }
|
||||
|
||||
// Class returns the Bidi class for p.
|
||||
func (p Properties) Class() Class {
|
||||
c := Class(p.entry & 0x0F)
|
||||
if c == Control {
|
||||
c = controlByteToClass[p.last&0xF]
|
||||
}
|
||||
return c
|
||||
}
|
||||
|
||||
// IsBracket reports whether the rune is a bracket.
|
||||
func (p Properties) IsBracket() bool { return p.entry&0xF0 != 0 }
|
||||
|
||||
// IsOpeningBracket reports whether the rune is an opening bracket.
|
||||
// IsBracket must return true.
|
||||
func (p Properties) IsOpeningBracket() bool { return p.entry&openMask != 0 }
|
||||
|
||||
// TODO: find a better API and expose.
|
||||
func (p Properties) reverseBracket(r rune) rune {
|
||||
return xorMasks[p.entry>>xorMaskShift] ^ r
|
||||
}
|
||||
|
||||
var controlByteToClass = [16]Class{
|
||||
0xD: LRO, // U+202D LeftToRightOverride,
|
||||
0xE: RLO, // U+202E RightToLeftOverride,
|
||||
0xA: LRE, // U+202A LeftToRightEmbedding,
|
||||
0xB: RLE, // U+202B RightToLeftEmbedding,
|
||||
0xC: PDF, // U+202C PopDirectionalFormat,
|
||||
0x6: LRI, // U+2066 LeftToRightIsolate,
|
||||
0x7: RLI, // U+2067 RightToLeftIsolate,
|
||||
0x8: FSI, // U+2068 FirstStrongIsolate,
|
||||
0x9: PDI, // U+2069 PopDirectionalIsolate,
|
||||
}
|
||||
|
||||
// LookupRune returns properties for r.
|
||||
func LookupRune(r rune) (p Properties, size int) {
|
||||
var buf [4]byte
|
||||
n := utf8.EncodeRune(buf[:], r)
|
||||
return Lookup(buf[:n])
|
||||
}
|
||||
|
||||
// TODO: these lookup methods are based on the generated trie code. The returned
|
||||
// sizes have slightly different semantics from the generated code, in that it
|
||||
// always returns size==1 for an illegal UTF-8 byte (instead of the length
|
||||
// of the maximum invalid subsequence). Most Transformers, like unicode/norm,
|
||||
// leave invalid UTF-8 untouched, in which case it has performance benefits to
|
||||
// do so (without changing the semantics). Bidi requires the semantics used here
|
||||
// for the bidirule implementation to be compatible with the Go semantics.
|
||||
// They ultimately should perhaps be adopted by all trie implementations, for
|
||||
// convenience sake.
|
||||
// This unrolled code also boosts performance of the secure/bidirule package by
|
||||
// about 30%.
|
||||
// So, to remove this code:
|
||||
// - add option to trie generator to define return type.
|
||||
// - always return 1 byte size for ill-formed UTF-8 runes.
|
||||
|
||||
// Lookup returns properties for the first rune in s and the width in bytes of
|
||||
// its encoding. The size will be 0 if s does not hold enough bytes to complete
|
||||
// the encoding.
|
||||
func Lookup(s []byte) (p Properties, sz int) {
|
||||
c0 := s[0]
|
||||
switch {
|
||||
case c0 < 0x80: // is ASCII
|
||||
return Properties{entry: bidiValues[c0]}, 1
|
||||
case c0 < 0xC2:
|
||||
return Properties{}, 1
|
||||
case c0 < 0xE0: // 2-byte UTF-8
|
||||
if len(s) < 2 {
|
||||
return Properties{}, 0
|
||||
}
|
||||
i := bidiIndex[c0]
|
||||
c1 := s[1]
|
||||
if c1 < 0x80 || 0xC0 <= c1 {
|
||||
return Properties{}, 1
|
||||
}
|
||||
return Properties{entry: trie.lookupValue(uint32(i), c1)}, 2
|
||||
case c0 < 0xF0: // 3-byte UTF-8
|
||||
if len(s) < 3 {
|
||||
return Properties{}, 0
|
||||
}
|
||||
i := bidiIndex[c0]
|
||||
c1 := s[1]
|
||||
if c1 < 0x80 || 0xC0 <= c1 {
|
||||
return Properties{}, 1
|
||||
}
|
||||
o := uint32(i)<<6 + uint32(c1)
|
||||
i = bidiIndex[o]
|
||||
c2 := s[2]
|
||||
if c2 < 0x80 || 0xC0 <= c2 {
|
||||
return Properties{}, 1
|
||||
}
|
||||
return Properties{entry: trie.lookupValue(uint32(i), c2), last: c2}, 3
|
||||
case c0 < 0xF8: // 4-byte UTF-8
|
||||
if len(s) < 4 {
|
||||
return Properties{}, 0
|
||||
}
|
||||
i := bidiIndex[c0]
|
||||
c1 := s[1]
|
||||
if c1 < 0x80 || 0xC0 <= c1 {
|
||||
return Properties{}, 1
|
||||
}
|
||||
o := uint32(i)<<6 + uint32(c1)
|
||||
i = bidiIndex[o]
|
||||
c2 := s[2]
|
||||
if c2 < 0x80 || 0xC0 <= c2 {
|
||||
return Properties{}, 1
|
||||
}
|
||||
o = uint32(i)<<6 + uint32(c2)
|
||||
i = bidiIndex[o]
|
||||
c3 := s[3]
|
||||
if c3 < 0x80 || 0xC0 <= c3 {
|
||||
return Properties{}, 1
|
||||
}
|
||||
return Properties{entry: trie.lookupValue(uint32(i), c3)}, 4
|
||||
}
|
||||
// Illegal rune
|
||||
return Properties{}, 1
|
||||
}
|
||||
|
||||
// LookupString returns properties for the first rune in s and the width in
|
||||
// bytes of its encoding. The size will be 0 if s does not hold enough bytes to
|
||||
// complete the encoding.
|
||||
func LookupString(s string) (p Properties, sz int) {
|
||||
c0 := s[0]
|
||||
switch {
|
||||
case c0 < 0x80: // is ASCII
|
||||
return Properties{entry: bidiValues[c0]}, 1
|
||||
case c0 < 0xC2:
|
||||
return Properties{}, 1
|
||||
case c0 < 0xE0: // 2-byte UTF-8
|
||||
if len(s) < 2 {
|
||||
return Properties{}, 0
|
||||
}
|
||||
i := bidiIndex[c0]
|
||||
c1 := s[1]
|
||||
if c1 < 0x80 || 0xC0 <= c1 {
|
||||
return Properties{}, 1
|
||||
}
|
||||
return Properties{entry: trie.lookupValue(uint32(i), c1)}, 2
|
||||
case c0 < 0xF0: // 3-byte UTF-8
|
||||
if len(s) < 3 {
|
||||
return Properties{}, 0
|
||||
}
|
||||
i := bidiIndex[c0]
|
||||
c1 := s[1]
|
||||
if c1 < 0x80 || 0xC0 <= c1 {
|
||||
return Properties{}, 1
|
||||
}
|
||||
o := uint32(i)<<6 + uint32(c1)
|
||||
i = bidiIndex[o]
|
||||
c2 := s[2]
|
||||
if c2 < 0x80 || 0xC0 <= c2 {
|
||||
return Properties{}, 1
|
||||
}
|
||||
return Properties{entry: trie.lookupValue(uint32(i), c2), last: c2}, 3
|
||||
case c0 < 0xF8: // 4-byte UTF-8
|
||||
if len(s) < 4 {
|
||||
return Properties{}, 0
|
||||
}
|
||||
i := bidiIndex[c0]
|
||||
c1 := s[1]
|
||||
if c1 < 0x80 || 0xC0 <= c1 {
|
||||
return Properties{}, 1
|
||||
}
|
||||
o := uint32(i)<<6 + uint32(c1)
|
||||
i = bidiIndex[o]
|
||||
c2 := s[2]
|
||||
if c2 < 0x80 || 0xC0 <= c2 {
|
||||
return Properties{}, 1
|
||||
}
|
||||
o = uint32(i)<<6 + uint32(c2)
|
||||
i = bidiIndex[o]
|
||||
c3 := s[3]
|
||||
if c3 < 0x80 || 0xC0 <= c3 {
|
||||
return Properties{}, 1
|
||||
}
|
||||
return Properties{entry: trie.lookupValue(uint32(i), c3)}, 4
|
||||
}
|
||||
// Illegal rune
|
||||
return Properties{}, 1
|
||||
}
|
||||
1779
vendor/golang.org/x/text/unicode/bidi/tables.go
generated
vendored
Normal file
1779
vendor/golang.org/x/text/unicode/bidi/tables.go
generated
vendored
Normal file
File diff suppressed because it is too large
Load diff
60
vendor/golang.org/x/text/unicode/bidi/trieval.go
generated
vendored
Normal file
60
vendor/golang.org/x/text/unicode/bidi/trieval.go
generated
vendored
Normal file
|
|
@ -0,0 +1,60 @@
|
|||
// This file was generated by go generate; DO NOT EDIT
|
||||
|
||||
package bidi
|
||||
|
||||
// Class is the Unicode BiDi class. Each rune has a single class.
|
||||
type Class uint
|
||||
|
||||
const (
|
||||
L Class = iota // LeftToRight
|
||||
R // RightToLeft
|
||||
EN // EuropeanNumber
|
||||
ES // EuropeanSeparator
|
||||
ET // EuropeanTerminator
|
||||
AN // ArabicNumber
|
||||
CS // CommonSeparator
|
||||
B // ParagraphSeparator
|
||||
S // SegmentSeparator
|
||||
WS // WhiteSpace
|
||||
ON // OtherNeutral
|
||||
BN // BoundaryNeutral
|
||||
NSM // NonspacingMark
|
||||
AL // ArabicLetter
|
||||
Control // Control LRO - PDI
|
||||
|
||||
numClass
|
||||
|
||||
LRO // LeftToRightOverride
|
||||
RLO // RightToLeftOverride
|
||||
LRE // LeftToRightEmbedding
|
||||
RLE // RightToLeftEmbedding
|
||||
PDF // PopDirectionalFormat
|
||||
LRI // LeftToRightIsolate
|
||||
RLI // RightToLeftIsolate
|
||||
FSI // FirstStrongIsolate
|
||||
PDI // PopDirectionalIsolate
|
||||
|
||||
unknownClass = ^Class(0)
|
||||
)
|
||||
|
||||
var controlToClass = map[rune]Class{
|
||||
0x202D: LRO, // LeftToRightOverride,
|
||||
0x202E: RLO, // RightToLeftOverride,
|
||||
0x202A: LRE, // LeftToRightEmbedding,
|
||||
0x202B: RLE, // RightToLeftEmbedding,
|
||||
0x202C: PDF, // PopDirectionalFormat,
|
||||
0x2066: LRI, // LeftToRightIsolate,
|
||||
0x2067: RLI, // RightToLeftIsolate,
|
||||
0x2068: FSI, // FirstStrongIsolate,
|
||||
0x2069: PDI, // PopDirectionalIsolate,
|
||||
}
|
||||
|
||||
// A trie entry has the following bits:
|
||||
// 7..5 XOR mask for brackets
|
||||
// 4 1: Bracket open, 0: Bracket close
|
||||
// 3..0 Class type
|
||||
|
||||
const (
|
||||
openMask = 0x10
|
||||
xorMaskShift = 5
|
||||
)
|
||||
514
vendor/golang.org/x/text/unicode/norm/composition.go
generated
vendored
Normal file
514
vendor/golang.org/x/text/unicode/norm/composition.go
generated
vendored
Normal file
|
|
@ -0,0 +1,514 @@
|
|||
// Copyright 2011 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package norm
|
||||
|
||||
import "unicode/utf8"
|
||||
|
||||
const (
|
||||
maxNonStarters = 30
|
||||
// The maximum number of characters needed for a buffer is
|
||||
// maxNonStarters + 1 for the starter + 1 for the GCJ
|
||||
maxBufferSize = maxNonStarters + 2
|
||||
maxNFCExpansion = 3 // NFC(0x1D160)
|
||||
maxNFKCExpansion = 18 // NFKC(0xFDFA)
|
||||
|
||||
maxByteBufferSize = utf8.UTFMax * maxBufferSize // 128
|
||||
)
|
||||
|
||||
// ssState is used for reporting the segment state after inserting a rune.
|
||||
// It is returned by streamSafe.next.
|
||||
type ssState int
|
||||
|
||||
const (
|
||||
// Indicates a rune was successfully added to the segment.
|
||||
ssSuccess ssState = iota
|
||||
// Indicates a rune starts a new segment and should not be added.
|
||||
ssStarter
|
||||
// Indicates a rune caused a segment overflow and a CGJ should be inserted.
|
||||
ssOverflow
|
||||
)
|
||||
|
||||
// streamSafe implements the policy of when a CGJ should be inserted.
|
||||
type streamSafe uint8
|
||||
|
||||
// mkStreamSafe is a shorthand for declaring a streamSafe var and calling
|
||||
// first on it.
|
||||
func mkStreamSafe(p Properties) streamSafe {
|
||||
return streamSafe(p.nTrailingNonStarters())
|
||||
}
|
||||
|
||||
// first inserts the first rune of a segment.
|
||||
func (ss *streamSafe) first(p Properties) {
|
||||
if *ss != 0 {
|
||||
panic("!= 0")
|
||||
}
|
||||
*ss = streamSafe(p.nTrailingNonStarters())
|
||||
}
|
||||
|
||||
// insert returns a ssState value to indicate whether a rune represented by p
|
||||
// can be inserted.
|
||||
func (ss *streamSafe) next(p Properties) ssState {
|
||||
if *ss > maxNonStarters {
|
||||
panic("streamSafe was not reset")
|
||||
}
|
||||
n := p.nLeadingNonStarters()
|
||||
if *ss += streamSafe(n); *ss > maxNonStarters {
|
||||
*ss = 0
|
||||
return ssOverflow
|
||||
}
|
||||
// The Stream-Safe Text Processing prescribes that the counting can stop
|
||||
// as soon as a starter is encountered. However, there are some starters,
|
||||
// like Jamo V and T, that can combine with other runes, leaving their
|
||||
// successive non-starters appended to the previous, possibly causing an
|
||||
// overflow. We will therefore consider any rune with a non-zero nLead to
|
||||
// be a non-starter. Note that it always hold that if nLead > 0 then
|
||||
// nLead == nTrail.
|
||||
if n == 0 {
|
||||
*ss = 0
|
||||
return ssStarter
|
||||
}
|
||||
return ssSuccess
|
||||
}
|
||||
|
||||
// backwards is used for checking for overflow and segment starts
|
||||
// when traversing a string backwards. Users do not need to call first
|
||||
// for the first rune. The state of the streamSafe retains the count of
|
||||
// the non-starters loaded.
|
||||
func (ss *streamSafe) backwards(p Properties) ssState {
|
||||
if *ss > maxNonStarters {
|
||||
panic("streamSafe was not reset")
|
||||
}
|
||||
c := *ss + streamSafe(p.nTrailingNonStarters())
|
||||
if c > maxNonStarters {
|
||||
return ssOverflow
|
||||
}
|
||||
*ss = c
|
||||
if p.nLeadingNonStarters() == 0 {
|
||||
return ssStarter
|
||||
}
|
||||
return ssSuccess
|
||||
}
|
||||
|
||||
func (ss streamSafe) isMax() bool {
|
||||
return ss == maxNonStarters
|
||||
}
|
||||
|
||||
// GraphemeJoiner is inserted after maxNonStarters non-starter runes.
|
||||
const GraphemeJoiner = "\u034F"
|
||||
|
||||
// reorderBuffer is used to normalize a single segment. Characters inserted with
|
||||
// insert are decomposed and reordered based on CCC. The compose method can
|
||||
// be used to recombine characters. Note that the byte buffer does not hold
|
||||
// the UTF-8 characters in order. Only the rune array is maintained in sorted
|
||||
// order. flush writes the resulting segment to a byte array.
|
||||
type reorderBuffer struct {
|
||||
rune [maxBufferSize]Properties // Per character info.
|
||||
byte [maxByteBufferSize]byte // UTF-8 buffer. Referenced by runeInfo.pos.
|
||||
nbyte uint8 // Number or bytes.
|
||||
ss streamSafe // For limiting length of non-starter sequence.
|
||||
nrune int // Number of runeInfos.
|
||||
f formInfo
|
||||
|
||||
src input
|
||||
nsrc int
|
||||
tmpBytes input
|
||||
|
||||
out []byte
|
||||
flushF func(*reorderBuffer) bool
|
||||
}
|
||||
|
||||
func (rb *reorderBuffer) init(f Form, src []byte) {
|
||||
rb.f = *formTable[f]
|
||||
rb.src.setBytes(src)
|
||||
rb.nsrc = len(src)
|
||||
rb.ss = 0
|
||||
}
|
||||
|
||||
func (rb *reorderBuffer) initString(f Form, src string) {
|
||||
rb.f = *formTable[f]
|
||||
rb.src.setString(src)
|
||||
rb.nsrc = len(src)
|
||||
rb.ss = 0
|
||||
}
|
||||
|
||||
func (rb *reorderBuffer) setFlusher(out []byte, f func(*reorderBuffer) bool) {
|
||||
rb.out = out
|
||||
rb.flushF = f
|
||||
}
|
||||
|
||||
// reset discards all characters from the buffer.
|
||||
func (rb *reorderBuffer) reset() {
|
||||
rb.nrune = 0
|
||||
rb.nbyte = 0
|
||||
rb.ss = 0
|
||||
}
|
||||
|
||||
func (rb *reorderBuffer) doFlush() bool {
|
||||
if rb.f.composing {
|
||||
rb.compose()
|
||||
}
|
||||
res := rb.flushF(rb)
|
||||
rb.reset()
|
||||
return res
|
||||
}
|
||||
|
||||
// appendFlush appends the normalized segment to rb.out.
|
||||
func appendFlush(rb *reorderBuffer) bool {
|
||||
for i := 0; i < rb.nrune; i++ {
|
||||
start := rb.rune[i].pos
|
||||
end := start + rb.rune[i].size
|
||||
rb.out = append(rb.out, rb.byte[start:end]...)
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// flush appends the normalized segment to out and resets rb.
|
||||
func (rb *reorderBuffer) flush(out []byte) []byte {
|
||||
for i := 0; i < rb.nrune; i++ {
|
||||
start := rb.rune[i].pos
|
||||
end := start + rb.rune[i].size
|
||||
out = append(out, rb.byte[start:end]...)
|
||||
}
|
||||
rb.reset()
|
||||
return out
|
||||
}
|
||||
|
||||
// flushCopy copies the normalized segment to buf and resets rb.
|
||||
// It returns the number of bytes written to buf.
|
||||
func (rb *reorderBuffer) flushCopy(buf []byte) int {
|
||||
p := 0
|
||||
for i := 0; i < rb.nrune; i++ {
|
||||
runep := rb.rune[i]
|
||||
p += copy(buf[p:], rb.byte[runep.pos:runep.pos+runep.size])
|
||||
}
|
||||
rb.reset()
|
||||
return p
|
||||
}
|
||||
|
||||
// insertOrdered inserts a rune in the buffer, ordered by Canonical Combining Class.
|
||||
// It returns false if the buffer is not large enough to hold the rune.
|
||||
// It is used internally by insert and insertString only.
|
||||
func (rb *reorderBuffer) insertOrdered(info Properties) {
|
||||
n := rb.nrune
|
||||
b := rb.rune[:]
|
||||
cc := info.ccc
|
||||
if cc > 0 {
|
||||
// Find insertion position + move elements to make room.
|
||||
for ; n > 0; n-- {
|
||||
if b[n-1].ccc <= cc {
|
||||
break
|
||||
}
|
||||
b[n] = b[n-1]
|
||||
}
|
||||
}
|
||||
rb.nrune += 1
|
||||
pos := uint8(rb.nbyte)
|
||||
rb.nbyte += utf8.UTFMax
|
||||
info.pos = pos
|
||||
b[n] = info
|
||||
}
|
||||
|
||||
// insertErr is an error code returned by insert. Using this type instead
|
||||
// of error improves performance up to 20% for many of the benchmarks.
|
||||
type insertErr int
|
||||
|
||||
const (
|
||||
iSuccess insertErr = -iota
|
||||
iShortDst
|
||||
iShortSrc
|
||||
)
|
||||
|
||||
// insertFlush inserts the given rune in the buffer ordered by CCC.
|
||||
// If a decomposition with multiple segments are encountered, they leading
|
||||
// ones are flushed.
|
||||
// It returns a non-zero error code if the rune was not inserted.
|
||||
func (rb *reorderBuffer) insertFlush(src input, i int, info Properties) insertErr {
|
||||
if rune := src.hangul(i); rune != 0 {
|
||||
rb.decomposeHangul(rune)
|
||||
return iSuccess
|
||||
}
|
||||
if info.hasDecomposition() {
|
||||
return rb.insertDecomposed(info.Decomposition())
|
||||
}
|
||||
rb.insertSingle(src, i, info)
|
||||
return iSuccess
|
||||
}
|
||||
|
||||
// insertUnsafe inserts the given rune in the buffer ordered by CCC.
|
||||
// It is assumed there is sufficient space to hold the runes. It is the
|
||||
// responsibility of the caller to ensure this. This can be done by checking
|
||||
// the state returned by the streamSafe type.
|
||||
func (rb *reorderBuffer) insertUnsafe(src input, i int, info Properties) {
|
||||
if rune := src.hangul(i); rune != 0 {
|
||||
rb.decomposeHangul(rune)
|
||||
}
|
||||
if info.hasDecomposition() {
|
||||
// TODO: inline.
|
||||
rb.insertDecomposed(info.Decomposition())
|
||||
} else {
|
||||
rb.insertSingle(src, i, info)
|
||||
}
|
||||
}
|
||||
|
||||
// insertDecomposed inserts an entry in to the reorderBuffer for each rune
|
||||
// in dcomp. dcomp must be a sequence of decomposed UTF-8-encoded runes.
|
||||
// It flushes the buffer on each new segment start.
|
||||
func (rb *reorderBuffer) insertDecomposed(dcomp []byte) insertErr {
|
||||
rb.tmpBytes.setBytes(dcomp)
|
||||
for i := 0; i < len(dcomp); {
|
||||
info := rb.f.info(rb.tmpBytes, i)
|
||||
if info.BoundaryBefore() && rb.nrune > 0 && !rb.doFlush() {
|
||||
return iShortDst
|
||||
}
|
||||
i += copy(rb.byte[rb.nbyte:], dcomp[i:i+int(info.size)])
|
||||
rb.insertOrdered(info)
|
||||
}
|
||||
return iSuccess
|
||||
}
|
||||
|
||||
// insertSingle inserts an entry in the reorderBuffer for the rune at
|
||||
// position i. info is the runeInfo for the rune at position i.
|
||||
func (rb *reorderBuffer) insertSingle(src input, i int, info Properties) {
|
||||
src.copySlice(rb.byte[rb.nbyte:], i, i+int(info.size))
|
||||
rb.insertOrdered(info)
|
||||
}
|
||||
|
||||
// insertCGJ inserts a Combining Grapheme Joiner (0x034f) into rb.
|
||||
func (rb *reorderBuffer) insertCGJ() {
|
||||
rb.insertSingle(input{str: GraphemeJoiner}, 0, Properties{size: uint8(len(GraphemeJoiner))})
|
||||
}
|
||||
|
||||
// appendRune inserts a rune at the end of the buffer. It is used for Hangul.
|
||||
func (rb *reorderBuffer) appendRune(r rune) {
|
||||
bn := rb.nbyte
|
||||
sz := utf8.EncodeRune(rb.byte[bn:], rune(r))
|
||||
rb.nbyte += utf8.UTFMax
|
||||
rb.rune[rb.nrune] = Properties{pos: bn, size: uint8(sz)}
|
||||
rb.nrune++
|
||||
}
|
||||
|
||||
// assignRune sets a rune at position pos. It is used for Hangul and recomposition.
|
||||
func (rb *reorderBuffer) assignRune(pos int, r rune) {
|
||||
bn := rb.rune[pos].pos
|
||||
sz := utf8.EncodeRune(rb.byte[bn:], rune(r))
|
||||
rb.rune[pos] = Properties{pos: bn, size: uint8(sz)}
|
||||
}
|
||||
|
||||
// runeAt returns the rune at position n. It is used for Hangul and recomposition.
|
||||
func (rb *reorderBuffer) runeAt(n int) rune {
|
||||
inf := rb.rune[n]
|
||||
r, _ := utf8.DecodeRune(rb.byte[inf.pos : inf.pos+inf.size])
|
||||
return r
|
||||
}
|
||||
|
||||
// bytesAt returns the UTF-8 encoding of the rune at position n.
|
||||
// It is used for Hangul and recomposition.
|
||||
func (rb *reorderBuffer) bytesAt(n int) []byte {
|
||||
inf := rb.rune[n]
|
||||
return rb.byte[inf.pos : int(inf.pos)+int(inf.size)]
|
||||
}
|
||||
|
||||
// For Hangul we combine algorithmically, instead of using tables.
|
||||
const (
|
||||
hangulBase = 0xAC00 // UTF-8(hangulBase) -> EA B0 80
|
||||
hangulBase0 = 0xEA
|
||||
hangulBase1 = 0xB0
|
||||
hangulBase2 = 0x80
|
||||
|
||||
hangulEnd = hangulBase + jamoLVTCount // UTF-8(0xD7A4) -> ED 9E A4
|
||||
hangulEnd0 = 0xED
|
||||
hangulEnd1 = 0x9E
|
||||
hangulEnd2 = 0xA4
|
||||
|
||||
jamoLBase = 0x1100 // UTF-8(jamoLBase) -> E1 84 00
|
||||
jamoLBase0 = 0xE1
|
||||
jamoLBase1 = 0x84
|
||||
jamoLEnd = 0x1113
|
||||
jamoVBase = 0x1161
|
||||
jamoVEnd = 0x1176
|
||||
jamoTBase = 0x11A7
|
||||
jamoTEnd = 0x11C3
|
||||
|
||||
jamoTCount = 28
|
||||
jamoVCount = 21
|
||||
jamoVTCount = 21 * 28
|
||||
jamoLVTCount = 19 * 21 * 28
|
||||
)
|
||||
|
||||
const hangulUTF8Size = 3
|
||||
|
||||
func isHangul(b []byte) bool {
|
||||
if len(b) < hangulUTF8Size {
|
||||
return false
|
||||
}
|
||||
b0 := b[0]
|
||||
if b0 < hangulBase0 {
|
||||
return false
|
||||
}
|
||||
b1 := b[1]
|
||||
switch {
|
||||
case b0 == hangulBase0:
|
||||
return b1 >= hangulBase1
|
||||
case b0 < hangulEnd0:
|
||||
return true
|
||||
case b0 > hangulEnd0:
|
||||
return false
|
||||
case b1 < hangulEnd1:
|
||||
return true
|
||||
}
|
||||
return b1 == hangulEnd1 && b[2] < hangulEnd2
|
||||
}
|
||||
|
||||
func isHangulString(b string) bool {
|
||||
if len(b) < hangulUTF8Size {
|
||||
return false
|
||||
}
|
||||
b0 := b[0]
|
||||
if b0 < hangulBase0 {
|
||||
return false
|
||||
}
|
||||
b1 := b[1]
|
||||
switch {
|
||||
case b0 == hangulBase0:
|
||||
return b1 >= hangulBase1
|
||||
case b0 < hangulEnd0:
|
||||
return true
|
||||
case b0 > hangulEnd0:
|
||||
return false
|
||||
case b1 < hangulEnd1:
|
||||
return true
|
||||
}
|
||||
return b1 == hangulEnd1 && b[2] < hangulEnd2
|
||||
}
|
||||
|
||||
// Caller must ensure len(b) >= 2.
|
||||
func isJamoVT(b []byte) bool {
|
||||
// True if (rune & 0xff00) == jamoLBase
|
||||
return b[0] == jamoLBase0 && (b[1]&0xFC) == jamoLBase1
|
||||
}
|
||||
|
||||
func isHangulWithoutJamoT(b []byte) bool {
|
||||
c, _ := utf8.DecodeRune(b)
|
||||
c -= hangulBase
|
||||
return c < jamoLVTCount && c%jamoTCount == 0
|
||||
}
|
||||
|
||||
// decomposeHangul writes the decomposed Hangul to buf and returns the number
|
||||
// of bytes written. len(buf) should be at least 9.
|
||||
func decomposeHangul(buf []byte, r rune) int {
|
||||
const JamoUTF8Len = 3
|
||||
r -= hangulBase
|
||||
x := r % jamoTCount
|
||||
r /= jamoTCount
|
||||
utf8.EncodeRune(buf, jamoLBase+r/jamoVCount)
|
||||
utf8.EncodeRune(buf[JamoUTF8Len:], jamoVBase+r%jamoVCount)
|
||||
if x != 0 {
|
||||
utf8.EncodeRune(buf[2*JamoUTF8Len:], jamoTBase+x)
|
||||
return 3 * JamoUTF8Len
|
||||
}
|
||||
return 2 * JamoUTF8Len
|
||||
}
|
||||
|
||||
// decomposeHangul algorithmically decomposes a Hangul rune into
|
||||
// its Jamo components.
|
||||
// See http://unicode.org/reports/tr15/#Hangul for details on decomposing Hangul.
|
||||
func (rb *reorderBuffer) decomposeHangul(r rune) {
|
||||
r -= hangulBase
|
||||
x := r % jamoTCount
|
||||
r /= jamoTCount
|
||||
rb.appendRune(jamoLBase + r/jamoVCount)
|
||||
rb.appendRune(jamoVBase + r%jamoVCount)
|
||||
if x != 0 {
|
||||
rb.appendRune(jamoTBase + x)
|
||||
}
|
||||
}
|
||||
|
||||
// combineHangul algorithmically combines Jamo character components into Hangul.
|
||||
// See http://unicode.org/reports/tr15/#Hangul for details on combining Hangul.
|
||||
func (rb *reorderBuffer) combineHangul(s, i, k int) {
|
||||
b := rb.rune[:]
|
||||
bn := rb.nrune
|
||||
for ; i < bn; i++ {
|
||||
cccB := b[k-1].ccc
|
||||
cccC := b[i].ccc
|
||||
if cccB == 0 {
|
||||
s = k - 1
|
||||
}
|
||||
if s != k-1 && cccB >= cccC {
|
||||
// b[i] is blocked by greater-equal cccX below it
|
||||
b[k] = b[i]
|
||||
k++
|
||||
} else {
|
||||
l := rb.runeAt(s) // also used to compare to hangulBase
|
||||
v := rb.runeAt(i) // also used to compare to jamoT
|
||||
switch {
|
||||
case jamoLBase <= l && l < jamoLEnd &&
|
||||
jamoVBase <= v && v < jamoVEnd:
|
||||
// 11xx plus 116x to LV
|
||||
rb.assignRune(s, hangulBase+
|
||||
(l-jamoLBase)*jamoVTCount+(v-jamoVBase)*jamoTCount)
|
||||
case hangulBase <= l && l < hangulEnd &&
|
||||
jamoTBase < v && v < jamoTEnd &&
|
||||
((l-hangulBase)%jamoTCount) == 0:
|
||||
// ACxx plus 11Ax to LVT
|
||||
rb.assignRune(s, l+v-jamoTBase)
|
||||
default:
|
||||
b[k] = b[i]
|
||||
k++
|
||||
}
|
||||
}
|
||||
}
|
||||
rb.nrune = k
|
||||
}
|
||||
|
||||
// compose recombines the runes in the buffer.
|
||||
// It should only be used to recompose a single segment, as it will not
|
||||
// handle alternations between Hangul and non-Hangul characters correctly.
|
||||
func (rb *reorderBuffer) compose() {
|
||||
// UAX #15, section X5 , including Corrigendum #5
|
||||
// "In any character sequence beginning with starter S, a character C is
|
||||
// blocked from S if and only if there is some character B between S
|
||||
// and C, and either B is a starter or it has the same or higher
|
||||
// combining class as C."
|
||||
bn := rb.nrune
|
||||
if bn == 0 {
|
||||
return
|
||||
}
|
||||
k := 1
|
||||
b := rb.rune[:]
|
||||
for s, i := 0, 1; i < bn; i++ {
|
||||
if isJamoVT(rb.bytesAt(i)) {
|
||||
// Redo from start in Hangul mode. Necessary to support
|
||||
// U+320E..U+321E in NFKC mode.
|
||||
rb.combineHangul(s, i, k)
|
||||
return
|
||||
}
|
||||
ii := b[i]
|
||||
// We can only use combineForward as a filter if we later
|
||||
// get the info for the combined character. This is more
|
||||
// expensive than using the filter. Using combinesBackward()
|
||||
// is safe.
|
||||
if ii.combinesBackward() {
|
||||
cccB := b[k-1].ccc
|
||||
cccC := ii.ccc
|
||||
blocked := false // b[i] blocked by starter or greater or equal CCC?
|
||||
if cccB == 0 {
|
||||
s = k - 1
|
||||
} else {
|
||||
blocked = s != k-1 && cccB >= cccC
|
||||
}
|
||||
if !blocked {
|
||||
combined := combine(rb.runeAt(s), rb.runeAt(i))
|
||||
if combined != 0 {
|
||||
rb.assignRune(s, combined)
|
||||
continue
|
||||
}
|
||||
}
|
||||
}
|
||||
b[k] = b[i]
|
||||
k++
|
||||
}
|
||||
rb.nrune = k
|
||||
}
|
||||
256
vendor/golang.org/x/text/unicode/norm/forminfo.go
generated
vendored
Normal file
256
vendor/golang.org/x/text/unicode/norm/forminfo.go
generated
vendored
Normal file
|
|
@ -0,0 +1,256 @@
|
|||
// Copyright 2011 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package norm
|
||||
|
||||
// This file contains Form-specific logic and wrappers for data in tables.go.
|
||||
|
||||
// Rune info is stored in a separate trie per composing form. A composing form
|
||||
// and its corresponding decomposing form share the same trie. Each trie maps
|
||||
// a rune to a uint16. The values take two forms. For v >= 0x8000:
|
||||
// bits
|
||||
// 15: 1 (inverse of NFD_QD bit of qcInfo)
|
||||
// 13..7: qcInfo (see below). isYesD is always true (no decompostion).
|
||||
// 6..0: ccc (compressed CCC value).
|
||||
// For v < 0x8000, the respective rune has a decomposition and v is an index
|
||||
// into a byte array of UTF-8 decomposition sequences and additional info and
|
||||
// has the form:
|
||||
// <header> <decomp_byte>* [<tccc> [<lccc>]]
|
||||
// The header contains the number of bytes in the decomposition (excluding this
|
||||
// length byte). The two most significant bits of this length byte correspond
|
||||
// to bit 5 and 4 of qcInfo (see below). The byte sequence itself starts at v+1.
|
||||
// The byte sequence is followed by a trailing and leading CCC if the values
|
||||
// for these are not zero. The value of v determines which ccc are appended
|
||||
// to the sequences. For v < firstCCC, there are none, for v >= firstCCC,
|
||||
// the sequence is followed by a trailing ccc, and for v >= firstLeadingCC
|
||||
// there is an additional leading ccc. The value of tccc itself is the
|
||||
// trailing CCC shifted left 2 bits. The two least-significant bits of tccc
|
||||
// are the number of trailing non-starters.
|
||||
|
||||
const (
|
||||
qcInfoMask = 0x3F // to clear all but the relevant bits in a qcInfo
|
||||
headerLenMask = 0x3F // extract the length value from the header byte
|
||||
headerFlagsMask = 0xC0 // extract the qcInfo bits from the header byte
|
||||
)
|
||||
|
||||
// Properties provides access to normalization properties of a rune.
|
||||
type Properties struct {
|
||||
pos uint8 // start position in reorderBuffer; used in composition.go
|
||||
size uint8 // length of UTF-8 encoding of this rune
|
||||
ccc uint8 // leading canonical combining class (ccc if not decomposition)
|
||||
tccc uint8 // trailing canonical combining class (ccc if not decomposition)
|
||||
nLead uint8 // number of leading non-starters.
|
||||
flags qcInfo // quick check flags
|
||||
index uint16
|
||||
}
|
||||
|
||||
// functions dispatchable per form
|
||||
type lookupFunc func(b input, i int) Properties
|
||||
|
||||
// formInfo holds Form-specific functions and tables.
|
||||
type formInfo struct {
|
||||
form Form
|
||||
composing, compatibility bool // form type
|
||||
info lookupFunc
|
||||
nextMain iterFunc
|
||||
}
|
||||
|
||||
var formTable []*formInfo
|
||||
|
||||
func init() {
|
||||
formTable = make([]*formInfo, 4)
|
||||
|
||||
for i := range formTable {
|
||||
f := &formInfo{}
|
||||
formTable[i] = f
|
||||
f.form = Form(i)
|
||||
if Form(i) == NFKD || Form(i) == NFKC {
|
||||
f.compatibility = true
|
||||
f.info = lookupInfoNFKC
|
||||
} else {
|
||||
f.info = lookupInfoNFC
|
||||
}
|
||||
f.nextMain = nextDecomposed
|
||||
if Form(i) == NFC || Form(i) == NFKC {
|
||||
f.nextMain = nextComposed
|
||||
f.composing = true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// We do not distinguish between boundaries for NFC, NFD, etc. to avoid
|
||||
// unexpected behavior for the user. For example, in NFD, there is a boundary
|
||||
// after 'a'. However, 'a' might combine with modifiers, so from the application's
|
||||
// perspective it is not a good boundary. We will therefore always use the
|
||||
// boundaries for the combining variants.
|
||||
|
||||
// BoundaryBefore returns true if this rune starts a new segment and
|
||||
// cannot combine with any rune on the left.
|
||||
func (p Properties) BoundaryBefore() bool {
|
||||
if p.ccc == 0 && !p.combinesBackward() {
|
||||
return true
|
||||
}
|
||||
// We assume that the CCC of the first character in a decomposition
|
||||
// is always non-zero if different from info.ccc and that we can return
|
||||
// false at this point. This is verified by maketables.
|
||||
return false
|
||||
}
|
||||
|
||||
// BoundaryAfter returns true if runes cannot combine with or otherwise
|
||||
// interact with this or previous runes.
|
||||
func (p Properties) BoundaryAfter() bool {
|
||||
// TODO: loosen these conditions.
|
||||
return p.isInert()
|
||||
}
|
||||
|
||||
// We pack quick check data in 4 bits:
|
||||
// 5: Combines forward (0 == false, 1 == true)
|
||||
// 4..3: NFC_QC Yes(00), No (10), or Maybe (11)
|
||||
// 2: NFD_QC Yes (0) or No (1). No also means there is a decomposition.
|
||||
// 1..0: Number of trailing non-starters.
|
||||
//
|
||||
// When all 4 bits are zero, the character is inert, meaning it is never
|
||||
// influenced by normalization.
|
||||
type qcInfo uint8
|
||||
|
||||
func (p Properties) isYesC() bool { return p.flags&0x10 == 0 }
|
||||
func (p Properties) isYesD() bool { return p.flags&0x4 == 0 }
|
||||
|
||||
func (p Properties) combinesForward() bool { return p.flags&0x20 != 0 }
|
||||
func (p Properties) combinesBackward() bool { return p.flags&0x8 != 0 } // == isMaybe
|
||||
func (p Properties) hasDecomposition() bool { return p.flags&0x4 != 0 } // == isNoD
|
||||
|
||||
func (p Properties) isInert() bool {
|
||||
return p.flags&qcInfoMask == 0 && p.ccc == 0
|
||||
}
|
||||
|
||||
func (p Properties) multiSegment() bool {
|
||||
return p.index >= firstMulti && p.index < endMulti
|
||||
}
|
||||
|
||||
func (p Properties) nLeadingNonStarters() uint8 {
|
||||
return p.nLead
|
||||
}
|
||||
|
||||
func (p Properties) nTrailingNonStarters() uint8 {
|
||||
return uint8(p.flags & 0x03)
|
||||
}
|
||||
|
||||
// Decomposition returns the decomposition for the underlying rune
|
||||
// or nil if there is none.
|
||||
func (p Properties) Decomposition() []byte {
|
||||
// TODO: create the decomposition for Hangul?
|
||||
if p.index == 0 {
|
||||
return nil
|
||||
}
|
||||
i := p.index
|
||||
n := decomps[i] & headerLenMask
|
||||
i++
|
||||
return decomps[i : i+uint16(n)]
|
||||
}
|
||||
|
||||
// Size returns the length of UTF-8 encoding of the rune.
|
||||
func (p Properties) Size() int {
|
||||
return int(p.size)
|
||||
}
|
||||
|
||||
// CCC returns the canonical combining class of the underlying rune.
|
||||
func (p Properties) CCC() uint8 {
|
||||
if p.index >= firstCCCZeroExcept {
|
||||
return 0
|
||||
}
|
||||
return ccc[p.ccc]
|
||||
}
|
||||
|
||||
// LeadCCC returns the CCC of the first rune in the decomposition.
|
||||
// If there is no decomposition, LeadCCC equals CCC.
|
||||
func (p Properties) LeadCCC() uint8 {
|
||||
return ccc[p.ccc]
|
||||
}
|
||||
|
||||
// TrailCCC returns the CCC of the last rune in the decomposition.
|
||||
// If there is no decomposition, TrailCCC equals CCC.
|
||||
func (p Properties) TrailCCC() uint8 {
|
||||
return ccc[p.tccc]
|
||||
}
|
||||
|
||||
// Recomposition
|
||||
// We use 32-bit keys instead of 64-bit for the two codepoint keys.
|
||||
// This clips off the bits of three entries, but we know this will not
|
||||
// result in a collision. In the unlikely event that changes to
|
||||
// UnicodeData.txt introduce collisions, the compiler will catch it.
|
||||
// Note that the recomposition map for NFC and NFKC are identical.
|
||||
|
||||
// combine returns the combined rune or 0 if it doesn't exist.
|
||||
func combine(a, b rune) rune {
|
||||
key := uint32(uint16(a))<<16 + uint32(uint16(b))
|
||||
return recompMap[key]
|
||||
}
|
||||
|
||||
func lookupInfoNFC(b input, i int) Properties {
|
||||
v, sz := b.charinfoNFC(i)
|
||||
return compInfo(v, sz)
|
||||
}
|
||||
|
||||
func lookupInfoNFKC(b input, i int) Properties {
|
||||
v, sz := b.charinfoNFKC(i)
|
||||
return compInfo(v, sz)
|
||||
}
|
||||
|
||||
// Properties returns properties for the first rune in s.
|
||||
func (f Form) Properties(s []byte) Properties {
|
||||
if f == NFC || f == NFD {
|
||||
return compInfo(nfcData.lookup(s))
|
||||
}
|
||||
return compInfo(nfkcData.lookup(s))
|
||||
}
|
||||
|
||||
// PropertiesString returns properties for the first rune in s.
|
||||
func (f Form) PropertiesString(s string) Properties {
|
||||
if f == NFC || f == NFD {
|
||||
return compInfo(nfcData.lookupString(s))
|
||||
}
|
||||
return compInfo(nfkcData.lookupString(s))
|
||||
}
|
||||
|
||||
// compInfo converts the information contained in v and sz
|
||||
// to a Properties. See the comment at the top of the file
|
||||
// for more information on the format.
|
||||
func compInfo(v uint16, sz int) Properties {
|
||||
if v == 0 {
|
||||
return Properties{size: uint8(sz)}
|
||||
} else if v >= 0x8000 {
|
||||
p := Properties{
|
||||
size: uint8(sz),
|
||||
ccc: uint8(v),
|
||||
tccc: uint8(v),
|
||||
flags: qcInfo(v >> 8),
|
||||
}
|
||||
if p.ccc > 0 || p.combinesBackward() {
|
||||
p.nLead = uint8(p.flags & 0x3)
|
||||
}
|
||||
return p
|
||||
}
|
||||
// has decomposition
|
||||
h := decomps[v]
|
||||
f := (qcInfo(h&headerFlagsMask) >> 2) | 0x4
|
||||
p := Properties{size: uint8(sz), flags: f, index: v}
|
||||
if v >= firstCCC {
|
||||
v += uint16(h&headerLenMask) + 1
|
||||
c := decomps[v]
|
||||
p.tccc = c >> 2
|
||||
p.flags |= qcInfo(c & 0x3)
|
||||
if v >= firstLeadingCCC {
|
||||
p.nLead = c & 0x3
|
||||
if v >= firstStarterWithNLead {
|
||||
// We were tricked. Remove the decomposition.
|
||||
p.flags &= 0x03
|
||||
p.index = 0
|
||||
return p
|
||||
}
|
||||
p.ccc = decomps[v+1]
|
||||
}
|
||||
}
|
||||
return p
|
||||
}
|
||||
105
vendor/golang.org/x/text/unicode/norm/input.go
generated
vendored
Normal file
105
vendor/golang.org/x/text/unicode/norm/input.go
generated
vendored
Normal file
|
|
@ -0,0 +1,105 @@
|
|||
// Copyright 2011 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package norm
|
||||
|
||||
import "unicode/utf8"
|
||||
|
||||
type input struct {
|
||||
str string
|
||||
bytes []byte
|
||||
}
|
||||
|
||||
func inputBytes(str []byte) input {
|
||||
return input{bytes: str}
|
||||
}
|
||||
|
||||
func inputString(str string) input {
|
||||
return input{str: str}
|
||||
}
|
||||
|
||||
func (in *input) setBytes(str []byte) {
|
||||
in.str = ""
|
||||
in.bytes = str
|
||||
}
|
||||
|
||||
func (in *input) setString(str string) {
|
||||
in.str = str
|
||||
in.bytes = nil
|
||||
}
|
||||
|
||||
func (in *input) _byte(p int) byte {
|
||||
if in.bytes == nil {
|
||||
return in.str[p]
|
||||
}
|
||||
return in.bytes[p]
|
||||
}
|
||||
|
||||
func (in *input) skipASCII(p, max int) int {
|
||||
if in.bytes == nil {
|
||||
for ; p < max && in.str[p] < utf8.RuneSelf; p++ {
|
||||
}
|
||||
} else {
|
||||
for ; p < max && in.bytes[p] < utf8.RuneSelf; p++ {
|
||||
}
|
||||
}
|
||||
return p
|
||||
}
|
||||
|
||||
func (in *input) skipContinuationBytes(p int) int {
|
||||
if in.bytes == nil {
|
||||
for ; p < len(in.str) && !utf8.RuneStart(in.str[p]); p++ {
|
||||
}
|
||||
} else {
|
||||
for ; p < len(in.bytes) && !utf8.RuneStart(in.bytes[p]); p++ {
|
||||
}
|
||||
}
|
||||
return p
|
||||
}
|
||||
|
||||
func (in *input) appendSlice(buf []byte, b, e int) []byte {
|
||||
if in.bytes != nil {
|
||||
return append(buf, in.bytes[b:e]...)
|
||||
}
|
||||
for i := b; i < e; i++ {
|
||||
buf = append(buf, in.str[i])
|
||||
}
|
||||
return buf
|
||||
}
|
||||
|
||||
func (in *input) copySlice(buf []byte, b, e int) int {
|
||||
if in.bytes == nil {
|
||||
return copy(buf, in.str[b:e])
|
||||
}
|
||||
return copy(buf, in.bytes[b:e])
|
||||
}
|
||||
|
||||
func (in *input) charinfoNFC(p int) (uint16, int) {
|
||||
if in.bytes == nil {
|
||||
return nfcData.lookupString(in.str[p:])
|
||||
}
|
||||
return nfcData.lookup(in.bytes[p:])
|
||||
}
|
||||
|
||||
func (in *input) charinfoNFKC(p int) (uint16, int) {
|
||||
if in.bytes == nil {
|
||||
return nfkcData.lookupString(in.str[p:])
|
||||
}
|
||||
return nfkcData.lookup(in.bytes[p:])
|
||||
}
|
||||
|
||||
func (in *input) hangul(p int) (r rune) {
|
||||
if in.bytes == nil {
|
||||
if !isHangulString(in.str[p:]) {
|
||||
return 0
|
||||
}
|
||||
r, _ = utf8.DecodeRuneInString(in.str[p:])
|
||||
} else {
|
||||
if !isHangul(in.bytes[p:]) {
|
||||
return 0
|
||||
}
|
||||
r, _ = utf8.DecodeRune(in.bytes[p:])
|
||||
}
|
||||
return r
|
||||
}
|
||||
450
vendor/golang.org/x/text/unicode/norm/iter.go
generated
vendored
Normal file
450
vendor/golang.org/x/text/unicode/norm/iter.go
generated
vendored
Normal file
|
|
@ -0,0 +1,450 @@
|
|||
// Copyright 2011 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package norm
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"unicode/utf8"
|
||||
)
|
||||
|
||||
// MaxSegmentSize is the maximum size of a byte buffer needed to consider any
|
||||
// sequence of starter and non-starter runes for the purpose of normalization.
|
||||
const MaxSegmentSize = maxByteBufferSize
|
||||
|
||||
// An Iter iterates over a string or byte slice, while normalizing it
|
||||
// to a given Form.
|
||||
type Iter struct {
|
||||
rb reorderBuffer
|
||||
buf [maxByteBufferSize]byte
|
||||
info Properties // first character saved from previous iteration
|
||||
next iterFunc // implementation of next depends on form
|
||||
asciiF iterFunc
|
||||
|
||||
p int // current position in input source
|
||||
multiSeg []byte // remainder of multi-segment decomposition
|
||||
}
|
||||
|
||||
type iterFunc func(*Iter) []byte
|
||||
|
||||
// Init initializes i to iterate over src after normalizing it to Form f.
|
||||
func (i *Iter) Init(f Form, src []byte) {
|
||||
i.p = 0
|
||||
if len(src) == 0 {
|
||||
i.setDone()
|
||||
i.rb.nsrc = 0
|
||||
return
|
||||
}
|
||||
i.multiSeg = nil
|
||||
i.rb.init(f, src)
|
||||
i.next = i.rb.f.nextMain
|
||||
i.asciiF = nextASCIIBytes
|
||||
i.info = i.rb.f.info(i.rb.src, i.p)
|
||||
}
|
||||
|
||||
// InitString initializes i to iterate over src after normalizing it to Form f.
|
||||
func (i *Iter) InitString(f Form, src string) {
|
||||
i.p = 0
|
||||
if len(src) == 0 {
|
||||
i.setDone()
|
||||
i.rb.nsrc = 0
|
||||
return
|
||||
}
|
||||
i.multiSeg = nil
|
||||
i.rb.initString(f, src)
|
||||
i.next = i.rb.f.nextMain
|
||||
i.asciiF = nextASCIIString
|
||||
i.info = i.rb.f.info(i.rb.src, i.p)
|
||||
}
|
||||
|
||||
// Seek sets the segment to be returned by the next call to Next to start
|
||||
// at position p. It is the responsibility of the caller to set p to the
|
||||
// start of a UTF8 rune.
|
||||
func (i *Iter) Seek(offset int64, whence int) (int64, error) {
|
||||
var abs int64
|
||||
switch whence {
|
||||
case 0:
|
||||
abs = offset
|
||||
case 1:
|
||||
abs = int64(i.p) + offset
|
||||
case 2:
|
||||
abs = int64(i.rb.nsrc) + offset
|
||||
default:
|
||||
return 0, fmt.Errorf("norm: invalid whence")
|
||||
}
|
||||
if abs < 0 {
|
||||
return 0, fmt.Errorf("norm: negative position")
|
||||
}
|
||||
if int(abs) >= i.rb.nsrc {
|
||||
i.setDone()
|
||||
return int64(i.p), nil
|
||||
}
|
||||
i.p = int(abs)
|
||||
i.multiSeg = nil
|
||||
i.next = i.rb.f.nextMain
|
||||
i.info = i.rb.f.info(i.rb.src, i.p)
|
||||
return abs, nil
|
||||
}
|
||||
|
||||
// returnSlice returns a slice of the underlying input type as a byte slice.
|
||||
// If the underlying is of type []byte, it will simply return a slice.
|
||||
// If the underlying is of type string, it will copy the slice to the buffer
|
||||
// and return that.
|
||||
func (i *Iter) returnSlice(a, b int) []byte {
|
||||
if i.rb.src.bytes == nil {
|
||||
return i.buf[:copy(i.buf[:], i.rb.src.str[a:b])]
|
||||
}
|
||||
return i.rb.src.bytes[a:b]
|
||||
}
|
||||
|
||||
// Pos returns the byte position at which the next call to Next will commence processing.
|
||||
func (i *Iter) Pos() int {
|
||||
return i.p
|
||||
}
|
||||
|
||||
func (i *Iter) setDone() {
|
||||
i.next = nextDone
|
||||
i.p = i.rb.nsrc
|
||||
}
|
||||
|
||||
// Done returns true if there is no more input to process.
|
||||
func (i *Iter) Done() bool {
|
||||
return i.p >= i.rb.nsrc
|
||||
}
|
||||
|
||||
// Next returns f(i.input[i.Pos():n]), where n is a boundary of i.input.
|
||||
// For any input a and b for which f(a) == f(b), subsequent calls
|
||||
// to Next will return the same segments.
|
||||
// Modifying runes are grouped together with the preceding starter, if such a starter exists.
|
||||
// Although not guaranteed, n will typically be the smallest possible n.
|
||||
func (i *Iter) Next() []byte {
|
||||
return i.next(i)
|
||||
}
|
||||
|
||||
func nextASCIIBytes(i *Iter) []byte {
|
||||
p := i.p + 1
|
||||
if p >= i.rb.nsrc {
|
||||
i.setDone()
|
||||
return i.rb.src.bytes[i.p:p]
|
||||
}
|
||||
if i.rb.src.bytes[p] < utf8.RuneSelf {
|
||||
p0 := i.p
|
||||
i.p = p
|
||||
return i.rb.src.bytes[p0:p]
|
||||
}
|
||||
i.info = i.rb.f.info(i.rb.src, i.p)
|
||||
i.next = i.rb.f.nextMain
|
||||
return i.next(i)
|
||||
}
|
||||
|
||||
func nextASCIIString(i *Iter) []byte {
|
||||
p := i.p + 1
|
||||
if p >= i.rb.nsrc {
|
||||
i.buf[0] = i.rb.src.str[i.p]
|
||||
i.setDone()
|
||||
return i.buf[:1]
|
||||
}
|
||||
if i.rb.src.str[p] < utf8.RuneSelf {
|
||||
i.buf[0] = i.rb.src.str[i.p]
|
||||
i.p = p
|
||||
return i.buf[:1]
|
||||
}
|
||||
i.info = i.rb.f.info(i.rb.src, i.p)
|
||||
i.next = i.rb.f.nextMain
|
||||
return i.next(i)
|
||||
}
|
||||
|
||||
func nextHangul(i *Iter) []byte {
|
||||
p := i.p
|
||||
next := p + hangulUTF8Size
|
||||
if next >= i.rb.nsrc {
|
||||
i.setDone()
|
||||
} else if i.rb.src.hangul(next) == 0 {
|
||||
i.info = i.rb.f.info(i.rb.src, i.p)
|
||||
i.next = i.rb.f.nextMain
|
||||
return i.next(i)
|
||||
}
|
||||
i.p = next
|
||||
return i.buf[:decomposeHangul(i.buf[:], i.rb.src.hangul(p))]
|
||||
}
|
||||
|
||||
func nextDone(i *Iter) []byte {
|
||||
return nil
|
||||
}
|
||||
|
||||
// nextMulti is used for iterating over multi-segment decompositions
|
||||
// for decomposing normal forms.
|
||||
func nextMulti(i *Iter) []byte {
|
||||
j := 0
|
||||
d := i.multiSeg
|
||||
// skip first rune
|
||||
for j = 1; j < len(d) && !utf8.RuneStart(d[j]); j++ {
|
||||
}
|
||||
for j < len(d) {
|
||||
info := i.rb.f.info(input{bytes: d}, j)
|
||||
if info.BoundaryBefore() {
|
||||
i.multiSeg = d[j:]
|
||||
return d[:j]
|
||||
}
|
||||
j += int(info.size)
|
||||
}
|
||||
// treat last segment as normal decomposition
|
||||
i.next = i.rb.f.nextMain
|
||||
return i.next(i)
|
||||
}
|
||||
|
||||
// nextMultiNorm is used for iterating over multi-segment decompositions
|
||||
// for composing normal forms.
|
||||
func nextMultiNorm(i *Iter) []byte {
|
||||
j := 0
|
||||
d := i.multiSeg
|
||||
for j < len(d) {
|
||||
info := i.rb.f.info(input{bytes: d}, j)
|
||||
if info.BoundaryBefore() {
|
||||
i.rb.compose()
|
||||
seg := i.buf[:i.rb.flushCopy(i.buf[:])]
|
||||
i.rb.ss.first(info)
|
||||
i.rb.insertUnsafe(input{bytes: d}, j, info)
|
||||
i.multiSeg = d[j+int(info.size):]
|
||||
return seg
|
||||
}
|
||||
i.rb.ss.next(info)
|
||||
i.rb.insertUnsafe(input{bytes: d}, j, info)
|
||||
j += int(info.size)
|
||||
}
|
||||
i.multiSeg = nil
|
||||
i.next = nextComposed
|
||||
return doNormComposed(i)
|
||||
}
|
||||
|
||||
// nextDecomposed is the implementation of Next for forms NFD and NFKD.
|
||||
func nextDecomposed(i *Iter) (next []byte) {
|
||||
outp := 0
|
||||
inCopyStart, outCopyStart := i.p, 0
|
||||
ss := mkStreamSafe(i.info)
|
||||
for {
|
||||
if sz := int(i.info.size); sz <= 1 {
|
||||
p := i.p
|
||||
i.p++ // ASCII or illegal byte. Either way, advance by 1.
|
||||
if i.p >= i.rb.nsrc {
|
||||
i.setDone()
|
||||
return i.returnSlice(p, i.p)
|
||||
} else if i.rb.src._byte(i.p) < utf8.RuneSelf {
|
||||
i.next = i.asciiF
|
||||
return i.returnSlice(p, i.p)
|
||||
}
|
||||
outp++
|
||||
} else if d := i.info.Decomposition(); d != nil {
|
||||
// Note: If leading CCC != 0, then len(d) == 2 and last is also non-zero.
|
||||
// Case 1: there is a leftover to copy. In this case the decomposition
|
||||
// must begin with a modifier and should always be appended.
|
||||
// Case 2: no leftover. Simply return d if followed by a ccc == 0 value.
|
||||
p := outp + len(d)
|
||||
if outp > 0 {
|
||||
i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p)
|
||||
if p > len(i.buf) {
|
||||
return i.buf[:outp]
|
||||
}
|
||||
} else if i.info.multiSegment() {
|
||||
// outp must be 0 as multi-segment decompositions always
|
||||
// start a new segment.
|
||||
if i.multiSeg == nil {
|
||||
i.multiSeg = d
|
||||
i.next = nextMulti
|
||||
return nextMulti(i)
|
||||
}
|
||||
// We are in the last segment. Treat as normal decomposition.
|
||||
d = i.multiSeg
|
||||
i.multiSeg = nil
|
||||
p = len(d)
|
||||
}
|
||||
prevCC := i.info.tccc
|
||||
if i.p += sz; i.p >= i.rb.nsrc {
|
||||
i.setDone()
|
||||
i.info = Properties{} // Force BoundaryBefore to succeed.
|
||||
} else {
|
||||
i.info = i.rb.f.info(i.rb.src, i.p)
|
||||
}
|
||||
switch ss.next(i.info) {
|
||||
case ssOverflow:
|
||||
i.next = nextCGJDecompose
|
||||
fallthrough
|
||||
case ssStarter:
|
||||
if outp > 0 {
|
||||
copy(i.buf[outp:], d)
|
||||
return i.buf[:p]
|
||||
}
|
||||
return d
|
||||
}
|
||||
copy(i.buf[outp:], d)
|
||||
outp = p
|
||||
inCopyStart, outCopyStart = i.p, outp
|
||||
if i.info.ccc < prevCC {
|
||||
goto doNorm
|
||||
}
|
||||
continue
|
||||
} else if r := i.rb.src.hangul(i.p); r != 0 {
|
||||
outp = decomposeHangul(i.buf[:], r)
|
||||
i.p += hangulUTF8Size
|
||||
inCopyStart, outCopyStart = i.p, outp
|
||||
if i.p >= i.rb.nsrc {
|
||||
i.setDone()
|
||||
break
|
||||
} else if i.rb.src.hangul(i.p) != 0 {
|
||||
i.next = nextHangul
|
||||
return i.buf[:outp]
|
||||
}
|
||||
} else {
|
||||
p := outp + sz
|
||||
if p > len(i.buf) {
|
||||
break
|
||||
}
|
||||
outp = p
|
||||
i.p += sz
|
||||
}
|
||||
if i.p >= i.rb.nsrc {
|
||||
i.setDone()
|
||||
break
|
||||
}
|
||||
prevCC := i.info.tccc
|
||||
i.info = i.rb.f.info(i.rb.src, i.p)
|
||||
if v := ss.next(i.info); v == ssStarter {
|
||||
break
|
||||
} else if v == ssOverflow {
|
||||
i.next = nextCGJDecompose
|
||||
break
|
||||
}
|
||||
if i.info.ccc < prevCC {
|
||||
goto doNorm
|
||||
}
|
||||
}
|
||||
if outCopyStart == 0 {
|
||||
return i.returnSlice(inCopyStart, i.p)
|
||||
} else if inCopyStart < i.p {
|
||||
i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p)
|
||||
}
|
||||
return i.buf[:outp]
|
||||
doNorm:
|
||||
// Insert what we have decomposed so far in the reorderBuffer.
|
||||
// As we will only reorder, there will always be enough room.
|
||||
i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p)
|
||||
i.rb.insertDecomposed(i.buf[0:outp])
|
||||
return doNormDecomposed(i)
|
||||
}
|
||||
|
||||
func doNormDecomposed(i *Iter) []byte {
|
||||
for {
|
||||
if s := i.rb.ss.next(i.info); s == ssOverflow {
|
||||
i.next = nextCGJDecompose
|
||||
break
|
||||
}
|
||||
i.rb.insertUnsafe(i.rb.src, i.p, i.info)
|
||||
if i.p += int(i.info.size); i.p >= i.rb.nsrc {
|
||||
i.setDone()
|
||||
break
|
||||
}
|
||||
i.info = i.rb.f.info(i.rb.src, i.p)
|
||||
if i.info.ccc == 0 {
|
||||
break
|
||||
}
|
||||
}
|
||||
// new segment or too many combining characters: exit normalization
|
||||
return i.buf[:i.rb.flushCopy(i.buf[:])]
|
||||
}
|
||||
|
||||
func nextCGJDecompose(i *Iter) []byte {
|
||||
i.rb.ss = 0
|
||||
i.rb.insertCGJ()
|
||||
i.next = nextDecomposed
|
||||
buf := doNormDecomposed(i)
|
||||
return buf
|
||||
}
|
||||
|
||||
// nextComposed is the implementation of Next for forms NFC and NFKC.
|
||||
func nextComposed(i *Iter) []byte {
|
||||
outp, startp := 0, i.p
|
||||
var prevCC uint8
|
||||
ss := mkStreamSafe(i.info)
|
||||
for {
|
||||
if !i.info.isYesC() {
|
||||
goto doNorm
|
||||
}
|
||||
prevCC = i.info.tccc
|
||||
sz := int(i.info.size)
|
||||
if sz == 0 {
|
||||
sz = 1 // illegal rune: copy byte-by-byte
|
||||
}
|
||||
p := outp + sz
|
||||
if p > len(i.buf) {
|
||||
break
|
||||
}
|
||||
outp = p
|
||||
i.p += sz
|
||||
if i.p >= i.rb.nsrc {
|
||||
i.setDone()
|
||||
break
|
||||
} else if i.rb.src._byte(i.p) < utf8.RuneSelf {
|
||||
i.next = i.asciiF
|
||||
break
|
||||
}
|
||||
i.info = i.rb.f.info(i.rb.src, i.p)
|
||||
if v := ss.next(i.info); v == ssStarter {
|
||||
break
|
||||
} else if v == ssOverflow {
|
||||
i.next = nextCGJCompose
|
||||
break
|
||||
}
|
||||
if i.info.ccc < prevCC {
|
||||
goto doNorm
|
||||
}
|
||||
}
|
||||
return i.returnSlice(startp, i.p)
|
||||
doNorm:
|
||||
i.p = startp
|
||||
i.info = i.rb.f.info(i.rb.src, i.p)
|
||||
if i.info.multiSegment() {
|
||||
d := i.info.Decomposition()
|
||||
info := i.rb.f.info(input{bytes: d}, 0)
|
||||
i.rb.insertUnsafe(input{bytes: d}, 0, info)
|
||||
i.multiSeg = d[int(info.size):]
|
||||
i.next = nextMultiNorm
|
||||
return nextMultiNorm(i)
|
||||
}
|
||||
i.rb.ss.first(i.info)
|
||||
i.rb.insertUnsafe(i.rb.src, i.p, i.info)
|
||||
return doNormComposed(i)
|
||||
}
|
||||
|
||||
func doNormComposed(i *Iter) []byte {
|
||||
// First rune should already be inserted.
|
||||
for {
|
||||
if i.p += int(i.info.size); i.p >= i.rb.nsrc {
|
||||
i.setDone()
|
||||
break
|
||||
}
|
||||
i.info = i.rb.f.info(i.rb.src, i.p)
|
||||
if s := i.rb.ss.next(i.info); s == ssStarter {
|
||||
break
|
||||
} else if s == ssOverflow {
|
||||
i.next = nextCGJCompose
|
||||
break
|
||||
}
|
||||
i.rb.insertUnsafe(i.rb.src, i.p, i.info)
|
||||
}
|
||||
i.rb.compose()
|
||||
seg := i.buf[:i.rb.flushCopy(i.buf[:])]
|
||||
return seg
|
||||
}
|
||||
|
||||
func nextCGJCompose(i *Iter) []byte {
|
||||
i.rb.ss = 0 // instead of first
|
||||
i.rb.insertCGJ()
|
||||
i.next = nextComposed
|
||||
// Note that we treat any rune with nLeadingNonStarters > 0 as a non-starter,
|
||||
// even if they are not. This is particularly dubious for U+FF9E and UFF9A.
|
||||
// If we ever change that, insert a check here.
|
||||
i.rb.ss.first(i.info)
|
||||
i.rb.insertUnsafe(i.rb.src, i.p, i.info)
|
||||
return doNormComposed(i)
|
||||
}
|
||||
978
vendor/golang.org/x/text/unicode/norm/maketables.go
generated
vendored
Normal file
978
vendor/golang.org/x/text/unicode/norm/maketables.go
generated
vendored
Normal file
|
|
@ -0,0 +1,978 @@
|
|||
// Copyright 2011 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build ignore
|
||||
|
||||
// Normalization table generator.
|
||||
// Data read from the web.
|
||||
// See forminfo.go for a description of the trie values associated with each rune.
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"flag"
|
||||
"fmt"
|
||||
"io"
|
||||
"log"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"golang.org/x/text/internal/gen"
|
||||
"golang.org/x/text/internal/triegen"
|
||||
"golang.org/x/text/internal/ucd"
|
||||
)
|
||||
|
||||
func main() {
|
||||
gen.Init()
|
||||
loadUnicodeData()
|
||||
compactCCC()
|
||||
loadCompositionExclusions()
|
||||
completeCharFields(FCanonical)
|
||||
completeCharFields(FCompatibility)
|
||||
computeNonStarterCounts()
|
||||
verifyComputed()
|
||||
printChars()
|
||||
if *test {
|
||||
testDerived()
|
||||
printTestdata()
|
||||
} else {
|
||||
makeTables()
|
||||
}
|
||||
}
|
||||
|
||||
var (
|
||||
tablelist = flag.String("tables",
|
||||
"all",
|
||||
"comma-separated list of which tables to generate; "+
|
||||
"can be 'decomp', 'recomp', 'info' and 'all'")
|
||||
test = flag.Bool("test",
|
||||
false,
|
||||
"test existing tables against DerivedNormalizationProps and generate test data for regression testing")
|
||||
verbose = flag.Bool("verbose",
|
||||
false,
|
||||
"write data to stdout as it is parsed")
|
||||
)
|
||||
|
||||
const MaxChar = 0x10FFFF // anything above this shouldn't exist
|
||||
|
||||
// Quick Check properties of runes allow us to quickly
|
||||
// determine whether a rune may occur in a normal form.
|
||||
// For a given normal form, a rune may be guaranteed to occur
|
||||
// verbatim (QC=Yes), may or may not combine with another
|
||||
// rune (QC=Maybe), or may not occur (QC=No).
|
||||
type QCResult int
|
||||
|
||||
const (
|
||||
QCUnknown QCResult = iota
|
||||
QCYes
|
||||
QCNo
|
||||
QCMaybe
|
||||
)
|
||||
|
||||
func (r QCResult) String() string {
|
||||
switch r {
|
||||
case QCYes:
|
||||
return "Yes"
|
||||
case QCNo:
|
||||
return "No"
|
||||
case QCMaybe:
|
||||
return "Maybe"
|
||||
}
|
||||
return "***UNKNOWN***"
|
||||
}
|
||||
|
||||
const (
|
||||
FCanonical = iota // NFC or NFD
|
||||
FCompatibility // NFKC or NFKD
|
||||
FNumberOfFormTypes
|
||||
)
|
||||
|
||||
const (
|
||||
MComposed = iota // NFC or NFKC
|
||||
MDecomposed // NFD or NFKD
|
||||
MNumberOfModes
|
||||
)
|
||||
|
||||
// This contains only the properties we're interested in.
|
||||
type Char struct {
|
||||
name string
|
||||
codePoint rune // if zero, this index is not a valid code point.
|
||||
ccc uint8 // canonical combining class
|
||||
origCCC uint8
|
||||
excludeInComp bool // from CompositionExclusions.txt
|
||||
compatDecomp bool // it has a compatibility expansion
|
||||
|
||||
nTrailingNonStarters uint8
|
||||
nLeadingNonStarters uint8 // must be equal to trailing if non-zero
|
||||
|
||||
forms [FNumberOfFormTypes]FormInfo // For FCanonical and FCompatibility
|
||||
|
||||
state State
|
||||
}
|
||||
|
||||
var chars = make([]Char, MaxChar+1)
|
||||
var cccMap = make(map[uint8]uint8)
|
||||
|
||||
func (c Char) String() string {
|
||||
buf := new(bytes.Buffer)
|
||||
|
||||
fmt.Fprintf(buf, "%U [%s]:\n", c.codePoint, c.name)
|
||||
fmt.Fprintf(buf, " ccc: %v\n", c.ccc)
|
||||
fmt.Fprintf(buf, " excludeInComp: %v\n", c.excludeInComp)
|
||||
fmt.Fprintf(buf, " compatDecomp: %v\n", c.compatDecomp)
|
||||
fmt.Fprintf(buf, " state: %v\n", c.state)
|
||||
fmt.Fprintf(buf, " NFC:\n")
|
||||
fmt.Fprint(buf, c.forms[FCanonical])
|
||||
fmt.Fprintf(buf, " NFKC:\n")
|
||||
fmt.Fprint(buf, c.forms[FCompatibility])
|
||||
|
||||
return buf.String()
|
||||
}
|
||||
|
||||
// In UnicodeData.txt, some ranges are marked like this:
|
||||
// 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
|
||||
// 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
|
||||
// parseCharacter keeps a state variable indicating the weirdness.
|
||||
type State int
|
||||
|
||||
const (
|
||||
SNormal State = iota // known to be zero for the type
|
||||
SFirst
|
||||
SLast
|
||||
SMissing
|
||||
)
|
||||
|
||||
var lastChar = rune('\u0000')
|
||||
|
||||
func (c Char) isValid() bool {
|
||||
return c.codePoint != 0 && c.state != SMissing
|
||||
}
|
||||
|
||||
type FormInfo struct {
|
||||
quickCheck [MNumberOfModes]QCResult // index: MComposed or MDecomposed
|
||||
verified [MNumberOfModes]bool // index: MComposed or MDecomposed
|
||||
|
||||
combinesForward bool // May combine with rune on the right
|
||||
combinesBackward bool // May combine with rune on the left
|
||||
isOneWay bool // Never appears in result
|
||||
inDecomp bool // Some decompositions result in this char.
|
||||
decomp Decomposition
|
||||
expandedDecomp Decomposition
|
||||
}
|
||||
|
||||
func (f FormInfo) String() string {
|
||||
buf := bytes.NewBuffer(make([]byte, 0))
|
||||
|
||||
fmt.Fprintf(buf, " quickCheck[C]: %v\n", f.quickCheck[MComposed])
|
||||
fmt.Fprintf(buf, " quickCheck[D]: %v\n", f.quickCheck[MDecomposed])
|
||||
fmt.Fprintf(buf, " cmbForward: %v\n", f.combinesForward)
|
||||
fmt.Fprintf(buf, " cmbBackward: %v\n", f.combinesBackward)
|
||||
fmt.Fprintf(buf, " isOneWay: %v\n", f.isOneWay)
|
||||
fmt.Fprintf(buf, " inDecomp: %v\n", f.inDecomp)
|
||||
fmt.Fprintf(buf, " decomposition: %X\n", f.decomp)
|
||||
fmt.Fprintf(buf, " expandedDecomp: %X\n", f.expandedDecomp)
|
||||
|
||||
return buf.String()
|
||||
}
|
||||
|
||||
type Decomposition []rune
|
||||
|
||||
func parseDecomposition(s string, skipfirst bool) (a []rune, err error) {
|
||||
decomp := strings.Split(s, " ")
|
||||
if len(decomp) > 0 && skipfirst {
|
||||
decomp = decomp[1:]
|
||||
}
|
||||
for _, d := range decomp {
|
||||
point, err := strconv.ParseUint(d, 16, 64)
|
||||
if err != nil {
|
||||
return a, err
|
||||
}
|
||||
a = append(a, rune(point))
|
||||
}
|
||||
return a, nil
|
||||
}
|
||||
|
||||
func loadUnicodeData() {
|
||||
f := gen.OpenUCDFile("UnicodeData.txt")
|
||||
defer f.Close()
|
||||
p := ucd.New(f)
|
||||
for p.Next() {
|
||||
r := p.Rune(ucd.CodePoint)
|
||||
char := &chars[r]
|
||||
|
||||
char.ccc = uint8(p.Uint(ucd.CanonicalCombiningClass))
|
||||
decmap := p.String(ucd.DecompMapping)
|
||||
|
||||
exp, err := parseDecomposition(decmap, false)
|
||||
isCompat := false
|
||||
if err != nil {
|
||||
if len(decmap) > 0 {
|
||||
exp, err = parseDecomposition(decmap, true)
|
||||
if err != nil {
|
||||
log.Fatalf(`%U: bad decomp |%v|: "%s"`, r, decmap, err)
|
||||
}
|
||||
isCompat = true
|
||||
}
|
||||
}
|
||||
|
||||
char.name = p.String(ucd.Name)
|
||||
char.codePoint = r
|
||||
char.forms[FCompatibility].decomp = exp
|
||||
if !isCompat {
|
||||
char.forms[FCanonical].decomp = exp
|
||||
} else {
|
||||
char.compatDecomp = true
|
||||
}
|
||||
if len(decmap) > 0 {
|
||||
char.forms[FCompatibility].decomp = exp
|
||||
}
|
||||
}
|
||||
if err := p.Err(); err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
// compactCCC converts the sparse set of CCC values to a continguous one,
|
||||
// reducing the number of bits needed from 8 to 6.
|
||||
func compactCCC() {
|
||||
m := make(map[uint8]uint8)
|
||||
for i := range chars {
|
||||
c := &chars[i]
|
||||
m[c.ccc] = 0
|
||||
}
|
||||
cccs := []int{}
|
||||
for v, _ := range m {
|
||||
cccs = append(cccs, int(v))
|
||||
}
|
||||
sort.Ints(cccs)
|
||||
for i, c := range cccs {
|
||||
cccMap[uint8(i)] = uint8(c)
|
||||
m[uint8(c)] = uint8(i)
|
||||
}
|
||||
for i := range chars {
|
||||
c := &chars[i]
|
||||
c.origCCC = c.ccc
|
||||
c.ccc = m[c.ccc]
|
||||
}
|
||||
if len(m) >= 1<<6 {
|
||||
log.Fatalf("too many difference CCC values: %d >= 64", len(m))
|
||||
}
|
||||
}
|
||||
|
||||
// CompositionExclusions.txt has form:
|
||||
// 0958 # ...
|
||||
// See http://unicode.org/reports/tr44/ for full explanation
|
||||
func loadCompositionExclusions() {
|
||||
f := gen.OpenUCDFile("CompositionExclusions.txt")
|
||||
defer f.Close()
|
||||
p := ucd.New(f)
|
||||
for p.Next() {
|
||||
c := &chars[p.Rune(0)]
|
||||
if c.excludeInComp {
|
||||
log.Fatalf("%U: Duplicate entry in exclusions.", c.codePoint)
|
||||
}
|
||||
c.excludeInComp = true
|
||||
}
|
||||
if e := p.Err(); e != nil {
|
||||
log.Fatal(e)
|
||||
}
|
||||
}
|
||||
|
||||
// hasCompatDecomp returns true if any of the recursive
|
||||
// decompositions contains a compatibility expansion.
|
||||
// In this case, the character may not occur in NFK*.
|
||||
func hasCompatDecomp(r rune) bool {
|
||||
c := &chars[r]
|
||||
if c.compatDecomp {
|
||||
return true
|
||||
}
|
||||
for _, d := range c.forms[FCompatibility].decomp {
|
||||
if hasCompatDecomp(d) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// Hangul related constants.
|
||||
const (
|
||||
HangulBase = 0xAC00
|
||||
HangulEnd = 0xD7A4 // hangulBase + Jamo combinations (19 * 21 * 28)
|
||||
|
||||
JamoLBase = 0x1100
|
||||
JamoLEnd = 0x1113
|
||||
JamoVBase = 0x1161
|
||||
JamoVEnd = 0x1176
|
||||
JamoTBase = 0x11A8
|
||||
JamoTEnd = 0x11C3
|
||||
|
||||
JamoLVTCount = 19 * 21 * 28
|
||||
JamoTCount = 28
|
||||
)
|
||||
|
||||
func isHangul(r rune) bool {
|
||||
return HangulBase <= r && r < HangulEnd
|
||||
}
|
||||
|
||||
func isHangulWithoutJamoT(r rune) bool {
|
||||
if !isHangul(r) {
|
||||
return false
|
||||
}
|
||||
r -= HangulBase
|
||||
return r < JamoLVTCount && r%JamoTCount == 0
|
||||
}
|
||||
|
||||
func ccc(r rune) uint8 {
|
||||
return chars[r].ccc
|
||||
}
|
||||
|
||||
// Insert a rune in a buffer, ordered by Canonical Combining Class.
|
||||
func insertOrdered(b Decomposition, r rune) Decomposition {
|
||||
n := len(b)
|
||||
b = append(b, 0)
|
||||
cc := ccc(r)
|
||||
if cc > 0 {
|
||||
// Use bubble sort.
|
||||
for ; n > 0; n-- {
|
||||
if ccc(b[n-1]) <= cc {
|
||||
break
|
||||
}
|
||||
b[n] = b[n-1]
|
||||
}
|
||||
}
|
||||
b[n] = r
|
||||
return b
|
||||
}
|
||||
|
||||
// Recursively decompose.
|
||||
func decomposeRecursive(form int, r rune, d Decomposition) Decomposition {
|
||||
dcomp := chars[r].forms[form].decomp
|
||||
if len(dcomp) == 0 {
|
||||
return insertOrdered(d, r)
|
||||
}
|
||||
for _, c := range dcomp {
|
||||
d = decomposeRecursive(form, c, d)
|
||||
}
|
||||
return d
|
||||
}
|
||||
|
||||
func completeCharFields(form int) {
|
||||
// Phase 0: pre-expand decomposition.
|
||||
for i := range chars {
|
||||
f := &chars[i].forms[form]
|
||||
if len(f.decomp) == 0 {
|
||||
continue
|
||||
}
|
||||
exp := make(Decomposition, 0)
|
||||
for _, c := range f.decomp {
|
||||
exp = decomposeRecursive(form, c, exp)
|
||||
}
|
||||
f.expandedDecomp = exp
|
||||
}
|
||||
|
||||
// Phase 1: composition exclusion, mark decomposition.
|
||||
for i := range chars {
|
||||
c := &chars[i]
|
||||
f := &c.forms[form]
|
||||
|
||||
// Marks script-specific exclusions and version restricted.
|
||||
f.isOneWay = c.excludeInComp
|
||||
|
||||
// Singletons
|
||||
f.isOneWay = f.isOneWay || len(f.decomp) == 1
|
||||
|
||||
// Non-starter decompositions
|
||||
if len(f.decomp) > 1 {
|
||||
chk := c.ccc != 0 || chars[f.decomp[0]].ccc != 0
|
||||
f.isOneWay = f.isOneWay || chk
|
||||
}
|
||||
|
||||
// Runes that decompose into more than two runes.
|
||||
f.isOneWay = f.isOneWay || len(f.decomp) > 2
|
||||
|
||||
if form == FCompatibility {
|
||||
f.isOneWay = f.isOneWay || hasCompatDecomp(c.codePoint)
|
||||
}
|
||||
|
||||
for _, r := range f.decomp {
|
||||
chars[r].forms[form].inDecomp = true
|
||||
}
|
||||
}
|
||||
|
||||
// Phase 2: forward and backward combining.
|
||||
for i := range chars {
|
||||
c := &chars[i]
|
||||
f := &c.forms[form]
|
||||
|
||||
if !f.isOneWay && len(f.decomp) == 2 {
|
||||
f0 := &chars[f.decomp[0]].forms[form]
|
||||
f1 := &chars[f.decomp[1]].forms[form]
|
||||
if !f0.isOneWay {
|
||||
f0.combinesForward = true
|
||||
}
|
||||
if !f1.isOneWay {
|
||||
f1.combinesBackward = true
|
||||
}
|
||||
}
|
||||
if isHangulWithoutJamoT(rune(i)) {
|
||||
f.combinesForward = true
|
||||
}
|
||||
}
|
||||
|
||||
// Phase 3: quick check values.
|
||||
for i := range chars {
|
||||
c := &chars[i]
|
||||
f := &c.forms[form]
|
||||
|
||||
switch {
|
||||
case len(f.decomp) > 0:
|
||||
f.quickCheck[MDecomposed] = QCNo
|
||||
case isHangul(rune(i)):
|
||||
f.quickCheck[MDecomposed] = QCNo
|
||||
default:
|
||||
f.quickCheck[MDecomposed] = QCYes
|
||||
}
|
||||
switch {
|
||||
case f.isOneWay:
|
||||
f.quickCheck[MComposed] = QCNo
|
||||
case (i & 0xffff00) == JamoLBase:
|
||||
f.quickCheck[MComposed] = QCYes
|
||||
if JamoLBase <= i && i < JamoLEnd {
|
||||
f.combinesForward = true
|
||||
}
|
||||
if JamoVBase <= i && i < JamoVEnd {
|
||||
f.quickCheck[MComposed] = QCMaybe
|
||||
f.combinesBackward = true
|
||||
f.combinesForward = true
|
||||
}
|
||||
if JamoTBase <= i && i < JamoTEnd {
|
||||
f.quickCheck[MComposed] = QCMaybe
|
||||
f.combinesBackward = true
|
||||
}
|
||||
case !f.combinesBackward:
|
||||
f.quickCheck[MComposed] = QCYes
|
||||
default:
|
||||
f.quickCheck[MComposed] = QCMaybe
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func computeNonStarterCounts() {
|
||||
// Phase 4: leading and trailing non-starter count
|
||||
for i := range chars {
|
||||
c := &chars[i]
|
||||
|
||||
runes := []rune{rune(i)}
|
||||
// We always use FCompatibility so that the CGJ insertion points do not
|
||||
// change for repeated normalizations with different forms.
|
||||
if exp := c.forms[FCompatibility].expandedDecomp; len(exp) > 0 {
|
||||
runes = exp
|
||||
}
|
||||
// We consider runes that combine backwards to be non-starters for the
|
||||
// purpose of Stream-Safe Text Processing.
|
||||
for _, r := range runes {
|
||||
if cr := &chars[r]; cr.ccc == 0 && !cr.forms[FCompatibility].combinesBackward {
|
||||
break
|
||||
}
|
||||
c.nLeadingNonStarters++
|
||||
}
|
||||
for i := len(runes) - 1; i >= 0; i-- {
|
||||
if cr := &chars[runes[i]]; cr.ccc == 0 && !cr.forms[FCompatibility].combinesBackward {
|
||||
break
|
||||
}
|
||||
c.nTrailingNonStarters++
|
||||
}
|
||||
if c.nTrailingNonStarters > 3 {
|
||||
log.Fatalf("%U: Decomposition with more than 3 (%d) trailing modifiers (%U)", i, c.nTrailingNonStarters, runes)
|
||||
}
|
||||
|
||||
if isHangul(rune(i)) {
|
||||
c.nTrailingNonStarters = 2
|
||||
if isHangulWithoutJamoT(rune(i)) {
|
||||
c.nTrailingNonStarters = 1
|
||||
}
|
||||
}
|
||||
|
||||
if l, t := c.nLeadingNonStarters, c.nTrailingNonStarters; l > 0 && l != t {
|
||||
log.Fatalf("%U: number of leading and trailing non-starters should be equal (%d vs %d)", i, l, t)
|
||||
}
|
||||
if t := c.nTrailingNonStarters; t > 3 {
|
||||
log.Fatalf("%U: number of trailing non-starters is %d > 3", t)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func printBytes(w io.Writer, b []byte, name string) {
|
||||
fmt.Fprintf(w, "// %s: %d bytes\n", name, len(b))
|
||||
fmt.Fprintf(w, "var %s = [...]byte {", name)
|
||||
for i, c := range b {
|
||||
switch {
|
||||
case i%64 == 0:
|
||||
fmt.Fprintf(w, "\n// Bytes %x - %x\n", i, i+63)
|
||||
case i%8 == 0:
|
||||
fmt.Fprintf(w, "\n")
|
||||
}
|
||||
fmt.Fprintf(w, "0x%.2X, ", c)
|
||||
}
|
||||
fmt.Fprint(w, "\n}\n\n")
|
||||
}
|
||||
|
||||
// See forminfo.go for format.
|
||||
func makeEntry(f *FormInfo, c *Char) uint16 {
|
||||
e := uint16(0)
|
||||
if r := c.codePoint; HangulBase <= r && r < HangulEnd {
|
||||
e |= 0x40
|
||||
}
|
||||
if f.combinesForward {
|
||||
e |= 0x20
|
||||
}
|
||||
if f.quickCheck[MDecomposed] == QCNo {
|
||||
e |= 0x4
|
||||
}
|
||||
switch f.quickCheck[MComposed] {
|
||||
case QCYes:
|
||||
case QCNo:
|
||||
e |= 0x10
|
||||
case QCMaybe:
|
||||
e |= 0x18
|
||||
default:
|
||||
log.Fatalf("Illegal quickcheck value %v.", f.quickCheck[MComposed])
|
||||
}
|
||||
e |= uint16(c.nTrailingNonStarters)
|
||||
return e
|
||||
}
|
||||
|
||||
// decompSet keeps track of unique decompositions, grouped by whether
|
||||
// the decomposition is followed by a trailing and/or leading CCC.
|
||||
type decompSet [7]map[string]bool
|
||||
|
||||
const (
|
||||
normalDecomp = iota
|
||||
firstMulti
|
||||
firstCCC
|
||||
endMulti
|
||||
firstLeadingCCC
|
||||
firstCCCZeroExcept
|
||||
firstStarterWithNLead
|
||||
lastDecomp
|
||||
)
|
||||
|
||||
var cname = []string{"firstMulti", "firstCCC", "endMulti", "firstLeadingCCC", "firstCCCZeroExcept", "firstStarterWithNLead", "lastDecomp"}
|
||||
|
||||
func makeDecompSet() decompSet {
|
||||
m := decompSet{}
|
||||
for i := range m {
|
||||
m[i] = make(map[string]bool)
|
||||
}
|
||||
return m
|
||||
}
|
||||
func (m *decompSet) insert(key int, s string) {
|
||||
m[key][s] = true
|
||||
}
|
||||
|
||||
func printCharInfoTables(w io.Writer) int {
|
||||
mkstr := func(r rune, f *FormInfo) (int, string) {
|
||||
d := f.expandedDecomp
|
||||
s := string([]rune(d))
|
||||
if max := 1 << 6; len(s) >= max {
|
||||
const msg = "%U: too many bytes in decomposition: %d >= %d"
|
||||
log.Fatalf(msg, r, len(s), max)
|
||||
}
|
||||
head := uint8(len(s))
|
||||
if f.quickCheck[MComposed] != QCYes {
|
||||
head |= 0x40
|
||||
}
|
||||
if f.combinesForward {
|
||||
head |= 0x80
|
||||
}
|
||||
s = string([]byte{head}) + s
|
||||
|
||||
lccc := ccc(d[0])
|
||||
tccc := ccc(d[len(d)-1])
|
||||
cc := ccc(r)
|
||||
if cc != 0 && lccc == 0 && tccc == 0 {
|
||||
log.Fatalf("%U: trailing and leading ccc are 0 for non-zero ccc %d", r, cc)
|
||||
}
|
||||
if tccc < lccc && lccc != 0 {
|
||||
const msg = "%U: lccc (%d) must be <= tcc (%d)"
|
||||
log.Fatalf(msg, r, lccc, tccc)
|
||||
}
|
||||
index := normalDecomp
|
||||
nTrail := chars[r].nTrailingNonStarters
|
||||
if tccc > 0 || lccc > 0 || nTrail > 0 {
|
||||
tccc <<= 2
|
||||
tccc |= nTrail
|
||||
s += string([]byte{tccc})
|
||||
index = endMulti
|
||||
for _, r := range d[1:] {
|
||||
if ccc(r) == 0 {
|
||||
index = firstCCC
|
||||
}
|
||||
}
|
||||
if lccc > 0 {
|
||||
s += string([]byte{lccc})
|
||||
if index == firstCCC {
|
||||
log.Fatalf("%U: multi-segment decomposition not supported for decompositions with leading CCC != 0", r)
|
||||
}
|
||||
index = firstLeadingCCC
|
||||
}
|
||||
if cc != lccc {
|
||||
if cc != 0 {
|
||||
log.Fatalf("%U: for lccc != ccc, expected ccc to be 0; was %d", r, cc)
|
||||
}
|
||||
index = firstCCCZeroExcept
|
||||
}
|
||||
} else if len(d) > 1 {
|
||||
index = firstMulti
|
||||
}
|
||||
return index, s
|
||||
}
|
||||
|
||||
decompSet := makeDecompSet()
|
||||
const nLeadStr = "\x00\x01" // 0-byte length and tccc with nTrail.
|
||||
decompSet.insert(firstStarterWithNLead, nLeadStr)
|
||||
|
||||
// Store the uniqued decompositions in a byte buffer,
|
||||
// preceded by their byte length.
|
||||
for _, c := range chars {
|
||||
for _, f := range c.forms {
|
||||
if len(f.expandedDecomp) == 0 {
|
||||
continue
|
||||
}
|
||||
if f.combinesBackward {
|
||||
log.Fatalf("%U: combinesBackward and decompose", c.codePoint)
|
||||
}
|
||||
index, s := mkstr(c.codePoint, &f)
|
||||
decompSet.insert(index, s)
|
||||
}
|
||||
}
|
||||
|
||||
decompositions := bytes.NewBuffer(make([]byte, 0, 10000))
|
||||
size := 0
|
||||
positionMap := make(map[string]uint16)
|
||||
decompositions.WriteString("\000")
|
||||
fmt.Fprintln(w, "const (")
|
||||
for i, m := range decompSet {
|
||||
sa := []string{}
|
||||
for s := range m {
|
||||
sa = append(sa, s)
|
||||
}
|
||||
sort.Strings(sa)
|
||||
for _, s := range sa {
|
||||
p := decompositions.Len()
|
||||
decompositions.WriteString(s)
|
||||
positionMap[s] = uint16(p)
|
||||
}
|
||||
if cname[i] != "" {
|
||||
fmt.Fprintf(w, "%s = 0x%X\n", cname[i], decompositions.Len())
|
||||
}
|
||||
}
|
||||
fmt.Fprintln(w, "maxDecomp = 0x8000")
|
||||
fmt.Fprintln(w, ")")
|
||||
b := decompositions.Bytes()
|
||||
printBytes(w, b, "decomps")
|
||||
size += len(b)
|
||||
|
||||
varnames := []string{"nfc", "nfkc"}
|
||||
for i := 0; i < FNumberOfFormTypes; i++ {
|
||||
trie := triegen.NewTrie(varnames[i])
|
||||
|
||||
for r, c := range chars {
|
||||
f := c.forms[i]
|
||||
d := f.expandedDecomp
|
||||
if len(d) != 0 {
|
||||
_, key := mkstr(c.codePoint, &f)
|
||||
trie.Insert(rune(r), uint64(positionMap[key]))
|
||||
if c.ccc != ccc(d[0]) {
|
||||
// We assume the lead ccc of a decomposition !=0 in this case.
|
||||
if ccc(d[0]) == 0 {
|
||||
log.Fatalf("Expected leading CCC to be non-zero; ccc is %d", c.ccc)
|
||||
}
|
||||
}
|
||||
} else if c.nLeadingNonStarters > 0 && len(f.expandedDecomp) == 0 && c.ccc == 0 && !f.combinesBackward {
|
||||
// Handle cases where it can't be detected that the nLead should be equal
|
||||
// to nTrail.
|
||||
trie.Insert(c.codePoint, uint64(positionMap[nLeadStr]))
|
||||
} else if v := makeEntry(&f, &c)<<8 | uint16(c.ccc); v != 0 {
|
||||
trie.Insert(c.codePoint, uint64(0x8000|v))
|
||||
}
|
||||
}
|
||||
sz, err := trie.Gen(w, triegen.Compact(&normCompacter{name: varnames[i]}))
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
size += sz
|
||||
}
|
||||
return size
|
||||
}
|
||||
|
||||
func contains(sa []string, s string) bool {
|
||||
for _, a := range sa {
|
||||
if a == s {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func makeTables() {
|
||||
w := &bytes.Buffer{}
|
||||
|
||||
size := 0
|
||||
if *tablelist == "" {
|
||||
return
|
||||
}
|
||||
list := strings.Split(*tablelist, ",")
|
||||
if *tablelist == "all" {
|
||||
list = []string{"recomp", "info"}
|
||||
}
|
||||
|
||||
// Compute maximum decomposition size.
|
||||
max := 0
|
||||
for _, c := range chars {
|
||||
if n := len(string(c.forms[FCompatibility].expandedDecomp)); n > max {
|
||||
max = n
|
||||
}
|
||||
}
|
||||
|
||||
fmt.Fprintln(w, "const (")
|
||||
fmt.Fprintln(w, "\t// Version is the Unicode edition from which the tables are derived.")
|
||||
fmt.Fprintf(w, "\tVersion = %q\n", gen.UnicodeVersion())
|
||||
fmt.Fprintln(w)
|
||||
fmt.Fprintln(w, "\t// MaxTransformChunkSize indicates the maximum number of bytes that Transform")
|
||||
fmt.Fprintln(w, "\t// may need to write atomically for any Form. Making a destination buffer at")
|
||||
fmt.Fprintln(w, "\t// least this size ensures that Transform can always make progress and that")
|
||||
fmt.Fprintln(w, "\t// the user does not need to grow the buffer on an ErrShortDst.")
|
||||
fmt.Fprintf(w, "\tMaxTransformChunkSize = %d+maxNonStarters*4\n", len(string(0x034F))+max)
|
||||
fmt.Fprintln(w, ")\n")
|
||||
|
||||
// Print the CCC remap table.
|
||||
size += len(cccMap)
|
||||
fmt.Fprintf(w, "var ccc = [%d]uint8{", len(cccMap))
|
||||
for i := 0; i < len(cccMap); i++ {
|
||||
if i%8 == 0 {
|
||||
fmt.Fprintln(w)
|
||||
}
|
||||
fmt.Fprintf(w, "%3d, ", cccMap[uint8(i)])
|
||||
}
|
||||
fmt.Fprintln(w, "\n}\n")
|
||||
|
||||
if contains(list, "info") {
|
||||
size += printCharInfoTables(w)
|
||||
}
|
||||
|
||||
if contains(list, "recomp") {
|
||||
// Note that we use 32 bit keys, instead of 64 bit.
|
||||
// This clips the bits of three entries, but we know
|
||||
// this won't cause a collision. The compiler will catch
|
||||
// any changes made to UnicodeData.txt that introduces
|
||||
// a collision.
|
||||
// Note that the recomposition map for NFC and NFKC
|
||||
// are identical.
|
||||
|
||||
// Recomposition map
|
||||
nrentries := 0
|
||||
for _, c := range chars {
|
||||
f := c.forms[FCanonical]
|
||||
if !f.isOneWay && len(f.decomp) > 0 {
|
||||
nrentries++
|
||||
}
|
||||
}
|
||||
sz := nrentries * 8
|
||||
size += sz
|
||||
fmt.Fprintf(w, "// recompMap: %d bytes (entries only)\n", sz)
|
||||
fmt.Fprintln(w, "var recompMap = map[uint32]rune{")
|
||||
for i, c := range chars {
|
||||
f := c.forms[FCanonical]
|
||||
d := f.decomp
|
||||
if !f.isOneWay && len(d) > 0 {
|
||||
key := uint32(uint16(d[0]))<<16 + uint32(uint16(d[1]))
|
||||
fmt.Fprintf(w, "0x%.8X: 0x%.4X,\n", key, i)
|
||||
}
|
||||
}
|
||||
fmt.Fprintf(w, "}\n\n")
|
||||
}
|
||||
|
||||
fmt.Fprintf(w, "// Total size of tables: %dKB (%d bytes)\n", (size+512)/1024, size)
|
||||
gen.WriteGoFile("tables.go", "norm", w.Bytes())
|
||||
}
|
||||
|
||||
func printChars() {
|
||||
if *verbose {
|
||||
for _, c := range chars {
|
||||
if !c.isValid() || c.state == SMissing {
|
||||
continue
|
||||
}
|
||||
fmt.Println(c)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// verifyComputed does various consistency tests.
|
||||
func verifyComputed() {
|
||||
for i, c := range chars {
|
||||
for _, f := range c.forms {
|
||||
isNo := (f.quickCheck[MDecomposed] == QCNo)
|
||||
if (len(f.decomp) > 0) != isNo && !isHangul(rune(i)) {
|
||||
log.Fatalf("%U: NF*D QC must be No if rune decomposes", i)
|
||||
}
|
||||
|
||||
isMaybe := f.quickCheck[MComposed] == QCMaybe
|
||||
if f.combinesBackward != isMaybe {
|
||||
log.Fatalf("%U: NF*C QC must be Maybe if combinesBackward", i)
|
||||
}
|
||||
if len(f.decomp) > 0 && f.combinesForward && isMaybe {
|
||||
log.Fatalf("%U: NF*C QC must be Yes or No if combinesForward and decomposes", i)
|
||||
}
|
||||
|
||||
if len(f.expandedDecomp) != 0 {
|
||||
continue
|
||||
}
|
||||
if a, b := c.nLeadingNonStarters > 0, (c.ccc > 0 || f.combinesBackward); a != b {
|
||||
// We accept these runes to be treated differently (it only affects
|
||||
// segment breaking in iteration, most likely on improper use), but
|
||||
// reconsider if more characters are added.
|
||||
// U+FF9E HALFWIDTH KATAKANA VOICED SOUND MARK;Lm;0;L;<narrow> 3099;;;;N;;;;;
|
||||
// U+FF9F HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK;Lm;0;L;<narrow> 309A;;;;N;;;;;
|
||||
// U+3133 HANGUL LETTER KIYEOK-SIOS;Lo;0;L;<compat> 11AA;;;;N;HANGUL LETTER GIYEOG SIOS;;;;
|
||||
// U+318E HANGUL LETTER ARAEAE;Lo;0;L;<compat> 11A1;;;;N;HANGUL LETTER ALAE AE;;;;
|
||||
// U+FFA3 HALFWIDTH HANGUL LETTER KIYEOK-SIOS;Lo;0;L;<narrow> 3133;;;;N;HALFWIDTH HANGUL LETTER GIYEOG SIOS;;;;
|
||||
// U+FFDC HALFWIDTH HANGUL LETTER I;Lo;0;L;<narrow> 3163;;;;N;;;;;
|
||||
if i != 0xFF9E && i != 0xFF9F && !(0x3133 <= i && i <= 0x318E) && !(0xFFA3 <= i && i <= 0xFFDC) {
|
||||
log.Fatalf("%U: nLead was %v; want %v", i, a, b)
|
||||
}
|
||||
}
|
||||
}
|
||||
nfc := c.forms[FCanonical]
|
||||
nfkc := c.forms[FCompatibility]
|
||||
if nfc.combinesBackward != nfkc.combinesBackward {
|
||||
log.Fatalf("%U: Cannot combine combinesBackward\n", c.codePoint)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Use values in DerivedNormalizationProps.txt to compare against the
|
||||
// values we computed.
|
||||
// DerivedNormalizationProps.txt has form:
|
||||
// 00C0..00C5 ; NFD_QC; N # ...
|
||||
// 0374 ; NFD_QC; N # ...
|
||||
// See http://unicode.org/reports/tr44/ for full explanation
|
||||
func testDerived() {
|
||||
f := gen.OpenUCDFile("DerivedNormalizationProps.txt")
|
||||
defer f.Close()
|
||||
p := ucd.New(f)
|
||||
for p.Next() {
|
||||
r := p.Rune(0)
|
||||
c := &chars[r]
|
||||
|
||||
var ftype, mode int
|
||||
qt := p.String(1)
|
||||
switch qt {
|
||||
case "NFC_QC":
|
||||
ftype, mode = FCanonical, MComposed
|
||||
case "NFD_QC":
|
||||
ftype, mode = FCanonical, MDecomposed
|
||||
case "NFKC_QC":
|
||||
ftype, mode = FCompatibility, MComposed
|
||||
case "NFKD_QC":
|
||||
ftype, mode = FCompatibility, MDecomposed
|
||||
default:
|
||||
continue
|
||||
}
|
||||
var qr QCResult
|
||||
switch p.String(2) {
|
||||
case "Y":
|
||||
qr = QCYes
|
||||
case "N":
|
||||
qr = QCNo
|
||||
case "M":
|
||||
qr = QCMaybe
|
||||
default:
|
||||
log.Fatalf(`Unexpected quick check value "%s"`, p.String(2))
|
||||
}
|
||||
if got := c.forms[ftype].quickCheck[mode]; got != qr {
|
||||
log.Printf("%U: FAILED %s (was %v need %v)\n", r, qt, got, qr)
|
||||
}
|
||||
c.forms[ftype].verified[mode] = true
|
||||
}
|
||||
if err := p.Err(); err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
// Any unspecified value must be QCYes. Verify this.
|
||||
for i, c := range chars {
|
||||
for j, fd := range c.forms {
|
||||
for k, qr := range fd.quickCheck {
|
||||
if !fd.verified[k] && qr != QCYes {
|
||||
m := "%U: FAIL F:%d M:%d (was %v need Yes) %s\n"
|
||||
log.Printf(m, i, j, k, qr, c.name)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var testHeader = `const (
|
||||
Yes = iota
|
||||
No
|
||||
Maybe
|
||||
)
|
||||
|
||||
type formData struct {
|
||||
qc uint8
|
||||
combinesForward bool
|
||||
decomposition string
|
||||
}
|
||||
|
||||
type runeData struct {
|
||||
r rune
|
||||
ccc uint8
|
||||
nLead uint8
|
||||
nTrail uint8
|
||||
f [2]formData // 0: canonical; 1: compatibility
|
||||
}
|
||||
|
||||
func f(qc uint8, cf bool, dec string) [2]formData {
|
||||
return [2]formData{{qc, cf, dec}, {qc, cf, dec}}
|
||||
}
|
||||
|
||||
func g(qc, qck uint8, cf, cfk bool, d, dk string) [2]formData {
|
||||
return [2]formData{{qc, cf, d}, {qck, cfk, dk}}
|
||||
}
|
||||
|
||||
var testData = []runeData{
|
||||
`
|
||||
|
||||
func printTestdata() {
|
||||
type lastInfo struct {
|
||||
ccc uint8
|
||||
nLead uint8
|
||||
nTrail uint8
|
||||
f string
|
||||
}
|
||||
|
||||
last := lastInfo{}
|
||||
w := &bytes.Buffer{}
|
||||
fmt.Fprintf(w, testHeader)
|
||||
for r, c := range chars {
|
||||
f := c.forms[FCanonical]
|
||||
qc, cf, d := f.quickCheck[MComposed], f.combinesForward, string(f.expandedDecomp)
|
||||
f = c.forms[FCompatibility]
|
||||
qck, cfk, dk := f.quickCheck[MComposed], f.combinesForward, string(f.expandedDecomp)
|
||||
s := ""
|
||||
if d == dk && qc == qck && cf == cfk {
|
||||
s = fmt.Sprintf("f(%s, %v, %q)", qc, cf, d)
|
||||
} else {
|
||||
s = fmt.Sprintf("g(%s, %s, %v, %v, %q, %q)", qc, qck, cf, cfk, d, dk)
|
||||
}
|
||||
current := lastInfo{c.ccc, c.nLeadingNonStarters, c.nTrailingNonStarters, s}
|
||||
if last != current {
|
||||
fmt.Fprintf(w, "\t{0x%x, %d, %d, %d, %s},\n", r, c.origCCC, c.nLeadingNonStarters, c.nTrailingNonStarters, s)
|
||||
last = current
|
||||
}
|
||||
}
|
||||
fmt.Fprintln(w, "}")
|
||||
gen.WriteGoFile("data_test.go", "norm", w.Bytes())
|
||||
}
|
||||
576
vendor/golang.org/x/text/unicode/norm/normalize.go
generated
vendored
Normal file
576
vendor/golang.org/x/text/unicode/norm/normalize.go
generated
vendored
Normal file
|
|
@ -0,0 +1,576 @@
|
|||
// Copyright 2011 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
//go:generate go run maketables.go triegen.go
|
||||
//go:generate go run maketables.go triegen.go -test
|
||||
|
||||
// Package norm contains types and functions for normalizing Unicode strings.
|
||||
package norm
|
||||
|
||||
import "unicode/utf8"
|
||||
|
||||
// A Form denotes a canonical representation of Unicode code points.
|
||||
// The Unicode-defined normalization and equivalence forms are:
|
||||
//
|
||||
// NFC Unicode Normalization Form C
|
||||
// NFD Unicode Normalization Form D
|
||||
// NFKC Unicode Normalization Form KC
|
||||
// NFKD Unicode Normalization Form KD
|
||||
//
|
||||
// For a Form f, this documentation uses the notation f(x) to mean
|
||||
// the bytes or string x converted to the given form.
|
||||
// A position n in x is called a boundary if conversion to the form can
|
||||
// proceed independently on both sides:
|
||||
// f(x) == append(f(x[0:n]), f(x[n:])...)
|
||||
//
|
||||
// References: http://unicode.org/reports/tr15/ and
|
||||
// http://unicode.org/notes/tn5/.
|
||||
type Form int
|
||||
|
||||
const (
|
||||
NFC Form = iota
|
||||
NFD
|
||||
NFKC
|
||||
NFKD
|
||||
)
|
||||
|
||||
// Bytes returns f(b). May return b if f(b) = b.
|
||||
func (f Form) Bytes(b []byte) []byte {
|
||||
src := inputBytes(b)
|
||||
ft := formTable[f]
|
||||
n, ok := ft.quickSpan(src, 0, len(b), true)
|
||||
if ok {
|
||||
return b
|
||||
}
|
||||
out := make([]byte, n, len(b))
|
||||
copy(out, b[0:n])
|
||||
rb := reorderBuffer{f: *ft, src: src, nsrc: len(b), out: out, flushF: appendFlush}
|
||||
return doAppendInner(&rb, n)
|
||||
}
|
||||
|
||||
// String returns f(s).
|
||||
func (f Form) String(s string) string {
|
||||
src := inputString(s)
|
||||
ft := formTable[f]
|
||||
n, ok := ft.quickSpan(src, 0, len(s), true)
|
||||
if ok {
|
||||
return s
|
||||
}
|
||||
out := make([]byte, n, len(s))
|
||||
copy(out, s[0:n])
|
||||
rb := reorderBuffer{f: *ft, src: src, nsrc: len(s), out: out, flushF: appendFlush}
|
||||
return string(doAppendInner(&rb, n))
|
||||
}
|
||||
|
||||
// IsNormal returns true if b == f(b).
|
||||
func (f Form) IsNormal(b []byte) bool {
|
||||
src := inputBytes(b)
|
||||
ft := formTable[f]
|
||||
bp, ok := ft.quickSpan(src, 0, len(b), true)
|
||||
if ok {
|
||||
return true
|
||||
}
|
||||
rb := reorderBuffer{f: *ft, src: src, nsrc: len(b)}
|
||||
rb.setFlusher(nil, cmpNormalBytes)
|
||||
for bp < len(b) {
|
||||
rb.out = b[bp:]
|
||||
if bp = decomposeSegment(&rb, bp, true); bp < 0 {
|
||||
return false
|
||||
}
|
||||
bp, _ = rb.f.quickSpan(rb.src, bp, len(b), true)
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func cmpNormalBytes(rb *reorderBuffer) bool {
|
||||
b := rb.out
|
||||
for i := 0; i < rb.nrune; i++ {
|
||||
info := rb.rune[i]
|
||||
if int(info.size) > len(b) {
|
||||
return false
|
||||
}
|
||||
p := info.pos
|
||||
pe := p + info.size
|
||||
for ; p < pe; p++ {
|
||||
if b[0] != rb.byte[p] {
|
||||
return false
|
||||
}
|
||||
b = b[1:]
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// IsNormalString returns true if s == f(s).
|
||||
func (f Form) IsNormalString(s string) bool {
|
||||
src := inputString(s)
|
||||
ft := formTable[f]
|
||||
bp, ok := ft.quickSpan(src, 0, len(s), true)
|
||||
if ok {
|
||||
return true
|
||||
}
|
||||
rb := reorderBuffer{f: *ft, src: src, nsrc: len(s)}
|
||||
rb.setFlusher(nil, func(rb *reorderBuffer) bool {
|
||||
for i := 0; i < rb.nrune; i++ {
|
||||
info := rb.rune[i]
|
||||
if bp+int(info.size) > len(s) {
|
||||
return false
|
||||
}
|
||||
p := info.pos
|
||||
pe := p + info.size
|
||||
for ; p < pe; p++ {
|
||||
if s[bp] != rb.byte[p] {
|
||||
return false
|
||||
}
|
||||
bp++
|
||||
}
|
||||
}
|
||||
return true
|
||||
})
|
||||
for bp < len(s) {
|
||||
if bp = decomposeSegment(&rb, bp, true); bp < 0 {
|
||||
return false
|
||||
}
|
||||
bp, _ = rb.f.quickSpan(rb.src, bp, len(s), true)
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// patchTail fixes a case where a rune may be incorrectly normalized
|
||||
// if it is followed by illegal continuation bytes. It returns the
|
||||
// patched buffer and whether the decomposition is still in progress.
|
||||
func patchTail(rb *reorderBuffer) bool {
|
||||
info, p := lastRuneStart(&rb.f, rb.out)
|
||||
if p == -1 || info.size == 0 {
|
||||
return true
|
||||
}
|
||||
end := p + int(info.size)
|
||||
extra := len(rb.out) - end
|
||||
if extra > 0 {
|
||||
// Potentially allocating memory. However, this only
|
||||
// happens with ill-formed UTF-8.
|
||||
x := make([]byte, 0)
|
||||
x = append(x, rb.out[len(rb.out)-extra:]...)
|
||||
rb.out = rb.out[:end]
|
||||
decomposeToLastBoundary(rb)
|
||||
rb.doFlush()
|
||||
rb.out = append(rb.out, x...)
|
||||
return false
|
||||
}
|
||||
buf := rb.out[p:]
|
||||
rb.out = rb.out[:p]
|
||||
decomposeToLastBoundary(rb)
|
||||
if s := rb.ss.next(info); s == ssStarter {
|
||||
rb.doFlush()
|
||||
rb.ss.first(info)
|
||||
} else if s == ssOverflow {
|
||||
rb.doFlush()
|
||||
rb.insertCGJ()
|
||||
rb.ss = 0
|
||||
}
|
||||
rb.insertUnsafe(inputBytes(buf), 0, info)
|
||||
return true
|
||||
}
|
||||
|
||||
func appendQuick(rb *reorderBuffer, i int) int {
|
||||
if rb.nsrc == i {
|
||||
return i
|
||||
}
|
||||
end, _ := rb.f.quickSpan(rb.src, i, rb.nsrc, true)
|
||||
rb.out = rb.src.appendSlice(rb.out, i, end)
|
||||
return end
|
||||
}
|
||||
|
||||
// Append returns f(append(out, b...)).
|
||||
// The buffer out must be nil, empty, or equal to f(out).
|
||||
func (f Form) Append(out []byte, src ...byte) []byte {
|
||||
return f.doAppend(out, inputBytes(src), len(src))
|
||||
}
|
||||
|
||||
func (f Form) doAppend(out []byte, src input, n int) []byte {
|
||||
if n == 0 {
|
||||
return out
|
||||
}
|
||||
ft := formTable[f]
|
||||
// Attempt to do a quickSpan first so we can avoid initializing the reorderBuffer.
|
||||
if len(out) == 0 {
|
||||
p, _ := ft.quickSpan(src, 0, n, true)
|
||||
out = src.appendSlice(out, 0, p)
|
||||
if p == n {
|
||||
return out
|
||||
}
|
||||
rb := reorderBuffer{f: *ft, src: src, nsrc: n, out: out, flushF: appendFlush}
|
||||
return doAppendInner(&rb, p)
|
||||
}
|
||||
rb := reorderBuffer{f: *ft, src: src, nsrc: n}
|
||||
return doAppend(&rb, out, 0)
|
||||
}
|
||||
|
||||
func doAppend(rb *reorderBuffer, out []byte, p int) []byte {
|
||||
rb.setFlusher(out, appendFlush)
|
||||
src, n := rb.src, rb.nsrc
|
||||
doMerge := len(out) > 0
|
||||
if q := src.skipContinuationBytes(p); q > p {
|
||||
// Move leading non-starters to destination.
|
||||
rb.out = src.appendSlice(rb.out, p, q)
|
||||
p = q
|
||||
doMerge = patchTail(rb)
|
||||
}
|
||||
fd := &rb.f
|
||||
if doMerge {
|
||||
var info Properties
|
||||
if p < n {
|
||||
info = fd.info(src, p)
|
||||
if !info.BoundaryBefore() || info.nLeadingNonStarters() > 0 {
|
||||
if p == 0 {
|
||||
decomposeToLastBoundary(rb)
|
||||
}
|
||||
p = decomposeSegment(rb, p, true)
|
||||
}
|
||||
}
|
||||
if info.size == 0 {
|
||||
rb.doFlush()
|
||||
// Append incomplete UTF-8 encoding.
|
||||
return src.appendSlice(rb.out, p, n)
|
||||
}
|
||||
if rb.nrune > 0 {
|
||||
return doAppendInner(rb, p)
|
||||
}
|
||||
}
|
||||
p = appendQuick(rb, p)
|
||||
return doAppendInner(rb, p)
|
||||
}
|
||||
|
||||
func doAppendInner(rb *reorderBuffer, p int) []byte {
|
||||
for n := rb.nsrc; p < n; {
|
||||
p = decomposeSegment(rb, p, true)
|
||||
p = appendQuick(rb, p)
|
||||
}
|
||||
return rb.out
|
||||
}
|
||||
|
||||
// AppendString returns f(append(out, []byte(s))).
|
||||
// The buffer out must be nil, empty, or equal to f(out).
|
||||
func (f Form) AppendString(out []byte, src string) []byte {
|
||||
return f.doAppend(out, inputString(src), len(src))
|
||||
}
|
||||
|
||||
// QuickSpan returns a boundary n such that b[0:n] == f(b[0:n]).
|
||||
// It is not guaranteed to return the largest such n.
|
||||
func (f Form) QuickSpan(b []byte) int {
|
||||
n, _ := formTable[f].quickSpan(inputBytes(b), 0, len(b), true)
|
||||
return n
|
||||
}
|
||||
|
||||
// quickSpan returns a boundary n such that src[0:n] == f(src[0:n]) and
|
||||
// whether any non-normalized parts were found. If atEOF is false, n will
|
||||
// not point past the last segment if this segment might be become
|
||||
// non-normalized by appending other runes.
|
||||
func (f *formInfo) quickSpan(src input, i, end int, atEOF bool) (n int, ok bool) {
|
||||
var lastCC uint8
|
||||
ss := streamSafe(0)
|
||||
lastSegStart := i
|
||||
for n = end; i < n; {
|
||||
if j := src.skipASCII(i, n); i != j {
|
||||
i = j
|
||||
lastSegStart = i - 1
|
||||
lastCC = 0
|
||||
ss = 0
|
||||
continue
|
||||
}
|
||||
info := f.info(src, i)
|
||||
if info.size == 0 {
|
||||
if atEOF {
|
||||
// include incomplete runes
|
||||
return n, true
|
||||
}
|
||||
return lastSegStart, true
|
||||
}
|
||||
// This block needs to be before the next, because it is possible to
|
||||
// have an overflow for runes that are starters (e.g. with U+FF9E).
|
||||
switch ss.next(info) {
|
||||
case ssStarter:
|
||||
ss.first(info)
|
||||
lastSegStart = i
|
||||
case ssOverflow:
|
||||
return lastSegStart, false
|
||||
case ssSuccess:
|
||||
if lastCC > info.ccc {
|
||||
return lastSegStart, false
|
||||
}
|
||||
}
|
||||
if f.composing {
|
||||
if !info.isYesC() {
|
||||
break
|
||||
}
|
||||
} else {
|
||||
if !info.isYesD() {
|
||||
break
|
||||
}
|
||||
}
|
||||
lastCC = info.ccc
|
||||
i += int(info.size)
|
||||
}
|
||||
if i == n {
|
||||
if !atEOF {
|
||||
n = lastSegStart
|
||||
}
|
||||
return n, true
|
||||
}
|
||||
return lastSegStart, false
|
||||
}
|
||||
|
||||
// QuickSpanString returns a boundary n such that b[0:n] == f(s[0:n]).
|
||||
// It is not guaranteed to return the largest such n.
|
||||
func (f Form) QuickSpanString(s string) int {
|
||||
n, _ := formTable[f].quickSpan(inputString(s), 0, len(s), true)
|
||||
return n
|
||||
}
|
||||
|
||||
// FirstBoundary returns the position i of the first boundary in b
|
||||
// or -1 if b contains no boundary.
|
||||
func (f Form) FirstBoundary(b []byte) int {
|
||||
return f.firstBoundary(inputBytes(b), len(b))
|
||||
}
|
||||
|
||||
func (f Form) firstBoundary(src input, nsrc int) int {
|
||||
i := src.skipContinuationBytes(0)
|
||||
if i >= nsrc {
|
||||
return -1
|
||||
}
|
||||
fd := formTable[f]
|
||||
ss := streamSafe(0)
|
||||
// We should call ss.first here, but we can't as the first rune is
|
||||
// skipped already. This means FirstBoundary can't really determine
|
||||
// CGJ insertion points correctly. Luckily it doesn't have to.
|
||||
for {
|
||||
info := fd.info(src, i)
|
||||
if info.size == 0 {
|
||||
return -1
|
||||
}
|
||||
if s := ss.next(info); s != ssSuccess {
|
||||
return i
|
||||
}
|
||||
i += int(info.size)
|
||||
if i >= nsrc {
|
||||
if !info.BoundaryAfter() && !ss.isMax() {
|
||||
return -1
|
||||
}
|
||||
return nsrc
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// FirstBoundaryInString returns the position i of the first boundary in s
|
||||
// or -1 if s contains no boundary.
|
||||
func (f Form) FirstBoundaryInString(s string) int {
|
||||
return f.firstBoundary(inputString(s), len(s))
|
||||
}
|
||||
|
||||
// NextBoundary reports the index of the boundary between the first and next
|
||||
// segment in b or -1 if atEOF is false and there are not enough bytes to
|
||||
// determine this boundary.
|
||||
func (f Form) NextBoundary(b []byte, atEOF bool) int {
|
||||
return f.nextBoundary(inputBytes(b), len(b), atEOF)
|
||||
}
|
||||
|
||||
// NextBoundaryInString reports the index of the boundary between the first and
|
||||
// next segment in b or -1 if atEOF is false and there are not enough bytes to
|
||||
// determine this boundary.
|
||||
func (f Form) NextBoundaryInString(s string, atEOF bool) int {
|
||||
return f.nextBoundary(inputString(s), len(s), atEOF)
|
||||
}
|
||||
|
||||
func (f Form) nextBoundary(src input, nsrc int, atEOF bool) int {
|
||||
if nsrc == 0 {
|
||||
if atEOF {
|
||||
return 0
|
||||
}
|
||||
return -1
|
||||
}
|
||||
fd := formTable[f]
|
||||
info := fd.info(src, 0)
|
||||
if info.size == 0 {
|
||||
if atEOF {
|
||||
return 1
|
||||
}
|
||||
return -1
|
||||
}
|
||||
ss := streamSafe(0)
|
||||
ss.first(info)
|
||||
|
||||
for i := int(info.size); i < nsrc; i += int(info.size) {
|
||||
info = fd.info(src, i)
|
||||
if info.size == 0 {
|
||||
if atEOF {
|
||||
return i
|
||||
}
|
||||
return -1
|
||||
}
|
||||
if s := ss.next(info); s != ssSuccess {
|
||||
return i
|
||||
}
|
||||
}
|
||||
if !atEOF && !info.BoundaryAfter() && !ss.isMax() {
|
||||
return -1
|
||||
}
|
||||
return nsrc
|
||||
}
|
||||
|
||||
// LastBoundary returns the position i of the last boundary in b
|
||||
// or -1 if b contains no boundary.
|
||||
func (f Form) LastBoundary(b []byte) int {
|
||||
return lastBoundary(formTable[f], b)
|
||||
}
|
||||
|
||||
func lastBoundary(fd *formInfo, b []byte) int {
|
||||
i := len(b)
|
||||
info, p := lastRuneStart(fd, b)
|
||||
if p == -1 {
|
||||
return -1
|
||||
}
|
||||
if info.size == 0 { // ends with incomplete rune
|
||||
if p == 0 { // starts with incomplete rune
|
||||
return -1
|
||||
}
|
||||
i = p
|
||||
info, p = lastRuneStart(fd, b[:i])
|
||||
if p == -1 { // incomplete UTF-8 encoding or non-starter bytes without a starter
|
||||
return i
|
||||
}
|
||||
}
|
||||
if p+int(info.size) != i { // trailing non-starter bytes: illegal UTF-8
|
||||
return i
|
||||
}
|
||||
if info.BoundaryAfter() {
|
||||
return i
|
||||
}
|
||||
ss := streamSafe(0)
|
||||
v := ss.backwards(info)
|
||||
for i = p; i >= 0 && v != ssStarter; i = p {
|
||||
info, p = lastRuneStart(fd, b[:i])
|
||||
if v = ss.backwards(info); v == ssOverflow {
|
||||
break
|
||||
}
|
||||
if p+int(info.size) != i {
|
||||
if p == -1 { // no boundary found
|
||||
return -1
|
||||
}
|
||||
return i // boundary after an illegal UTF-8 encoding
|
||||
}
|
||||
}
|
||||
return i
|
||||
}
|
||||
|
||||
// decomposeSegment scans the first segment in src into rb. It inserts 0x034f
|
||||
// (Grapheme Joiner) when it encounters a sequence of more than 30 non-starters
|
||||
// and returns the number of bytes consumed from src or iShortDst or iShortSrc.
|
||||
func decomposeSegment(rb *reorderBuffer, sp int, atEOF bool) int {
|
||||
// Force one character to be consumed.
|
||||
info := rb.f.info(rb.src, sp)
|
||||
if info.size == 0 {
|
||||
return 0
|
||||
}
|
||||
if rb.nrune > 0 {
|
||||
if s := rb.ss.next(info); s == ssStarter {
|
||||
goto end
|
||||
} else if s == ssOverflow {
|
||||
rb.insertCGJ()
|
||||
goto end
|
||||
}
|
||||
} else {
|
||||
rb.ss.first(info)
|
||||
}
|
||||
if err := rb.insertFlush(rb.src, sp, info); err != iSuccess {
|
||||
return int(err)
|
||||
}
|
||||
for {
|
||||
sp += int(info.size)
|
||||
if sp >= rb.nsrc {
|
||||
if !atEOF && !info.BoundaryAfter() {
|
||||
return int(iShortSrc)
|
||||
}
|
||||
break
|
||||
}
|
||||
info = rb.f.info(rb.src, sp)
|
||||
if info.size == 0 {
|
||||
if !atEOF {
|
||||
return int(iShortSrc)
|
||||
}
|
||||
break
|
||||
}
|
||||
if s := rb.ss.next(info); s == ssStarter {
|
||||
break
|
||||
} else if s == ssOverflow {
|
||||
rb.insertCGJ()
|
||||
break
|
||||
}
|
||||
if err := rb.insertFlush(rb.src, sp, info); err != iSuccess {
|
||||
return int(err)
|
||||
}
|
||||
}
|
||||
end:
|
||||
if !rb.doFlush() {
|
||||
return int(iShortDst)
|
||||
}
|
||||
return sp
|
||||
}
|
||||
|
||||
// lastRuneStart returns the runeInfo and position of the last
|
||||
// rune in buf or the zero runeInfo and -1 if no rune was found.
|
||||
func lastRuneStart(fd *formInfo, buf []byte) (Properties, int) {
|
||||
p := len(buf) - 1
|
||||
for ; p >= 0 && !utf8.RuneStart(buf[p]); p-- {
|
||||
}
|
||||
if p < 0 {
|
||||
return Properties{}, -1
|
||||
}
|
||||
return fd.info(inputBytes(buf), p), p
|
||||
}
|
||||
|
||||
// decomposeToLastBoundary finds an open segment at the end of the buffer
|
||||
// and scans it into rb. Returns the buffer minus the last segment.
|
||||
func decomposeToLastBoundary(rb *reorderBuffer) {
|
||||
fd := &rb.f
|
||||
info, i := lastRuneStart(fd, rb.out)
|
||||
if int(info.size) != len(rb.out)-i {
|
||||
// illegal trailing continuation bytes
|
||||
return
|
||||
}
|
||||
if info.BoundaryAfter() {
|
||||
return
|
||||
}
|
||||
var add [maxNonStarters + 1]Properties // stores runeInfo in reverse order
|
||||
padd := 0
|
||||
ss := streamSafe(0)
|
||||
p := len(rb.out)
|
||||
for {
|
||||
add[padd] = info
|
||||
v := ss.backwards(info)
|
||||
if v == ssOverflow {
|
||||
// Note that if we have an overflow, it the string we are appending to
|
||||
// is not correctly normalized. In this case the behavior is undefined.
|
||||
break
|
||||
}
|
||||
padd++
|
||||
p -= int(info.size)
|
||||
if v == ssStarter || p < 0 {
|
||||
break
|
||||
}
|
||||
info, i = lastRuneStart(fd, rb.out[:p])
|
||||
if int(info.size) != p-i {
|
||||
break
|
||||
}
|
||||
}
|
||||
rb.ss = ss
|
||||
// Copy bytes for insertion as we may need to overwrite rb.out.
|
||||
var buf [maxBufferSize * utf8.UTFMax]byte
|
||||
cp := buf[:copy(buf[:], rb.out[p:])]
|
||||
rb.out = rb.out[:p]
|
||||
for padd--; padd >= 0; padd-- {
|
||||
info = add[padd]
|
||||
rb.insertUnsafe(inputBytes(cp), 0, info)
|
||||
cp = cp[info.size:]
|
||||
}
|
||||
}
|
||||
126
vendor/golang.org/x/text/unicode/norm/readwriter.go
generated
vendored
Normal file
126
vendor/golang.org/x/text/unicode/norm/readwriter.go
generated
vendored
Normal file
|
|
@ -0,0 +1,126 @@
|
|||
// Copyright 2011 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package norm
|
||||
|
||||
import "io"
|
||||
|
||||
type normWriter struct {
|
||||
rb reorderBuffer
|
||||
w io.Writer
|
||||
buf []byte
|
||||
}
|
||||
|
||||
// Write implements the standard write interface. If the last characters are
|
||||
// not at a normalization boundary, the bytes will be buffered for the next
|
||||
// write. The remaining bytes will be written on close.
|
||||
func (w *normWriter) Write(data []byte) (n int, err error) {
|
||||
// Process data in pieces to keep w.buf size bounded.
|
||||
const chunk = 4000
|
||||
|
||||
for len(data) > 0 {
|
||||
// Normalize into w.buf.
|
||||
m := len(data)
|
||||
if m > chunk {
|
||||
m = chunk
|
||||
}
|
||||
w.rb.src = inputBytes(data[:m])
|
||||
w.rb.nsrc = m
|
||||
w.buf = doAppend(&w.rb, w.buf, 0)
|
||||
data = data[m:]
|
||||
n += m
|
||||
|
||||
// Write out complete prefix, save remainder.
|
||||
// Note that lastBoundary looks back at most 31 runes.
|
||||
i := lastBoundary(&w.rb.f, w.buf)
|
||||
if i == -1 {
|
||||
i = 0
|
||||
}
|
||||
if i > 0 {
|
||||
if _, err = w.w.Write(w.buf[:i]); err != nil {
|
||||
break
|
||||
}
|
||||
bn := copy(w.buf, w.buf[i:])
|
||||
w.buf = w.buf[:bn]
|
||||
}
|
||||
}
|
||||
return n, err
|
||||
}
|
||||
|
||||
// Close forces data that remains in the buffer to be written.
|
||||
func (w *normWriter) Close() error {
|
||||
if len(w.buf) > 0 {
|
||||
_, err := w.w.Write(w.buf)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Writer returns a new writer that implements Write(b)
|
||||
// by writing f(b) to w. The returned writer may use an
|
||||
// an internal buffer to maintain state across Write calls.
|
||||
// Calling its Close method writes any buffered data to w.
|
||||
func (f Form) Writer(w io.Writer) io.WriteCloser {
|
||||
wr := &normWriter{rb: reorderBuffer{}, w: w}
|
||||
wr.rb.init(f, nil)
|
||||
return wr
|
||||
}
|
||||
|
||||
type normReader struct {
|
||||
rb reorderBuffer
|
||||
r io.Reader
|
||||
inbuf []byte
|
||||
outbuf []byte
|
||||
bufStart int
|
||||
lastBoundary int
|
||||
err error
|
||||
}
|
||||
|
||||
// Read implements the standard read interface.
|
||||
func (r *normReader) Read(p []byte) (int, error) {
|
||||
for {
|
||||
if r.lastBoundary-r.bufStart > 0 {
|
||||
n := copy(p, r.outbuf[r.bufStart:r.lastBoundary])
|
||||
r.bufStart += n
|
||||
if r.lastBoundary-r.bufStart > 0 {
|
||||
return n, nil
|
||||
}
|
||||
return n, r.err
|
||||
}
|
||||
if r.err != nil {
|
||||
return 0, r.err
|
||||
}
|
||||
outn := copy(r.outbuf, r.outbuf[r.lastBoundary:])
|
||||
r.outbuf = r.outbuf[0:outn]
|
||||
r.bufStart = 0
|
||||
|
||||
n, err := r.r.Read(r.inbuf)
|
||||
r.rb.src = inputBytes(r.inbuf[0:n])
|
||||
r.rb.nsrc, r.err = n, err
|
||||
if n > 0 {
|
||||
r.outbuf = doAppend(&r.rb, r.outbuf, 0)
|
||||
}
|
||||
if err == io.EOF {
|
||||
r.lastBoundary = len(r.outbuf)
|
||||
} else {
|
||||
r.lastBoundary = lastBoundary(&r.rb.f, r.outbuf)
|
||||
if r.lastBoundary == -1 {
|
||||
r.lastBoundary = 0
|
||||
}
|
||||
}
|
||||
}
|
||||
panic("should not reach here")
|
||||
}
|
||||
|
||||
// Reader returns a new reader that implements Read
|
||||
// by reading data from r and returning f(data).
|
||||
func (f Form) Reader(r io.Reader) io.Reader {
|
||||
const chunk = 4000
|
||||
buf := make([]byte, chunk)
|
||||
rr := &normReader{rb: reorderBuffer{}, r: r, inbuf: buf}
|
||||
rr.rb.init(f, buf)
|
||||
return rr
|
||||
}
|
||||
7627
vendor/golang.org/x/text/unicode/norm/tables.go
generated
vendored
Normal file
7627
vendor/golang.org/x/text/unicode/norm/tables.go
generated
vendored
Normal file
File diff suppressed because it is too large
Load diff
88
vendor/golang.org/x/text/unicode/norm/transform.go
generated
vendored
Normal file
88
vendor/golang.org/x/text/unicode/norm/transform.go
generated
vendored
Normal file
|
|
@ -0,0 +1,88 @@
|
|||
// Copyright 2013 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package norm
|
||||
|
||||
import (
|
||||
"unicode/utf8"
|
||||
|
||||
"golang.org/x/text/transform"
|
||||
)
|
||||
|
||||
// Reset implements the Reset method of the transform.Transformer interface.
|
||||
func (Form) Reset() {}
|
||||
|
||||
// Transform implements the Transform method of the transform.Transformer
|
||||
// interface. It may need to write segments of up to MaxSegmentSize at once.
|
||||
// Users should either catch ErrShortDst and allow dst to grow or have dst be at
|
||||
// least of size MaxTransformChunkSize to be guaranteed of progress.
|
||||
func (f Form) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
n := 0
|
||||
// Cap the maximum number of src bytes to check.
|
||||
b := src
|
||||
eof := atEOF
|
||||
if ns := len(dst); ns < len(b) {
|
||||
err = transform.ErrShortDst
|
||||
eof = false
|
||||
b = b[:ns]
|
||||
}
|
||||
i, ok := formTable[f].quickSpan(inputBytes(b), n, len(b), eof)
|
||||
n += copy(dst[n:], b[n:i])
|
||||
if !ok {
|
||||
nDst, nSrc, err = f.transform(dst[n:], src[n:], atEOF)
|
||||
return nDst + n, nSrc + n, err
|
||||
}
|
||||
if n < len(src) && !atEOF {
|
||||
err = transform.ErrShortSrc
|
||||
}
|
||||
return n, n, err
|
||||
}
|
||||
|
||||
func flushTransform(rb *reorderBuffer) bool {
|
||||
// Write out (must fully fit in dst, or else it is a ErrShortDst).
|
||||
if len(rb.out) < rb.nrune*utf8.UTFMax {
|
||||
return false
|
||||
}
|
||||
rb.out = rb.out[rb.flushCopy(rb.out):]
|
||||
return true
|
||||
}
|
||||
|
||||
var errs = []error{nil, transform.ErrShortDst, transform.ErrShortSrc}
|
||||
|
||||
// transform implements the transform.Transformer interface. It is only called
|
||||
// when quickSpan does not pass for a given string.
|
||||
func (f Form) transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
// TODO: get rid of reorderBuffer. See CL 23460044.
|
||||
rb := reorderBuffer{}
|
||||
rb.init(f, src)
|
||||
for {
|
||||
// Load segment into reorder buffer.
|
||||
rb.setFlusher(dst[nDst:], flushTransform)
|
||||
end := decomposeSegment(&rb, nSrc, atEOF)
|
||||
if end < 0 {
|
||||
return nDst, nSrc, errs[-end]
|
||||
}
|
||||
nDst = len(dst) - len(rb.out)
|
||||
nSrc = end
|
||||
|
||||
// Next quickSpan.
|
||||
end = rb.nsrc
|
||||
eof := atEOF
|
||||
if n := nSrc + len(dst) - nDst; n < end {
|
||||
err = transform.ErrShortDst
|
||||
end = n
|
||||
eof = false
|
||||
}
|
||||
end, ok := rb.f.quickSpan(rb.src, nSrc, end, eof)
|
||||
n := copy(dst[nDst:], rb.src.bytes[nSrc:end])
|
||||
nSrc += n
|
||||
nDst += n
|
||||
if ok {
|
||||
if n < rb.nsrc && !atEOF {
|
||||
err = transform.ErrShortSrc
|
||||
}
|
||||
return nDst, nSrc, err
|
||||
}
|
||||
}
|
||||
}
|
||||
54
vendor/golang.org/x/text/unicode/norm/trie.go
generated
vendored
Normal file
54
vendor/golang.org/x/text/unicode/norm/trie.go
generated
vendored
Normal file
|
|
@ -0,0 +1,54 @@
|
|||
// Copyright 2011 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package norm
|
||||
|
||||
type valueRange struct {
|
||||
value uint16 // header: value:stride
|
||||
lo, hi byte // header: lo:n
|
||||
}
|
||||
|
||||
type sparseBlocks struct {
|
||||
values []valueRange
|
||||
offset []uint16
|
||||
}
|
||||
|
||||
var nfcSparse = sparseBlocks{
|
||||
values: nfcSparseValues[:],
|
||||
offset: nfcSparseOffset[:],
|
||||
}
|
||||
|
||||
var nfkcSparse = sparseBlocks{
|
||||
values: nfkcSparseValues[:],
|
||||
offset: nfkcSparseOffset[:],
|
||||
}
|
||||
|
||||
var (
|
||||
nfcData = newNfcTrie(0)
|
||||
nfkcData = newNfkcTrie(0)
|
||||
)
|
||||
|
||||
// lookupValue determines the type of block n and looks up the value for b.
|
||||
// For n < t.cutoff, the block is a simple lookup table. Otherwise, the block
|
||||
// is a list of ranges with an accompanying value. Given a matching range r,
|
||||
// the value for b is by r.value + (b - r.lo) * stride.
|
||||
func (t *sparseBlocks) lookup(n uint32, b byte) uint16 {
|
||||
offset := t.offset[n]
|
||||
header := t.values[offset]
|
||||
lo := offset + 1
|
||||
hi := lo + uint16(header.lo)
|
||||
for lo < hi {
|
||||
m := lo + (hi-lo)/2
|
||||
r := t.values[m]
|
||||
if r.lo <= b && b <= r.hi {
|
||||
return r.value + uint16(b-r.lo)*header.value
|
||||
}
|
||||
if b < r.lo {
|
||||
hi = m
|
||||
} else {
|
||||
lo = m + 1
|
||||
}
|
||||
}
|
||||
return 0
|
||||
}
|
||||
117
vendor/golang.org/x/text/unicode/norm/triegen.go
generated
vendored
Normal file
117
vendor/golang.org/x/text/unicode/norm/triegen.go
generated
vendored
Normal file
|
|
@ -0,0 +1,117 @@
|
|||
// Copyright 2011 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build ignore
|
||||
|
||||
// Trie table generator.
|
||||
// Used by make*tables tools to generate a go file with trie data structures
|
||||
// for mapping UTF-8 to a 16-bit value. All but the last byte in a UTF-8 byte
|
||||
// sequence are used to lookup offsets in the index table to be used for the
|
||||
// next byte. The last byte is used to index into a table with 16-bit values.
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"io"
|
||||
)
|
||||
|
||||
const maxSparseEntries = 16
|
||||
|
||||
type normCompacter struct {
|
||||
sparseBlocks [][]uint64
|
||||
sparseOffset []uint16
|
||||
sparseCount int
|
||||
name string
|
||||
}
|
||||
|
||||
func mostFrequentStride(a []uint64) int {
|
||||
counts := make(map[int]int)
|
||||
var v int
|
||||
for _, x := range a {
|
||||
if stride := int(x) - v; v != 0 && stride >= 0 {
|
||||
counts[stride]++
|
||||
}
|
||||
v = int(x)
|
||||
}
|
||||
var maxs, maxc int
|
||||
for stride, cnt := range counts {
|
||||
if cnt > maxc || (cnt == maxc && stride < maxs) {
|
||||
maxs, maxc = stride, cnt
|
||||
}
|
||||
}
|
||||
return maxs
|
||||
}
|
||||
|
||||
func countSparseEntries(a []uint64) int {
|
||||
stride := mostFrequentStride(a)
|
||||
var v, count int
|
||||
for _, tv := range a {
|
||||
if int(tv)-v != stride {
|
||||
if tv != 0 {
|
||||
count++
|
||||
}
|
||||
}
|
||||
v = int(tv)
|
||||
}
|
||||
return count
|
||||
}
|
||||
|
||||
func (c *normCompacter) Size(v []uint64) (sz int, ok bool) {
|
||||
if n := countSparseEntries(v); n <= maxSparseEntries {
|
||||
return (n+1)*4 + 2, true
|
||||
}
|
||||
return 0, false
|
||||
}
|
||||
|
||||
func (c *normCompacter) Store(v []uint64) uint32 {
|
||||
h := uint32(len(c.sparseOffset))
|
||||
c.sparseBlocks = append(c.sparseBlocks, v)
|
||||
c.sparseOffset = append(c.sparseOffset, uint16(c.sparseCount))
|
||||
c.sparseCount += countSparseEntries(v) + 1
|
||||
return h
|
||||
}
|
||||
|
||||
func (c *normCompacter) Handler() string {
|
||||
return c.name + "Sparse.lookup"
|
||||
}
|
||||
|
||||
func (c *normCompacter) Print(w io.Writer) (retErr error) {
|
||||
p := func(f string, x ...interface{}) {
|
||||
if _, err := fmt.Fprintf(w, f, x...); retErr == nil && err != nil {
|
||||
retErr = err
|
||||
}
|
||||
}
|
||||
|
||||
ls := len(c.sparseBlocks)
|
||||
p("// %sSparseOffset: %d entries, %d bytes\n", c.name, ls, ls*2)
|
||||
p("var %sSparseOffset = %#v\n\n", c.name, c.sparseOffset)
|
||||
|
||||
ns := c.sparseCount
|
||||
p("// %sSparseValues: %d entries, %d bytes\n", c.name, ns, ns*4)
|
||||
p("var %sSparseValues = [%d]valueRange {", c.name, ns)
|
||||
for i, b := range c.sparseBlocks {
|
||||
p("\n// Block %#x, offset %#x", i, c.sparseOffset[i])
|
||||
var v int
|
||||
stride := mostFrequentStride(b)
|
||||
n := countSparseEntries(b)
|
||||
p("\n{value:%#04x,lo:%#02x},", stride, uint8(n))
|
||||
for i, nv := range b {
|
||||
if int(nv)-v != stride {
|
||||
if v != 0 {
|
||||
p(",hi:%#02x},", 0x80+i-1)
|
||||
}
|
||||
if nv != 0 {
|
||||
p("\n{value:%#04x,lo:%#02x", nv, 0x80+i)
|
||||
}
|
||||
}
|
||||
v = int(nv)
|
||||
}
|
||||
if v != 0 {
|
||||
p(",hi:%#02x},", 0x80+len(b)-1)
|
||||
}
|
||||
}
|
||||
p("\n}\n\n")
|
||||
return
|
||||
}
|
||||
115
vendor/golang.org/x/text/width/gen.go
generated
vendored
Normal file
115
vendor/golang.org/x/text/width/gen.go
generated
vendored
Normal file
|
|
@ -0,0 +1,115 @@
|
|||
// Copyright 2015 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build ignore
|
||||
|
||||
// This program generates the trie for width operations. The generated table
|
||||
// includes width category information as well as the normalization mappings.
|
||||
package main
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"io"
|
||||
"log"
|
||||
"math"
|
||||
"unicode/utf8"
|
||||
|
||||
"golang.org/x/text/internal/gen"
|
||||
"golang.org/x/text/internal/triegen"
|
||||
)
|
||||
|
||||
// See gen_common.go for flags.
|
||||
|
||||
func main() {
|
||||
gen.Init()
|
||||
genTables()
|
||||
genTests()
|
||||
gen.Repackage("gen_trieval.go", "trieval.go", "width")
|
||||
gen.Repackage("gen_common.go", "common_test.go", "width")
|
||||
}
|
||||
|
||||
func genTables() {
|
||||
t := triegen.NewTrie("width")
|
||||
// fold and inverse mappings. See mapComment for a description of the format
|
||||
// of each entry. Add dummy value to make an index of 0 mean no mapping.
|
||||
inverse := [][4]byte{{}}
|
||||
mapping := map[[4]byte]int{[4]byte{}: 0}
|
||||
|
||||
getWidthData(func(r rune, tag elem, alt rune) {
|
||||
idx := 0
|
||||
if alt != 0 {
|
||||
var buf [4]byte
|
||||
buf[0] = byte(utf8.EncodeRune(buf[1:], alt))
|
||||
s := string(r)
|
||||
buf[buf[0]] ^= s[len(s)-1]
|
||||
var ok bool
|
||||
if idx, ok = mapping[buf]; !ok {
|
||||
idx = len(mapping)
|
||||
if idx > math.MaxUint8 {
|
||||
log.Fatalf("Index %d does not fit in a byte.", idx)
|
||||
}
|
||||
mapping[buf] = idx
|
||||
inverse = append(inverse, buf)
|
||||
}
|
||||
}
|
||||
t.Insert(r, uint64(tag|elem(idx)))
|
||||
})
|
||||
|
||||
w := &bytes.Buffer{}
|
||||
gen.WriteUnicodeVersion(w)
|
||||
|
||||
sz, err := t.Gen(w)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
sz += writeMappings(w, inverse)
|
||||
|
||||
fmt.Fprintf(w, "// Total table size %d bytes (%dKiB)\n", sz, sz/1024)
|
||||
|
||||
gen.WriteGoFile(*outputFile, "width", w.Bytes())
|
||||
}
|
||||
|
||||
const inverseDataComment = `
|
||||
// inverseData contains 4-byte entries of the following format:
|
||||
// <length> <modified UTF-8-encoded rune> <0 padding>
|
||||
// The last byte of the UTF-8-encoded rune is xor-ed with the last byte of the
|
||||
// UTF-8 encoding of the original rune. Mappings often have the following
|
||||
// pattern:
|
||||
// A -> A (U+FF21 -> U+0041)
|
||||
// B -> B (U+FF22 -> U+0042)
|
||||
// ...
|
||||
// By xor-ing the last byte the same entry can be shared by many mappings. This
|
||||
// reduces the total number of distinct entries by about two thirds.
|
||||
// The resulting entry for the aforementioned mappings is
|
||||
// { 0x01, 0xE0, 0x00, 0x00 }
|
||||
// Using this entry to map U+FF21 (UTF-8 [EF BC A1]), we get
|
||||
// E0 ^ A1 = 41.
|
||||
// Similarly, for U+FF22 (UTF-8 [EF BC A2]), we get
|
||||
// E0 ^ A2 = 42.
|
||||
// Note that because of the xor-ing, the byte sequence stored in the entry is
|
||||
// not valid UTF-8.`
|
||||
|
||||
func writeMappings(w io.Writer, data [][4]byte) int {
|
||||
fmt.Fprintln(w, inverseDataComment)
|
||||
fmt.Fprintf(w, "var inverseData = [%d][4]byte{\n", len(data))
|
||||
for _, x := range data {
|
||||
fmt.Fprintf(w, "{ 0x%02x, 0x%02x, 0x%02x, 0x%02x },\n", x[0], x[1], x[2], x[3])
|
||||
}
|
||||
fmt.Fprintln(w, "}")
|
||||
return len(data) * 4
|
||||
}
|
||||
|
||||
func genTests() {
|
||||
w := &bytes.Buffer{}
|
||||
fmt.Fprintf(w, "\nvar mapRunes = map[rune]struct{r rune; e elem}{\n")
|
||||
getWidthData(func(r rune, tag elem, alt rune) {
|
||||
if alt != 0 {
|
||||
fmt.Fprintf(w, "\t0x%X: {0x%X, 0x%X},\n", r, alt, tag)
|
||||
}
|
||||
})
|
||||
fmt.Fprintln(w, "}")
|
||||
gen.WriteGoFile("runes_test.go", "width", w.Bytes())
|
||||
}
|
||||
96
vendor/golang.org/x/text/width/gen_common.go
generated
vendored
Normal file
96
vendor/golang.org/x/text/width/gen_common.go
generated
vendored
Normal file
|
|
@ -0,0 +1,96 @@
|
|||
// Copyright 2015 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build ignore
|
||||
|
||||
package main
|
||||
|
||||
// This code is shared between the main code generator and the test code.
|
||||
|
||||
import (
|
||||
"flag"
|
||||
"log"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"golang.org/x/text/internal/gen"
|
||||
"golang.org/x/text/internal/ucd"
|
||||
)
|
||||
|
||||
var (
|
||||
outputFile = flag.String("out", "tables.go", "output file")
|
||||
)
|
||||
|
||||
var typeMap = map[string]elem{
|
||||
"A": tagAmbiguous,
|
||||
"N": tagNeutral,
|
||||
"Na": tagNarrow,
|
||||
"W": tagWide,
|
||||
"F": tagFullwidth,
|
||||
"H": tagHalfwidth,
|
||||
}
|
||||
|
||||
// getWidthData calls f for every entry for which it is defined.
|
||||
//
|
||||
// f may be called multiple times for the same rune. The last call to f is the
|
||||
// correct value. f is not called for all runes. The default tag type is
|
||||
// Neutral.
|
||||
func getWidthData(f func(r rune, tag elem, alt rune)) {
|
||||
// Set the default values for Unified Ideographs. In line with Annex 11,
|
||||
// we encode full ranges instead of the defined runes in Unified_Ideograph.
|
||||
for _, b := range []struct{ lo, hi rune }{
|
||||
{0x4E00, 0x9FFF}, // the CJK Unified Ideographs block,
|
||||
{0x3400, 0x4DBF}, // the CJK Unified Ideographs Externsion A block,
|
||||
{0xF900, 0xFAFF}, // the CJK Compatibility Ideographs block,
|
||||
{0x20000, 0x2FFFF}, // the Supplementary Ideographic Plane,
|
||||
{0x30000, 0x3FFFF}, // the Tertiary Ideographic Plane,
|
||||
} {
|
||||
for r := b.lo; r <= b.hi; r++ {
|
||||
f(r, tagWide, 0)
|
||||
}
|
||||
}
|
||||
|
||||
inverse := map[rune]rune{}
|
||||
maps := map[string]bool{
|
||||
"<wide>": true,
|
||||
"<narrow>": true,
|
||||
}
|
||||
|
||||
// We cannot reuse package norm's decomposition, as we need an unexpanded
|
||||
// decomposition. We make use of the opportunity to verify that the
|
||||
// decomposition type is as expected.
|
||||
ucd.Parse(gen.OpenUCDFile("UnicodeData.txt"), func(p *ucd.Parser) {
|
||||
r := p.Rune(0)
|
||||
s := strings.SplitN(p.String(ucd.DecompMapping), " ", 2)
|
||||
if !maps[s[0]] {
|
||||
return
|
||||
}
|
||||
x, err := strconv.ParseUint(s[1], 16, 32)
|
||||
if err != nil {
|
||||
log.Fatalf("Error parsing rune %q", s[1])
|
||||
}
|
||||
if inverse[r] != 0 || inverse[rune(x)] != 0 {
|
||||
log.Fatalf("Circular dependency in mapping between %U and %U", r, x)
|
||||
}
|
||||
inverse[r] = rune(x)
|
||||
inverse[rune(x)] = r
|
||||
})
|
||||
|
||||
// <rune range>;<type>
|
||||
ucd.Parse(gen.OpenUCDFile("EastAsianWidth.txt"), func(p *ucd.Parser) {
|
||||
tag, ok := typeMap[p.String(1)]
|
||||
if !ok {
|
||||
log.Fatalf("Unknown width type %q", p.String(1))
|
||||
}
|
||||
r := p.Rune(0)
|
||||
alt, ok := inverse[r]
|
||||
if tag == tagFullwidth || tag == tagHalfwidth && r != wonSign {
|
||||
tag |= tagNeedsFold
|
||||
if !ok {
|
||||
log.Fatalf("Narrow or wide rune %U has no decomposition", r)
|
||||
}
|
||||
}
|
||||
f(r, tag, alt)
|
||||
})
|
||||
}
|
||||
34
vendor/golang.org/x/text/width/gen_trieval.go
generated
vendored
Normal file
34
vendor/golang.org/x/text/width/gen_trieval.go
generated
vendored
Normal file
|
|
@ -0,0 +1,34 @@
|
|||
// Copyright 2015 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build ignore
|
||||
|
||||
package main
|
||||
|
||||
// elem is an entry of the width trie. The high byte is used to encode the type
|
||||
// of the rune. The low byte is used to store the index to a mapping entry in
|
||||
// the inverseData array.
|
||||
type elem uint16
|
||||
|
||||
const (
|
||||
tagNeutral elem = iota << typeShift
|
||||
tagAmbiguous
|
||||
tagWide
|
||||
tagNarrow
|
||||
tagFullwidth
|
||||
tagHalfwidth
|
||||
)
|
||||
|
||||
const (
|
||||
numTypeBits = 3
|
||||
typeShift = 16 - numTypeBits
|
||||
|
||||
// tagNeedsFold is true for all fullwidth and halfwidth runes except for
|
||||
// the Won sign U+20A9.
|
||||
tagNeedsFold = 0x1000
|
||||
|
||||
// The Korean Won sign is halfwidth, but SHOULD NOT be mapped to a wide
|
||||
// variant.
|
||||
wonSign rune = 0x20A9
|
||||
)
|
||||
16
vendor/golang.org/x/text/width/kind_string.go
generated
vendored
Normal file
16
vendor/golang.org/x/text/width/kind_string.go
generated
vendored
Normal file
|
|
@ -0,0 +1,16 @@
|
|||
// Code generated by "stringer -type=Kind"; DO NOT EDIT
|
||||
|
||||
package width
|
||||
|
||||
import "fmt"
|
||||
|
||||
const _Kind_name = "NeutralEastAsianAmbiguousEastAsianWideEastAsianNarrowEastAsianFullwidthEastAsianHalfwidth"
|
||||
|
||||
var _Kind_index = [...]uint8{0, 7, 25, 38, 53, 71, 89}
|
||||
|
||||
func (i Kind) String() string {
|
||||
if i < 0 || i >= Kind(len(_Kind_index)-1) {
|
||||
return fmt.Sprintf("Kind(%d)", i)
|
||||
}
|
||||
return _Kind_name[_Kind_index[i]:_Kind_index[i+1]]
|
||||
}
|
||||
1284
vendor/golang.org/x/text/width/tables.go
generated
vendored
Normal file
1284
vendor/golang.org/x/text/width/tables.go
generated
vendored
Normal file
File diff suppressed because it is too large
Load diff
162
vendor/golang.org/x/text/width/transform.go
generated
vendored
Normal file
162
vendor/golang.org/x/text/width/transform.go
generated
vendored
Normal file
|
|
@ -0,0 +1,162 @@
|
|||
// Copyright 2015 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package width
|
||||
|
||||
import (
|
||||
"unicode/utf8"
|
||||
|
||||
"golang.org/x/text/transform"
|
||||
)
|
||||
|
||||
type foldTransform struct {
|
||||
transform.NopResetter
|
||||
}
|
||||
|
||||
func (foldTransform) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
for nSrc < len(src) {
|
||||
if src[nSrc] < utf8.RuneSelf {
|
||||
// ASCII fast path.
|
||||
start, end := nSrc, len(src)
|
||||
if d := len(dst) - nDst; d < end-start {
|
||||
end = nSrc + d
|
||||
}
|
||||
for nSrc++; nSrc < end && src[nSrc] < utf8.RuneSelf; nSrc++ {
|
||||
}
|
||||
n := copy(dst[nDst:], src[start:nSrc])
|
||||
if nDst += n; nDst == len(dst) {
|
||||
nSrc = start + n
|
||||
if nSrc == len(src) {
|
||||
return nDst, nSrc, nil
|
||||
}
|
||||
if src[nSrc] < utf8.RuneSelf {
|
||||
return nDst, nSrc, transform.ErrShortDst
|
||||
}
|
||||
}
|
||||
continue
|
||||
}
|
||||
v, size := trie.lookup(src[nSrc:])
|
||||
if size == 0 { // incomplete UTF-8 encoding
|
||||
if !atEOF {
|
||||
return nDst, nSrc, transform.ErrShortSrc
|
||||
}
|
||||
size = 1 // gobble 1 byte
|
||||
}
|
||||
if elem(v)&tagNeedsFold == 0 {
|
||||
if size != copy(dst[nDst:], src[nSrc:nSrc+size]) {
|
||||
return nDst, nSrc, transform.ErrShortDst
|
||||
}
|
||||
nDst += size
|
||||
} else {
|
||||
data := inverseData[byte(v)]
|
||||
if len(dst)-nDst < int(data[0]) {
|
||||
return nDst, nSrc, transform.ErrShortDst
|
||||
}
|
||||
i := 1
|
||||
for end := int(data[0]); i < end; i++ {
|
||||
dst[nDst] = data[i]
|
||||
nDst++
|
||||
}
|
||||
dst[nDst] = data[i] ^ src[nSrc+size-1]
|
||||
nDst++
|
||||
}
|
||||
nSrc += size
|
||||
}
|
||||
return nDst, nSrc, nil
|
||||
}
|
||||
|
||||
type narrowTransform struct {
|
||||
transform.NopResetter
|
||||
}
|
||||
|
||||
func (narrowTransform) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
for nSrc < len(src) {
|
||||
if src[nSrc] < utf8.RuneSelf {
|
||||
// ASCII fast path.
|
||||
start, end := nSrc, len(src)
|
||||
if d := len(dst) - nDst; d < end-start {
|
||||
end = nSrc + d
|
||||
}
|
||||
for nSrc++; nSrc < end && src[nSrc] < utf8.RuneSelf; nSrc++ {
|
||||
}
|
||||
n := copy(dst[nDst:], src[start:nSrc])
|
||||
if nDst += n; nDst == len(dst) {
|
||||
nSrc = start + n
|
||||
if nSrc == len(src) {
|
||||
return nDst, nSrc, nil
|
||||
}
|
||||
if src[nSrc] < utf8.RuneSelf {
|
||||
return nDst, nSrc, transform.ErrShortDst
|
||||
}
|
||||
}
|
||||
continue
|
||||
}
|
||||
v, size := trie.lookup(src[nSrc:])
|
||||
if size == 0 { // incomplete UTF-8 encoding
|
||||
if !atEOF {
|
||||
return nDst, nSrc, transform.ErrShortSrc
|
||||
}
|
||||
size = 1 // gobble 1 byte
|
||||
}
|
||||
if k := elem(v).kind(); byte(v) == 0 || k != EastAsianFullwidth && k != EastAsianWide && k != EastAsianAmbiguous {
|
||||
if size != copy(dst[nDst:], src[nSrc:nSrc+size]) {
|
||||
return nDst, nSrc, transform.ErrShortDst
|
||||
}
|
||||
nDst += size
|
||||
} else {
|
||||
data := inverseData[byte(v)]
|
||||
if len(dst)-nDst < int(data[0]) {
|
||||
return nDst, nSrc, transform.ErrShortDst
|
||||
}
|
||||
i := 1
|
||||
for end := int(data[0]); i < end; i++ {
|
||||
dst[nDst] = data[i]
|
||||
nDst++
|
||||
}
|
||||
dst[nDst] = data[i] ^ src[nSrc+size-1]
|
||||
nDst++
|
||||
}
|
||||
nSrc += size
|
||||
}
|
||||
return nDst, nSrc, nil
|
||||
}
|
||||
|
||||
type wideTransform struct {
|
||||
transform.NopResetter
|
||||
}
|
||||
|
||||
func (wideTransform) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
for nSrc < len(src) {
|
||||
// TODO: Consider ASCII fast path. Special-casing ASCII handling can
|
||||
// reduce the ns/op of BenchmarkWideASCII by about 30%. This is probably
|
||||
// not enough to warrant the extra code and complexity.
|
||||
v, size := trie.lookup(src[nSrc:])
|
||||
if size == 0 { // incomplete UTF-8 encoding
|
||||
if !atEOF {
|
||||
return nDst, nSrc, transform.ErrShortSrc
|
||||
}
|
||||
size = 1 // gobble 1 byte
|
||||
}
|
||||
if k := elem(v).kind(); byte(v) == 0 || k != EastAsianHalfwidth && k != EastAsianNarrow {
|
||||
if size != copy(dst[nDst:], src[nSrc:nSrc+size]) {
|
||||
return nDst, nSrc, transform.ErrShortDst
|
||||
}
|
||||
nDst += size
|
||||
} else {
|
||||
data := inverseData[byte(v)]
|
||||
if len(dst)-nDst < int(data[0]) {
|
||||
return nDst, nSrc, transform.ErrShortDst
|
||||
}
|
||||
i := 1
|
||||
for end := int(data[0]); i < end; i++ {
|
||||
dst[nDst] = data[i]
|
||||
nDst++
|
||||
}
|
||||
dst[nDst] = data[i] ^ src[nSrc+size-1]
|
||||
nDst++
|
||||
}
|
||||
nSrc += size
|
||||
}
|
||||
return nDst, nSrc, nil
|
||||
}
|
||||
30
vendor/golang.org/x/text/width/trieval.go
generated
vendored
Normal file
30
vendor/golang.org/x/text/width/trieval.go
generated
vendored
Normal file
|
|
@ -0,0 +1,30 @@
|
|||
// This file was generated by go generate; DO NOT EDIT
|
||||
|
||||
package width
|
||||
|
||||
// elem is an entry of the width trie. The high byte is used to encode the type
|
||||
// of the rune. The low byte is used to store the index to a mapping entry in
|
||||
// the inverseData array.
|
||||
type elem uint16
|
||||
|
||||
const (
|
||||
tagNeutral elem = iota << typeShift
|
||||
tagAmbiguous
|
||||
tagWide
|
||||
tagNarrow
|
||||
tagFullwidth
|
||||
tagHalfwidth
|
||||
)
|
||||
|
||||
const (
|
||||
numTypeBits = 3
|
||||
typeShift = 16 - numTypeBits
|
||||
|
||||
// tagNeedsFold is true for all fullwidth and halfwidth runes except for
|
||||
// the Won sign U+20A9.
|
||||
tagNeedsFold = 0x1000
|
||||
|
||||
// The Korean Won sign is halfwidth, but SHOULD NOT be mapped to a wide
|
||||
// variant.
|
||||
wonSign rune = 0x20A9
|
||||
)
|
||||
201
vendor/golang.org/x/text/width/width.go
generated
vendored
Normal file
201
vendor/golang.org/x/text/width/width.go
generated
vendored
Normal file
|
|
@ -0,0 +1,201 @@
|
|||
// Copyright 2015 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
//go:generate stringer -type=Kind
|
||||
//go:generate go run gen.go gen_common.go gen_trieval.go
|
||||
|
||||
// Package width provides functionality for handling different widths in text.
|
||||
//
|
||||
// Wide characters behave like ideographs; they tend to allow line breaks after
|
||||
// each character and remain upright in vertical text layout. Narrow characters
|
||||
// are kept together in words or runs that are rotated sideways in vertical text
|
||||
// layout.
|
||||
//
|
||||
// For more information, see http://unicode.org/reports/tr11/.
|
||||
package width
|
||||
|
||||
import (
|
||||
"unicode/utf8"
|
||||
|
||||
"golang.org/x/text/transform"
|
||||
)
|
||||
|
||||
// TODO
|
||||
// 1) Reduce table size by compressing blocks.
|
||||
// 2) API proposition for computing display length
|
||||
// (approximation, fixed pitch only).
|
||||
// 3) Implement display length.
|
||||
|
||||
// Kind indicates the type of width property as defined in http://unicode.org/reports/tr11/.
|
||||
type Kind int
|
||||
|
||||
const (
|
||||
// Neutral characters do not occur in legacy East Asian character sets.
|
||||
Neutral Kind = iota
|
||||
|
||||
// EastAsianAmbiguous characters that can be sometimes wide and sometimes
|
||||
// narrow and require additional information not contained in the character
|
||||
// code to further resolve their width.
|
||||
EastAsianAmbiguous
|
||||
|
||||
// EastAsianWide characters are wide in its usual form. They occur only in
|
||||
// the context of East Asian typography. These runes may have explicit
|
||||
// halfwidth counterparts.
|
||||
EastAsianWide
|
||||
|
||||
// EastAsianNarrow characters are narrow in its usual form. They often have
|
||||
// fullwidth counterparts.
|
||||
EastAsianNarrow
|
||||
|
||||
// Note: there exist Narrow runes that do not have fullwidth or wide
|
||||
// counterparts, despite what the definition says (e.g. U+27E6).
|
||||
|
||||
// EastAsianFullwidth characters have a compatibility decompositions of type
|
||||
// wide that map to a narrow counterpart.
|
||||
EastAsianFullwidth
|
||||
|
||||
// EastAsianHalfwidth characters have a compatibility decomposition of type
|
||||
// narrow that map to a wide or ambiguous counterpart, plus U+20A9 ₩ WON
|
||||
// SIGN.
|
||||
EastAsianHalfwidth
|
||||
|
||||
// Note: there exist runes that have a halfwidth counterparts but that are
|
||||
// classified as Ambiguous, rather than wide (e.g. U+2190).
|
||||
)
|
||||
|
||||
// TODO: the generated tries need to return size 1 for invalid runes for the
|
||||
// width to be computed correctly (each byte should render width 1)
|
||||
|
||||
var trie = newWidthTrie(0)
|
||||
|
||||
// Lookup reports the Properties of the first rune in b and the number of bytes
|
||||
// of its UTF-8 encoding.
|
||||
func Lookup(b []byte) (p Properties, size int) {
|
||||
v, sz := trie.lookup(b)
|
||||
return Properties{elem(v), b[sz-1]}, sz
|
||||
}
|
||||
|
||||
// LookupString reports the Properties of the first rune in s and the number of
|
||||
// bytes of its UTF-8 encoding.
|
||||
func LookupString(s string) (p Properties, size int) {
|
||||
v, sz := trie.lookupString(s)
|
||||
return Properties{elem(v), s[sz-1]}, sz
|
||||
}
|
||||
|
||||
// LookupRune reports the Properties of rune r.
|
||||
func LookupRune(r rune) Properties {
|
||||
var buf [4]byte
|
||||
n := utf8.EncodeRune(buf[:], r)
|
||||
v, _ := trie.lookup(buf[:n])
|
||||
last := byte(r)
|
||||
if r >= utf8.RuneSelf {
|
||||
last = 0x80 + byte(r&0x3f)
|
||||
}
|
||||
return Properties{elem(v), last}
|
||||
}
|
||||
|
||||
// Properties provides access to width properties of a rune.
|
||||
type Properties struct {
|
||||
elem elem
|
||||
last byte
|
||||
}
|
||||
|
||||
func (e elem) kind() Kind {
|
||||
return Kind(e >> typeShift)
|
||||
}
|
||||
|
||||
// Kind returns the Kind of a rune as defined in Unicode TR #11.
|
||||
// See http://unicode.org/reports/tr11/ for more details.
|
||||
func (p Properties) Kind() Kind {
|
||||
return p.elem.kind()
|
||||
}
|
||||
|
||||
// Folded returns the folded variant of a rune or 0 if the rune is canonical.
|
||||
func (p Properties) Folded() rune {
|
||||
if p.elem&tagNeedsFold != 0 {
|
||||
buf := inverseData[byte(p.elem)]
|
||||
buf[buf[0]] ^= p.last
|
||||
r, _ := utf8.DecodeRune(buf[1 : 1+buf[0]])
|
||||
return r
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
// Narrow returns the narrow variant of a rune or 0 if the rune is already
|
||||
// narrow or doesn't have a narrow variant.
|
||||
func (p Properties) Narrow() rune {
|
||||
if k := p.elem.kind(); byte(p.elem) != 0 && (k == EastAsianFullwidth || k == EastAsianWide || k == EastAsianAmbiguous) {
|
||||
buf := inverseData[byte(p.elem)]
|
||||
buf[buf[0]] ^= p.last
|
||||
r, _ := utf8.DecodeRune(buf[1 : 1+buf[0]])
|
||||
return r
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
// Wide returns the wide variant of a rune or 0 if the rune is already
|
||||
// wide or doesn't have a wide variant.
|
||||
func (p Properties) Wide() rune {
|
||||
if k := p.elem.kind(); byte(p.elem) != 0 && (k == EastAsianHalfwidth || k == EastAsianNarrow) {
|
||||
buf := inverseData[byte(p.elem)]
|
||||
buf[buf[0]] ^= p.last
|
||||
r, _ := utf8.DecodeRune(buf[1 : 1+buf[0]])
|
||||
return r
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
// TODO for Properties:
|
||||
// - Add Fullwidth/Halfwidth or Inverted methods for computing variants
|
||||
// mapping.
|
||||
// - Add width information (including information on non-spacing runes).
|
||||
|
||||
// Transformer implements the transform.Transformer interface.
|
||||
type Transformer struct {
|
||||
t transform.Transformer
|
||||
}
|
||||
|
||||
// Reset implements the transform.Transformer interface.
|
||||
func (t Transformer) Reset() { t.t.Reset() }
|
||||
|
||||
// Transform implements the Transformer interface.
|
||||
func (t Transformer) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
return t.t.Transform(dst, src, atEOF)
|
||||
}
|
||||
|
||||
// Bytes returns a new byte slice with the result of applying t to b.
|
||||
func (t Transformer) Bytes(b []byte) []byte {
|
||||
b, _, _ = transform.Bytes(t, b)
|
||||
return b
|
||||
}
|
||||
|
||||
// String returns a string with the result of applying t to s.
|
||||
func (t Transformer) String(s string) string {
|
||||
s, _, _ = transform.String(t, s)
|
||||
return s
|
||||
}
|
||||
|
||||
var (
|
||||
// Fold is a transform that maps all runes to their canonical width.
|
||||
//
|
||||
// Note that the NFKC and NFKD transforms in golang.org/x/text/unicode/norm
|
||||
// provide a more generic folding mechanism.
|
||||
Fold Transformer = Transformer{foldTransform{}}
|
||||
|
||||
// Widen is a transform that maps runes to their wide variant, if
|
||||
// available.
|
||||
Widen Transformer = Transformer{wideTransform{}}
|
||||
|
||||
// Narrow is a transform that maps runes to their narrow variant, if
|
||||
// available.
|
||||
Narrow Transformer = Transformer{narrowTransform{}}
|
||||
)
|
||||
|
||||
// TODO: Consider the following options:
|
||||
// - Treat Ambiguous runes that have a halfwidth counterpart as wide, or some
|
||||
// generalized variant of this.
|
||||
// - Consider a wide Won character to be the default width (or some generalized
|
||||
// variant of this).
|
||||
// - Filter the set of characters that gets converted (the preferred approach is
|
||||
// to allow applying filters to transforms).
|
||||
Loading…
Add table
Add a link
Reference in a new issue