138 lines
2.7 KiB
Go
138 lines
2.7 KiB
Go
|
// Package unistring contains an implementation of a hybrid ASCII/UTF-16 string.
|
||
|
// For ASCII strings the underlying representation is equivalent to a normal Go string.
|
||
|
// For unicode strings the underlying representation is UTF-16 as []uint16 with 0th element set to 0xFEFF.
|
||
|
// unicode.String allows representing malformed UTF-16 values (e.g. stand-alone parts of surrogate pairs)
|
||
|
// which cannot be represented in UTF-8.
|
||
|
// At the same time it is possible to use unicode.String as property keys just as efficiently as simple strings,
|
||
|
// (the leading 0xFEFF ensures there is no clash with ASCII string), and it is possible to convert it
|
||
|
// to valueString without extra allocations.
|
||
|
package unistring
|
||
|
|
||
|
import (
|
||
|
"reflect"
|
||
|
"unicode/utf16"
|
||
|
"unicode/utf8"
|
||
|
"unsafe"
|
||
|
)
|
||
|
|
||
|
const (
|
||
|
BOM = 0xFEFF
|
||
|
)
|
||
|
|
||
|
type String string
|
||
|
|
||
|
// Scan checks if the string contains any unicode characters. If it does, converts to an array suitable for creating
|
||
|
// a String using FromUtf16, otherwise returns nil.
|
||
|
func Scan(s string) []uint16 {
|
||
|
utf16Size := 0
|
||
|
for ; utf16Size < len(s); utf16Size++ {
|
||
|
if s[utf16Size] >= utf8.RuneSelf {
|
||
|
goto unicode
|
||
|
}
|
||
|
}
|
||
|
return nil
|
||
|
unicode:
|
||
|
for _, chr := range s[utf16Size:] {
|
||
|
utf16Size++
|
||
|
if chr > 0xFFFF {
|
||
|
utf16Size++
|
||
|
}
|
||
|
}
|
||
|
|
||
|
buf := make([]uint16, utf16Size+1)
|
||
|
buf[0] = BOM
|
||
|
c := 1
|
||
|
for _, chr := range s {
|
||
|
if chr <= 0xFFFF {
|
||
|
buf[c] = uint16(chr)
|
||
|
} else {
|
||
|
first, second := utf16.EncodeRune(chr)
|
||
|
buf[c] = uint16(first)
|
||
|
c++
|
||
|
buf[c] = uint16(second)
|
||
|
}
|
||
|
c++
|
||
|
}
|
||
|
|
||
|
return buf
|
||
|
}
|
||
|
|
||
|
func NewFromString(s string) String {
|
||
|
if buf := Scan(s); buf != nil {
|
||
|
return FromUtf16(buf)
|
||
|
}
|
||
|
return String(s)
|
||
|
}
|
||
|
|
||
|
func NewFromRunes(s []rune) String {
|
||
|
ascii := true
|
||
|
size := 0
|
||
|
for _, c := range s {
|
||
|
if c >= utf8.RuneSelf {
|
||
|
ascii = false
|
||
|
if c > 0xFFFF {
|
||
|
size++
|
||
|
}
|
||
|
}
|
||
|
size++
|
||
|
}
|
||
|
if ascii {
|
||
|
return String(s)
|
||
|
}
|
||
|
b := make([]uint16, size+1)
|
||
|
b[0] = BOM
|
||
|
i := 1
|
||
|
for _, c := range s {
|
||
|
if c <= 0xFFFF {
|
||
|
b[i] = uint16(c)
|
||
|
} else {
|
||
|
first, second := utf16.EncodeRune(c)
|
||
|
b[i] = uint16(first)
|
||
|
i++
|
||
|
b[i] = uint16(second)
|
||
|
}
|
||
|
i++
|
||
|
}
|
||
|
return FromUtf16(b)
|
||
|
}
|
||
|
|
||
|
func FromUtf16(b []uint16) String {
|
||
|
var str string
|
||
|
hdr := (*reflect.StringHeader)(unsafe.Pointer(&str))
|
||
|
hdr.Data = uintptr(unsafe.Pointer(&b[0]))
|
||
|
hdr.Len = len(b) * 2
|
||
|
|
||
|
return String(str)
|
||
|
}
|
||
|
|
||
|
func (s String) String() string {
|
||
|
if b := s.AsUtf16(); b != nil {
|
||
|
return string(utf16.Decode(b[1:]))
|
||
|
}
|
||
|
|
||
|
return string(s)
|
||
|
}
|
||
|
|
||
|
func (s String) AsUtf16() []uint16 {
|
||
|
if len(s) < 4 || len(s)&1 != 0 {
|
||
|
return nil
|
||
|
}
|
||
|
|
||
|
var a []uint16
|
||
|
raw := string(s)
|
||
|
|
||
|
sliceHeader := (*reflect.SliceHeader)(unsafe.Pointer(&a))
|
||
|
sliceHeader.Data = (*reflect.StringHeader)(unsafe.Pointer(&raw)).Data
|
||
|
|
||
|
l := len(raw) / 2
|
||
|
|
||
|
sliceHeader.Len = l
|
||
|
sliceHeader.Cap = l
|
||
|
|
||
|
if a[0] == BOM {
|
||
|
return a
|
||
|
}
|
||
|
|
||
|
return nil
|
||
|
}
|