Add str:from-utf8-bytes, str:to-utf8-bytes and move builtin ord and chr to str module (#1081)

* pkg/eval/str: move builtin ord and chr to str

Move builtin string function ord and chr to the str module and rename to
to str:to-codepoints and str:from-codepoints respectively as suggested
in #851.

* pkg/eval/str: add from-utf8-bytes & to-utf8-bytes

Add from-utf8-bytes and to-utf8-bytes functions to the str module. This
functions differ from their *-codepoints in that they handle utf8 bytes
instead of whole codepoints. Closes #851

* pkg/eval/str: range check for codepoint and bytes

str:from-codepoints
Add check if arguments codepoints are within valid unicode range, return
an OutOfRange error otherwise. Return a BadValue error if the codepoint
isn't valid. Add/change testcases.

str:from-utf8-bytes
Add check if byte arguments are within valid range, return an OutOfRange
error otherwise. Return a BadValue error if the byte sequence isn't a
valid UTF-8 sequence. Add/change testcases.

Add additional test if piping from str:to-codepoints/str:to-utf8-bytes
to str:from-codpoints/str:from-utf8-bytes returns the original input.
This commit is contained in:
Gabriel Rauter 2020-08-13 22:56:28 +02:00 committed by GitHub
parent c42755892e
commit 1124c10b56
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 202 additions and 8 deletions

View File

@ -1,2 +1,8 @@
This is the draft release notes for 0.15.0, scheduled to be released on
2021-01-01.
# Deprecated features
- The `chr` command is now deprecated. Use `str:from-codepoints` instead.
- The `ord` command is now deprecated. Use `str:to-codepoints` instead.

View File

@ -62,6 +62,8 @@ var ErrInputOfEawkMustBeString = errors.New("input of eawk must be string")
// ord $string
// ```
//
// This function is deprecated; use [str:to-codepoints](str.html#strto-codepoints) instead.
//
// Output value of each codepoint in `$string`, in hexadecimal. Examples:
//
// ```elvish-transcript
@ -84,6 +86,8 @@ var ErrInputOfEawkMustBeString = errors.New("input of eawk must be string")
// chr $number...
// ```
//
// This function is deprecated; use [str:from-codepoints](str.html#strfrom-codepoints) instead.
//
// Outputs a string consisting of the given Unicode codepoints. Example:
//
// ```elvish-transcript

View File

@ -164,6 +164,10 @@ func (cp *compiler) checkDeprecatedBuiltin(name string, r diag.Ranger) {
msg = `the "-time" command is deprecated; use "time" instead`
case "^~":
msg = `the "^" command is deprecated; use "math:pow" or "math:pow10" instead`
case "ord":
msg = `the "ord" command is deprecated; use "str:to-codepoints" instead`
case "chr":
msg = `the "chr" command is deprecated; use "str:from-codepoints" instead`
default:
return
}

View File

@ -4,7 +4,11 @@ package str
import (
"bytes"
"fmt"
"strconv"
"strings"
"unicode"
"unicode/utf8"
"github.com/elves/elvish/pkg/eval"
"github.com/elves/elvish/pkg/eval/errs"
@ -92,6 +96,80 @@ import (
// ▶ $false
// ```
//elvdoc:fn from-codepoints
//
// ```elvish
// str:from-codepoints $number...
// ```
//
// Outputs a string consisting of the given Unicode codepoints. Example:
//
// ```elvish-transcript
// ~> str:from-codepoints 0x61
// ▶ a
// ~> str:from-codepoints 0x4f60 0x597d
// ▶ 你好
// ```
//
// @cf str:to-codepoints
func fromCodepoints(nums ...int) (string, error) {
var b bytes.Buffer
for _, num := range nums {
if num < 0 || num > unicode.MaxRune {
return "", errs.OutOfRange{
What: "codepoint",
ValidLow: 0, ValidHigh: unicode.MaxRune,
Actual: strconv.Itoa(num)}
}
if !utf8.ValidRune(rune(num)) {
return "", errs.BadValue{
What: "argument to str:from-codepoints",
Valid: "valid Unicode codepoint",
Actual: "0x" + strconv.FormatInt(int64(num), 16)}
}
b.WriteRune(rune(num))
}
return b.String(), nil
}
//elvdoc:fn from-utf8-bytes
//
// ```elvish
// str:from-from-utf8-bytes $number...
// ```
//
// Outputs a string consisting of the given Unicode bytes. Example:
//
// ```elvish-transcript
// ~> str:from-utf8-bytes 0x61
// ▶ a
// ~> str:from-utf8-bytes 0xe4 0xbd 0xa0 0xe5 0xa5 0xbd
// ▶ 你好
// ```
//
// @cf str:to-utf8-bytes
func fromUtf8Bytes(nums ...int) (string, error) {
var b bytes.Buffer
for _, num := range nums {
if num < 0 || num > 255 {
return "", errs.OutOfRange{
What: "byte",
ValidLow: 0, ValidHigh: 255,
Actual: strconv.Itoa(num)}
}
b.WriteByte(byte(num))
}
if !utf8.Valid(b.Bytes()) {
return "", errs.BadValue{
What: "arguments to str:from-utf8-bytes",
Valid: "valid UTF-8 sequence",
Actual: fmt.Sprint(b.Bytes())}
}
return b.String(), nil
}
//elvdoc:fn has-prefix
//
// ```elvish
@ -288,6 +366,33 @@ func split(fm *eval.Frame, opts maxOpt, sep, s string) {
// ▶ Her Royal Highness
// ```
//elvdoc:fn to-codepoints
//
// ```elvish
// str:to-codepoints $string
// ```
//
// Output value of each codepoint in `$string`, in hexadecimal. Examples:
//
// ```elvish-transcript
// ~> str:to-codepoints a
// ▶ 0x61
// ~> str:to-codepoints 你好
// ▶ 0x4f60
// ▶ 0x597d
// ```
//
// The output format is subject to change.
//
// @cf from-codepoints
func toCodepoints(fm *eval.Frame, s string) {
out := fm.OutputChan()
for _, r := range s {
out <- "0x" + strconv.FormatInt(int64(r), 16)
}
}
//elvdoc:fn to-lower
//
// ```elvish
@ -302,6 +407,37 @@ func split(fm *eval.Frame, opts maxOpt, sep, s string) {
// ▶ abc!123
// ```
//elvdoc:fn to-utf8-bytes
//
// ```elvish
// str:to-utf8-bytes $string
// ```
//
// Output value of each byte in `$string`, in hexadecimal. Examples:
//
// ```elvish-transcript
// ~> str:to-utf8-bytes a
// ▶ 0x61
// ~> str:to-utf8-bytes 你好
// ▶ 0xe4
// ▶ 0xbd
// ▶ 0xa0
// ▶ 0xe5
// ▶ 0xa5
// ▶ 0xbd
// ```
//
// The output format is subject to change.
//
// @cf from-utf8-bytes
func toUtf8Bytes(fm *eval.Frame, s string) {
out := fm.OutputChan()
for _, r := range []byte(s) {
out <- "0x" + strconv.FormatInt(int64(r), 16)
}
}
//elvdoc:fn to-title
//
// ```elvish
@ -428,10 +564,12 @@ var fns = map[string]interface{}{
"count": strings.Count,
"equal-fold": strings.EqualFold,
// TODO: Fields, FieldsFunc
"has-prefix": strings.HasPrefix,
"has-suffix": strings.HasSuffix,
"index": strings.Index,
"index-any": strings.IndexAny,
"from-codepoints": fromCodepoints,
"from-utf8-bytes": fromUtf8Bytes,
"has-prefix": strings.HasPrefix,
"has-suffix": strings.HasSuffix,
"index": strings.Index,
"index-any": strings.IndexAny,
// TODO: IndexFunc
"join": join,
"last-index": strings.LastIndex,
@ -439,10 +577,12 @@ var fns = map[string]interface{}{
"replace": replace,
"split": split,
// TODO: SplitAfter
"title": strings.Title,
"to-lower": strings.ToLower,
"to-title": strings.ToTitle,
"to-upper": strings.ToUpper,
"title": strings.Title,
"to-codepoints": toCodepoints,
"to-lower": strings.ToLower,
"to-title": strings.ToTitle,
"to-upper": strings.ToUpper,
"to-utf8-bytes": toUtf8Bytes,
// TODO: ToLowerSpecial, ToTitleSpecial, ToUpperSpecial
"trim": strings.Trim,
"trim-left": strings.TrimLeft,

View File

@ -1,7 +1,10 @@
package str
import (
"fmt"
"strconv"
"testing"
"unicode"
"github.com/elves/elvish/pkg/eval"
"github.com/elves/elvish/pkg/eval/errs"
@ -31,6 +34,35 @@ func TestStr(t *testing.T) {
That(`str:equal-fold abc ABC`).Puts(true),
That(`str:equal-fold abc A`).Puts(false),
That(`str:from-codepoints 0x61`).Puts("a"),
That(`str:from-codepoints 0x4f60 0x597d`).Puts("你好"),
That(`str:from-codepoints -0x1`).ThrowsCause(errs.OutOfRange{
What: "codepoint",
ValidLow: 0, ValidHigh: unicode.MaxRune,
Actual: strconv.Itoa(-1)}),
That(fmt.Sprintf(`str:from-codepoints 0x%x`, unicode.MaxRune+1)).ThrowsCause(errs.OutOfRange{
What: "codepoint",
ValidLow: 0, ValidHigh: unicode.MaxRune,
Actual: strconv.Itoa(unicode.MaxRune + 1)}),
That(`str:from-codepoints 0xd800`).ThrowsCause(errs.BadValue{
What: "argument to str:from-codepoints",
Valid: "valid Unicode codepoint",
Actual: "0xd800"}),
That(`str:from-utf8-bytes 0x61`).Puts("a"),
That(`str:from-utf8-bytes 0xe4 0xbd 0xa0 0xe5 0xa5 0xbd`).Puts("你好"),
That(`str:from-utf8-bytes -1`).ThrowsCause(errs.OutOfRange{
What: "byte",
ValidLow: 0, ValidHigh: 255,
Actual: strconv.Itoa(-1)}),
That(`str:from-utf8-bytes 256`).ThrowsCause(errs.OutOfRange{
What: "byte",
ValidLow: 0, ValidHigh: 255,
Actual: strconv.Itoa(256)}),
That(`str:from-utf8-bytes 0xff 0x3 0xaa`).ThrowsCause(errs.BadValue{
What: "arguments to str:from-utf8-bytes",
Valid: "valid UTF-8 sequence",
Actual: "[255 3 170]"}),
That(`str:has-prefix abc`).ThrowsAny(),
That(`str:has-prefix abcd ab`).Puts(true),
That(`str:has-prefix abcd cd`).Puts(false),
@ -62,6 +94,14 @@ func TestStr(t *testing.T) {
That(`str:split : /usr:/bin:/tmp`).Puts("/usr", "/bin", "/tmp"),
That(`str:split : /usr:/bin:/tmp &max=2`).Puts("/usr", "/bin:/tmp"),
That(`str:to-codepoints a`).Puts("0x61"),
That(`str:to-codepoints 你好`).Puts("0x4f60", "0x597d"),
That(`str:to-codepoints 你好 | str:from-codepoints (all)`).Puts("你好"),
That(`str:to-utf8-bytes a`).Puts("0x61"),
That(`str:to-utf8-bytes 你好`).Puts("0xe4", "0xbd", "0xa0", "0xe5", "0xa5", "0xbd"),
That(`str:to-utf8-bytes 你好 | str:from-utf8-bytes (all)`).Puts("你好"),
That(`str:title abc`).Puts("Abc"),
That(`str:title "abc def"`).Puts("Abc Def"),
That(`str:to-lower abc def`).ThrowsAny(),