Add str:from-utf8-bytes, str:to-utf8-bytes and move builtin ord and chr to str module (#1081)

* pkg/eval/str: move builtin ord and chr to str Move builtin string function ord and chr to the str module and rename to to str:to-codepoints and str:from-codepoints respectively as suggested in #851. * pkg/eval/str: add from-utf8-bytes & to-utf8-bytes Add from-utf8-bytes and to-utf8-bytes functions to the str module. This functions differ from their *-codepoints in that they handle utf8 bytes instead of whole codepoints. Closes #851 * pkg/eval/str: range check for codepoint and bytes str:from-codepoints Add check if arguments codepoints are within valid unicode range, return an OutOfRange error otherwise. Return a BadValue error if the codepoint isn't valid. Add/change testcases. str:from-utf8-bytes Add check if byte arguments are within valid range, return an OutOfRange error otherwise. Return a BadValue error if the byte sequence isn't a valid UTF-8 sequence. Add/change testcases. Add additional test if piping from str:to-codepoints/str:to-utf8-bytes to str:from-codpoints/str:from-utf8-bytes returns the original input.
2024-12-05 03:17:50 +08:00 · 2020-08-13 22:56:28 +02:00 · 2020-08-13 22:56:28 +02:00 · 1124c10b56
commit 1124c10b56
parent c42755892e
5 changed files with 202 additions and 8 deletions
--- a/NEXT-RELEASE.md
+++ b/NEXT-RELEASE.md
@ -1,2 +1,8 @@
 This is the draft release notes for 0.15.0, scheduled to be released on
 2021-01-01.
+
+# Deprecated features
+
+-   The `chr` command is now deprecated. Use `str:from-codepoints` instead.
+
+-   The `ord` command is now deprecated. Use `str:to-codepoints` instead.
--- a/pkg/eval/builtin_fn_str.go
+++ b/pkg/eval/builtin_fn_str.go
@ -62,6 +62,8 @@ var ErrInputOfEawkMustBeString = errors.New("input of eawk must be string")
 // ord $string
 // ```
 //
+// This function is deprecated; use [str:to-codepoints](str.html#strto-codepoints) instead.
+//
 // Output value of each codepoint in `$string`, in hexadecimal. Examples:
 //
 // ```elvish-transcript
@ -84,6 +86,8 @@ var ErrInputOfEawkMustBeString = errors.New("input of eawk must be string")
 // chr $number...
 // ```
 //
+// This function is deprecated; use [str:from-codepoints](str.html#strfrom-codepoints) instead.
+//
 // Outputs a string consisting of the given Unicode codepoints. Example:
 //
 // ```elvish-transcript
--- a/pkg/eval/compiler.go
+++ b/pkg/eval/compiler.go
@ -164,6 +164,10 @@ func (cp *compiler) checkDeprecatedBuiltin(name string, r diag.Ranger) {
 		msg = `the "-time" command is deprecated; use "time" instead`
 	case "^~":
 		msg = `the "^" command is deprecated; use "math:pow" or "math:pow10" instead`
+	case "ord":
+		msg = `the "ord" command is deprecated; use "str:to-codepoints" instead`
+	case "chr":
+		msg = `the "chr" command is deprecated; use "str:from-codepoints" instead`
 	default:
 		return
 	}
--- a/pkg/eval/str/str.go
+++ b/pkg/eval/str/str.go
@ -4,7 +4,11 @@ package str

 import (
 	"bytes"
+	"fmt"
+	"strconv"
 	"strings"
+	"unicode"
+	"unicode/utf8"

 	"github.com/elves/elvish/pkg/eval"
 	"github.com/elves/elvish/pkg/eval/errs"
@ -92,6 +96,80 @@ import (
 // ▶ $false
 // ```

+//elvdoc:fn from-codepoints
+//
+// ```elvish
+// str:from-codepoints $number...
+// ```
+//
+// Outputs a string consisting of the given Unicode codepoints. Example:
+//
+// ```elvish-transcript
+// ~> str:from-codepoints 0x61
+// ▶ a
+// ~> str:from-codepoints 0x4f60 0x597d
+// ▶ 你好
+// ```
+//
+// @cf str:to-codepoints
+
+func fromCodepoints(nums ...int) (string, error) {
+	var b bytes.Buffer
+	for _, num := range nums {
+		if num < 0 || num > unicode.MaxRune {
+			return "", errs.OutOfRange{
+				What:     "codepoint",
+				ValidLow: 0, ValidHigh: unicode.MaxRune,
+				Actual: strconv.Itoa(num)}
+		}
+		if !utf8.ValidRune(rune(num)) {
+			return "", errs.BadValue{
+				What:   "argument to str:from-codepoints",
+				Valid:  "valid Unicode codepoint",
+				Actual: "0x" + strconv.FormatInt(int64(num), 16)}
+		}
+		b.WriteRune(rune(num))
+	}
+	return b.String(), nil
+}
+
+//elvdoc:fn from-utf8-bytes
+//
+// ```elvish
+// str:from-from-utf8-bytes $number...
+// ```
+//
+// Outputs a string consisting of the given Unicode bytes. Example:
+//
+// ```elvish-transcript
+// ~> str:from-utf8-bytes 0x61
+// ▶ a
+// ~> str:from-utf8-bytes 0xe4 0xbd 0xa0 0xe5 0xa5 0xbd
+// ▶ 你好
+// ```
+//
+// @cf str:to-utf8-bytes
+
+func fromUtf8Bytes(nums ...int) (string, error) {
+	var b bytes.Buffer
+	for _, num := range nums {
+		if num < 0 || num > 255 {
+			return "", errs.OutOfRange{
+				What:     "byte",
+				ValidLow: 0, ValidHigh: 255,
+				Actual: strconv.Itoa(num)}
+		}
+		b.WriteByte(byte(num))
+	}
+	if !utf8.Valid(b.Bytes()) {
+		return "", errs.BadValue{
+			What:   "arguments to str:from-utf8-bytes",
+			Valid:  "valid UTF-8 sequence",
+			Actual: fmt.Sprint(b.Bytes())}
+	}
+	return b.String(), nil
+}
+
 //elvdoc:fn has-prefix
 //
 // ```elvish
@ -288,6 +366,33 @@ func split(fm *eval.Frame, opts maxOpt, sep, s string) {
 // ▶ Her Royal Highness
 // ```

+//elvdoc:fn to-codepoints
+//
+// ```elvish
+// str:to-codepoints $string
+// ```
+//
+// Output value of each codepoint in `$string`, in hexadecimal. Examples:
+//
+// ```elvish-transcript
+// ~> str:to-codepoints a
+// ▶ 0x61
+// ~> str:to-codepoints 你好
+// ▶ 0x4f60
+// ▶ 0x597d
+// ```
+//
+// The output format is subject to change.
+//
+// @cf from-codepoints
+
+func toCodepoints(fm *eval.Frame, s string) {
+	out := fm.OutputChan()
+	for _, r := range s {
+		out <- "0x" + strconv.FormatInt(int64(r), 16)
+	}
+}
+
 //elvdoc:fn to-lower
 //
 // ```elvish
@ -302,6 +407,37 @@ func split(fm *eval.Frame, opts maxOpt, sep, s string) {
 // ▶ abc!123
 // ```

+//elvdoc:fn to-utf8-bytes
+//
+// ```elvish
+// str:to-utf8-bytes $string
+// ```
+//
+// Output value of each byte in `$string`, in hexadecimal. Examples:
+//
+// ```elvish-transcript
+// ~> str:to-utf8-bytes a
+// ▶ 0x61
+// ~> str:to-utf8-bytes 你好
+// ▶ 0xe4
+// ▶ 0xbd
+// ▶ 0xa0
+// ▶ 0xe5
+// ▶ 0xa5
+// ▶ 0xbd
+// ```
+//
+// The output format is subject to change.
+//
+// @cf from-utf8-bytes
+
+func toUtf8Bytes(fm *eval.Frame, s string) {
+	out := fm.OutputChan()
+	for _, r := range []byte(s) {
+		out <- "0x" + strconv.FormatInt(int64(r), 16)
+	}
+}
+
 //elvdoc:fn to-title
 //
 // ```elvish
@ -428,10 +564,12 @@ var fns = map[string]interface{}{
 	"count":        strings.Count,
 	"equal-fold":   strings.EqualFold,
 	// TODO: Fields, FieldsFunc
-	"has-prefix": strings.HasPrefix,
-	"has-suffix": strings.HasSuffix,
-	"index":      strings.Index,
-	"index-any":  strings.IndexAny,
+	"from-codepoints": fromCodepoints,
+	"from-utf8-bytes": fromUtf8Bytes,
+	"has-prefix":      strings.HasPrefix,
+	"has-suffix":      strings.HasSuffix,
+	"index":           strings.Index,
+	"index-any":       strings.IndexAny,
 	// TODO: IndexFunc
 	"join":       join,
 	"last-index": strings.LastIndex,
@ -439,10 +577,12 @@ var fns = map[string]interface{}{
 	"replace": replace,
 	"split":   split,
 	// TODO: SplitAfter
-	"title":    strings.Title,
-	"to-lower": strings.ToLower,
-	"to-title": strings.ToTitle,
-	"to-upper": strings.ToUpper,
+	"title":         strings.Title,
+	"to-codepoints": toCodepoints,
+	"to-lower":      strings.ToLower,
+	"to-title":      strings.ToTitle,
+	"to-upper":      strings.ToUpper,
+	"to-utf8-bytes": toUtf8Bytes,
 	// TODO: ToLowerSpecial, ToTitleSpecial, ToUpperSpecial
 	"trim":       strings.Trim,
 	"trim-left":  strings.TrimLeft,
--- a/pkg/eval/str/str_test.go
+++ b/pkg/eval/str/str_test.go
@ -1,7 +1,10 @@
 package str

 import (
+	"fmt"
+	"strconv"
 	"testing"
+	"unicode"

 	"github.com/elves/elvish/pkg/eval"
 	"github.com/elves/elvish/pkg/eval/errs"
@ -31,6 +34,35 @@ func TestStr(t *testing.T) {
 		That(`str:equal-fold abc ABC`).Puts(true),
 		That(`str:equal-fold abc A`).Puts(false),

+		That(`str:from-codepoints 0x61`).Puts("a"),
+		That(`str:from-codepoints 0x4f60 0x597d`).Puts("你好"),
+		That(`str:from-codepoints -0x1`).ThrowsCause(errs.OutOfRange{
+			What:     "codepoint",
+			ValidLow: 0, ValidHigh: unicode.MaxRune,
+			Actual: strconv.Itoa(-1)}),
+		That(fmt.Sprintf(`str:from-codepoints 0x%x`, unicode.MaxRune+1)).ThrowsCause(errs.OutOfRange{
+			What:     "codepoint",
+			ValidLow: 0, ValidHigh: unicode.MaxRune,
+			Actual: strconv.Itoa(unicode.MaxRune + 1)}),
+		That(`str:from-codepoints 0xd800`).ThrowsCause(errs.BadValue{
+			What:   "argument to str:from-codepoints",
+			Valid:  "valid Unicode codepoint",
+			Actual: "0xd800"}),
+		That(`str:from-utf8-bytes 0x61`).Puts("a"),
+		That(`str:from-utf8-bytes 0xe4 0xbd 0xa0 0xe5 0xa5 0xbd`).Puts("你好"),
+		That(`str:from-utf8-bytes -1`).ThrowsCause(errs.OutOfRange{
+			What:     "byte",
+			ValidLow: 0, ValidHigh: 255,
+			Actual: strconv.Itoa(-1)}),
+		That(`str:from-utf8-bytes 256`).ThrowsCause(errs.OutOfRange{
+			What:     "byte",
+			ValidLow: 0, ValidHigh: 255,
+			Actual: strconv.Itoa(256)}),
+		That(`str:from-utf8-bytes 0xff 0x3 0xaa`).ThrowsCause(errs.BadValue{
+			What:   "arguments to str:from-utf8-bytes",
+			Valid:  "valid UTF-8 sequence",
+			Actual: "[255 3 170]"}),
+
 		That(`str:has-prefix abc`).ThrowsAny(),
 		That(`str:has-prefix abcd ab`).Puts(true),
 		That(`str:has-prefix abcd cd`).Puts(false),
@ -62,6 +94,14 @@ func TestStr(t *testing.T) {
 		That(`str:split : /usr:/bin:/tmp`).Puts("/usr", "/bin", "/tmp"),
 		That(`str:split : /usr:/bin:/tmp &max=2`).Puts("/usr", "/bin:/tmp"),

+		That(`str:to-codepoints a`).Puts("0x61"),
+		That(`str:to-codepoints 你好`).Puts("0x4f60", "0x597d"),
+		That(`str:to-codepoints 你好 | str:from-codepoints (all)`).Puts("你好"),
+
+		That(`str:to-utf8-bytes a`).Puts("0x61"),
+		That(`str:to-utf8-bytes 你好`).Puts("0xe4", "0xbd", "0xa0", "0xe5", "0xa5", "0xbd"),
+		That(`str:to-utf8-bytes 你好 | str:from-utf8-bytes (all)`).Puts("你好"),
+
 		That(`str:title abc`).Puts("Abc"),
 		That(`str:title "abc def"`).Puts("Abc Def"),
 		That(`str:to-lower abc def`).ThrowsAny(),