字符串拼接测试

编写基准测试讨论 Golang 字符串拼接效率,基准测试如下

package play

import (
	"bytes"
	"fmt"
	"strings"

	"testing"
)

const (
	hello = "hello"
	world = "world"

	num = 100

	longHello = "hellohellohellohellohellohellohellohellohellohello"
)

func BenchmarkStringWithPlus(b *testing.B) {

	for i := 0; i < b.N; i++ {
		_ = hello + "," + world
	}
}

func BenchmarkStringWithSprintf(b *testing.B) {

	for i := 0; i < b.N; i++ {
		_ = fmt.Sprintf("%s,%s", hello, world)
	}
}

func BenchmarkStringWithJoin(b *testing.B) {

	for i := 0; i < b.N; i++ {
		_ = strings.Join([]string{hello, world}, ",")
	}
}

func BenchmarkStringWithBuffer(b *testing.B) {

	for i := 0; i < b.N; i++ {
		var buffer bytes.Buffer
		buffer.WriteString(hello)
		buffer.WriteString(",")
		buffer.WriteString(world)
		_ = buffer.String()
	}
}

func BenchmarkStringWithBuilder(b *testing.B) {

	for i := 0; i < b.N; i++ {
		var builder strings.Builder
		builder.WriteString(hello)
		builder.WriteString(",")
		builder.WriteString(world)
		_ = builder.String()
	}
}

func BenchmarkLongStringWithPlus(b *testing.B) {

	for i := 0; i < b.N; i++ {
		s := ""
		for j := 0; j < num; j++ {
			s += longHello
		}
	}
}

func BenchmarkLongStringWithSprintf(b *testing.B) {

	for i := 0; i < b.N; i++ {
		s := ""
		for j := 0; j < num; j++ {
			s = fmt.Sprintf("%s%s", s, longHello)
		}
	}
}

var joinList []string

func init() {
	joinList = make([]string, num)

	for j := 0; j < num; j++ {
		joinList[j] = longHello
	}

}

func BenchmarkLongStringWithJoin(b *testing.B) {

	for i := 0; i < b.N; i++ {
		_ = strings.Join(joinList, "")
	}
}

func BenchmarkLongStringWithBuffer(b *testing.B) {

	for i := 0; i < b.N; i++ {
		var buffer bytes.Buffer
		for j := 0; j < num; j++ {
			buffer.WriteString(longHello)
		}
		_ = buffer.String()
	}
}

// 1.10+
func BenchmarkLongStringWithBuilder(b *testing.B) {

	for i := 0; i < b.N; i++ {
		var builder strings.Builder

		for j := 0; j < num; j++ {
			builder.WriteString(longHello)
		}

		_ = builder.String()
	}
}

func BenchmarkLongStringWithBuilderGrow(b *testing.B) {

	for i := 0; i < b.N; i++ {
		var builder strings.Builder
		builder.Grow(len(longHello) * num)

		for j := 0; j < num; j++ {
			builder.WriteString(longHello)
		}

		_ = builder.String()
	}
}

测试结果

$ go test -benchmem  -bench=. -v
goos: darwin
goarch: amd64
pkg: github.com/haozibi/play
BenchmarkStringWithPlus-4              	2000000000	         0.34 ns/op	       0 B/op	       0 allocs/op
BenchmarkStringWithSprintf-4           	10000000	       126 ns/op	      16 B/op	       1 allocs/op
BenchmarkStringWithJoin-4              	30000000	        58.6 ns/op	      16 B/op	       1 allocs/op
BenchmarkStringWithBuffer-4            	20000000	        67.6 ns/op	      64 B/op	       1 allocs/op
BenchmarkStringWithBuilder-4           	20000000	        67.6 ns/op	      24 B/op	       2 allocs/op
BenchmarkLongStringWithPlus-4          	   20000	     60128 ns/op	  270784 B/op	      99 allocs/op
BenchmarkLongStringWithSprintf-4       	   20000	     75893 ns/op	  272558 B/op	     199 allocs/op
BenchmarkLongStringWithJoin-4          	 1000000	      2118 ns/op	    5376 B/op	       1 allocs/op
BenchmarkLongStringWithBuffer-4        	  200000	      5768 ns/op	   21024 B/op	       8 allocs/op
BenchmarkLongStringWithBuilder-4       	  200000	      5605 ns/op	   21184 B/op	      10 allocs/op
BenchmarkLongStringWithBuilderGrow-4   	 1000000	      1567 ns/op	    5376 B/op	       1 allocs/op
PASS
ok  	github.com/haozibi/play	17.105s

-benchmem可以提供每次操作分配内存的次数,以及每次操作分配的字节数。

测试发现小文本 “+” 方法拼接还是非常适合的,应该编译器对其进行了优化,”fmt” 方式性能不是很好,不推荐使用。

对于大文本推荐使用 “strings.Builder” + “Builder.Grow”

Go 1.10 推出了一个新的结构 strings.Builder,用于替代 bytes.Buffer 。

// A Builder is used to efficiently build a string using Write methods.
// It minimizes memory copying. The zero value is ready to use.
// Do not copy a non-zero Builder.
type Builder struct {
	addr *Builder // of receiver, to detect copies by value
	buf  []byte
}

// It returns the length of s and a nil error.
func (b *Builder) WriteString(s string) (int, error) {
	b.copyCheck()
	b.buf = append(b.buf, s...)
	return len(s), nil
}

由于 append 函数的特性,如果字符串过多则会重新申请新的内存,导致拖慢速度,则可以使用 Builder.Grow() 方法进行预设大小,避免重新申请内存。设置过 Grow 的效果显著。

ps: 看了 func Join(a []string, sep string) string 发现其内部也是用 strings.Builder 进行处理。

// Join concatenates the elements of a to create a single string. The separator string
// sep is placed between elements in the resulting string.
func Join(a []string, sep string) string {
	switch len(a) {
	case 0:
		return ""
	case 1:
		return a[0]
	}
	n := len(sep) * (len(a) - 1)
	for i := 0; i < len(a); i++ {
		n += len(a[i])
	}

	var b Builder
	b.Grow(n)
	b.WriteString(a[0])
	for _, s := range a[1:] {
		b.WriteString(sep)
		b.WriteString(s)
	}
	return b.String()
}

strings.Builder 源码

查看 strings.Builder 的源码

Builder 支持 4 种将数据写入 builder 中

func (b *Builder) Write(p []byte) (int, error)
func (b *Builder) WriteByte(c byte) error
func (b *Builder) WriteRune(r rune) (int, error)
func (b *Builder) WriteString(s string) (int, error)

Builder 底层通过 []byte 来存储数据

// A Builder is used to efficiently build a string using Write methods.
// It minimizes memory copying. The zero value is ready to use.
// Do not copy a non-zero Builder.
type Builder struct {
	addr *Builder // of receiver, to detect copies by value
	buf  []byte
}

WriteString

func (b *Builder) WriteString(s string) (int, error) 为例,当开发者调用 WriteString 方法时,数据会被追加到其内部的 slice (具体为: []byte) 中。根据 append 函数的特性(容量每次双倍提升),如果达到了 slice 的容量(capacity)限制,一个新的 slice 就会被分配,然后老的 slice 上的内容会被拷贝到新的 slice 上。当 slice 长度很大时,这个操作就会很消耗资源甚至引起 内存问题。

// WriteString appends the contents of s to b's buffer.
// It returns the length of s and a nil error.
func (b *Builder) WriteString(s string) (int, error) {
	b.copyCheck()
	b.buf = append(b.buf, s...)
	return len(s), nil
}

Grow

为了解决 append 的问题,Builder 提供了 Grow 方法预设足够大的容量。只有 slice 剩余空间不足以写入扩容的字节数(n)时扩容才发生,而且扩容的容量是 2*cap(b.buf)+nfunc (b *Builder) grow(n int)会先创建一个新的 slice,然后通过 copy 关键字把旧的拷贝过去。

由于 UTF-8 的原因,WriteString,WriteRune 写入的字符可能不止一个字节。

// Grow grows b's capacity, if necessary, to guarantee space for
// another n bytes. After Grow(n), at least n bytes can be written to b
// without another allocation. If n is negative, Grow panics.
func (b *Builder) Grow(n int) {
	b.copyCheck()
	if n < 0 {
		panic("strings.Builder.Grow: negative count")
	}
	if cap(b.buf)-len(b.buf) < n {
		b.grow(n)
	}
}

// grow copies the buffer to a new, larger buffer so that there are at least n
// bytes of capacity beyond len(b.buf).
func (b *Builder) grow(n int) {
	buf := make([]byte, len(b.buf), 2*cap(b.buf)+n)
	copy(buf, b.buf)
	b.buf = buf
}

String

为了节省内存分配,通过 unsafe.Pointer 的存指针转换操作,实现了直接将buf []byte转换为 string类型,同时避免了内存充分配的问题。

// String returns the accumulated string.
func (b *Builder) String() string {
	return *(*string)(unsafe.Pointer(&b.buf))
}

copyCheck

当你试图拷贝 strings.Builder 并写入的时候,你的程序就会崩溃。

var b1 strings.Builder
b1.WriteString("ABC")
b2 := b1
b2.WriteString("DEF") 
// panic: illegal use of non-zero Builder copied by value

strings.Builder 内部通过 slice 来保存和管理内容。slice 内部则是通过一个指针指向实际保存内容的数组。当我们拷贝了 builder 以后,同样也拷贝了其 slice 的指针。但是它仍然指向同一个旧的数组。

当你对源 builder 或者拷贝后的 builder 写入的时候,问题就产生了。另一个 builder 指向的数组内容也被改变了。这就是为什么 strings.Builder 不允许拷贝的原因。

func (b *Builder) copyCheck() {
	if b.addr == nil {
		// This hack works around a failing of Go's escape analysis
		// that was causing b to escape and be heap allocated.
		// See issue 23382.
		// TODO: once issue 7921 is fixed, this should be reverted to
		// just "b.addr = b".
		b.addr = (*Builder)(noescape(unsafe.Pointer(b)))
	} else if b.addr != b {
		panic("strings: illegal use of non-zero Builder copied by value")
	}
}

通过 copyCheck 获得当前 Builder 的地址,如果当前 Builder 被拷贝,则 b.addrb 的地址不相同,具体应该可以查阅 Go's escape analysis,只在下面 4 种方法中进行检测。

  • Grow(n int)
  • Write(p []byte)
  • WriteRune(r rune)
  • WriteString(s string)

所以下面的代码是可行的

// Reset()
// Len()
// String()

var b1 strings.Builder
b1.WriteString("ABC")
b2 := b1
fmt.Println(b2.Len())    // 3
fmt.Println(b2.String()) // ABC
b2.Reset()
b2.WriteString("DEF")
fmt.Println(b2.String()) // DEF

并发

bytes.Buffer 一样,strings.Builder 也不支持并行的读或者写。

io.Writer

strings.Builder 通过 Write(p []byte) (n int, err error) 方法实现了 io.Writer 接口。所以,我们多了很多使用它的情形:

  • io.Copy(dst Writer, src Reader) (written int64, err error)
  • bufio.NewWriter(w io.Writer) *Writer
  • fmt.Fprint(w io.Writer, a …interface{}) (n int, err error)
  • func (r *http.Request) Write(w io.Writer) error

其他使用 io.Writer 的库

参考文档