Measuring your system’s performance using software (Go edition)

https://lemire.me/blog/2024/03/17/measuring-your-systems-performance-using-software-go-edition/?continueFlag=6e2109ebedef4e4aef083a29820ed2a3

cache line的测试我花了好一阵子才理解清楚：

如果stride size小于cache line size的话，那么每次访问都会加载cache line
如果是cache line size两倍的话，那么每次访问其实是跳过一个cache line
假设整个数组是N的话，如果每个cche line都加载的话，实际上内存读取就花费了N字节
但是如果是跳着读取的话，那么实际上内存读取只花费了N/2字节

package main

import (
    "fmt"
    "time"
)

const size = 33554432 // 32 MB
func Cpy(arr1 []uint8, arr2 []uint8, slice int) {
    for i := 0; i < len(arr1); i += slice {
        arr2[i] = arr1[i]
    }
}

func AverageMinMax(f func() float64) (float64, float64, float64) {
    var sum float64
    var minimum float64
    var maximum float64

    for i := 0; i < 10; i++ {
        arr1 = make([]uint8, size)
        arr2 = make([]uint8, size)

        v := f()
        sum += v
        if i == 0 || v < minimum {
            minimum = v
        }
        if i == 0 || v > maximum {
            maximum = v
        }
    }
    return sum / 10, minimum, maximum
}

var arr1 []uint8
var arr2 []uint8

func run(size int, slice int) float64 {
    start := time.Now()
    times := 10
    for i := 0; i < times*slice; i++ {
        Cpy(arr1, arr2, slice)
    }
    end := time.Now()
    dur := float64(end.Sub(start)) / float64(times*slice)
    return dur
}

func main() {
    for slice := 16; slice <= 4096; slice *= 2 {
        a, m, M := AverageMinMax(func() float64 { return run(size, slice-1) })
        fmt.Printf("%10d: %10.1f GB/s [%4.1f - %4.1f]\n", slice-1, float64(size)/a, float64(size)/M, float64(size)/m)
    }
}

比如下面这个输出结果的话，在255这个地方出现了一次2x. 所以cache line size可能是在128字节。

$ go run cacheline.go
        15:       23.6 GB/s [21.3 - 24.4]
        31:       24.3 GB/s [23.8 - 24.5]
        63:       24.2 GB/s [23.6 - 24.6]
       127:       26.9 GB/s [23.8 - 27.9]
       255:       40.8 GB/s [37.8 - 43.6]
       511:      162.0 GB/s [130.4 - 203.4]
      1023:      710.0 GB/s [652.0 - 744.4]
      2047:      976.1 GB/s [967.1 - 983.8]
      4095:     1247.4 GB/s [1147.7 - 1267.0]

memory latency & parallelism 可以通过访问链表的方式来做测量内存延迟。但是如果同时访问两个内存链表的话，延迟实际上差不多的，这是因为多个内存访问是可以并行的，延迟可以只被计算成为一次。

这个实验设计比较巧妙，感觉可以好好学习一下：

首先创建 0..n-1 的数组array，并且shuffle. 注意这里必须swap.
然后根据这个array来产生index. 这个index有个特点是可以循环访问的。
然后将这个index拆分成为 `mlp` 个group. 按照每个group里面下表去访问array的时候，就是随机的访问
这样相当于模拟出 `mlp` 个memory request. 然后观察延迟的抖动情况

package main

import (
    "fmt"
    "math/rand"
    "time"
)

// makeCycle creates a cycle of a specified length starting at element 0
func makeCycle(length int) ([]uint64, []uint64) {
    array := make([]uint64, length)
    index := make([]uint64, length)
    // Create a cycle of maximum length within the big array
    for i := 0; i < length; i++ {
        array[i] = uint64(i)
    }

    // Sattolo shuffle
    for i := 0; i+1 < length; i++ {
        swapIdx := rand.Intn(length-i-1) + i + 1
        array[i], array[swapIdx] = array[swapIdx], array[i]
    }

    total := 0
    cur := uint64(0)
    for cur != 0 {
        index[total] = cur
        total++
        cur = array[cur]
    }
    return array, index
}

// setupPointers sets up pointers based on the given index
func setupPointers(index []uint64, length, mlp int) []uint64 {
    sp := make([]uint64, mlp)
    sp[0] = 0

    totalInc := 0
    for m := 1; m < mlp; m++ {
        totalInc += length / mlp
        sp[m] = index[totalInc]
    }
    return sp
}

func runBench(array []uint64, index []uint64, mlp int) time.Duration {
    length := len(array)
    sp := setupPointers(index, length, mlp)
    hits := length / mlp
    before := time.Now()
    for i := 0; i < hits; i++ {
        for m := 0; m < mlp; m++ {
            sp[m] = array[sp[m]]
        }
    }
    after := time.Now()
    return after.Sub(before)
}

func main() {
    const length = 100000000
    array, index := makeCycle(length)
    fmt.Println("Length:", length*8/1024/1024, "MB")
    base := runBench(array, index, 1)
    fmt.Println("Lanes:", 1, "Time:", base)

    for mlp := 2; mlp <= 40; mlp *= 2 {
        t := runBench(array, index, mlp)
        fmt.Println("Lanes:", mlp, "Speedup:", fmt.Sprintf("%.1f", float64(base)/float64(t)))
    }
}

下面是我的M3上运行结果，看上去大约是在128这个水平上。

(base) (StarRocks)(Warehouse) m3-book :: ~/Downloads » go run bench_mlp.go
Length: 1024 MB
Lanes: 1 Time: 14.852118916s
Lanes: 2 Speedup: 2.0
Lanes: 4 Speedup: 4.0
Lanes: 8 Speedup: 8.1
Lanes: 16 Speedup: 15.8
Lanes: 32 Speedup: 30.6
Lanes: 64 Speedup: 57.8
Lanes: 128 Speedup: 98.6
Lanes: 256 Speedup: 147.9
Lanes: 512 Speedup: 209.4
Lanes: 1024 Speedup: 270.2