有效地进行bit-unpacking

ORC文件使用了bit-packing技术来对整数进行编码 https://orc.apache.org/specification/ORCv1/

  1. 假设所有的数范围都在[0,31]之间的话,那么就可以使用5bit表示每个int
  2. 在packing的过程中,可以尽量先塞入低bit(impala的做法),也可以尽量先塞入高bit(ORC的做法)
  3. 最后可能会多出来一个字节需要flush下去。

关于这个packing实现,我之前用 python 写过一个版本,效率就不说了。

如果使用c++来编写unpacking的话,大致是这样的:

// in: input
// fb: fixed bit
// data: output
// nums: how many values to be unpacked.
void bit_unpack_tail(const uint8_t* in, int fb, int64_t* data, int nums) {
    if (nums == 0) return;
    int64_t t = 0;
    uint8_t c = 0;
    int cb = 0;
    for (int i = 0; i < nums; i++) {
        int bits = fb;
        t = 0;
        while (bits) {
            if (cb == 0) {
                c = (*in++);
                cb = 8;
            }
            int lb = std::min(cb, bits);
            t = (t << lb) | ((c >> (cb - lb)) & ((1 << lb) - 1));
            bits -= lb;
            cb -= lb;
        }
        *data = t;
        data++;
    }
}

但是这个实现效率不高,原因就是里面有太多变量计算了。如果我们一旦知道fb的具体值的话,许多流程其实是可以固定的,唯一一个变量就是 nums. 如果我们可以把nums也固定住的话,那么完全可以使用codegen的方式来生成efficient code. 最开始我一直纠结如何可以固定这个nums, 觉得这个情况太多了, 直到看到了这篇文章: How fast is bit packing? – Daniel Lemire's blog. 这篇文章是将nums固定成为32,而ORC的fixed bit最大可以到64, 所以我们也可以 将nums固定成为64. 每64个元素作为一个chunk进行解压,然后不断地循环,对于最后剩余的元素用 bit_unpack_tail 这个函数解压。作者也有个实现 lemire/FastPFor: The FastPFOR C++ library: Fast integer compression 后面有时间可以看看,我按照作者的思路自己走了个实现。

我编写了脚本用来生成各种fixed bit下64个元素的解压函数,这个过程不难,其实就是 bit_unpack_tail 函数的动态执行版本。具体带可以看 gen-bit-pack.py

fbs = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
       12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
       23, 24, 26, 28, 30, 32, 40, 48, 56, 64]

def gen_unpack_bit(fb):
    oss = MyStringIO()
    oss.emit('const uint8_t* _unpack64_{fb}(const uint8_t *__restrict__ in, int64_t *__restrict__ out)'.format(
        fb=fb))
    oss.emit('{')
    oss.emit('int64_t t = 0;')
    oss.emit('uint8_t c = 0;')
    # how many bits in cb
    cb = 0

    for i in range(64):
        oss.emit('t = 0;')
        exp = fb
        while exp:
            if cb == 0:
                oss.emit('c = (*in++);')
                cb = 8

            lb = min(exp, cb)
            oss.emit('t = (t << {lb}) | ((c >> {x}) & ((1 << {lb}) - 1));'.format(lb=lb, x=cb - lb))
            exp -= lb
            cb -= lb

        oss.emit('*out = t;')
        oss.emit('out++;')
    oss.emit('return in;')
    oss.emit('}')
    return oss.get()


def gen_unpack_driver():
    oss = MyStringIO()
    for fb in fbs:
        oss.emit('void bit_unpack64_{fb}(const uint8_t* __restrict in, int64_t* __restrict__ out, int nums)'.format(
            fb=fb))
        oss.emit("""{{
        int run = nums / 64;
        for(int i=0;i<run;i++) {{
        in = _unpack64_{fb}(in, out);
        out += 64;
        }}
        bit_unpack_tail(in, {fb}, out, nums % 64);
        }}""".format(fb=fb))

    oss.emit('void bit_unpack(const uint8_t* in, int fb, int64_t* data, int nums) {')
    oss.emit('switch (fb) {')
    for fb in fbs:
        oss.emit('case {fb}:'.format(fb=fb))
        oss.emit('return bit_unpack64_{fb}(in, data, nums);'.format(fb=fb))
    oss.emit('}')
    oss.emit('}')
    return oss.get()


为了对比这个版本和原始 bit_unpack_tail 版本的性能,我做了测试对1MB个元素在各种fixed bit下面的性能表现, 这个测试甚者几乎不用准备任何数据,具体代码可以看 TestBitUnpacking.cpp. 测试下来发现最低可以提升5倍,最高可以提升10倍左右, 当然某些case可以继续手工优化。

unpack fb: 1. fast = 63 ms, avg = 0.600815 ns. tail = 329 ms, avg = 3.13759 ns. speedup = 5.22222
unpack fb: 2. fast = 63 ms, avg = 0.600815 ns. tail = 329 ms, avg = 3.13759 ns. speedup = 5.22222
unpack fb: 3. fast = 55 ms, avg = 0.524521 ns. tail = 403 ms, avg = 3.84331 ns. speedup = 7.32727
unpack fb: 4. fast = 53 ms, avg = 0.505447 ns. tail = 329 ms, avg = 3.13759 ns. speedup = 6.20755
unpack fb: 5. fast = 54 ms, avg = 0.514984 ns. tail = 479 ms, avg = 4.5681 ns. speedup = 8.87037
unpack fb: 6. fast = 55 ms, avg = 0.524521 ns. tail = 477 ms, avg = 4.54903 ns. speedup = 8.67273
unpack fb: 7. fast = 60 ms, avg = 0.572205 ns. tail = 551 ms, avg = 5.25475 ns. speedup = 9.18333
unpack fb: 8. fast = 46 ms, avg = 0.43869 ns. tail = 329 ms, avg = 3.13759 ns. speedup = 7.15217
unpack fb: 9. fast = 65 ms, avg = 0.619888 ns. tail = 626 ms, avg = 5.97 ns. speedup = 9.63077
unpack fb: 10. fast = 62 ms, avg = 0.591278 ns. tail = 626 ms, avg = 5.97 ns. speedup = 10.0968
unpack fb: 11. fast = 71 ms, avg = 0.677109 ns. tail = 701 ms, avg = 6.68526 ns. speedup = 9.87324
unpack fb: 12. fast = 58 ms, avg = 0.553131 ns. tail = 625 ms, avg = 5.96046 ns. speedup = 10.7759
unpack fb: 13. fast = 77 ms, avg = 0.734329 ns. tail = 772 ms, avg = 7.36237 ns. speedup = 10.026
unpack fb: 14. fast = 74 ms, avg = 0.705719 ns. tail = 774 ms, avg = 7.38144 ns. speedup = 10.4595
unpack fb: 15. fast = 83 ms, avg = 0.79155 ns. tail = 848 ms, avg = 8.08716 ns. speedup = 10.2169
unpack fb: 16. fast = 50 ms, avg = 0.476837 ns. tail = 625 ms, avg = 5.96046 ns. speedup = 12.5
unpack fb: 17. fast = 89 ms, avg = 0.84877 ns. tail = 917 ms, avg = 8.74519 ns. speedup = 10.3034
unpack fb: 18. fast = 86 ms, avg = 0.82016 ns. tail = 923 ms, avg = 8.80241 ns. speedup = 10.7326
unpack fb: 19. fast = 95 ms, avg = 0.905991 ns. tail = 992 ms, avg = 9.46045 ns. speedup = 10.4421
unpack fb: 20. fast = 80 ms, avg = 0.762939 ns. tail = 920 ms, avg = 8.7738 ns. speedup = 11.5
unpack fb: 21. fast = 102 ms, avg = 0.972748 ns. tail = 1069 ms, avg = 10.1948 ns. speedup = 10.4804
unpack fb: 22. fast = 99 ms, avg = 0.944138 ns. tail = 1069 ms, avg = 10.1948 ns. speedup = 10.798
unpack fb: 23. fast = 115 ms, avg = 1.09673 ns. tail = 1146 ms, avg = 10.9291 ns. speedup = 9.96522
unpack fb: 24. fast = 69 ms, avg = 0.658035 ns. tail = 918 ms, avg = 8.75473 ns. speedup = 13.3043
unpack fb: 26. fast = 111 ms, avg = 1.05858 ns. tail = 1215 ms, avg = 11.5871 ns. speedup = 10.9459
unpack fb: 28. fast = 106 ms, avg = 1.01089 ns. tail = 1219 ms, avg = 11.6253 ns. speedup = 11.5
unpack fb: 30. fast = 134 ms, avg = 1.27792 ns. tail = 1370 ms, avg = 13.0653 ns. speedup = 10.2239
unpack fb: 32. fast = 96 ms, avg = 0.915527 ns. tail = 1219 ms, avg = 11.6253 ns. speedup = 12.6979
unpack fb: 40. fast = 130 ms, avg = 1.23978 ns. tail = 1496 ms, avg = 14.267 ns. speedup = 11.5077
unpack fb: 48. fast = 161 ms, avg = 1.53542 ns. tail = 1814 ms, avg = 17.2997 ns. speedup = 11.2671
unpack fb: 56. fast = 187 ms, avg = 1.78337 ns. tail = 2010 ms, avg = 19.1689 ns. speedup = 10.7487
unpack fb: 64. fast = 212 ms, avg = 2.02179 ns. tail = 2407 ms, avg = 22.9549 ns. speedup = 11.353