I am an extreme moderate

June 10, 2012

xxhash256 update

Filed under: Uncategorized — niezmierniespokojny @ 11:41 am

One of the drawbacks of the previous version of the hash was limited avalanching, each bit of the input could affect at most 64 bits of the output.
I decided to fix it by adding slight interactions between internal state variables. The process is slow and has high latency, but the function was designed for large inputs, so it doesn’t matter much. Also, I changed the main loop so that multiplications occur less often, so it’s something between the strong and the fast hash with a better avalanching.
To make the interactions really small, I unrolled the main loop and the code got faster in the process, clocking at 3024 MB/s (previously – the fast version did 3008 and the fast one – 2751 MB/s), so I obsoleted the fast hash. Well, I could unroll it too, but I don’t think it’s worthwhile.

/*
   xxHash256 - A fast checksum algorithm
   Copyright (C) 2012, Yann Collet & Maciej Adamczyk.
   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
   met:

       * Redistributions of source code must retain the above copyright
   notice, this list of conditions and the following disclaimer.
       * Redistributions in binary form must reproduce the above
   copyright notice, this list of conditions and the following disclaimer
   in the documentation and/or other materials provided with the
   distribution.

   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

************************************************************************
   This file contains a super-fast hash function for checksumming
   purposes, designed for large (1 KB++) inputs.
   Known limitations:
    * they use 64-bit math and will not be so fast on 32-bit machines
    * on architectures that disallow unaligned memory access, the input
      must be 8-byte aligned. Aligning it on other architectures
      is not needed, but will improve performance.
    * it produces different results on big and small endian.
  
   Changelog:
    v0: initial release
    v1: the strong hash is faster and stronger, it obsoletes the fast one
*/

//**************************************
// Includes
//**************************************
#include "stddef.h"
#include "stdint.h"
#include "string.h"


//**************************************
// Macros
//**************************************
#define _rotl(x,r) ((x << r) | (x >> (64 - r)))

//**************************************
// Constants
//**************************************
#define PRIME 11400714819323198393ULL

//******************************
// Hash functions
//******************************
void XXH_256(const void* input, size_t len, uint64_t* out)
{
    const uint8_t* p = (uint8_t*)input;
    const uint8_t* const bEnd = p + len;
    uint64_t v1 = len * PRIME;
    uint64_t v2 = v1;
    uint64_t v3 = v1;
    uint64_t v4 = v1;

    const size_t big_loop_step = 4 * 4 * sizeof(uint64_t);
    const size_t small_loop_step = 4 * sizeof(uint64_t);
    // Set the big loop limit early enough, so the well-mixing small loop can be executed twice after it
    const uint8_t* const big_loop_limit   = bEnd - big_loop_step - 2 * small_loop_step;
    const uint8_t* const small_loop_limit = bEnd - small_loop_step;

    while (p < big_loop_limit)
    {
        v1 = _rotl(v1, 29) + (*(uint64_t*)p); p+=sizeof(uint64_t);
        v2 = _rotl(v2, 31) + (*(uint64_t*)p); p+=sizeof(uint64_t);
        v3 = _rotl(v3, 33) + (*(uint64_t*)p); p+=sizeof(uint64_t);
        v4 = _rotl(v4, 35) + (*(uint64_t*)p); p+=sizeof(uint64_t);
        v1 += v2 *= PRIME;
        v1 = _rotl(v1, 29) + (*(uint64_t*)p); p+=sizeof(uint64_t);
        v2 = _rotl(v2, 31) + (*(uint64_t*)p); p+=sizeof(uint64_t);
        v3 = _rotl(v3, 33) + (*(uint64_t*)p); p+=sizeof(uint64_t);
        v4 = _rotl(v4, 35) + (*(uint64_t*)p); p+=sizeof(uint64_t);
        v2 += v3 *= PRIME;
        v1 = _rotl(v1, 29) + (*(uint64_t*)p); p+=sizeof(uint64_t);
        v2 = _rotl(v2, 31) + (*(uint64_t*)p); p+=sizeof(uint64_t);
        v3 = _rotl(v3, 33) + (*(uint64_t*)p); p+=sizeof(uint64_t);
        v4 = _rotl(v4, 35) + (*(uint64_t*)p); p+=sizeof(uint64_t);
        v3 += v4 *= PRIME;
        v1 = _rotl(v1, 29) + (*(uint64_t*)p); p+=sizeof(uint64_t);
        v2 = _rotl(v2, 31) + (*(uint64_t*)p); p+=sizeof(uint64_t);
        v3 = _rotl(v3, 33) + (*(uint64_t*)p); p+=sizeof(uint64_t);
        v4 = _rotl(v4, 35) + (*(uint64_t*)p); p+=sizeof(uint64_t);
        v4 += v1 *= PRIME;
    }

    while (p < small_loop_limit)
    {
        v1 = _rotl(v1, 29) + (*(uint64_t*)p); p+=sizeof(uint64_t);
        v2 += v1 *= PRIME;
        v2 = _rotl(v2, 31) + (*(uint64_t*)p); p+=sizeof(uint64_t);
        v3 += v2 *= PRIME;
        v3 = _rotl(v3, 33) + (*(uint64_t*)p); p+=sizeof(uint64_t);
        v4 += v3 *= PRIME;
        v4 = _rotl(v4, 35) + (*(uint64_t*)p); p+=sizeof(uint64_t);
        v1 += v4 *= PRIME;
    }
    memcpy(out, p, bEnd - p);

    out[0] += v1;
    out[1] += v2;
    out[2] += v3;
    out[3] += v4;
}

June 9, 2012

A fast checksum

Filed under: Uncategorized — niezmierniespokojny @ 6:22 am

Inspired by xxhash and fletcher4 I decided to write a checksum meant to be stronger than fletcher4 (not an ambitious goal) and as fast as possible. I started off with xxhash (both strong and fast versions of it) and turned it to a 256-bit (like fletcher4) checksum optimised for 64-bit machines.
I was surprised how well it went. There are several algorithms on the edge of my memory bandwidth limits and I didn’t expect to be able to beat them by a noteworthy margin, but I got almost 10%.

The 2 hashes:
xxhash-fast256 3008 MB/s
xxhash-strong256 2751 MB/s

Competition:
xxhash-fast 2777 MB/s
fnv1-tesla 2716 MB/s
CityHash128 2666 MB/s
SpookyHash 2280 MB/s
xxhash-strong 2015 MB/s
fletcher4 1855 MB/s
fnv1-meiyan 1533 MB/s
CrapWow 1409 MB/s
murmur3_x64_128 1272 MB/s

All benchmarked on Pentium D 805, compiled with mingw64 4.6.2. CrapWow comes with C and ASM versions, what you see above is C. ASM is faster, but nothing to brag about.

BTW, I am surprised also by how badly Murmur3 worked. I guess it doesn’t like my CPU or compiler or something.

The code:

/*
   xxHash - Fast Hash algorithm
   Copyright (C) 2012, Yann Collet & Maciej Adamczyk.
   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
   met:

       * Redistributions of source code must retain the above copyright
   notice, this list of conditions and the following disclaimer.
       * Redistributions in binary form must reproduce the above
   copyright notice, this list of conditions and the following disclaimer
   in the documentation and/or other materials provided with the
   distribution.

   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

************************************************************************
   This file contains a pair of super-fast hashes for checksumming
   purposes, designed for large (1 KB+) inputs.
   Known limitations:
    * they use 64-bit math and will not be so fast on 32-bit machines
    * on architectures that disallow unaligned memory access, the input
      must be 8-byte aligned. Aligning it on other architectures
      is not needed, but will improve performance.
*/

//**************************************
// Includes
//**************************************
#include "stddef.h"
#include "stdint.h"
#include "string.h"


//**************************************
// Macros
//**************************************
#define _rotl(x,r) ((x << r) | (x >> (64 - r)))

//**************************************
// Constants
//**************************************
#define PRIME 11400714819323198393ULL

//******************************
// Hash functions
//******************************
void XXH_fast256(const void* input, size_t len, uint64_t* seed_out)
{
    const uint8_t* p = (uint8_t*)input;
    const uint8_t* const bEnd = p + len;
    uint64_t v1 = len * PRIME;
    uint64_t v2 = v1;
    uint64_t v3 = v1;
    uint64_t v4 = v1;
    const uint8_t* const limit = len > 4 * sizeof(uint64_t) ? bEnd - 4 * sizeof(uint64_t) : NULL;

    while (p<limit)
    {
        v1 = _rotl(v1, 29) + (*(uint64_t*)p); p+=sizeof(uint64_t);
        v2 = _rotl(v2, 31) + (*(uint64_t*)p); p+=sizeof(uint64_t);
        v3 = _rotl(v3, 33) + (*(uint64_t*)p); p+=sizeof(uint64_t);
        v4 = _rotl(v4, 35) + (*(uint64_t*)p); p+=sizeof(uint64_t);
    }
    memcpy(seed_out, p, bEnd-p);

    seed_out[0] += v1;
    seed_out[1] += v2;
    seed_out[2] += v3;
    seed_out[3] += v4;
}
void XXH_strong256(const void* input, size_t len, uint64_t* seed_out)
{
    const uint8_t* p = (uint8_t*)input;
    const uint8_t* const bEnd = p + len;
    uint64_t v1 = len * PRIME;
    uint64_t v2 = v1;
    uint64_t v3 = v1;
    uint64_t v4 = v1;
    const uint8_t* const limit = len > 4 * sizeof(uint64_t) ? bEnd - 4 * sizeof(uint64_t) : NULL;

    while (p<limit)
    {
        v1 += _rotl(v1, 29); v1 *= PRIME; v1 += (*(uint64_t*)p); p+=sizeof(uint64_t);
        v2 += _rotl(v2, 31); v2 *= PRIME; v2 += (*(uint64_t*)p); p+=sizeof(uint64_t);
        v3 += _rotl(v3, 33); v3 *= PRIME; v3 += (*(uint64_t*)p); p+=sizeof(uint64_t);
        v4 += _rotl(v4, 35); v4 *= PRIME; v4 += (*(uint64_t*)p); p+=sizeof(uint64_t);
    }
    memcpy(seed_out, p, bEnd-p);

    seed_out[0] += v1;
    seed_out[1] += v2;
    seed_out[2] += v3;
    seed_out[3] += v4;
}

Final note:
The code is very young and I’m a checksumming amateur, there might be bugs.

Create a free website or blog at WordPress.com.