/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
│vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8                                :vi│
╞══════════════════════════════════════════════════════════════════════════════╡
│ Copyright 2020 Justine Alexandra Roberts Tunney                              │
│                                                                              │
│ Permission to use, copy, modify, and/or distribute this software for         │
│ any purpose with or without fee is hereby granted, provided that the         │
│ above copyright notice and this permission notice appear in all copies.      │
│                                                                              │
│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
│ PERFORMANCE OF THIS SOFTWARE.                                                │
╚─────────────────────────────────────────────────────────────────────────────*/
#include "libc/str/internal.h"

/**
 * Hashes data with hardware acceleration at 10GBps.
 * @note needs Nehalem+ c. 2008 or Bulldozer+ c. 2011
 */
optimizespeed uint32_t crc32c$sse42(uint32_t init, const void *data, size_t n) {
  const unsigned char *p = (const unsigned char *)data;
  const unsigned char *pe = (const unsigned char *)data + n;
  uint32_t h = init ^ 0xffffffff;
  if (n >= 16 + 8) {
    while ((uintptr_t)p & 7) asm("crc32b\t%1,%0" : "+r"(h) : "rm"(*p++));
    uint64_t hl = h;
    while (p < pe - 16ul) {
      asm("crc32q\t%1,%0" : "+r"(hl) : "rm"(*(const uint64_t *)p));
      p += 8;
      asm("crc32q\t%1,%0" : "+r"(hl) : "rm"(*(const uint64_t *)p));
      p += 8;
    }
    h = (uint32_t)hl;
  }
  while (p < pe) asm("crc32b\t%1,%0" : "+r"(h) : "rm"(*p++));
  return h ^ 0xffffffff;
}

/*
  bench_crc32c$sse42 for #c per n where c ≈ 0.293ns
  N                     x1            x8           x64	  mBps
  ------------------------------------------------------------
  1                877.000        43.375        40.359      81
  1                 45.000        39.625        40.484      80
  2                 34.500        27.562        20.461     159
  3                 23.000        16.708        14.245     228
  4                 18.250        13.094        11.449     284
  7                 10.429         8.339         8.185     397
  8                 42.125         8.734         6.850     475
  15                 9.400         5.375         4.884     665
  16                 7.312         5.070         4.882     666
  31                 5.258         2.923         2.680    1213
  32                 3.969         2.676         2.562    1269
  63                 3.095         1.581         1.428    2276
  64                 2.234         1.623         1.478    2199
  127                1.205         0.901         0.900    3610
  128                1.164         0.960         0.915    3552
  255                0.922         0.651         0.618    5260
  256                0.715         0.650         0.609    5341
  511                0.558         0.482         0.477    6819
  512                0.529         0.475         0.469    6932
  1023               0.425         0.400         0.396    8204
  1024               0.417         0.392         0.388    8383
  2047               0.367         0.355         0.353    9199
  2048               0.374         0.366         0.364    8929
  4095               0.351         0.338         0.337    9644
  4096               0.353         0.338         0.338    9624
  8191               0.335         0.338         0.337    9641
  8192               0.335         0.329         0.329    9870
  16383              0.336         0.325         0.325   10011
  16384              0.336         0.326         0.375    8666
  32767              0.329         0.323         0.323   10070
  32768              0.327         0.324         0.323   10062
  65535              0.322         0.322         0.322   10103
  65536              0.321         0.322         0.322   10102
  131071             0.322         0.321         0.321   10125
  131072             0.321         0.321         0.321   10124
  262143             0.322         0.321         0.335    9699
  262144             0.321         0.321         0.321   10134
  524287             0.321         0.321         0.499    6516
  524288             0.321         0.321         0.339    9575
  1048575            0.322         0.321         0.322   10095
  1048576            0.320         1.001         0.323   10048
  2097151            0.325         0.321         0.322   10086
  2097152            0.330         0.320         0.323   10076
  4194303            0.331         0.322         0.321   10128
  4194304            0.332         0.321         0.325   10004
  8388607            0.334         0.332         0.331    9829
  8388608            0.334         0.329         0.327    9934
*/