quic: implement rapidhash for hashing improvements

jasnell · jasnell · commit b5c4277e3d47 · 2026-04-09T07:10:30.000-07:00
Signed-off-by: James M Snell <jasnell@gmail.com> Assisted-by: Opencode:Opus 4.6 PR-URL: nodejs/node#62620 Reviewed-By: Robert Nagy <ronagy@icloud.com> Reviewed-By: Tim Perry <pimterry@gmail.com>
diff --git a/node.gyp b/node.gyp
@@ -269,6 +269,7 @@
       'src/node_mem.h',
       'src/node_mem-inl.h',
       'src/node_messaging.h',
+      'src/node_hash.h',
       'src/node_metadata.h',
       'src/node_mutex.h',
       'src/node_diagnostics_channel.h',
diff --git a/src/node_hash.h b/src/node_hash.h
@@ -0,0 +1,212 @@
+#pragma once
+
+#if defined(NODE_WANT_INTERNALS) && NODE_WANT_INTERNALS
+
+// Fast, high-quality hash function for byte sequences.
+//
+// Provides HashBytes() for use in hash tables. Uses native-width integer
+// loads and a 128-bit multiply-and-fold mixer for excellent avalanche
+// properties on short byte sequences (network identifiers, addresses,
+// tokens).
+//
+// Based on rapidhash V3 by Nicolas De Carli, which evolved from wyhash
+// by Wang Yi. Both use the same core mixing primitive (MUM: multiply,
+// then XOR the high and low halves of the 128-bit result).
+//
+//   rapidhash: https://github.com/Nicoshev/rapidhash
+//     Copyright (C) 2025 Nicolas De Carli — MIT License
+//   wyhash:    https://github.com/wangyi-fudan/wyhash
+//     Wang Yi — public domain (The Unlicense)
+//
+// The implementation here uses rapidhash's read strategy (native-width
+// overlapping reads, optimized for short inputs) and secret constants.
+// The core mixing function (rapid_mum/rapid_mix) is identical to
+// wyhash's wymum/wymix.
+
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#if defined(_MSC_VER)
+#include <intrin.h>
+#if defined(_M_X64) && !defined(_M_ARM64EC)
+#pragma intrinsic(_umul128)
+#endif
+#endif
+
+namespace node {
+
+namespace hash_detail {
+
+// 128-bit multiply, then XOR the high and low halves.
+// This is the core mixing function ("rapid_mum" / "wymum").
+// On 64-bit platforms with __int128, this compiles to a single
+// mul instruction + shift + xor.
+inline uint64_t RapidMix(uint64_t a, uint64_t b) {
+#ifdef __SIZEOF_INT128__
+  __uint128_t r = static_cast<__uint128_t>(a) * b;
+  a = static_cast<uint64_t>(r);
+  b = static_cast<uint64_t>(r >> 64);
+#elif defined(_MSC_VER) && (defined(_WIN64) || defined(_M_HYBRID_CHPE_ARM64))
+#if defined(_M_X64)
+  a = _umul128(a, b, &b);
+#else
+  uint64_t hi = __umulh(a, b);
+  a = a * b;
+  b = hi;
+#endif
+#else
+  // Portable 64x64 -> 128-bit multiply fallback for 32-bit platforms.
+  uint64_t ha = a >> 32, hb = b >> 32;
+  uint64_t la = static_cast<uint32_t>(a), lb = static_cast<uint32_t>(b);
+  uint64_t rh = ha * hb, rm0 = ha * lb, rm1 = hb * la, rl = la * lb;
+  uint64_t t = rl + (rm0 << 32);
+  uint64_t lo = t + (rm1 << 32);
+  uint64_t hi = rh + (rm0 >> 32) + (rm1 >> 32) + (t < rl) + (lo < t);
+  a = lo;
+  b = hi;
+#endif
+  return a ^ b;
+}
+
+// Inline 128-bit multiply WITHOUT the final XOR (used in the
+// penultimate mixing step where a and b are updated separately).
+inline void RapidMum(uint64_t* a, uint64_t* b) {
+#ifdef __SIZEOF_INT128__
+  __uint128_t r = static_cast<__uint128_t>(*a) * (*b);
+  *a = static_cast<uint64_t>(r);
+  *b = static_cast<uint64_t>(r >> 64);
+#elif defined(_MSC_VER) && (defined(_WIN64) || defined(_M_HYBRID_CHPE_ARM64))
+#if defined(_M_X64)
+  *a = _umul128(*a, *b, b);
+#else
+  uint64_t hi = __umulh(*a, *b);
+  *a = (*a) * (*b);
+  *b = hi;
+#endif
+#else
+  uint64_t ha = *a >> 32, hb = *b >> 32;
+  uint64_t la = static_cast<uint32_t>(*a), lb = static_cast<uint32_t>(*b);
+  uint64_t rh = ha * hb, rm0 = ha * lb, rm1 = hb * la, rl = la * lb;
+  uint64_t t = rl + (rm0 << 32);
+  *a = t + (rm1 << 32);
+  *b = rh + (rm0 >> 32) + (rm1 >> 32) + (t < rl) + (*a < t);
+#endif
+}
+
+// Read functions. The compiler optimizes small fixed-size memcpy calls
+// to single load instructions — no actual byte-by-byte copy occurs.
+inline uint64_t RapidRead64(const uint8_t* p) {
+  uint64_t v;
+  memcpy(&v, p, sizeof(uint64_t));
+  return v;
+}
+
+inline uint64_t RapidRead32(const uint8_t* p) {
+  uint32_t v;
+  memcpy(&v, p, sizeof(uint32_t));
+  return v;
+}
+
+// Default rapidhash secret parameters.
+constexpr uint64_t kSecret[8] = {0x2d358dccaa6c78a5ULL,
+                                 0x8bb84b93962eacc9ULL,
+                                 0x4b33a62ed433d4a3ULL,
+                                 0x4d5a2da51de1aa47ULL,
+                                 0xa0761d6478bd642fULL,
+                                 0xe7037ed1a0b428dbULL,
+                                 0x90ed1765281c388cULL,
+                                 0xaaaaaaaaaaaaaaaaULL};
+
+}  // namespace hash_detail
+
+// Hash a contiguous byte range. Optimized for short inputs (≤48 bytes)
+// which is the common case for network identifiers and addresses. For
+// inputs >48 bytes, falls through to a loop processing 48-byte chunks.
+inline size_t HashBytes(const void* data, size_t len) {
+  const uint8_t* p = static_cast<const uint8_t*>(data);
+
+  // Seed initialization.
+  uint64_t seed = hash_detail::RapidMix(0 ^ hash_detail::kSecret[2],
+                                        hash_detail::kSecret[1]);
+  uint64_t a = 0;
+  uint64_t b = 0;
+  size_t i = len;
+
+  if (len <= 16) {
+    if (len >= 4) {
+      // Mix length into seed for better distribution of
+      // different-length inputs with shared prefixes.
+      seed ^= len;
+      if (len >= 8) {
+        // 8-16 bytes: two native 64-bit reads (overlapping from end).
+        a = hash_detail::RapidRead64(p);
+        b = hash_detail::RapidRead64(p + len - 8);
+      } else {
+        // 4-7 bytes: two 32-bit reads (overlapping from end).
+        a = hash_detail::RapidRead32(p);
+        b = hash_detail::RapidRead32(p + len - 4);
+      }
+    } else if (len > 0) {
+      // 1-3 bytes: spread bytes across two values for mixing.
+      a = (static_cast<uint64_t>(p[0]) << 45) | p[len - 1];
+      b = p[len >> 1];
+    } else {
+      a = b = 0;
+    }
+  } else if (len <= 48) {
+    // 17-48 bytes: process in 16-byte chunks, then read the tail.
+    seed = hash_detail::RapidMix(
+        hash_detail::RapidRead64(p) ^ hash_detail::kSecret[2],
+        hash_detail::RapidRead64(p + 8) ^ seed);
+    if (len > 32) {
+      seed = hash_detail::RapidMix(
+          hash_detail::RapidRead64(p + 16) ^ hash_detail::kSecret[2],
+          hash_detail::RapidRead64(p + 24) ^ seed);
+    }
+    a = hash_detail::RapidRead64(p + len - 16) ^ len;
+    b = hash_detail::RapidRead64(p + len - 8);
+  } else {
+    // >48 bytes: process 48-byte chunks with three parallel mix lanes.
+    uint64_t see1 = seed;
+    uint64_t see2 = seed;
+    do {
+      seed = hash_detail::RapidMix(
+          hash_detail::RapidRead64(p) ^ hash_detail::kSecret[0],
+          hash_detail::RapidRead64(p + 8) ^ seed);
+      see1 = hash_detail::RapidMix(
+          hash_detail::RapidRead64(p + 16) ^ hash_detail::kSecret[1],
+          hash_detail::RapidRead64(p + 24) ^ see1);
+      see2 = hash_detail::RapidMix(
+          hash_detail::RapidRead64(p + 32) ^ hash_detail::kSecret[2],
+          hash_detail::RapidRead64(p + 40) ^ see2);
+      p += 48;
+      i -= 48;
+    } while (i > 48);
+    seed ^= see1 ^ see2;
+    // Process remaining 17-48 bytes.
+    if (i > 16) {
+      seed = hash_detail::RapidMix(
+          hash_detail::RapidRead64(p) ^ hash_detail::kSecret[2],
+          hash_detail::RapidRead64(p + 8) ^ seed);
+      if (i > 32) {
+        seed = hash_detail::RapidMix(
+            hash_detail::RapidRead64(p + 16) ^ hash_detail::kSecret[2],
+            hash_detail::RapidRead64(p + 24) ^ seed);
+      }
+    }
+    a = hash_detail::RapidRead64(p + i - 16) ^ i;
+    b = hash_detail::RapidRead64(p + i - 8);
+  }
+
+  // Final mix.
+  a ^= hash_detail::kSecret[1];
+  b ^= seed;
+  hash_detail::RapidMum(&a, &b);
+  return static_cast<size_t>(hash_detail::RapidMix(
+      a ^ hash_detail::kSecret[7], b ^ hash_detail::kSecret[1] ^ len));
+}
+
+}  // namespace node
+
+#endif  // defined(NODE_WANT_INTERNALS) && NODE_WANT_INTERNALS
diff --git a/src/node_sockaddr-inl.h b/src/node_sockaddr-inl.h
@@ -16,14 +16,6 @@ namespace node {
 
 static constexpr uint32_t kLabelMask = 0xFFFFF;
 
-inline void hash_combine(size_t* seed) { }
-
-template <typename T, typename... Args>
-inline void hash_combine(size_t* seed, const T& value, Args... rest) {
-    *seed ^= std::hash<T>{}(value) + 0x9e3779b9 + (*seed << 6) + (*seed >> 2);
-    hash_combine(seed, rest...);
-}
-
 bool SocketAddress::is_numeric_host(const char* hostname) {
   return is_numeric_host(hostname, AF_INET) ||
          is_numeric_host(hostname, AF_INET6);
diff --git a/src/node_sockaddr.cc b/src/node_sockaddr.cc
@@ -4,6 +4,7 @@
 #include "memory_tracker-inl.h"
 #include "nbytes.h"
 #include "node_errors.h"
+#include "node_hash.h"
 #include "node_sockaddr-inl.h"  // NOLINT(build/include_inline)
 #include "uv.h"
 
@@ -77,26 +78,28 @@ bool SocketAddress::New(
 }
 
 size_t SocketAddress::Hash::operator()(const SocketAddress& addr) const {
-  size_t hash = 0;
+  // Hash only the meaningful bytes (family + port + address), not the
+  // full 128-byte sockaddr_storage.
   switch (addr.family()) {
     case AF_INET: {
       const sockaddr_in* ipv4 =
           reinterpret_cast<const sockaddr_in*>(addr.raw());
-      hash_combine(&hash, ipv4->sin_port, ipv4->sin_addr.s_addr);
-      break;
+      uint8_t buf[6];
+      memcpy(buf, &ipv4->sin_port, 2);
+      memcpy(buf + 2, &ipv4->sin_addr, 4);
+      return HashBytes(buf, sizeof(buf));
     }
     case AF_INET6: {
       const sockaddr_in6* ipv6 =
           reinterpret_cast<const sockaddr_in6*>(addr.raw());
-      const uint64_t* a =
-          reinterpret_cast<const uint64_t*>(&ipv6->sin6_addr);
-      hash_combine(&hash, ipv6->sin6_port, a[0], a[1]);
-      break;
+      uint8_t buf[18];
+      memcpy(buf, &ipv6->sin6_port, 2);
+      memcpy(buf + 2, &ipv6->sin6_addr, 16);
+      return HashBytes(buf, sizeof(buf));
     }
     default:
       UNREACHABLE();
   }
-  return hash;
 }
 
 SocketAddress SocketAddress::FromSockName(const uv_tcp_t& handle) {
diff --git a/src/quic/cid.cc b/src/quic/cid.cc
@@ -3,6 +3,7 @@
 #ifndef OPENSSL_NO_QUIC
 #include <crypto/crypto_util.h>
 #include <memory_tracker-inl.h>
+#include <node_hash.h>
 #include <node_mutex.h>
 #include <string_bytes.h>
 #include "cid.h"
@@ -85,16 +86,7 @@ const CID CID::kInvalid{};
 // CID::Hash
 
 size_t CID::Hash::operator()(const CID& cid) const {
-  // Uses the Boost hash_combine strategy: XOR each byte with the golden
-  // ratio constant 0x9e3779b9 (derived from the fractional part of the
-  // golden ratio, (sqrt(5)-1)/2 * 2^32) plus bit-shifted accumulator
-  // state. This provides good avalanche properties for short byte
-  // sequences like connection IDs (1-20 bytes).
-  size_t hash = 0;
-  for (size_t n = 0; n < cid.length(); n++) {
-    hash ^= cid.ptr_->data[n] + 0x9e3779b9 + (hash << 6) + (hash >> 2);
-  }
-  return hash;
+  return HashBytes(cid.ptr_->data, cid.length());
 }
 
 // ============================================================================
diff --git a/src/quic/tokens.cc b/src/quic/tokens.cc
@@ -3,6 +3,7 @@
 #ifndef OPENSSL_NO_QUIC
 #include <crypto/crypto_util.h>
 #include <ngtcp2/ngtcp2_crypto.h>
+#include <node_hash.h>
 #include <node_sockaddr-inl.h>
 #include <string_bytes.h>
 #include <util-inl.h>
@@ -126,13 +127,8 @@ std::string StatelessResetToken::ToString() const {
 
 size_t StatelessResetToken::Hash::operator()(
     const StatelessResetToken& token) const {
-  // See CID::Hash for details on this hash combine strategy.
-  size_t hash = 0;
-  if (token.ptr_ == nullptr) return hash;
-  for (size_t n = 0; n < kStatelessTokenLen; n++) {
-    hash ^= token.ptr_[n] + 0x9e3779b9 + (hash << 6) + (hash >> 2);
-  }
-  return hash;
+  if (token.ptr_ == nullptr) return 0;
+  return HashBytes(token.ptr_, kStatelessTokenLen);
 }
 
 StatelessResetToken StatelessResetToken::kInvalid;