哪种散列函数更适合在小散列表中表示128位随机ID

Synoon 发表于 Dev

Synoon

在我的课堂上，我进行了以下练习：

我有具有128bit的GUID（全局唯一标识符）。

哪种哈希函数更好地表示哈希ID为000到899的存储桶中的值，每个存储桶有100个空闲位置来存储哈希冲突？

我想比较以下散列函数：

a) h(a) = a mod 900
b) h(a) = a mod 887
c) h(a) = a^2 mod 887
d) there are not enough information to answer this question

我所拥有的：

我认为使用a ^ 2并不是更好，因为它只会在前几千个id中给我们带来好处，它们应该更好地分布，但之后，我可能必须进行更多的碰撞探测才能将这些值存储在其他值中桶。

我已经尝试完成上述行为：在下面的代码段中，我生成了90000个“随机”唯一数字，这些数字存储在地图中，并使用mod 900之后的哈希函数。我知道出于某些原因，首选使用质数用于散列函数。

随机性最多只能实现32位。但是我认为这不应该太重要，因为我没有使用最大128位。

m = null;
uniqueMap = new Map();
hash = (z, p) => z % p ;

function getRandomInt(max) {
  guid = Math.floor(Math.random() * Math.floor(max));
  if (uniqueMap.has(guid)) return getRandomInt(max);
  return guid;
}


map = new Map();
for (var i = 1; i <= 90000; i++) {
  h = hash(getRandomInt(2147483647), 900);
  map.has(h) ? map.set(h, map.get(h) + 1) : map.set(h, 1);
}

map.forEach((a) => m = Math.max(a, m))

console.log(m);

具有相同功能但带有mod 887的下一个代码段：

m = null;
uniqueMap = new Map();
hash = (z, p) => z % p ;

function getRandomInt(max) {
  guid = Math.floor(Math.random() * Math.floor(max));
  if (uniqueMap.has(guid)) return getRandomInt(max);
  return guid;
}


map = new Map();
for (var i = 1; i <= 90000; i++) {
  h = hash(getRandomInt(2147483647), 887);
  map.has(h) ? map.set(h, map.get(h) + 1) : map.set(h, 1);
}

map.forEach((a) => m = Math.max(a, m))

console.log(m);

并带有a ^ 2：

m = null;
uniqueMap = new Map();
hash = (z, p) => z % p ;

function getRandomInt(max) {
  guid = Math.floor(Math.random() * Math.floor(max));
  if (uniqueMap.has(guid)) return getRandomInt(max);
  return guid;
}


map = new Map();
for (var i = 1; i <= 90000; i++) {
  h = hash(Math.pow(getRandomInt(2147483647),2), 887);
  map.has(h) ? map.set(h, map.get(h) + 1) : map.set(h, 1);
}

map.forEach((a) => m = Math.max(a, m))

console.log(m);

全部内在：

m = null;
uniqueMap = new Map();
hash = (z, p) => z % p ;

function getRandomInt(max) {
  guid = Math.floor(Math.random() * Math.floor(max));
  if (uniqueMap.has(guid)) return getRandomInt(max);
  return guid;
}


map = new Map();
for (var i = 1; i <= 90000; i++) {
  h = hash(getRandomInt(2147483647), 900);
  map.has(h) ? map.set(h, map.get(h) + 1) : map.set(h, 1);
}

map.forEach((a) => m = Math.max(a, m))

console.log(m);

m = null;
uniqueMap = new Map();
map = new Map();
for (var i = 1; i <= 90000; i++) {
  h = hash(getRandomInt(2147483647), 887);
  map.has(h) ? map.set(h, map.get(h) + 1) : map.set(h, 1);
}

map.forEach((a) => m = Math.max(a, m))

console.log(m);

m = null;
uniqueMap = new Map();
map = new Map();
for (var i = 1; i <= 90000; i++) {
  h = hash(Math.pow(getRandomInt(2147483647),2), 887);
  map.has(h) ? map.set(h, map.get(h) + 1) : map.set(h, 1);
}

map.forEach((a) => m = Math.max(a, m))

console.log(m);

如果我正在比较这3种方法，它们会告诉我，使用mod a ^ 2的最高碰撞次数比不使用guid的887和900都高。因此，我认为这不是正确的答案。

但是我应该如何比较其他两个呢？它们显示出相似的峰，但差异很小。

卡尔提克

您可以通过简单地检查哪些因素数量较少来比较其他两个，因为素数具有较少的因素用于散列。

两者之间的差异可以忽略不计的原因主要是由于您使用的哈希函数。您的哈希函数已经给出了分布良好的值。但是由于问题在于直接比较。最好的方法是选择一个具有质数的mod 887

在cs.stackexchange中对此有很好的解释

这是有关模块化哈希的更多详细信息https://algs4.cs.princeton.edu/34hash/

本文收集自互联网，转载请注明来源。

如有侵权，请联系 [email protected] 删除。