HashSet源码解读
作者:互联网
# HashSet使用散列表实现,其内部实现和Dictionary类似,可以看作是一个只有key没有value的Dictionary
【散列映射】
# 就是通过hashCode获得散列桶(buckets)的索引
# 使用了除留余数法(实现起来简单),以及散列桶数组的长度使用了素数的大小(素数使得索引分布更均匀)
var hashCode = _comparer.GetHashCode(item) & 0x7FFFFFFF var targetBucket = hashCode % _buckets.Length;
【散列冲突】
# 不同的元素hashCode相同时,就发生了散列冲突
# 使用了拉链法,当不同元素hashCode相同时,就会被分配到相同的散列桶上,散列桶维护了一个单向链表,发生冲突有新元素分配过来时,就成为新的链表头
然后散列桶会指向这个新的链表头
# 所以,假设10个元素的hashCode都一样,那其基本就相当于退化成一个链表了。所以选一个好的hashCode生成函数是很重要的
【扩容】
# 使用素数大小的桶(buckets)容量,就是3, 7, 11, 17, 23, 29, 37...这样的大小递增
# 至于为啥使用素数,就是素数可以使得散列分布更均匀,不会集中在第1个桶或者某个桶。至于分布更均匀的证明可以自己去查阅相关资料。
【HashSet源码】
# 删掉了参数检测,断言,迭代器,以及一些非重要的函数
# 代码也做了部分调整,_buckets值存放那块和Dictionary保持一致
# 加上了部分自己理解的注释(语言比较随意,易于自己理解)
public class HashSetSrc<T> { // store lower 31 bits of hash code private const int Lower31BitMask = 0x7FFFFFFF; // cutoff point, above which we won't do stackallocs. This corresponds to 100 integers. private const int StackAllocThreshold = 100; private int[] _buckets; private Slot[] _slots; private int _count; private int _slotsTail; //slots数组的尾游标, 指向后一个位置: 先操作再加减 private int _freeList; //空闲slots也是一个单向链表, 其值总是为链表头, 取元素总是从链表头取 private IEqualityComparer<T> _comparer; private int _version; public HashSet(int capacity, IEqualityComparer<T> comparer) { if (comparer == null) comparer = EqualityComparer<T>.Default; _comparer = comparer; _slotsTail = 0; _count = 0; _freeList = -1; _version = 0; if (capacity > 0) Initialize(capacity); } public void Clear() { if (_slotsTail > 0) { // clear the elements so that the gc can reclaim the references. // clear only up to m_lastIndex for m_slots Array.Clear(_slots, 0, _slotsTail); for (int i = 0; i < _buckets.Length; i++) _buckets[i] = -1; //Array.Clear(_buckets, 0, _buckets.Length); _slotsTail = 0; _count = 0; _freeList = -1; } _version++; } public bool Contains(T item) { if (_buckets != null) { int hashCode = InternalGetHashCode(item); var targetBucket = hashCode % _buckets.Length; var listHeadIndex = _buckets[targetBucket]; //链表头元素在slots中的索引 //每个桶对应一个单向链表, 遍历链表 for (int i = listHeadIndex; i >= 0; i = _slots[i].next) { if (_slots[i].hashCode == hashCode && _comparer.Equals(_slots[i].value, item)) { return true; } } } // either m_buckets is null or wasn't found return false; } public bool Remove(T item) { if (_buckets != null) { int hashCode = InternalGetHashCode(item); int targetBucket = hashCode % _buckets.Length; int last = -1; var listHeadIndex = _buckets[targetBucket]; //链表头元素在slots中的索引 //每个桶对应一个单向链表, 遍历链表 for (int i = listHeadIndex; i >= 0; last = i, i = _slots[i].next) { if (_slots[i].hashCode == hashCode && _comparer.Equals(_slots[i].value, item)) { if (last < 0) { //相当于删除链表头 _buckets[targetBucket] = _slots[i].next; } else { //相当于在链表中间删除, 重新建立前后关系 _slots[last].next = _slots[i].next; } _slots[i].hashCode = -1; _slots[i].value = default(T); _slots[i].next = _freeList; _count--; _version++; if (_count == 0) { _slotsTail = 0; _freeList = -1; } else { _freeList = i; } return true; } } } // either m_buckets is null or wasn't found return false; } public int Count { get { return _count; } } public bool Add(T item) { return AddIfNotPresent(item); } public bool TryGetValue(T equalValue, out T actualValue) { if (_buckets != null) { int i = InternalIndexOf(equalValue); if (i >= 0) { actualValue = _slots[i].value; return true; } } actualValue = default(T); return false; } //并集, 加上别人的所有元素 public void UnionWith(IEnumerable<T> other) { //把另一个集合的所有元素添加进来 foreach (T item in other) { AddIfNotPresent(item); } } //交集, 自己有_别人没有的删除; 或对方有_自己也有的保留 public void IntersectWith(IEnumerable<T> other) { // intersection of anything with empty set is empty set, so return if count is 0 if (_count == 0) { return; } // if other is empty, intersection is empty set; remove all elements and we're done // can only figure this out if implements ICollection<T>. (IEnumerable<T> has no count) ICollection<T> otherAsCollection = other as ICollection<T>; if (otherAsCollection != null) { if (otherAsCollection.Count == 0) { Clear(); return; } HashSet<T> otherAsSet = other as HashSet<T>; // faster if other is a hashset using same equality comparer; so check // that other is a hashset using the same equality comparer. if (otherAsSet != null && AreEqualityComparersEqual(this, otherAsSet)) { IntersectWithHashSetWithSameEC(otherAsSet); return; } } IntersectWithEnumerable(other); } //差集, 减掉别人的所有元素 public void ExceptWith(IEnumerable<T> other) { // this is already the enpty set; return if (_count == 0) { return; } // special case if other is this; a set minus itself is the empty set if (other == this) { Clear(); return; } // remove every element in other from this foreach (T element in other) { Remove(element); } } //异或, 相同的删除, 不相同的添加 public void SymmetricExceptWith(IEnumerable<T> other) { // if set is empty, then symmetric difference is other if (_count == 0) { UnionWith(other); return; } // special case this; the symmetric difference of a set with itself is the empty set if (other == this) { Clear(); return; } HashSet<T> otherAsSet = other as HashSet<T>; // If other is a HashSet, it has unique elements according to its equality comparer, // but if they're using different equality comparers, then assumption of uniqueness // will fail. So first check if other is a hashset using the same equality comparer; // symmetric except is a lot faster and avoids bit array allocations if we can assume // uniqueness if (otherAsSet != null && AreEqualityComparersEqual(this, otherAsSet)) { SymmetricExceptWithUniqueHashSet(otherAsSet); } else { SymmetricExceptWithEnumerable(other); } } //是否为子集, 自己的元素other中都有 public bool IsSubsetOf(IEnumerable<T> other) { // The empty set is a subset of any set if (_count == 0) { return true; } HashSet<T> otherAsSet = other as HashSet<T>; // faster if other has unique elements according to this equality comparer; so check // that other is a hashset using the same equality comparer. if (otherAsSet != null && AreEqualityComparersEqual(this, otherAsSet)) { // if this has more elements then it can't be a subset if (_count > otherAsSet.Count) { return false; } // already checked that we're using same equality comparer. simply check that // each element in this is contained in other. return IsSubsetOfHashSetWithSameEC(otherAsSet); } else { ElementCount result = CheckUniqueAndUnfoundElements(other, false); return (result.uniqueCount == _count && result.unfoundCount >= 0); } } //真子集 public bool IsProperSubsetOf(IEnumerable<T> other) { ICollection<T> otherAsCollection = other as ICollection<T>; if (otherAsCollection != null) { // the empty set is a proper subset of anything but the empty set if (_count == 0) { return otherAsCollection.Count > 0; } HashSet<T> otherAsSet = other as HashSet<T>; // faster if other is a hashset (and we're using same equality comparer) if (otherAsSet != null && AreEqualityComparersEqual(this, otherAsSet)) { if (_count >= otherAsSet.Count) { return false; } // this has strictly less than number of items in other, so the following // check suffices for proper subset. return IsSubsetOfHashSetWithSameEC(otherAsSet); } } ElementCount result = CheckUniqueAndUnfoundElements(other, false); return (result.uniqueCount == _count && result.unfoundCount > 0); } //超集, 别人的所有元素, 自己都有 public bool IsSupersetOf(IEnumerable<T> other) { // try to fall out early based on counts ICollection<T> otherAsCollection = other as ICollection<T>; if (otherAsCollection != null) { // if other is the empty set then this is a superset if (otherAsCollection.Count == 0) { return true; } HashSet<T> otherAsSet = other as HashSet<T>; // try to compare based on counts alone if other is a hashset with // same equality comparer if (otherAsSet != null && AreEqualityComparersEqual(this, otherAsSet)) { if (otherAsSet.Count > _count) { return false; } } } return ContainsAllElements(other); } //真超集, 别人的所有元素, 自己都有 public bool IsProperSupersetOf(IEnumerable<T> other) { // the empty set isn't a proper superset of any set. if (_count == 0) { return false; } ICollection<T> otherAsCollection = other as ICollection<T>; if (otherAsCollection != null) { // if other is the empty set then this is a superset if (otherAsCollection.Count == 0) { // note that this has at least one element, based on above check return true; } HashSet<T> otherAsSet = other as HashSet<T>; // faster if other is a hashset with the same equality comparer if (otherAsSet != null && AreEqualityComparersEqual(this, otherAsSet)) { if (otherAsSet.Count >= _count) { return false; } // now perform element check return ContainsAllElements(otherAsSet); } } // couldn't fall out in the above cases; do it the long way ElementCount result = CheckUniqueAndUnfoundElements(other, true); return (result.uniqueCount < _count && result.unfoundCount == 0); } //是否存在重叠, 我有一个别人的元素就ok public bool Overlaps(IEnumerable<T> other) { if (_count == 0) { return false; } foreach (T element in other) { if (Contains(element)) { return true; } } return false; } public bool IsSetEquals(IEnumerable<T> other) { HashSet<T> otherAsSet = other as HashSet<T>; // faster if other is a hashset and we're using same equality comparer if (otherAsSet != null && AreEqualityComparersEqual(this, otherAsSet)) { // attempt to return early: since both contain unique elements, if they have // different counts, then they can't be equal if (_count != otherAsSet.Count) { return false; } // already confirmed that the sets have the same number of distinct elements, so if // one is a superset of the other then they must be equal return ContainsAllElements(otherAsSet); } else { ICollection<T> otherAsCollection = other as ICollection<T>; if (otherAsCollection != null) { // if this count is 0 but other contains at least one element, they can't be equal if (_count == 0 && otherAsCollection.Count > 0) { return false; } } ElementCount result = CheckUniqueAndUnfoundElements(other, true); return (result.uniqueCount == _count && result.unfoundCount == 0); } } public int RemoveWhere(Predicate<T> match) { int numRemoved = 0; for (int i = 0; i < _slotsTail; i++) { if (_slots[i].hashCode >= 0) { // cache value in case delegate removes it T value = _slots[i].value; if (match(value)) { // check again that remove actually removed it if (Remove(value)) { numRemoved++; } } } } return numRemoved; } public IEqualityComparer<T> Comparer { get { return _comparer; } } //去除多余的 public void TrimExcess() { if (_count == 0) { // if count is zero, clear references _buckets = null; _slots = null; _version++; } else { // similar to IncreaseCapacity but moves down elements in case add/remove/etc // caused fragmentation int newSize = HashHelpers.GetPrime(_count); Slot[] newSlots = new Slot[newSize]; int[] newBuckets = new int[newSize]; for (int i = 0; i < newBuckets.Length; i++) newBuckets[i] = -1; // move down slots and rehash at the same time. newIndex keeps track of current // position in newSlots array int newIndex = 0; for (int i = 0; i < _slotsTail; i++) { if (_slots[i].hashCode >= 0) { var slot = _slots[i]; newSlots[newIndex] = slot; // rehash int targetBucket = slot.hashCode % newSize; var oldListHeadIndex = newBuckets[targetBucket]; slot.next = oldListHeadIndex; //链接到新表头后面 newBuckets[targetBucket] = newIndex; newIndex++; } } _slotsTail = newIndex; _slots = newSlots; _buckets = newBuckets; _freeList = -1; } } private void Initialize(int capacity) { int size = HashHelpers.GetPrime(capacity); _buckets = new int[size]; for (var i = 0; i < size; ++i) _buckets[i] = -1; //-1表示还没有存放值 _slots = new Slot[size]; } private void IncreaseCapacity() { int newSize = HashHelpers.ExpandPrime(_count); SetCapacity(newSize, false); } //_buckets和_slots同时扩容 private void SetCapacity(int newSize, bool forceNewHashCodes) { Slot[] newSlots = new Slot[newSize]; if (_slots != null) { Array.Copy(_slots, 0, newSlots, 0, _slotsTail); } if(forceNewHashCodes) { for(int i = 0; i < _slotsTail; i++) { if(newSlots[i].hashCode != -1) { newSlots[i].hashCode = InternalGetHashCode(newSlots[i].value); } } } int[] newBuckets = new int[newSize]; for (int i = 0; i < newSize; ++i) newBuckets[i] = -1; for (int i = 0; i < _slotsTail; i++) { int targetBucket = newSlots[i].hashCode % newSize; var oldListHeadIndex = newBuckets[targetBucket]; newSlots[i].next = oldListHeadIndex; //链接到新表头后面 newBuckets[targetBucket] = i; } _slots = newSlots; _buckets = newBuckets; } private bool AddIfNotPresent(T value) { if (_buckets == null) { Initialize(0); } int hashCode = InternalGetHashCode(value); int targetBucket = hashCode % _buckets.Length; #if FEATURE_RANDOMIZED_STRING_HASHING && !FEATURE_NETCORE int collisionCount = 0; #endif var listHeadIndex = _buckets[targetBucket]; //链表头元素在slots中的索引 //每个桶对应一个单向链表, 遍历链表 for (int i = listHeadIndex; i >= 0; i = _slots[i].next) { if (_slots[i].hashCode == hashCode && _comparer.Equals(_slots[i].value, value)) { return false; //已存在 } #if FEATURE_RANDOMIZED_STRING_HASHING && !FEATURE_NETCORE collisionCount++; #endif } int index; if (_freeList >= 0) { index = _freeList; _freeList = _slots[index].next; } else { if (_slotsTail == _slots.Length) { IncreaseCapacity(); // this will change during resize targetBucket = hashCode % _buckets.Length; } index = _slotsTail; _slotsTail++; } _slots[index].hashCode = hashCode; _slots[index].value = value; _slots[index].next = _buckets[targetBucket]; _buckets[targetBucket] = index; //新添加条目总是加在链表头 _count++; _version++; #if FEATURE_RANDOMIZED_STRING_HASHING && !FEATURE_NETCORE if(collisionCount > HashHelpers.HashCollisionThreshold && HashHelpers.IsWellKnownEqualityComparer(m_comparer)) { m_comparer = (IEqualityComparer<T>) HashHelpers.GetRandomizedEqualityComparer(m_comparer); SetCapacity(_buckets.Length, true); } #endif // FEATURE_RANDOMIZED_STRING_HASHING return true; } //用自己的比较器(遍历别人的元素调用自己的方法), private bool ContainsAllElements(IEnumerable<T> other) { foreach (T element in other) { if (!Contains(element)) { return false; } } return true; } //相同的EqualityComparer<T>, 可以用别人的比较器(遍历自己的元素调用other的方法), private bool IsSubsetOfHashSetWithSameEC(HashSet<T> other) { foreach (T item in this) { if (!other.Contains(item)) { return false; } } return true; } //相同的EqualityComparer<T>, 可以用别人的比较器(遍历自己的元素调用other的方法), //遍历自己的元素, 别人没有的删除 private void IntersectWithHashSetWithSameEC(HashSet<T> other) { //如果对方没有, 就从自己这边删除 for (int i = 0; i < _slotsTail; i++) { if (_slots[i].hashCode >= 0) { T item = _slots[i].value; if (!other.Contains(item)) { Remove(item); } } } } //EqualityComparer<T>不同时, 只能用自己的比较器(遍历别人的元素调用自己的方法), //遍历对方的元素, 自己也有的保留 [System.Security.SecuritySafeCritical] private unsafe void IntersectWithEnumerable(IEnumerable<T> other) { //使用bit来作为bool使用 int originalLastIndex = _slotsTail; int intArrayLength = BitHelper.ToIntArrayLength(originalLastIndex); //x bits需要多少个int, 比如: 10bits需要1个int BitHelper bitHelper; if (intArrayLength <= StackAllocThreshold) { int* bitArrayPtr = stackalloc int[intArrayLength]; bitHelper = new BitHelper(bitArrayPtr, intArrayLength); } else { int[] bitArray = new int[intArrayLength]; bitHelper = new BitHelper(bitArray, intArrayLength); } //如果对方没有, 就从自己这边删除 // mark if contains: find index of in slots array and mark corresponding element in bit array foreach (T item in other) { int index = InternalIndexOf(item); if (index >= 0) { bitHelper.MarkBit(index); } } // if anything unmarked, remove it. Perf can be optimized here if BitHelper had a // FindFirstUnmarked method. for (int i = 0; i < originalLastIndex; i++) { if (_slots[i].hashCode >= 0 && !bitHelper.IsMarked(i)) { Remove(_slots[i].value); } } } private int InternalIndexOf(T item) { int hashCode = InternalGetHashCode(item); var targetBucket = hashCode % _buckets.Length; //每个桶对应一个单向链表, 遍历链表 var listHeadIndex = _buckets[targetBucket]; //链表头元素在slots中的索引 for (int i = listHeadIndex; i >= 0; i = _slots[i].next) { if ((_slots[i].hashCode) == hashCode && _comparer.Equals(_slots[i].value, item)) { return i; } } // wasn't found return -1; } //用自己的比较器(遍历别人的元素调用自己的方法), //遍历对方的元素, 相同的删除, 不相同的添加; //可以用的别人的比较器SameEC那种么? private void SymmetricExceptWithUniqueHashSet(HashSet<T> other) { foreach (T item in other) { if (!Remove(item)) { AddIfNotPresent(item); } } } //用自己的比较器(遍历别人的元素调用自己的方法), //遍历对方的元素, 相同的删除, 不相同的添加; [System.Security.SecuritySafeCritical] private unsafe void SymmetricExceptWithEnumerable(IEnumerable<T> other) { //使用bit来作为bool使用 int originalLastIndex = _slotsTail; int intArrayLength = BitHelper.ToIntArrayLength(originalLastIndex); BitHelper itemsToRemove; BitHelper itemsAddedFromOther; if (intArrayLength <= StackAllocThreshold / 2) { int* itemsToRemovePtr = stackalloc int[intArrayLength]; itemsToRemove = new BitHelper(itemsToRemovePtr, intArrayLength); int* itemsAddedFromOtherPtr = stackalloc int[intArrayLength]; itemsAddedFromOther = new BitHelper(itemsAddedFromOtherPtr, intArrayLength); } else { int[] itemsToRemoveArray = new int[intArrayLength]; itemsToRemove = new BitHelper(itemsToRemoveArray, intArrayLength); int[] itemsAddedFromOtherArray = new int[intArrayLength]; itemsAddedFromOther = new BitHelper(itemsAddedFromOtherArray, intArrayLength); } foreach (T item in other) { int location = 0; bool added = AddOrGetLocation(item, out location); if (added) { //不相同的添加 // wasn't already present in collection; flag it as something not to remove // *NOTE* if location is out of range, we should ignore. BitHelper will // detect that it's out of bounds and not try to mark it. But it's // expected that location could be out of bounds because adding the item // will increase m_lastIndex as soon as all the free spots are filled. itemsAddedFromOther.MarkBit(location); } else { //相同的删除 // already there...if not added from other, mark for remove. // *NOTE* Even though BitHelper will check that location is in range, we want // to check here. There's no point in checking items beyond originalLastIndex // because they could not have been in the original collection if (location < originalLastIndex && !itemsAddedFromOther.IsMarked(location) //像List这种允许有2个甚至多个"two"的 ) { itemsToRemove.MarkBit(location); } } } //不相同的添加, 相同的删除 // if anything marked, remove it for (int i = 0; i < originalLastIndex; i++) { if (itemsToRemove.IsMarked(i)) { Remove(_slots[i].value); } } } //不存在时, 添加并返回true; 已存在时, 返回false; private bool AddOrGetLocation(T value, out int location) { int hashCode = InternalGetHashCode(value); int targetBucket = hashCode % _buckets.Length; var listHeadIndex = _buckets[targetBucket]; //链表头元素在slots中的索引 for (int i = listHeadIndex; i >= 0; i = _slots[i].next) { if (_slots[i].hashCode == hashCode && _comparer.Equals(_slots[i].value, value)) { location = i; return false; //already present } } int index; if (_freeList >= 0) { index = _freeList; _freeList = _slots[index].next; } else { if (_slotsTail == _slots.Length) { IncreaseCapacity(); // this will change during resize targetBucket = hashCode % _buckets.Length; } index = _slotsTail; _slotsTail++; } _slots[index].hashCode = hashCode; _slots[index].value = value; var oldListHeadIndex = _buckets[targetBucket]; _slots[index].next = oldListHeadIndex; _buckets[targetBucket] = index; _count++; _version++; location = index; return true; } //EqualityComparer<T>不同时, 只能用自己的比较器(遍历别人的元素调用自己的方法), [System.Security.SecuritySafeCritical] private unsafe ElementCount CheckUniqueAndUnfoundElements(IEnumerable<T> other, bool returnIfUnfound) { ElementCount result; // need special case in case this has no elements. if (_count == 0) { int numElementsInOther = 0; foreach (T item in other) { numElementsInOther++; // break right away, all we want to know is whether other has 0 or 1 elements break; } result.uniqueCount = 0; result.unfoundCount = numElementsInOther; return result; } //使用bit来作为bool使用 int originalLastIndex = _slotsTail; int intArrayLength = BitHelper.ToIntArrayLength(originalLastIndex); BitHelper bitHelper; if (intArrayLength <= StackAllocThreshold) { int* bitArrayPtr = stackalloc int[intArrayLength]; bitHelper = new BitHelper(bitArrayPtr, intArrayLength); } else { int[] bitArray = new int[intArrayLength]; bitHelper = new BitHelper(bitArray, intArrayLength); } // 别人有, 自己没有 int unfoundCount = 0; // 别人有自己也有 int uniqueFoundCount = 0; foreach (T item in other) { int index = InternalIndexOf(item); if (index >= 0) { //别人有自己也有 if (!bitHelper.IsMarked(index)) { // item hasn't been seen yet bitHelper.MarkBit(index); uniqueFoundCount++; } } else { //别人有自己没有 unfoundCount++; if (returnIfUnfound) { break; } } } result.uniqueCount = uniqueFoundCount; result.unfoundCount = unfoundCount; return result; } private static bool AreEqualityComparersEqual(HashSet<T> set1, HashSet<T> set2) { return set1.Comparer.Equals(set2.Comparer); } private int InternalGetHashCode(T item) { if (item == null) return 0; return _comparer.GetHashCode(item) & Lower31BitMask; //符号位置为0, -1 & Lower31BitMask会变为正数 } // used for set checking operations (using enumerables) that rely on counting internal struct ElementCount { /// 别人有自己也有的数量 internal int uniqueCount; /// 别人有, 自己没有的数量 internal int unfoundCount; } internal struct Slot { internal int hashCode; // Lower 31 bits of hash code, -1 if unused internal int next; // Index of next entry, -1 if last internal T value; } }
【关于奇数和素数】
# 奇数: 不能被2整除的
1,3,5,7,9
# 素数(质数): 只能被1和自己整除的
2,3,5,7,11
# 不要混淆了
【参考】
浅析C# Dictionary实现原理_Phil Arist的博客-CSDN博客_c# dictionary
C# HashSet源码分享 自定义HashSet - 走看看 (zoukankan.com)
标签:return,HashSet,int,buckets,hashCode,解读,源码,slots,other 来源: https://www.cnblogs.com/sailJs/p/16070253.html