unordered-set-map/Linux/HashTable.hpp

#pragma once

#include <iostream>
#include <vector>
#include <string>

// unordered_set<K>		->	HashTable<K, K>
// unordered_map<K, V>	->	HashTable<K, pair<K, V>>

namespace Lenyiin
{
    template <class K>
    struct SetKeyOfT
    {
        const K &operator()(const K &key)
        {
            return key;
        }
    };

    enum State
    {
        EMPTY, // 槽位为空
        EXIST, // 槽位已经存在一个元素
        DELETE // 槽位中元素被删除
    };

    template <class T>
    struct HashData
    {
        T _data;
        State _state;

        HashData()
            : _data(T()), _state(EMPTY)
        {
        }
    };

    template <class K, class T, class KeyOfT>
    class Close_HashTable
    {
    private:
        typedef struct HashData<T> HashData;

        size_t SecondHash(const K &key, size_t table_size)
        {
            return 1 + (key % (table_size - 1));
        }

    public:
        // 负载因子 = 表中数据/表的大小 衡量哈希表满的程度
        // 表越接近满, 插入数据越容易冲突, 冲突越多, 效率越低
        // 哈希表并不是满了才增容, 开放定制法中, 一般负载因子到 0.7 左右就开始增容
        // 负载因子越小, 冲突概率越低, 整体效率越高, 但是负载因子越小, 浪费的空间越大, 所以负载因子一般取一个折中的值
        void CheckCapacity()
        {
            KeyOfT koft;

            // // version 1
            // if (_tables.size() == 0 || _num * 10 / _tables.size() >= 7)
            // {
            //     // 增容
            //     // 1. 开 2倍大小的新表
            //     // 2. 遍历旧表的数据，重新计算在新表中位置
            //     // 3. 释放旧表
            //     std::vector<HashData> newtables;
            //     size_t newsize = _tables.size() == 0 ? 10 : _tables.size() * 2;
            //     newtables.resize(newsize);
            //     for (size_t i = 0; i < _tables.size(); i++)
            //     {
            //         if (_tables[i]._state == EXIST)
            //         {
            //             // 计算在新表中的位置, 并处理冲突
            //             size_t index = koft(_tables[i]._data) % newtables.size();
            //             while (newtables[index]._state == EXIST)
            //             {
            //                 ++index;
            //                 if (index == _tables.size())
            //                 {
            //                     index = 0;
            //                 }
            //             }
            //             newtables[index] = _tables[i];
            //         }
            //     }
            //     _tables.swap(newtables);
            // }

            // // version 2
            // if (_tables.size() == 0 || _num * 10 / _tables.size() >= 7)
            // {
            //     // 增容
            //     // 1. 开 2倍大小的新表
            //     // 2. 遍历旧表的数据，重新计算在新表中位置
            //     // 3. 释放旧表
            //     std::vector<HashData> newtables;
            //     size_t newsize = _tables.size() == 0 ? 10 : _tables.size() * 2;
            //     newtables.resize(newsize);
            //     for (size_t i = 0; i < _tables.size(); i++)
            //     {
            //         if (_tables[i]._state == EXIST)
            //         {
            //             // 重新计算新表中的位置
            //             size_t index = koft(_tables[i]._data) % newtables.size();
            //             size_t step = SecondHash(koft(_tables[i]._data), newtables.size());

            //             // 处理冲突：双重哈希探测
            //             while (newtables[index]._state == EXIST)
            //             {
            //                 index = (index + step) % newtables.size();
            //             }

            //             // 插入元素到新表
            //             newtables[index] = _tables[i];
            //         }
            //     }
            //     _tables.swap(newtables);
            // }

            // version 3
            // 另一种增容思路
            if (_tables.size() == 0 || _num * 10 / _tables.size() >= 7)
            {
                Close_HashTable<K, T, KeyOfT> newht;
                size_t newsize = _tables.size() == 0 ? 10 : _tables.size() * 2;
                newht._tables.resize(newsize);
                for (size_t i = 0; i < _tables.size(); i++)
                {
                    if (_tables[i]._state == EXIST)
                    {
                        newht.Insert(_tables[i]._data);
                    }
                }
                _tables.swap(newht._tables);
            }
        }

        bool Insert(const T &data)
        {
            KeyOfT koft;

            CheckCapacity();
            // 闭散列中线性探测有什么问题?
            // 线性探测思路就是我的位置被占了, 我就挨着往后去占别人的位置, 可能会导致一片一片的冲突, 洪水效应

            // version 1
            // 线性探测
            // 计算 data 中的 key 在表中映射的位置
            // size_t index = koft(data) % _tables.size();
            // while (_tables[index]._state == EXIST)
            // {
            //     if (koft(_tables[index]._data) == koft(data))
            //     {
            //         return false; // 已经存在
            //     }
            //     ++index;
            //     if (index == _tables.size())
            //     {
            //         index = 0;
            //     }
            // }

            // version 2
            // 二次探测
            // 计算 data 中的 key 在表中映射的位置
            // size_t start = koft(data) % _tables.size();
            // size_t index = start;
            // int i = 0;
            // while (_tables[index]._state == EXIST)
            // {
            //     if (koft(_tables[index]._data) == koft(data))
            //     {
            //         return false; // 已经存在
            //     }
            //     index = start + i * i;
            //     i++;
            //     index %= _tables.size();
            // }

            // version 3
            // 双重哈希
            size_t index = koft(data) % _tables.size();
            size_t step = SecondHash(koft(data), _tables.size());
            while (_tables[index]._state == EXIST)
            {
                if (koft(_tables[index]._data) == koft(data))
                {
                    return false; // 如果找到相同的 key，插入失败
                }

                index = (index + step) % _tables.size(); // 使用双重哈希计算下一个位置
            }

            _tables[index]._data = data;
            _tables[index]._state = EXIST;
            ++_num;

            // 我么可以看到闭散列-开放定制法不是一种好的解决方式, 因为它是一种我的位置被占了, 我就去抢占别人的位置的思路
            // 也就是说他的哈希冲突会相互影响, 我冲突占你的, 你冲突占他的, 他冲突了... , 没完没了, 整体的效率都变低了
            // 开散列的哈希桶可以解决上面的问题
            return true;
        }

        // 线性探测
        // HashData *Find(const K &key)
        // {
        //     KeyOfT koft;
        //     // 计算 data 中的 key 在表中映射的位置
        //     size_t index = key % _tables.size();
        //     while (_tables[index]._state != EMPTY)
        //     {
        //         if (koft(_tables[index]._data) == key)
        //         {
        //             if (_tables[index]._state == EXIST)
        //             {
        //                 return &_tables[index];
        //             }
        //             else if (_tables[index]._state == DELETE)
        //             {
        //                 return nullptr;
        //             }
        //         }
        //         ++index;
        //         if (index == _tables.size())
        //         {
        //             index = 0;
        //         }
        //     }
        //     return nullptr;
        // }

        // 双重哈希
        HashData *Find(const K &key)
        {
            KeyOfT koft;
            size_t index = key % _tables.size();
            size_t step = SecondHash(key, _tables.size()); // 计算步长
            while (_tables[index]._state != EMPTY)
            {
                if (koft(_tables[index]._data) == key)
                {
                    if (_tables[index]._state == EXIST)
                    {
                        return &_tables[index];
                    }
                    else if (_tables[index]._state == DELETE)
                    {
                        return nullptr;
                    }
                }

                index = (index + step) % _tables.size(); // 使用双重哈希探测下一个位置
            }

            return nullptr;
        }

        bool Erase(const K &key)
        {
            HashData *ret = Find(key);
            if (ret)
            {
                ret->_state = DELETE;
                --_num;
                return true;
            }
            else
            {
                return false;
            }
        }

        HashData &getHashData(int pos)
        {
            return _tables[pos];
        }

        void Print()
        {
            int size = _tables.size();
            for (int i = 0; i < size; i++)
            {
                std::cout << i << "\t";
            }
            std::cout << std::endl;

            for (int i = 0; i < size; i++)
            {
                auto cur = _tables[i];
                if (cur._state == EXIST)
                {
                    std::cout << cur._data << "\t";
                }
                else
                {
                    std::cout << "*\t";
                }
            }
            std::cout << "\n\n";
        }

    private:
        std::vector<HashData> _tables;
        size_t _num = 0; // 存储了几个有效数据
    };

    template <class T>
    struct HashNode
    {
        T _data;            // 存储数据
        HashNode<T> *_next; // 存储下一个节点

        // 如果想要实现迭代顺序为插入顺序, 可以加两个指针组成一个链表
        // HashNode<T>* _linknext;
        // HashNode<T>* _linkprev;

        HashNode(const T &data)
            : _data(data), _next(nullptr)
        {
        }
    };

    // 前置声明
    template <class K, class T, class KeyOfT, class Hash>
    class Open_HashTable;

    // 哈希表只有单向迭代器, 只有 ++, 没有--
    template <class K, class T, class KeyOfT, class Hash>
    struct __HashTableIterator
    {
        typedef __HashTableIterator<K, T, KeyOfT, Hash> Self;
        typedef Open_HashTable<K, T, KeyOfT, Hash> HT;
        typedef HashNode<T> Node;

        Node *_node;
        HT *_pht;

        __HashTableIterator(Node *node, HT *pht)
            : _node(node), _pht(pht)
        {
        }

        T &operator*()
        {
            return _node->_data;
        }

        T *operator->()
        {
            return &_node->_data;
        }

        Self &operator++()
        {
            if (_node->_next)
            {
                _node = _node->_next;
            }
            else
            {
                // 如果一个桶走完了, 找到下一个桶继续便利
                KeyOfT koft;
                size_t index = _pht->HashFunc(koft(_node->_data)) % _pht->_tables.size();
                ++index;
                while (index < _pht->_tables.size())
                {
                    Node *cur = _pht->_tables[index];
                    if (cur)
                    {
                        _node = cur;
                        return *this;
                    }
                    ++index;
                }
                _node = nullptr;
            }
            return *this;
        }

        Self operator++(int)
        {
            Self tmp(*this);
            ++*this;
            return tmp;
        }

        bool operator!=(const Self &s)
        {
            return _node != s._node;
        }

        bool operator==(const Self &s)
        {
            return _node == s._node;
        }
    };

    template <class K>
    struct _Hash
    {
        const K &operator()(const K &key)
        {
            return key;
        }
    };

    // 特化
    template <>
    struct _Hash<std::string>
    {
        size_t operator()(const std::string &key)
        {
            // BKDR Hash
            size_t hash = 0;
            for (size_t i = 0; i < key.size(); i++)
            {
                hash *= 131;
                hash += key[i];
            }
            return hash;
        }
    };

    struct _HashString
    {
        size_t operator()(const std::string &key)
        {
            // BKDR Hash
            size_t hash = 0;
            for (size_t i = 0; i < key.size(); i++)
            {
                hash *= 131;
                hash += key[i];
            }

            return hash;
        }
    };

    template <class K, class T, class KeyOfT, class Hash>
    // template <class K, class T, class KeyOfT, class Hash = _Hash<K>>
    class Open_HashTable
    {
    private:
        typedef HashNode<T> Node;

    public:
        friend struct __HashTableIterator<K, T, KeyOfT, Hash>;
        typedef __HashTableIterator<K, T, KeyOfT, Hash> iterator;

        iterator begin()
        {
            for (size_t i = 0; i < _tables.size(); i++)
            {
                if (_tables[i])
                {
                    return iterator(_tables[i], this);
                }
            }
            return end();
        }

        iterator end()
        {
            return iterator(nullptr, this);
        }

        Open_HashTable()
        {
        }

        Open_HashTable(size_t bucket_count)
            : _tables(bucket_count), _num(0)
        {
        }

        ~Open_HashTable()
        {
            Clear();
        }

        void Clear()
        {
            for (size_t i = 0; i < _tables.size(); i++)
            {
                Node *cur = _tables[i];
                while (cur)
                {
                    Node *next = cur->_next;
                    delete cur;
                    cur = next;
                }
                _tables[i] = nullptr;
            }
        }

        size_t HashFunc(const K &key)
        {
            Hash hash;
            return hash(key);
        }

        size_t GetNextPrime(size_t num)
        {
            const int PrimeSize = 28;
            static const unsigned long PrimeList[PrimeSize] =
                {
                    53ul, 97ul, 193ul, 389ul, 769ul,
                    1543ul, 3079ul, 6151ul, 12289ul, 24593ul,
                    49157ul, 98317ul, 196613ul, 393241ul, 786433ul,
                    1572869ul, 3145739ul, 6291469ul, 12582917ul, 25165843ul,
                    50331653ul, 100663319ul, 201326611ul, 402653189ul, 805306457ul,
                    1610612741ul, 3221225473ul, 4294967291ul};

            for (size_t i = 0; i < PrimeSize; i++)
            {
                if (PrimeList[i] > num)
                {
                    return PrimeList[i];
                }
            }

            return PrimeList[PrimeSize - 1]; // 如果已经是最后一个数的, 则不增容
        }

        // 重新哈希
        void Rehash(size_t newsize)
        {
            KeyOfT koft;
            std::vector<Node *> newtables;
            newtables.resize(newsize);
            for (size_t i = 0; i < _tables.size(); i++)
            {
                // 将旧表中的节点取下来, 重新计算在新表中的位置, 并插入进去
                Node *cur = _tables[i];
                while (cur)
                {
                    Node *next = cur->_next;
                    size_t index = HashFunc(koft(cur->_data)) % newtables.size();
                    cur->_next = newtables[index];
                    newtables[index] = cur;
                    cur = next;
                }
                _tables[i] = nullptr;
            }
            _tables.swap(newtables);
        }

        // 插入操作
        // 当大量的数据冲突, 这些哈希冲突的数据就会挂在同一个链式桶中, 查找时效率就会降低, 所以开散列-哈希桶也是要控制哈希冲突的。
        // 如何控制呢? 通过控制负载因子, 不过这里就把空间利用率提高一些, 负载因子也可以高一些, 一般开散列把负载因子控制到1, 会比较好一点
        std::pair<iterator, bool> Insert(const T &data)
        {
            KeyOfT koft;

            // 1. 检查负载因子
            // 如果负载因子等于 1 , 则增容, 避免大量的哈希冲突
            if (_tables.size() == _num)
            {
                // 1. 开两倍大小的新表（不一定是两倍）
                // 2. 遍历旧表的数据, 重新计算在新表中的位置
                // 3. 释放旧表
                size_t newsize = _tables.size() == 0 ? 10 : _tables.size() * 2;
                // size_t newsize = GetNextPrime(_tables.size());

                Rehash(newsize);
            }

            // 2. 计算数据在表中映射的位置
            size_t index = HashFunc(koft(data)) % _tables.size();

            // 3. 先查找这个值在不在表中, 是否有冲突
            Node *cur = _tables[index];
            while (cur)
            {
                if (HashFunc(koft(cur->_data)) == HashFunc(koft(data)))
                {
                    // 如果已经存在该键，返回失败
                    return std::make_pair(iterator(cur, this), false);
                }
                else
                {
                    // 查找下一个节点
                    cur = cur->_next;
                }
            }

            // 4. 头插挂到链表中(尾插也是可以的)
            Node *newnode = new Node(data);
            newnode->_next = _tables[index];
            _tables[index] = newnode;

            ++_num; // 更新已存储元素数量
            return std::make_pair(iterator(newnode, this), true);
        }

        // 查找操作
        Node *Find(const K &key)
        {
            KeyOfT koft;

            // 1. 计算键在表中映射的位置
            size_t index = HashFunc(key) % _tables.size();
            Node *cur = _tables[index];

            // 2. 遍历链表查找匹配的键
            while (cur)
            {
                if (HashFunc(koft(cur->_data)) == HashFunc(key))
                {
                    // 如果找到匹配的元素，返回其指针
                    return cur;
                }
                // 继续查找下一个节点
                cur = cur->_next;
            }
            // 如果未找到，返回空指针
            return nullptr;
        }

        bool Erase(const K &key)
        {
            KeyOfT koft;

            // 1. 计算要删除元素的哈希值
            size_t index = HashFunc(key) % _tables.size();
            Node *prev = nullptr;
            Node *cur = _tables[index];

            // 2. 遍历链表, 查找匹配的元素
            while (cur)
            {
                if (HashFunc(koft(cur->_data)) == HashFunc(key))
                {
                    // 3. 找到元素后, 调整链表结构
                    if (prev == nullptr)
                    {
                        // 如果要删除的元素是链表的第一个节点, 直接让桶指向下一个节点
                        _tables[index] = cur->_next;
                    }
                    else
                    {
                        // 否则，将前一个节点的 next 指向当前节点的下一个节点
                        prev->_next = cur->_next;
                    }
                    // 4. 释放节点内存
                    delete cur;
                    --_num; // 元素数量减少
                    return true;
                }
                else
                {
                    // 继续遍历链表
                    prev = cur;
                    cur = cur->_next;
                }
            }
            // 如果未找到该元素，返回 false
            return false;
        }

        void Print() const
        {
            KeyOfT koft;
            int size = _tables.size();
            for (int i = 0; i < size; i++)
            {
                std::cout << i << "\t";
                Node *cur = _tables[i];
                while (cur)
                {
                    std::cout << koft(cur->_data) << "\t";
                    cur = cur->_next;
                }
                std::cout << std::endl;
            }
            std::cout << std::endl;
        }

    private:
        std::vector<Node *> _tables; // 哈希表存储桶
        size_t _num;                 // 记录着存储的数据个数
    };
}