Dictionary object implementation using a hash table ,通过描述可知,python 的字典就是实现了一个 hash 表。
Python 字典概述
在 python 的字典中,一个键值对的对应保存就是 PyDictEntry 类型来保存;
源文件:Include/dict-common.h
1 2 3 4 5 6 7
// Objects/dict-common.h typedefstruct { /* Cached hash code of me_key. */ Py_hash_t me_hash; PyObject *me_key; PyObject *me_value; /* This field is only meaningful for combined tables */ } PyDictKeyEntry;
/* The ma_values pointer is NULL for a combined table * or points to an array of PyObject* for a split table */ typedefstruct { PyObject_HEAD
/* Number of items in the dictionary */ Py_ssize_t ma_used; // 使用的keys个数
/* Dictionary version: globally unique, value change each time the dictionary is modified */ uint64_t ma_version_tag;
PyDictKeysObject *ma_keys; // 如果有则是保存的keys数据
/* If ma_values is NULL, the table is "combined": keys and values are stored in ma_keys. If ma_values is not NULL, the table is splitted: keys are stored in ma_keys and values are stored in ma_values */ PyObject **ma_values; // 如果不为空则保存的是values } PyDictObject;
// Objects/dict-common.h /* See dictobject.c for actual layout of DictKeysObject */ struct _dictkeysobject { Py_ssize_t dk_refcnt; // 引用计数
/* Size of the hash table (dk_indices). It must be a power of 2. */ Py_ssize_t dk_size; // hash table 的大小必须是2的倍数
/* Function to lookup in the hash table (dk_indices): - lookdict(): general-purpose, and may return DKIX_ERROR if (and only if) a comparison raises an exception. - lookdict_unicode(): specialized to Unicode string keys, comparison of which can never raise an exception; that function can never return DKIX_ERROR. - lookdict_unicode_nodummy(): similar to lookdict_unicode() but further specialized for Unicode string keys that cannot be the <dummy> value. - lookdict_split(): Version of lookdict() for split tables. */ dict_lookup_func dk_lookup; // 哈希查找函数
/* Number of usable entries in dk_entries. */ Py_ssize_t dk_usable; // 可用的entry数量
/* Number of used entries in dk_entries. */ Py_ssize_t dk_nentries; // 已经使用的entry数量
/* Actual hash table of dk_size entries. It holds indices in dk_entries, or DKIX_EMPTY(-1) or DKIX_DUMMY(-2). Indices must be: 0 <= indice < USABLE_FRACTION(dk_size). The size in bytes of an indice depends on dk_size: - 1 byte if dk_size <= 0xff (char*) - 2 bytes if dk_size <= 0xffff (int16_t*) - 4 bytes if dk_size <= 0xffffffff (int32_t*) - 8 bytes otherwise (int64_t*) Dynamically sized, SIZEOF_VOID_P is minimum. */ char dk_indices[]; /* char is required to avoid strict aliasing. */// 存入的entries
/* "PyDictKeyEntry dk_entries[dk_usable];" array follows: see the DK_ENTRIES() macro */ };
/* There are no strict guarantee that returned dict can contain minused * items without resize. So we create medium size dict instead of very * large dict or MemoryError. */ if (minused > USABLE_FRACTION(max_presize)) { // 检查传入的数量是否超过最大值 newsize = max_presize; } else { Py_ssize_t minsize = ESTIMATE_SIZE(minused); // 获取最小的值,在新建一个空的字典的时候该值为0 newsize = PyDict_MINSIZE; // 设置字典的最小值 为8 while (newsize < minsize) { // 如果传入的值大于最小值则调整newsize 大小 newsize <<= 1; } } assert(IS_POWER_OF_2(newsize));
/* When insertion order is different from shared key, we can't share * the key anymore. Convert this instance to combine table. */ if (_PyDict_HasSplitTable(mp) && ((ix >= 0 && old_value == NULL && mp->ma_used != ix) || (ix == DKIX_EMPTY && mp->ma_used != mp->ma_keys->dk_nentries))) { // 检查是否是分离表,如果没查找到旧值并且 if (insertion_resize(mp) < 0) // 重新设置该字典大小 goto Fail; ix = DKIX_EMPTY; }
/* Faster version of lookdict_unicode when it is known that no <dummy> keys * will be present. */ static Py_ssize_t _Py_HOT_FUNCTION lookdict_unicode_nodummy(PyDictObject *mp, PyObject *key, Py_hash_t hash, PyObject **value_addr) { assert(mp->ma_values == NULL); /* Make sure this function doesn't have to handle non-unicode keys, including subclasses of str; e.g., one reason to subclass unicodes is to override __eq__, and for speed we don't cater to that here. */ if (!PyUnicode_CheckExact(key)) { // 检查如果不是unicode则直接调用lookdict方法查找 mp->ma_keys->dk_lookup = lookdict; return lookdict(mp, key, hash, value_addr); }
// 测试代码 PyObject* key1 = PyLong_FromLong(20000); Py_hash_t hash1 = PyObject_Hash(key1); PyObject* old_value1; Py_ssize_t ix1 = mp->ma_keys->dk_lookup(mp, key1, hash1, &old_value1); if (ix1 == 0){ PyLongObject* give; give = (PyLongObject* )key1; printf("found value : %ld\n", give->ob_digit[0]); PyDictKeyEntry *ep01 = DK_ENTRIES(mp->ma_keys); int i, count; count = mp->ma_used; int size_count, j; size_count = mp->ma_keys->dk_size; printf("%s ", mp->ma_keys->dk_indices); int8_t *indices = (int8_t*)(mp->ma_keys->dk_indices); printf("indices index values :"); for (j=0; j<size_count;j++){ printf("%d ",(char) indices[j]); } printf("\n"); for (i=0; i<count;i++){ give = (PyLongObject* )ep01->me_key; printf("size : %d ", mp->ma_keys->dk_size); printf("found value while key : %ld ", give->ob_digit[0]); give = (PyLongObject* )ep01->me_value; printf("value : %ld\n", give->ob_digit[0]); ep01++; } }
然后编译运行;
1 2 3 4 5 6 7
Python 3.7.3 (default, May 222019, 16:17:57) [GCC 7.3.0] on linux Type"help", "copyright", "credits"or"license"for more information. >>> d = {20000:2} found value : 20000 indices index values :0 -1 -1 -1 -1 -1 -1 -1 size : 8 found value while key : 20000 value : 2
其中为什么初始化的时候输入 20000,是根据代码找到相关的 key 值,因为字典也被 python 自身实现的结构中引用了多次,所以我们就设置了一个特殊值来跟踪我们想要的字典;当 d 初始化的时候,就输出如上所示内容;我们接下来继续操作;
>>> d = {20000:2} found value : 20000 indices index values :0 -1 -1 -1 -1 -1 -1 -1 size : 8 found value while key : 20000 value : 2 >>> d[2] = 3 found value : 20000 indices index values :0 -11 -1 -1 -1 -1 -1 size : 8 found value while key : 20000 value : 2 size : 8 found value while key : 2 value : 3 >>> d[3] = 4 found value : 20000 indices index values :0 -112 -1 -1 -1 -1 size : 8 found value while key : 20000 value : 2 size : 8 found value while key : 2 value : 3 size : 8 found value while key : 3 value : 4 >>> d[5] = 6 found value : 20000 indices index values :0 -112 -13 -1 -1 size : 8 found value while key : 20000 value : 2 size : 8 found value while key : 2 value : 3 size : 8 found value while key : 3 value : 4 size : 8 found value while key : 5 value : 6 >>> d[7] = 8 found value : 20000 indices index values :0 -112 -13 -14 size : 8 found value while key : 20000 value : 2 size : 8 found value while key : 2 value : 3 size : 8 found value while key : 3 value : 4 size : 8 found value while key : 5 value : 6 size : 8 found value while key : 7 value : 8
此后我们一直添加值进 d,从输出信息可知,index 就是记录了 PyDictKeyEntry 的索引值,-1 就表示该处未使用。 当我们继续向 d 中添加内容时;
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
>>> d[9] = 10 found value : 20000 indices index values :0 -112 -13 -14 -15 -1 -1 -1 -1 -1 -1 size : 16 found value while key : 20000 value : 2 size : 16 found value while key : 2 value : 3 size : 16 found value while key : 3 value : 4 size : 16 found value while key : 5 value : 6 size : 16 found value while key : 7 value : 8 size : 16 found value while key : 9 value : 10 >>> d[10] = 11 found value : 20000 indices index values :0 -112 -13 -14 -156 -1 -1 -1 -1 -1 size : 16 found value while key : 20000 value : 2 size : 16 found value while key : 2 value : 3 size : 16 found value while key : 3 value : 4 size : 16 found value while key : 5 value : 6 size : 16 found value while key : 7 value : 8 size : 16 found value while key : 9 value : 10 size : 16 found value while key : 10 value : 11