【第1题】 Pythonn内存管理以及垃圾回收机制

时间 2019-11-06

标签第1题 pythonn 内存管理以及垃圾回收机制繁體版

原文原文链接

内存管理

Python解释器由c语言开发完成，py中全部的操做最终都由底层的c语言来实现并完成，因此想要了解底层内存管理须要结合python源码来进行解释。html

1. 两个重要的结构体

include/object.hpython

#define _PyObject_HEAD_EXTRA            \
    struct _object *_ob_next;           \
    struct _object *_ob_prev;
    
#define PyObject_HEAD       PyObject ob_base;

#define PyObject_VAR_HEAD      PyVarObject ob_base;


typedef struct _object {
    _PyObject_HEAD_EXTRA // 用于构造双向链表
    Py_ssize_t ob_refcnt;  // 引用计数器
    struct _typeobject *ob_type;	// 数据类型
} PyObject;


typedef struct {
    PyObject ob_base;	// PyObject对象
    Py_ssize_t ob_size; /* Number of items in variable part，即：元素个数 */
} PyVarObject;

以上源码是Python内存管理中的基石，其中包含了：git

2个结构体
- PyObject，此结构体中包含3个元素。
  - _PyObject_HEAD_EXTRA，用于构造双向链表。
  - ob_refcnt，引用计数器。
  - *ob_type，数据类型。
- PyVarObject，次结构体中包含4个元素（ob_base中包含3个元素）
  - ob_base，PyObject结构体对象，即：包含PyObject结构体中的三个元素。
  - ob_size，内部元素个数。
3个宏定义
- PyObject_HEAD，代指PyObject结构体。
- PyVarObject_HEAD，代指PyVarObject对象。
- _PyObject_HEAD_EXTRA，代指先后指针，用于构造双向队列。

Python中全部类型建立对象时，底层都是与PyObject和PyVarObject结构体实现，通常状况下由单个元素组成对象内部会使用PyObject结构体（float）、由多个元素组成的对象内部会使用PyVarObject结构体（str/int/list/dict/tuple/set/自定义类），由于由多个元素组成的话是须要为其维护一个 ob_size（内部元素个数）。express

typedef struct {
    PyObject_HEAD
    double ob_fval;
} PyFloatObject;

include/floatobject.h

// longintrepr.h

struct _longobject {
    PyObject_VAR_HEAD
    digit ob_digit[1];
};

// longobject.h

/* Long (arbitrary precision) integer object interface */
typedef struct _longobject PyLongObject; /* Revealed in longintrepr.h */

/*
1. python3中没有long类型，只有int类型，但py3内部的int是基于long实现。
2. python3中对int/long长度没有限制，因其内部不是用long存储而是使用相似于“字符串”存储。
*/

include/longobject.h

typedef struct {
    PyObject_VAR_HEAD
    Py_hash_t ob_shash;
    char ob_sval[1];
    /* Invariants:
     *     ob_sval contains space for 'ob_size+1' elements.
     *     ob_sval[ob_size] == 0.
     *     ob_shash is the hash of the string or -1 if not computed yet.
     */
} PyBytesObject;

include/bytesobject.h

typedef struct {
    PyObject_VAR_HEAD
      
    /* Vector of pointers to list elements.  list[0] is ob_item[0], etc. */
    PyObject **ob_item;
  
    /* ob_item contains space for 'allocated' elements.  The number
     * currently in use is ob_size.
     * Invariants:
     *     0 <= ob_size <= allocated
     *     len(list) == ob_size
     *     ob_item == NULL implies ob_size == allocated == 0
     * list.sort() temporarily sets allocated to -1 to detect mutations.
     *
     * Items must normally not be NULL, except during construction when
     * the list is not yet visible outside the function that builds it.
     */
    Py_ssize_t allocated;
} PyListObject;

include/listobject.h

typedef struct {
    PyObject_VAR_HEAD
    PyObject *ob_item[1];

    /* ob_item contains space for 'ob_size' elements.
     * Items must normally not be NULL, except during construction when
     * the tuple is not yet visible outside the function that builds it.
     */
} PyTupleObject;

include/tupleobject.h

typedef struct {
    PyObject_HEAD
    Py_ssize_t ma_used;
    PyDictKeysObject *ma_keys;
    PyObject **ma_values;
} PyDictObject;

include/dictobject.h

typedef struct {
    PyObject_HEAD

    Py_ssize_t fill;            /* Number active and dummy entries*/
    Py_ssize_t used;            /* Number active entries */

    /* The table contains mask + 1 slots, and that's a power of 2.
     * We store the mask instead of the size because the mask is more
     * frequently needed.
     */
    Py_ssize_t mask;

    /* The table points to a fixed-size smalltable for small tables
     * or to additional malloc'ed memory for bigger tables.
     * The table pointer is never NULL which saves us from repeated
     * runtime null-tests.
     */
    setentry *table;
    Py_hash_t hash;             /* Only used by frozenset objects */
    Py_ssize_t finger;          /* Search finger for pop() */

    setentry smalltable[PySet_MINSIZE];
    PyObject *weakreflist;      /* List of weak references */
} PySetObject;

include/setobject.h

typedef struct _typeobject {
    PyObject_VAR_HEAD
    const char *tp_name; /* For printing, in format "<module>.<name>" */
    Py_ssize_t tp_basicsize, tp_itemsize; /* For allocation */

    /* Methods to implement standard operations */
    ...
    
} PyTypeObject;

自定义类 include/object.h

注意：Python3只保留int类型，但此时的int就是Python2中的long类型，请看以下官方提示： PEP 0237: Essentially, long renamed to int. That is, there is only one built-in integral type, named int; but it behaves mostly like the old long type.点击查看原文。缓存

2. 内存管理

以float和list类型为例，分析python源码执行流程，了解内存管理机制。app

2.1 float类型

情景一：建立float对象时ide

val = 3.14

当按照上述方式建立一个Float类型对象时，源码内部会前后执行以下代码。函数

/* Special free list
   free_list is a singly-linked list of available PyFloatObjects, linked
   via abuse of their ob_type members.
*/
static PyFloatObject *free_list = NULL;
static int numfree = 0;
 
PyObject *
PyFloat_FromDouble(double fval)
{
    PyFloatObject *op = free_list;
    if (op != NULL) {
        free_list = (PyFloatObject *) Py_TYPE(op);
        numfree--;
    } else {
         
        // 第一步：根据float类型大小，为float对象开辟内存。
        op = (PyFloatObject*) PyObject_MALLOC(sizeof(PyFloatObject));
        if (!op)
            return PyErr_NoMemory();
    }
 
    // 第二步：在为float对象开辟的内存中进行初始化。
    /* Inline PyObject_New */
    (void)PyObject_INIT(op, &PyFloat_Type);
     
    // 第三步：将值赋值到float对象开辟的内存中。
    op->ob_fval = fval;
 
    // 第四步：返回已经建立的float对象的内存地址（引用/指针）
    return (PyObject *) op;
}

第一步：根据float类型所需的内存大小，为其开辟内存。post

static PyMemAllocatorEx _PyObject = {
#ifdef PYMALLOC_DEBUG
    &_PyMem_Debug.obj, PYDBG_FUNCS
#else
    NULL, PYOBJ_FUNCS
#endif
    };



void *
PyObject_Malloc(size_t size)
{
    /* see PyMem_RawMalloc() */
    if (size > (size_t)PY_SSIZE_T_MAX)
        return NULL;

    // 开辟内存
    return _PyObject.malloc(_PyObject.ctx, size);
}

Objects/obmalloc.c

Customize Memory Allocators
===========================

.. versionadded:: 3.4

.. c:type:: PyMemAllocatorEx

   Structure used to describe a memory block allocator. The structure has
   four fields:

   +----------------------------------------------------------+---------------------------------------+
   | Field                                                    | Meaning                               |
   +==========================================================+=======================================+
   | ``void *ctx``                                            | user context passed as first argument |
   +----------------------------------------------------------+---------------------------------------+
   | ``void* malloc(void *ctx, size_t size)``                 | allocate a memory block               |
   +----------------------------------------------------------+---------------------------------------+
   | ``void* calloc(void *ctx, size_t nelem, size_t elsize)`` | allocate a memory block initialized   |
   |                                                          | with zeros                            |
   +----------------------------------------------------------+---------------------------------------+
   | ``void* realloc(void *ctx, void *ptr, size_t new_size)`` | allocate or resize a memory block     |
   +----------------------------------------------------------+---------------------------------------+
   | ``void free(void *ctx, void *ptr)``                      | free a memory block                   |
   +----------------------------------------------------------+---------------------------------------+

   .. versionchanged:: 3.5
      The :c:type:`PyMemAllocator` structure was renamed to
      :c:type:`PyMemAllocatorEx` and a new ``calloc`` field was added.

PyMemAllocatorEx的方法说明

第二步：对新开辟的内存中进行类型和引用的初始化ui

/* Macros trading binary compatibility for speed. See also pymem.h.
   Note that these macros expect non-NULL object pointers.*/
#define PyObject_INIT(op, typeobj) \
    ( Py_TYPE(op) = (typeobj), _Py_NewReference((PyObject *)(op)), (op) )

include/objimpl.h

/* Head of circular doubly-linked list of all objects.  These are linked
 * together via the _ob_prev and _ob_next members of a PyObject, which
 * exist only in a Py_TRACE_REFS build.
 */
static PyObject refchain = {&refchain, &refchain};

/* Insert op at the front of the list of all objects.  If force is true,
 * op is added even if _ob_prev and _ob_next are non-NULL already.  If
 * force is false amd _ob_prev or _ob_next are non-NULL, do nothing.
 * force should be true if and only if op points to freshly allocated,
 * uninitialized memory, or you've unlinked op from the list and are
 * relinking it into the front.
 * Note that objects are normally added to the list via _Py_NewReference,
 * which is called by PyObject_Init.  Not all objects are initialized that
 * way, though; exceptions include statically allocated type objects, and
 * statically allocated singletons (like Py_True and Py_None).
 */
void
_Py_AddToAllObjects(PyObject *op, int force)
{

    if (force || op->_ob_prev == NULL) {
        op->_ob_next = refchain._ob_next;
        op->_ob_prev = &refchain;
        refchain._ob_next->_ob_prev = op;
        refchain._ob_next = op;
    }
}

void
_Py_NewReference(PyObject *op)
{
    _Py_INC_REFTOTAL;

    // 对新开辟的内存中的的引用计数器初始化为1。
    op->ob_refcnt = 1;

    // 将新开辟的内存的指针添加到一个双向链表refchain中。
    _Py_AddToAllObjects(op, 1);

    _Py_INC_TPALLOCS(op);
}

Objects/object.c

因此，float类型每次建立对象时都会把对象放到 refchain 的双向链表中。

情景二：float对象引用时

val = 7.8
data = val

这个过程比较简单，在给对象建立新引用时，会对其引用计数器+1的动做。

/*
The macros Py_INCREF(op) and Py_DECREF(op) are used to increment or decrement
reference counts.  Py_DECREF calls the object's deallocator function when
the refcount falls to 0; for
objects that don't contain references to other objects or heap memory
this can be the standard function free().  Both macros can be used
wherever a void expression is allowed.  The argument must not be a
NULL pointer.  If it may be NULL, use Py_XINCREF/Py_XDECREF instead.
The macro _Py_NewReference(op) initialize reference counts to 1, and
in special builds (Py_REF_DEBUG, Py_TRACE_REFS) performs additional
bookkeeping appropriate to the special build.


#define Py_INCREF(op) (                         \
    _Py_INC_REFTOTAL  _Py_REF_DEBUG_COMMA       \
    ((PyObject *)(op))->ob_refcnt++)

include/object.h

情景三：销毁float对象时

val = 3.14
# 主动删除对象
del val 

"""
主动del删除对象时，会执行对象销毁的动做。
一个函数执行完毕以后，其内部局部变量也会有销毁动做，如：
def func():
    val = 2.22

func()
"""

当进行销毁对象动做时，前后会执行以下代码：

The macros Py_INCREF(op) and Py_DECREF(op) are used to increment or decrement
reference counts.  Py_DECREF calls the object's deallocator function when
the refcount falls to 0; for
objects that don't contain references to other objects or heap memory
this can be the standard function free().  Both macros can be used
wherever a void expression is allowed.  The argument must not be a
NULL pointer.  If it may be NULL, use Py_XINCREF/Py_XDECREF instead.
The macro _Py_NewReference(op) initialize reference counts to 1, and
in special builds (Py_REF_DEBUG, Py_TRACE_REFS) performs additional
bookkeeping appropriate to the special build.


#define Py_DECREF(op)                                   \
    do {                                                \
        PyObject *_py_decref_tmp = (PyObject *)(op);    \
        if (_Py_DEC_REFTOTAL  _Py_REF_DEBUG_COMMA       \
        --(_py_decref_tmp)->ob_refcnt != 0)             \
            _Py_CHECK_REFCNT(_py_decref_tmp)            \
        else                                            \
        _Py_Dealloc(_py_decref_tmp);                    \
    } while (0)

include/object.h

void
_Py_Dealloc(PyObject *op)
{
    // 第一步：调用float类型的tp_dealloc，进行内存的销毁
    destructor dealloc = Py_TYPE(op)->tp_dealloc;

    // 第二步：在refchain双向链表中移除
    _Py_ForgetReference(op);

    (*dealloc)(op);
}

Objects/object.c

第一步，调用float类型的tp_dealloc进行内存的销毁。

按理此过程说应该直接将对象内存销毁，但float内部有缓存机制，因此他的执行流程是这样的：

float内部缓存的内存个数已经大于等于100，那么在执行`del val`的语句时，内存中就会直接删除此对象。
未达到100时，那么执行 `del val`语句，不会真的在内存中销毁对象，而是将对象放到一个free_list的单链表中，以便之后的对象使用。

/* Special free list
   free_list is a singly-linked list of available PyFloatObjects, linked
   via abuse of their ob_type members.
*/

#ifndef PyFloat_MAXFREELIST
#define PyFloat_MAXFREELIST    100
#endif
static int numfree = 0;
static PyFloatObject *free_list = NULL;



PyTypeObject PyFloat_Type = {
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
    "float",
    sizeof(PyFloatObject),
    0,

    // tp_dealloc表示执行float_dealloc方法
    (destructor)float_dealloc,                  /* tp_dealloc */
    0,                                          /* tp_print */
    0,                                          /* tp_getattr */
    0,                                          /* tp_setattr */
    0,                                          /* tp_reserved */
    ...
};



static void
float_dealloc(PyFloatObject *op)
{
    // 检测是不是float类型
    if (PyFloat_CheckExact(op)) {

        // 检测缓冲池个数是否大于100个
        if (numfree >= PyFloat_MAXFREELIST)  {
            // 若是大于100个，则在内存中销毁对象
            PyObject_FREE(op);
            return;
        }
        // 不然，缓冲池个数+1
        // 并将要销毁的数据加入到free_list的单项链表中，以便之后建立float类型使用。
        numfree++;
        Py_TYPE(op) = (struct _typeobject *)free_list;
        free_list = op;
    }
    else
        Py_TYPE(op)->tp_free((PyObject *)op);
}

Objects/floatobject.c

"""
了解Python中float类型的缓存机制以后，就能够理解以下代码的两个内存地址竟然同样的现象的本质了。
"""

v1 = 3.8
print(id(v1)) # 内存地址：140454027861640
del v1

v2 = 88.7
print(id(v2)) # 内存地址：140454027861640

扩展：读源码了解现象本质

void
PyObject_Free(void *ptr)
{
    // 与上述开辟内存相似
    _PyObject.free(_PyObject.ctx, ptr);
}

Objects/obmalloc.c

第二步，在refchain双向链表中移除

/* Head of circular doubly-linked list of all objects.  These are linked
 * together via the _ob_prev and _ob_next members of a PyObject, which
 * exist only in a Py_TRACE_REFS build.
 */
static PyObject refchain = {&refchain, &refchain};


void
_Py_ForgetReference(PyObject *op)
{
#ifdef SLOW_UNREF_CHECK
    PyObject *p;
#endif
    if (op->ob_refcnt < 0)
        Py_FatalError("UNREF negative refcnt");
    if (op == &refchain ||
        op->_ob_prev->_ob_next != op || op->_ob_next->_ob_prev != op) {
        fprintf(stderr, "* ob\n");
        _PyObject_Dump(op);
        fprintf(stderr, "* op->_ob_prev->_ob_next\n");
        _PyObject_Dump(op->_ob_prev->_ob_next);
        fprintf(stderr, "* op->_ob_next->_ob_prev\n");
        _PyObject_Dump(op->_ob_next->_ob_prev);
        Py_FatalError("UNREF invalid object");
    }
#ifdef SLOW_UNREF_CHECK
    for (p = refchain._ob_next; p != &refchain; p = p->_ob_next) {
        if (p == op)
            break;
    }
    if (p == &refchain) /* Not found */
        Py_FatalError("UNREF unknown object");
#endif
    op->_ob_next->_ob_prev = op->_ob_prev;
    op->_ob_prev->_ob_next = op->_ob_next;
    op->_ob_next = op->_ob_prev = NULL;
    _Py_INC_TPFREES(op);
}

Objects/object.c

综上所述，float对象在建立对象时会把为其开辟内存并初始化引用计数器为1，而后将其加入到名为 refchain 的双向链表中；float对象在增长引用时，会执行 Py_INCREF在内部会让引用计数器+1；最后执行销毁float对象时，会先判断float内部free_list中缓存的个数，若是已达到300个，则直接在内存中销毁，不然不会真正销毁而是加入free_list单链表中，之后后续对象使用，销毁动做的最后再在refchain中移除便可。

2.2 list类型

垃圾回收机制

Python的垃圾回收机制是以：引用计数器为主，标记清除和分代回收为辅。

1. 引用计数器

每一个对象内部都维护了一个值，该值记录这此对象被引用的次数，若是次数为0，则Python垃圾回收机制会自动清除此对象。下图是Python源码中引用计数器存储的代码。

引用计数器的获取及代码示例：

import sys

# 在内存中建立一个字符串对象"武沛齐"，对象引用计数器的值为：1
nick_name = '武沛齐'

# 应该输入2，实际输出2，由于getrefcount方法时把 nick_name 当作参数传递了，引起引用计数器+1，因此打印时值为：2
# 注意：getrefcount 函数执行完毕后，会自动-1，因此本质上引用计数器仍是1.
print(sys.getrefcount(nick_name))

# 变量 real_name 也指向的字符串对象"武沛齐"，即：引用计数器再 +1，因此值为：2
real_name = nick_name

# 应该输出2，实际输出3. 由于getrefcount方法时把 real_name 当作参数传递了，引起引用计数器+1，因此打印时值为：3
# 注意：getrefcount 函数执行完毕后，会自动-1，因此本质上引用计数器仍是2.
print(sys.getrefcount(nick_name))

# 删除reald_name变量，并让其指向对象中的引用计数器-1
del real_name

# 应该输出1，实际输出2，由于getrefcount方法时把 real_name 当作参数传递了，引起引用计数器+1，因此打印时值为：2.
print(sys.getrefcount(nick_name))





# ############ getrefcount 注释信息 ############
'''
def getrefcount(p_object): # real signature unknown; restored from __doc__
    """
    getrefcount(object) -> integer
    
    Return the reference count of object.  The count returned is generally
    one higher than you might expect, because it includes the (temporary)
    reference as an argument to getrefcount().
    """
    return 0
'''

2. 循环引用

经过引用计数器的方式基本上能够完成Python的垃圾回收，但它仍是具备明显的缺陷，即：“循环引用” 。

#!/usr/bin/env python
# -*- coding:utf-8 -*-
import gc
import objgraph


class Foo(object):
    def __init__(self):
        self.data = None


# 在内存建立两个对象，即：引用计数器值都是1
obj1 = Foo()
obj2 = Foo()

# 两个对象循环引用，致使内存中对象的应用+1，即：引用计数器值都是2
obj1.data = obj2
obj2.data = obj1

# 删除变量，并将引用计数器-1。
del obj1
del obj2

# 关闭垃圾回收机制，由于python的垃圾回收机制是：引用计数器、标记清除、分代回收 配合已解决循环引用的问题，关闭他便于以后查询内存中未被释放对象。
gc.disable()

# 至此，因为循环引用致使内存中建立的obj1和obj2两个对象引用计数器不为0，没法被垃圾回收机制回收。
# 因此，内存中Foo类的对象就还显示有2个。
print(objgraph.count('Foo'))

注意：gc.collect() 能够主动触发垃圾回收；

循环引用的问题会引起内存中的对象一直没法释放，从而内存逐渐增大，最终致使内存泄露。

为了解决循环引用的问题，Python又在引用计数器的基础上引入了标记清除和分代回收的机制。

so，没必要再担忧循环引用的问题了。

Reference cycles involving lists, tuples, instances, classes, dictionaries, and functions are found.

Python GC 源码文档：http://www.arctrix.com/nas/python/gc/

3. 标记清除&分代回收

Python为了解决循环引用，针对 lists, tuples, instances, classes, dictionaries, and functions 类型，每建立一个对象都会将对象放到一个双向链表中，每一个对象中都有 _ob_next 和 _ob_prev 指针，用于挂靠到链表中。

/* Nothing is actually declared to be a PyObject, but every pointer to
 * a Python object can be cast to a PyObject*.  This is inheritance built
 * by hand.  Similarly every pointer to a variable-size Python object can,
 * in addition, be cast to PyVarObject*.
 */
typedef struct _object {
    _PyObject_HEAD_EXTRA # 双向链表
    Py_ssize_t ob_refcnt;
    struct _typeobject *ob_type;
} PyObject;

typedef struct {
    PyObject ob_base;
    Py_ssize_t ob_size; /* Number of items in variable part */
} PyVarObject;


/* Define pointers to support a doubly-linked list of all live heap objects. */
#define _PyObject_HEAD_EXTRA            \
    struct _object *_ob_next;           \
    struct _object *_ob_prev;

随着对象的建立，该双向链表上的对象会愈来愈多。

当对象个数超过 700个时，Python解释器就会进行垃圾回收。
当代码中主动执行 gc.collect() 命令时，Python解释器就会进行垃圾回收。
```
import gc

gc.collect()
```

Python解释器在垃圾回收时，会遍历链表中的每一个对象，若是存在循环引用，就将存在循环引用的对象的引用计数器 -1，同时Python解释器也会将计数器等于0（可回收）和不等于0（不可回收）的一分为二，把计数器等于0的全部对象进行回收，把计数器不为0的对象放到另一个双向链表表（即：分代回收的下一代）。

关于分代回收（generations）：

The GC classifies objects into three generations depending on how many collection sweeps they have survived. New objects are placed in the youngest generation (generation 0). If an object survives a collection it is moved into the next older generation. Since generation 2 is the oldest generation, objects in that generation remain there after a collection. In order to decide when to run, the collector keeps track of the number object allocations and deallocations since the last collection. When the number of allocations minus the number of deallocations exceeds threshold0, collection starts. Initially only generation 0 is examined. If generation 0 has been examined more than threshold1 times since generation 1 has been examined, then generation 1 is examined as well. Similarly, threshold2 controls the number of collections of generation 1 before collecting generation 2.

# 默认状况下三个阈值为 (700,10,10) ，也能够主动去修改默认阈值。
import gc

gc.set_threshold(threshold0[, threshold1[, threshold2]])

官方文档： https://docs.python.org/3/library/gc.html

参考文档：

　　http://www.wklken.me/posts/2015/09/29/python-source-gc.html

　　https://yq.aliyun.com/users/yqzdoezsuvujg/album?spm=a2c4e.11155435.0.0.d07467451AwRxO