编程语言
首页 > 编程语言> > 深入探究Python:字符串加法

深入探究Python:字符串加法

作者:互联网

从Python3字符串统一使用Unicode编码

1.字符串加法

#源码
a="dddddddddd"
b="sssssssssss"

a = a+b
#字节码
              0 LOAD_NAME                0 (a)
              2 LOAD_NAME                1 (b)
              4 BINARY_ADD
              6 STORE_NAME               0 (a)
              8 LOAD_CONST               0 (None)
             10 RETURN_VALUE
【BINARY_ADD】#源码有删减
            PyObject *right = POP();#出栈right
            PyObject *left = TOP();#指向栈顶left
            PyObject *sum;#新对象sum
            #针对字符串加法运算的优化
            if (PyUnicode_CheckExact(left) &&PyUnicode_CheckExact(right)) {
                sum = unicode_concatenate(tstate, left, right, f, next_instr);
            }
            else {
                '''
            }
            SET_TOP(sum);#重新设置栈顶
//源码有删减
static PyObject *
unicode_concatenate(PyThreadState *tstate, PyObject *v, PyObject *w,
                    PyFrameObject *f, const _Py_CODEUNIT *next_instr)
{
    PyObject *res;
    if (Py_REFCNT(v) == 2) {
        int opcode, oparg;#字节码指令和指令参数
        NEXTOPARG();#获取下一字节码指令
        switch (opcode) {#根据下一字节码指令进行优化
        case STORE_FAST:
        {
            ''''
        }
        case STORE_DEREF:
        {
            ''''
        }
        case STORE_NAME:#这是我们执行的情况
        {
            PyObject *names = f->f_code->co_names;#获取code对象name元组
            PyObject *name = GETITEM(names, oparg);#获取参数名
            PyObject *locals = f->f_locals;#获取局部参数字典
            if (locals && PyDict_CheckExact(locals)) {
                PyObject *w = PyDict_GetItemWithError(locals, name);
                #两种情况直接返回
                if ((w == v && PyDict_DelItem(locals, name) != 0) ||
                    (w == NULL && _PyErr_Occurred(tstate)))
                {
                    Py_DECREF(v);
                    return NULL;
                }
            }
            break;
        }
        }
    }
    res = v;
    PyUnicode_Append(&res, w);#通常情况调用的函数
    return res;
}
//源码有删减
void
PyUnicode_Append(PyObject **p_left, PyObject *right)
{
    PyObject *left, *res;
    Py_UCS4 maxchar, maxchar2;
    Py_ssize_t left_len, right_len, new_len;
    
    '''
    '''

    /* Shortcuts */
    if (left == unicode_empty) { #左为''情况
        Py_DECREF(left);
        Py_INCREF(right);
        *p_left = right;
        return;
    }
    if (right == unicode_empty)    #右为''情况
        return;

    #拼接后长度计算
    left_len = PyUnicode_GET_LENGTH(left);
    right_len = PyUnicode_GET_LENGTH(right);

    if (left_len > PY_SSIZE_T_MAX - right_len) {
        #PY_SSIZE_T_MAX 7FFF FFFF‬也就是(unsigned int)/2 - 1,字符串最大长度
        goto error;
    }
    new_len = left_len + right_len;#拼接后长度

    if (unicode_modifiable(left) #左类型检查
        && PyUnicode_CheckExact(right)#右类型检查
        #左类型必须是和右类型相同的类型或先后兼容的类型(1:Py_UCS1,2:Py_UCS2,4:Py_UCS4)
        && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
        #而且两种类型必须都不是ascii码类型,也就是类型0
        && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
    {
        #动态扩展左操作字符串空间大小
        if (unicode_resize(p_left, new_len) != 0)
            goto error;

        #快速内存字符串拷贝
        _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
    }
    else {
        #类型不兼容或有操作串是ascii码的情况
        maxchar = PyUnicode_MAX_CHAR_VALUE(left);
        maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
        maxchar = Py_MAX(maxchar, maxchar2);

        #创建一个新对象,
        res = PyUnicode_New(new_len, maxchar);
        if (res == NULL)
            goto error;
        #先拷贝左边
        _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
        #再拷贝右边
        _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
        #左指向新对象了,以前的对象就被垃圾回收了
        *p_left = res;
    }
    return;

error:
    Py_CLEAR(*p_left);
}
//源码有删减
static int
unicode_resize(PyObject **p_unicode, Py_ssize_t length)
{
    PyObject *unicode;
    Py_ssize_t old_length;

    unicode = *p_unicode;

    #获取传入unicode长度
    if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
        old_length = PyUnicode_WSTR_LENGTH(unicode);
    else
        old_length = PyUnicode_GET_LENGTH(unicode);
    #不用处理
    if (old_length == length)
        return 0;
    #长度为0,指向empty对象
    if (length == 0) {
        _Py_INCREF_UNICODE_EMPTY();
        if (!unicode_empty)
            return -1;
        Py_SETREF(*p_unicode, unicode_empty);
        return 0;
    }
    #不能更改的情况
    if (!unicode_modifiable(unicode)) {
        #创建一个新unicode对象,并拷贝原字符串内容
        PyObject *copy = resize_copy(unicode, length);
        if (copy == NULL)
            return -1;
        return 0;
    }
    #空间足够的情况
    if (PyUnicode_IS_COMPACT(unicode)) {
        #调用操作系统relloc函数重新分配内存
        PyObject *new_unicode = resize_compact(unicode, length);
        if (new_unicode == NULL)
            return -1;
        *p_unicode = new_unicode;
        return 0;
    }
    #调用操作系统relloc函数重新分配内存
    return resize_inplace(unicode, length);
}
//有删减
static PyObject*
resize_compact(PyObject *unicode, Py_ssize_t length)
{
    Py_ssize_t char_size;
    Py_ssize_t struct_size;
    Py_ssize_t new_size;
    int share_wstr;
    PyObject *new_unicode;

    char_size = PyUnicode_KIND(unicode);
    //ascii类型长度
    if (PyUnicode_IS_ASCII(unicode))
        struct_size = sizeof(PyASCIIObject);
    //unicode类型长度
    else
        struct_size = sizeof(PyCompactUnicodeObject);
    share_wstr = _PyUnicode_SHARE_WSTR(unicode);
    
    //超过最大长度
    if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
        PyErr_NoMemory();
        return NULL;
    }
    //新的大小
    new_size = (struct_size + (length + 1) * char_size);

    if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
        PyObject_DEL(_PyUnicode_UTF8(unicode));
        _PyUnicode_UTF8(unicode) = NULL;
        _PyUnicode_UTF8_LENGTH(unicode) = 0;
    }
    
    //调用relloc函数分配
    new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
    if (new_unicode == NULL) {
        _Py_NewReference(unicode);
        PyErr_NoMemory();
        return NULL;
    }
    unicode = new_unicode;
    
    //设置length长度
    _PyUnicode_LENGTH(unicode) = length;
    //unicode编码设置长度
    if (share_wstr) {
        _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
        if (!PyUnicode_IS_ASCII(unicode))
            _PyUnicode_WSTR_LENGTH(unicode) = length;
    }
    //其他情况
    else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
        PyObject_DEL(_PyUnicode_WSTR(unicode));
        _PyUnicode_WSTR(unicode) = NULL;
        if (!PyUnicode_IS_ASCII(unicode))
            _PyUnicode_WSTR_LENGTH(unicode) = 0;
    }

#define PyUnicode_WRITE(kind, data, index, value) \
    do { \
        switch ((kind)) { \
        case PyUnicode_1BYTE_KIND: { \
            ((Py_UCS1 *)(data))[(index)] = (Py_UCS1)(value); \
            break; \
        } \
        case PyUnicode_2BYTE_KIND: { \
            ((Py_UCS2 *)(data))[(index)] = (Py_UCS2)(value); \
            break; \
        } \
        default: { \
            assert((kind) == PyUnicode_4BYTE_KIND); \
            ((Py_UCS4 *)(data))[(index)] = (Py_UCS4)(value); \
        } \
        } \
    } while (0)

    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
                    length, 0);

    return unicode;
}


static int
resize_inplace(PyObject *unicode, Py_ssize_t length)
{
    wchar_t *wstr;
    Py_ssize_t new_size;
    
    //一般情况
    if (PyUnicode_IS_READY(unicode)) {
        Py_ssize_t char_size;
        int share_wstr, share_utf8;
        void *data;

        data = _PyUnicode_DATA_ANY(unicode);//data
        char_size = PyUnicode_KIND(unicode);//size
        share_wstr = _PyUnicode_SHARE_WSTR(unicode);//wstr
        share_utf8 = _PyUnicode_SHARE_UTF8(unicode);//utf-8
        
        //超过最大长度
        if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
            PyErr_NoMemory();
            return -1;
        }
        //新的长度
        new_size = (length + 1) * char_size;
        
        //不是utf8类型却有其内存
        if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
        {
            PyObject_DEL(_PyUnicode_UTF8(unicode));
            _PyUnicode_UTF8(unicode) = NULL;
            _PyUnicode_UTF8_LENGTH(unicode) = 0;
        }
        
        //调用relloc
        data = (PyObject *)PyObject_REALLOC(data, new_size);

        _PyUnicode_DATA_ANY(unicode) = data;
        //wstr类型
        if (share_wstr) {
            _PyUnicode_WSTR(unicode) = data;
            _PyUnicode_WSTR_LENGTH(unicode) = length;
        }
        //uft8类型
        if (share_utf8) {
            _PyUnicode_UTF8(unicode) = data;
            _PyUnicode_UTF8_LENGTH(unicode) = length;
        }
        //设置长度
        _PyUnicode_LENGTH(unicode) = length;
        PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);

        if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
            assert(_PyUnicode_CheckConsistency(unicode, 0));
            return 0;
        }
    }
    
    //长度超过最大长度
    /* check for integer overflow */
    if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
        PyErr_NoMemory();
        return -1;
    }
    //重新新建对象,当成wstr类型处理
    new_size = sizeof(wchar_t) * (length + 1);
    wstr =  _PyUnicode_WSTR(unicode);
    wstr = PyObject_REALLOC(wstr, new_size);
    if (!wstr) {
        PyErr_NoMemory();
        return -1;
    }
    _PyUnicode_WSTR(unicode) = wstr;
    _PyUnicode_WSTR(unicode)[length] = 0;
    _PyUnicode_WSTR_LENGTH(unicode) = length;

    return 0;
}

static PyObject*
resize_copy(PyObject *unicode, Py_ssize_t length)
{
    Py_ssize_t copy_length;
    //不是wstr类型
    if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
        PyObject *copy;
        //新建对象
        copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
        //长度
        copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
        //快速内存拷贝
        _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
        return copy;
    }
    //当成wstr类型处理
    else {
        PyObject *w;

        w = (PyObject*)_PyUnicode_New(length);
        if (w == NULL)
            return NULL;
        copy_length = _PyUnicode_WSTR_LENGTH(unicode);
        copy_length = Py_MIN(copy_length, length);
        memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
                  copy_length * sizeof(wchar_t));
        return w;
    }
}

 

标签:PyUnicode,return,Python,Py,length,PyObject,探究,unicode,加法
来源: https://blog.csdn.net/qq_33913982/article/details/104757198