首页 > 编程语言> > 深入探究Python：字符串JOIN

深入探究Python：字符串JOIN

2020-03-10 19:40:24 作者：互联网

2.字符串join

#源码
a="dddddddddd"
b="sssssssssss"

a.join(b)

#字节码（实际上也就是调用unicode的method中的join函数）
              0 LOAD_NAME                0 (a)
              2 LOAD_METHOD              1 (join)
              4 LOAD_NAME                2 (b)
              6 CALL_METHOD              1
              8 RETURN_VALUE

//实际上也就是调用这个函数，有删减
PyObject *
PyUnicode_Join(PyObject *separator, PyObject *seq)
{
    PyObject *res;
    PyObject *fseq;
    Py_ssize_t seqlen;
    PyObject **items;

    fseq = PySequence_Fast(seq, "can only join an iterable");

    items = PySequence_Fast_ITEMS(fseq);
    seqlen = PySequence_Fast_GET_SIZE(fseq);//join中参数的size

    res = _PyUnicode_JoinArray(separator, items, seqlen);
    Py_DECREF(fseq);
    return res;
}

PyObject *
PySequence_Fast(PyObject *v, const char *m)
{
    PyObject *it;
    
    //NULL报错
    if (v == NULL) {
        return null_error();
    }
    
    //List和Tuple类型直接返回
    if (PyList_CheckExact(v) || PyTuple_CheckExact(v)) {
        Py_INCREF(v);
        return v;
    }
    
    //调用对象方法，转换为迭代器
    it = PyObject_GetIter(v);
    if (it == NULL) {
        if (PyErr_ExceptionMatches(PyExc_TypeError))
            PyErr_SetString(PyExc_TypeError, m);
        return NULL;
    }
    
    //转换为List类型
    v = PySequence_List(it);
    Py_DECREF(it);

    return v;
}

//有删减
PyObject *
_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
{
    PyObject *res = NULL; /* the result */
    PyObject *sep = NULL;
    Py_ssize_t seplen;
    PyObject *item;
    Py_ssize_t sz, i, res_offset;
    Py_UCS4 maxchar;
    Py_UCS4 item_maxchar;
    int use_memcpy;
    unsigned char *res_data = NULL, *sep_data = NULL;
    PyObject *last_obj;
    unsigned int kind = 0;


    last_obj = NULL;
    //Join参数长度为1，不用处理，直接返回
    if (seqlen == 1) {
        if (PyUnicode_CheckExact(items[0])) {
            res = items[0];
            return res;
        }
        seplen = 0;
        maxchar = 0;
    }
    else {
        //分隔符为NULL,设置为' ',并设置长度
        if (separator == NULL) {
            /* fall back to a blank space separator */
            sep = PyUnicode_FromOrdinal(' ');
            if (!sep)
                goto one rror;
            seplen = 1;
            maxchar = 32;
        }
        else {
            sep = separator;    //分隔符
            seplen = PyUnicode_GET_LENGTH(separator);    //分隔符长度
            maxchar = PyUnicode_MAX_CHAR_VALUE(separator);    //编码长度
        }
        last_obj = sep;
    }

    /* There are at least two things to join, or else we have a subclass
     * of str in the sequence.
     * Do a pre-pass to figure out the total amount of space we'll
     * need (sz), and see whether all argument are strings.
     */
    sz = 0;

#ifdef Py_DEBUG
    use_memcpy = 0;
#else
    use_memcpy = 1;
#endif

    for (i = 0; i < seqlen; i++) {
        size_t add_sz;
        item = items[i];    //List或Tuple中的单个索引对象
        
        //必须是Unicode
        if (!PyUnicode_Check(separator)) {
                PyErr_Format(PyExc_TypeError,
                             "separator: expected str instance,"
                             " %.80s found",
                             Py_TYPE(separator)->tp_name);
                goto one rror;
            }        

        add_sz = PyUnicode_GET_LENGTH(item);    //索引对象
        item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);    //对象长度
        maxchar = Py_MAX(maxchar, item_maxchar);    //对象编码，获取最大的编码类型
        if (i != 0) {
            add_sz += seplen;
        }
        //字符串最大长度
        if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
            PyErr_SetString(PyExc_OverflowError,
                            "join() result is too long for a Python string");
            goto one rror;
        }
        //统计总长
        sz += add_sz;
        if (use_memcpy && last_obj != NULL) {
            if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
                use_memcpy = 0;
        }
        last_obj = item;
    }
    
    //创建一个对应长度和最大编码的新对象
    res = PyUnicode_New(sz, maxchar);
    if (res == NULL)
        goto one rror;
    
#ifdef Py_DEBUG
    use_memcpy = 0;
#else
    //使用memcpy函数
    if (use_memcpy) {
        res_data = PyUnicode_1BYTE_DATA(res);
        kind = PyUnicode_KIND(res);
        if (seplen != 0)
            sep_data = PyUnicode_1BYTE_DATA(sep);
    }
#endif
    //使用memcpy函数
    if (use_memcpy) {
        for (i = 0; i < seqlen; ++i) {
            Py_ssize_t itemlen;
            item = items[i];

            /* Copy item, and maybe the separator. */
            //分隔符
            if (i && seplen != 0) {
                memcpy(res_data,
                          sep_data,
                          kind * seplen);
                res_data += kind * seplen;
            }

            itemlen = PyUnicode_GET_LENGTH(item);
            //索引对象
            if (itemlen != 0) {
                memcpy(res_data,
                          PyUnicode_DATA(item),
                          kind * itemlen);
                res_data += kind * itemlen;
            }
        }
    }
    //使用_PyUnicode_FastCopyCharacters函数
    else {
        for (i = 0, res_offset = 0; i < seqlen; ++i) {
            Py_ssize_t itemlen;
            item = items[i];

            /* Copy item, and maybe the separator. */
            //分隔符
            if (i && seplen != 0) {
                _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
                res_offset += seplen;
            }
            
            itemlen = PyUnicode_GET_LENGTH(item);
            //索引对象
            if (itemlen != 0) {
                _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
                res_offset += itemlen;
            }
        }
    }
}

标签：PyUnicode,JOIN,item,Python,res,Py,PyObject,探究,NULL
来源： https://blog.csdn.net/qq_33913982/article/details/104761611