深入探究Python:字符串子串
作者:互联网
a='dddddddddd'
a=a[1:6]
0 LOAD_CONST 0 ('dddddddddd')
2 STORE_NAME 0 (a)
4 LOAD_NAME 0 (a)
6 LOAD_CONST 1 (1)
8 LOAD_CONST 2 (6)
10 BUILD_SLICE 2
12 BINARY_SUBSCR
14 STORE_NAME 0 (a)
16 LOAD_CONST 3 (None)
18 RETURN_VALUE
【BUILD_SLICE】
PyObject *start, *stop, *step, *slice;
if (oparg == 3)
step = POP();
else
step = NULL;
stop = POP();
start = TOP();
slice = PySlice_New(start, stop, step);
Py_DECREF(start);
Py_DECREF(stop);
Py_XDECREF(step);
SET_TOP(slice);
if (slice == NULL)
goto error;
DISPATCH();
【BINARY_SUBSCR】
PyObject *sub = POP();
PyObject *container = TOP();
PyObject *res = PyObject_GetItem(container, sub);
Py_DECREF(container);
Py_DECREF(sub);
SET_TOP(res);
if (res == NULL)
goto error;
DISPATCH();
//有删减
PyObject *
PyObject_GetItem(PyObject *o, PyObject *key)
{
PyMappingMethods *m;
PySequenceMethods *ms;
//字符串用此方式
//mapping.mp_subscript
m = Py_TYPE(o)->tp_as_mapping;
if (m && m->mp_subscript) {
PyObject *item = m->mp_subscript(o, key);//调用函数->unicode_subscript
assert((item != NULL) ^ (PyErr_Occurred() != NULL));
return item;
}
//sequence.sq_item
ms = Py_TYPE(o)->tp_as_sequence;
if (ms && ms->sq_item) {
'''
}
if (PyType_Check(o)) {
'''
}
return type_error("'%.200s' object is not subscriptable", o);
}
//有删减
static PyObject*
unicode_subscript(PyObject* self, PyObject* item)
{
//单个索引
if (PyIndex_Check(item)) {
Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
if (i == -1 && PyErr_Occurred())
return NULL;
if (i < 0)
i += PyUnicode_GET_LENGTH(self);
return unicode_getitem(self, i);
//Slice类型
}else if (PySlice_Check(item)) {
Py_ssize_t start, stop, step, slicelength, i;
size_t cur;
PyObject *result;
void *src_data, *dest_data;
int src_kind, dest_kind;
Py_UCS4 ch, max_char, kind_limit;
//获取对应值
if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
return NULL;
}
slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
&start, &stop, step);
//空串情况
if (slicelength <= 0) {
_Py_RETURN_UNICODE_EMPTY();
//不操作情况
} else if (start == 0 && step == 1 &&
slicelength == PyUnicode_GET_LENGTH(self)) {
return unicode_result_unchanged(self);
//step为1情况
} else if (step == 1) {
return PyUnicode_Substring(self,
start, start + slicelength);
}
//通常情况
src_kind = PyUnicode_KIND(self);
src_data = PyUnicode_DATA(self);
if (!PyUnicode_IS_ASCII(self)) {
kind_limit = kind_maxchar_limit(src_kind);
max_char = 0;
for (cur = start, i = 0; i < slicelength; cur += step, i++) {
ch = PyUnicode_READ(src_kind, src_data, cur);
if (ch > max_char) {
max_char = ch;
if (max_char >= kind_limit)
break;
}
}
}
else
max_char = 127;
result = PyUnicode_New(slicelength, max_char);
if (result == NULL)
return NULL;
dest_kind = PyUnicode_KIND(result);
dest_data = PyUnicode_DATA(result);
for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
PyUnicode_WRITE(dest_kind, dest_data, i, ch);
}
assert(_PyUnicode_CheckConsistency(result, 1));
return result;
//其他类型
} else {
PyErr_SetString(PyExc_TypeError, "string indices must be integers");
return NULL;
}
}
//索引情况
//有删减
static PyObject *
unicode_getitem(PyObject *self, Py_ssize_t index)
{
void *data;
enum PyUnicode_Kind kind;
Py_UCS4 ch;
'''
kind = PyUnicode_KIND(self);//类型
data = PyUnicode_DATA(self);//buffer
ch = PyUnicode_READ(kind, data, index);
return unicode_char(ch);
}
#define PyUnicode_READ(kind, data, index) \
((Py_UCS4) \
((kind) == PyUnicode_1BYTE_KIND ? \
((const Py_UCS1 *)(data))[(index)] : \ //UCS1
((kind) == PyUnicode_2BYTE_KIND ? \
((const Py_UCS2 *)(data))[(index)] : \ //UCS2
((const Py_UCS4 *)(data))[(index)] \ //UCS4
) \
))
//step为1情况
//有删减
PyObject*
PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
{
unsigned char *data;
int kind;
Py_ssize_t length;
length = PyUnicode_GET_LENGTH(self);
end = Py_MIN(end, length);
//不操作
if (start == 0 && end == length)
return unicode_result_unchanged(self);
//非法范围
if (start < 0 || end < 0) {
PyErr_SetString(PyExc_IndexError, "string index out of range");
return NULL;
}
//非法范围
if (start >= length || end < start)
_Py_RETURN_UNICODE_EMPTY();
length = end - start;
//ascii处理
if (PyUnicode_IS_ASCII(self)) {
data = PyUnicode_1BYTE_DATA(self);
return _PyUnicode_FromASCII((char*)(data + start), length);
}
//其他编码处理
else {
kind = PyUnicode_KIND(self);
data = PyUnicode_1BYTE_DATA(self);
return PyUnicode_FromKindAndData(kind,
data + kind * start,
length);
}
}
//ascii类型处理
PyObject*
_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
{
const unsigned char *s = (const unsigned char *)buffer;
PyObject *unicode;
//单个字符
if (size == 1) {
return get_latin1_char(s[0]);
}
//创建新对象拷贝
unicode = PyUnicode_New(size, 127);
if (!unicode)
return NULL;
memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
assert(_PyUnicode_CheckConsistency(unicode, 1));
return unicode;
}
//unicode类型的处理
PyObject*
PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
{
switch (kind) {
case PyUnicode_1BYTE_KIND:
return _PyUnicode_FromUCS1(buffer, size);
case PyUnicode_2BYTE_KIND:
return _PyUnicode_FromUCS2(buffer, size);
case PyUnicode_4BYTE_KIND:
return _PyUnicode_FromUCS4(buffer, size);
default:
PyErr_SetString(PyExc_SystemError, "invalid kind");
return NULL;
}
}
//US1
static PyObject*
_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
{
PyObject *res;
unsigned char max_char;
//空串
if (size == 0)
_Py_RETURN_UNICODE_EMPTY();
assert(size > 0);
//单个字符
if (size == 1)
return get_latin1_char(u[0]);
//编码最大值
max_char = ucs1lib_find_max_char(u, u + size);
res = PyUnicode_New(size, max_char);
#拷贝到新对象
memcpy(PyUnicode_1BYTE_DATA(res), u, size);
return res;
}
//US2
static PyObject*
_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
{
PyObject *res;
Py_UCS2 max_char;
//空串
if (size == 0)
_Py_RETURN_UNICODE_EMPTY();
assert(size > 0);
//单个字符
if (size == 1)
return unicode_char(u[0]);
//编码最大值
max_char = ucs2lib_find_max_char(u, u + size);
res = PyUnicode_New(size, max_char);
//一般情况
if (max_char >= 256)
#拷贝到新对象
memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
//编码最大值过小转换编码
else {
_PyUnicode_CONVERT_BYTES(
Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
}
return res;
}
//US4
static PyObject*
_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
{
PyObject *res;
Py_UCS4 max_char;
//空串
if (size == 0)
_Py_RETURN_UNICODE_EMPTY();
//单个字符
if (size == 1)
return unicode_char(u[0]);
//编码最大值
max_char = ucs4lib_find_max_char(u, u + size);
res = PyUnicode_New(size, max_char);
//编码最大值过小转换编码
if (max_char < 256)
_PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
PyUnicode_1BYTE_DATA(res));
//编码最大值过小转换编码
else if (max_char < 0x10000)
_PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
PyUnicode_2BYTE_DATA(res));
//一般情况
else
#拷贝到新对象
memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
return res;
}
标签:子串,PyUnicode,return,Python,Py,char,PyObject,探究,size 来源: https://blog.csdn.net/qq_33913982/article/details/104764708