Skip to content

Commit

Permalink
✨ fix version 0.6.3
Browse files Browse the repository at this point in the history
use unicode_contains_impl
  • Loading branch information
RF-Tar-Railt committed Oct 15, 2024
1 parent b9e138d commit fe27aec
Show file tree
Hide file tree
Showing 4 changed files with 205 additions and 146 deletions.
324 changes: 191 additions & 133 deletions src/tarina/_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,156 +19,214 @@ static inline PyObject * tupleitem(PyObject *a, Py_ssize_t i)
return ((PyTupleObject*)a)->ob_item[i];
}

// #include "stringlib/asciilib.h"
// #include "stringlib/fastsearch.h"
// #include "stringlib/partition.h"
// #include "stringlib/split.h"
// #include "stringlib/count.h"
// #include "stringlib/find.h"
// #include "stringlib/find_max_char.h"
// #include "stringlib/undef.h"

// #include "stringlib/ucs1lib.h"
// #include "stringlib/fastsearch.h"
// #include "stringlib/partition.h"
// #include "stringlib/split.h"
// #include "stringlib/count.h"
// #include "stringlib/find.h"
// #include "stringlib/replace.h"
// #include "stringlib/find_max_char.h"
// #include "stringlib/undef.h"

// #include "stringlib/ucs2lib.h"
// #include "stringlib/fastsearch.h"
// #include "stringlib/partition.h"
// #include "stringlib/split.h"
// #include "stringlib/count.h"
// #include "stringlib/find.h"
// #include "stringlib/replace.h"
// #include "stringlib/find_max_char.h"
// #include "stringlib/undef.h"

// #include "stringlib/ucs4lib.h"
// #include "stringlib/fastsearch.h"
// #include "stringlib/partition.h"
// #include "stringlib/split.h"
// #include "stringlib/count.h"
// #include "stringlib/find.h"
// #include "stringlib/replace.h"
// #include "stringlib/find_max_char.h"
// #include "stringlib/undef.h"

// static inline Py_ssize_t
// findchar(const void *s, int kind, Py_ssize_t size, Py_UCS4 ch)
// {
// switch (kind) {
// case PyUnicode_1BYTE_KIND:
// if ((Py_UCS1) ch != ch)
// return -1;
// return ucs1lib_find_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
// case PyUnicode_2BYTE_KIND:
// if ((Py_UCS2) ch != ch)
// return -1;
// return ucs2lib_find_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
// case PyUnicode_4BYTE_KIND:
// return ucs4lib_find_char((const Py_UCS4 *) s, size, ch);
// default:
// Py_UNREACHABLE();
// }
// }

// static int contains(PyObject *str, Py_UCS4 ch) {
// const void *buf;
// int result, kind, len;
// kind = PyUnicode_KIND(str);
// len = PyUnicode_GET_LENGTH(str);
// buf = PyUnicode_DATA(str);
// result = findchar((const char *)buf, kind, len, ch) != -1;
// return result;
// }
#define FAST_COUNT 0
#define FAST_SEARCH 1
#define FAST_RSEARCH 2

Py_LOCAL_INLINE(int)
unicode_eq(PyObject *str1, PyObject *str2)
Py_LOCAL_INLINE(Py_ssize_t)
ucs1lib_find_char(const Py_UCS1* s, Py_ssize_t n, Py_UCS1 ch)
{
Py_ssize_t len = PyUnicode_GET_LENGTH(str1);
if (PyUnicode_GET_LENGTH(str2) != len) {
return 0;
const Py_UCS1 *p, *e;

p = s;
e = s + n;
if (n > 15) {
p = memchr(s, ch, n);
if (p != NULL)
return (p - s);
return -1;
}

int kind = PyUnicode_KIND(str1);
if (PyUnicode_KIND(str2) != kind) {
return 0;
while (p < e) {
if (*p == ch)
return (p - s);
p++;
}

const void *data1 = PyUnicode_DATA(str1);
const void *data2 = PyUnicode_DATA(str2);
return (memcmp(data1, data2, len * kind) == 0);
return -1;
}

#if SIZEOF_WCHAR_T == 2
#define UCS2_FAST_MEMCHR(s, c, n) \
(Py_UCS2 *)wmemchr((const wchar_t *)(s), c, n)
#endif

static setentry *
set_lookkey(PySetObject *so, PyObject *key, Py_hash_t hash)
#ifdef UCS2_FAST_MEMCHR
# define UCS2_MEMCHR_CUT_OFF 15
#else
# define UCS2_MEMCHR_CUT_OFF 40
#endif

Py_LOCAL_INLINE(Py_ssize_t)
ucs2lib_find_char(const Py_UCS2* s, Py_ssize_t n, Py_UCS2 ch)
{
setentry *table;
setentry *entry;
size_t perturb = hash;
size_t mask = so->mask;
size_t i = (size_t)hash & mask; /* Unsigned for defined overflow behavior */
int probes;
int cmp;

while (1) {
entry = &so->table[i];
probes = (i + LINEAR_PROBES <= mask) ? LINEAR_PROBES: 0;
do {
if (entry->hash == 0 && entry->key == NULL)
return entry;
if (entry->hash == hash) {
PyObject *startkey = entry->key;
assert(startkey != dummy);
if (startkey == key)
return entry;
if (PyUnicode_CheckExact(startkey)
&& PyUnicode_CheckExact(key)
&& unicode_eq(startkey, key))
return entry;
table = so->table;
Py_INCREF(startkey);
cmp = PyObject_RichCompareBool(startkey, key, Py_EQ);
Py_DECREF(startkey);
if (cmp < 0)
return NULL;
if (table != so->table || entry->key != startkey)
return set_lookkey(so, key, hash);
if (cmp > 0)
return entry;
mask = so->mask;
const Py_UCS2 *p, *e;

p = s;
e = s + n;
if (n > UCS2_MEMCHR_CUT_OFF) {
#ifdef UCS2_FAST_MEMCHR
p = UCS2_FAST_MEMCHR(s, ch, n);
if (p != NULL)
return (p - s);
return -1;
#else
/* use memchr if we can choose a needle without too many likely
false positives */
const Py_UCS2 *s1, *e1;
unsigned char needle = ch & 0xff;
/* If looking for a multiple of 256, we'd have too
many false positives looking for the '\0' byte in UCS2
and UCS4 representations. */
if (needle != 0) {
do {
void *candidate = memchr(p, needle,
(e - p) * sizeof(Py_UCS2));
if (candidate == NULL)
return -1;
s1 = p;
p = (const Py_UCS2 *)
_Py_ALIGN_DOWN(candidate, sizeof(Py_UCS2));
if (*p == ch)
return (p - s);
/* False positive */
p++;
if (p - s1 > UCS2_MEMCHR_CUT_OFF)
continue;
if (e - p <= UCS2_MEMCHR_CUT_OFF)
break;
e1 = p + UCS2_MEMCHR_CUT_OFF;
while (p != e1) {
if (*p == ch)
return (p - s);
p++;
}
}
entry++;
} while (probes--);
perturb >>= PERTURB_SHIFT;
i = (i * 5 + 1 + perturb) & mask;
while (e - p > UCS2_MEMCHR_CUT_OFF);
}
#endif
}
while (p < e) {
if (*p == ch)
return (p - s);
p++;
}
return -1;
}


static int
set_contains_key(PyObject *so, PyObject *key)
#if SIZEOF_WCHAR_T == 4
#define UCS4_FAST_MEMCHR(s, c, n) \
(Py_UCS4 *)wmemchr((const wchar_t *)(s), c, n)
#endif

#ifdef UCS4_FAST_MEMCHR
# define UCS4_MEMCHR_CUT_OFF 15
#else
# define UCS4_MEMCHR_CUT_OFF 40
#endif

Py_LOCAL_INLINE(Py_ssize_t)
ucs4lib_find_char(const Py_UCS4* s, Py_ssize_t n, Py_UCS4 ch)
{
Py_hash_t hash;
setentry *entry;
const Py_UCS4 *p, *e;

p = s;
e = s + n;
if (n > UCS4_MEMCHR_CUT_OFF) {
#ifdef UCS4_FAST_MEMCHR
p = UCS4_FAST_MEMCHR(s, ch, n);
if (p != NULL)
return (p - s);
return -1;
#else
/* use memchr if we can choose a needle without too many likely
false positives */
const Py_UCS4 *s1, *e1;
unsigned char needle = ch & 0xff;
/* If looking for a multiple of 256, we'd have too
many false positives looking for the '\0' byte in UCS2
and UCS4 representations. */
if (needle != 0) {
do {
void *candidate = memchr(p, needle,
(e - p) * sizeof(Py_UCS4));
if (candidate == NULL)
return -1;
s1 = p;
p = (const Py_UCS4 *)
_Py_ALIGN_DOWN(candidate, sizeof(Py_UCS4));
if (*p == ch)
return (p - s);
/* False positive */
p++;
if (p - s1 > UCS4_MEMCHR_CUT_OFF)
continue;
if (e - p <= UCS4_MEMCHR_CUT_OFF)
break;
e1 = p + UCS4_MEMCHR_CUT_OFF;
while (p != e1) {
if (*p == ch)
return (p - s);
p++;
}
}
while (e - p > UCS4_MEMCHR_CUT_OFF);
}
#endif
}
while (p < e) {
if (*p == ch)
return (p - s);
p++;
}
return -1;
}


if (!PyUnicode_CheckExact(key) ||
(hash = ((PyASCIIObject *)key)->hash) == -1) {
hash = PyObject_Hash(key);
if (hash == -1)
static inline Py_ssize_t
findchar(const void *s, int kind, Py_ssize_t size, Py_UCS4 ch)
{
switch (kind) {
case PyUnicode_1BYTE_KIND:
if ((Py_UCS1) ch != ch)
return -1;
return ucs1lib_find_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
case PyUnicode_2BYTE_KIND:
if ((Py_UCS2) ch != ch)
return -1;
return ucs2lib_find_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
case PyUnicode_4BYTE_KIND:
return ucs4lib_find_char((const Py_UCS4 *) s, size, ch);
default:
Py_UNREACHABLE();
}
}

entry = set_lookkey(((PySetObject *)so), key, hash);
if (entry != NULL)
return entry->key != NULL;
return -1;
static int str_contains(PyObject *str, Py_UCS4 ch) {
const void *buf;
int result, kind, len;
kind = PyUnicode_KIND(str);
len = PyUnicode_GET_LENGTH(str);
buf = PyUnicode_DATA(str);
result = findchar((const char *)buf, kind, len, ch) != -1;
return result;
}

Py_LOCAL_INLINE(int)
unicode_eq(PyObject *str1, PyObject *str2)
{
Py_ssize_t len = PyUnicode_GET_LENGTH(str1);
if (PyUnicode_GET_LENGTH(str2) != len) {
return 0;
}

int kind = PyUnicode_KIND(str1);
if (PyUnicode_KIND(str2) != kind) {
return 0;
}

const void *data1 = PyUnicode_DATA(str1);
const void *data2 = PyUnicode_DATA(str2);
return (memcmp(data1, data2, len * kind) == 0);
}

#define LEFTSTRIP 0
Expand Down Expand Up @@ -232,7 +290,7 @@ make_bloom_mask(int kind, const void* ptr, Py_ssize_t len)
}

PyObject *
xstrip(PyObject *self, int striptype, PyObject *sepobj)
str_strip(PyObject *self, int striptype, PyObject *sepobj)
{
const void *data;
int kind;
Expand Down
3 changes: 3 additions & 0 deletions src/tarina/_string_c.pyi
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
CRLF: str
QUOTATION: dict[str, str]

def split(text: str, separator: str, crlf: bool = True) -> list[str]:
"""尊重引号与转义的字符串切分
Expand Down
Loading

0 comments on commit fe27aec

Please sign in to comment.