1/* stringlib: split implementation */
2
3#ifndef STRINGLIB_FASTSEARCH_H
4#error must include "stringlib/fastsearch.h" before including this module
5#endif
6
7/* Overallocate the initial list to reduce the number of reallocs for small
8   split sizes.  Eg, "A A A A A A A A A A".split() (10 elements) has three
9   resizes, to sizes 4, 8, then 16.  Most observed string splits are for human
10   text (roughly 11 words per line) and field delimited data (usually 1-10
11   fields).  For large strings the split algorithms are bandwidth limited
12   so increasing the preallocation likely will not improve things.*/
13
14#define MAX_PREALLOC 12
15
16/* 5 splits gives 6 elements */
17#define PREALLOC_SIZE(maxsplit) \
18    (maxsplit >= MAX_PREALLOC ? MAX_PREALLOC : maxsplit+1)
19
20#define SPLIT_APPEND(data, left, right)         \
21    sub = STRINGLIB_NEW((data) + (left),        \
22                        (right) - (left));      \
23    if (sub == NULL)                            \
24        goto onError;                           \
25    if (PyList_Append(list, sub)) {             \
26        Py_DECREF(sub);                         \
27        goto onError;                           \
28    }                                           \
29    else                                        \
30        Py_DECREF(sub);
31
32#define SPLIT_ADD(data, left, right) {          \
33    sub = STRINGLIB_NEW((data) + (left),        \
34                        (right) - (left));      \
35    if (sub == NULL)                            \
36        goto onError;                           \
37    if (count < MAX_PREALLOC) {                 \
38        PyList_SET_ITEM(list, count, sub);      \
39    } else {                                    \
40        if (PyList_Append(list, sub)) {         \
41            Py_DECREF(sub);                     \
42            goto onError;                       \
43        }                                       \
44        else                                    \
45            Py_DECREF(sub);                     \
46    }                                           \
47    count++; }
48
49
50/* Always force the list to the expected size. */
51#define FIX_PREALLOC_SIZE(list) Py_SET_SIZE(list, count)
52
53Py_LOCAL_INLINE(PyObject *)
54STRINGLIB(split_whitespace)(PyObject* str_obj,
55                           const STRINGLIB_CHAR* str, Py_ssize_t str_len,
56                           Py_ssize_t maxcount)
57{
58    Py_ssize_t i, j, count=0;
59    PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
60    PyObject *sub;
61
62    if (list == NULL)
63        return NULL;
64
65    i = j = 0;
66    while (maxcount-- > 0) {
67        while (i < str_len && STRINGLIB_ISSPACE(str[i]))
68            i++;
69        if (i == str_len) break;
70        j = i; i++;
71        while (i < str_len && !STRINGLIB_ISSPACE(str[i]))
72            i++;
73#if !STRINGLIB_MUTABLE
74        if (j == 0 && i == str_len && STRINGLIB_CHECK_EXACT(str_obj)) {
75            /* No whitespace in str_obj, so just use it as list[0] */
76            Py_INCREF(str_obj);
77            PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
78            count++;
79            break;
80        }
81#endif
82        SPLIT_ADD(str, j, i);
83    }
84
85    if (i < str_len) {
86        /* Only occurs when maxcount was reached */
87        /* Skip any remaining whitespace and copy to end of string */
88        while (i < str_len && STRINGLIB_ISSPACE(str[i]))
89            i++;
90        if (i != str_len)
91            SPLIT_ADD(str, i, str_len);
92    }
93    FIX_PREALLOC_SIZE(list);
94    return list;
95
96  onError:
97    Py_DECREF(list);
98    return NULL;
99}
100
101Py_LOCAL_INLINE(PyObject *)
102STRINGLIB(split_char)(PyObject* str_obj,
103                     const STRINGLIB_CHAR* str, Py_ssize_t str_len,
104                     const STRINGLIB_CHAR ch,
105                     Py_ssize_t maxcount)
106{
107    Py_ssize_t i, j, count=0;
108    PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
109    PyObject *sub;
110
111    if (list == NULL)
112        return NULL;
113
114    i = j = 0;
115    while ((j < str_len) && (maxcount-- > 0)) {
116        for(; j < str_len; j++) {
117            /* I found that using memchr makes no difference */
118            if (str[j] == ch) {
119                SPLIT_ADD(str, i, j);
120                i = j = j + 1;
121                break;
122            }
123        }
124    }
125#if !STRINGLIB_MUTABLE
126    if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
127        /* ch not in str_obj, so just use str_obj as list[0] */
128        Py_INCREF(str_obj);
129        PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
130        count++;
131    } else
132#endif
133    if (i <= str_len) {
134        SPLIT_ADD(str, i, str_len);
135    }
136    FIX_PREALLOC_SIZE(list);
137    return list;
138
139  onError:
140    Py_DECREF(list);
141    return NULL;
142}
143
144Py_LOCAL_INLINE(PyObject *)
145STRINGLIB(split)(PyObject* str_obj,
146                const STRINGLIB_CHAR* str, Py_ssize_t str_len,
147                const STRINGLIB_CHAR* sep, Py_ssize_t sep_len,
148                Py_ssize_t maxcount)
149{
150    Py_ssize_t i, j, pos, count=0;
151    PyObject *list, *sub;
152
153    if (sep_len == 0) {
154        PyErr_SetString(PyExc_ValueError, "empty separator");
155        return NULL;
156    }
157    else if (sep_len == 1)
158        return STRINGLIB(split_char)(str_obj, str, str_len, sep[0], maxcount);
159
160    list = PyList_New(PREALLOC_SIZE(maxcount));
161    if (list == NULL)
162        return NULL;
163
164    i = j = 0;
165    while (maxcount-- > 0) {
166        pos = FASTSEARCH(str+i, str_len-i, sep, sep_len, -1, FAST_SEARCH);
167        if (pos < 0)
168            break;
169        j = i + pos;
170        SPLIT_ADD(str, i, j);
171        i = j + sep_len;
172    }
173#if !STRINGLIB_MUTABLE
174    if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
175        /* No match in str_obj, so just use it as list[0] */
176        Py_INCREF(str_obj);
177        PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
178        count++;
179    } else
180#endif
181    {
182        SPLIT_ADD(str, i, str_len);
183    }
184    FIX_PREALLOC_SIZE(list);
185    return list;
186
187  onError:
188    Py_DECREF(list);
189    return NULL;
190}
191
192Py_LOCAL_INLINE(PyObject *)
193STRINGLIB(rsplit_whitespace)(PyObject* str_obj,
194                            const STRINGLIB_CHAR* str, Py_ssize_t str_len,
195                            Py_ssize_t maxcount)
196{
197    Py_ssize_t i, j, count=0;
198    PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
199    PyObject *sub;
200
201    if (list == NULL)
202        return NULL;
203
204    i = j = str_len - 1;
205    while (maxcount-- > 0) {
206        while (i >= 0 && STRINGLIB_ISSPACE(str[i]))
207            i--;
208        if (i < 0) break;
209        j = i; i--;
210        while (i >= 0 && !STRINGLIB_ISSPACE(str[i]))
211            i--;
212#if !STRINGLIB_MUTABLE
213        if (j == str_len - 1 && i < 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
214            /* No whitespace in str_obj, so just use it as list[0] */
215            Py_INCREF(str_obj);
216            PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
217            count++;
218            break;
219        }
220#endif
221        SPLIT_ADD(str, i + 1, j + 1);
222    }
223
224    if (i >= 0) {
225        /* Only occurs when maxcount was reached */
226        /* Skip any remaining whitespace and copy to beginning of string */
227        while (i >= 0 && STRINGLIB_ISSPACE(str[i]))
228            i--;
229        if (i >= 0)
230            SPLIT_ADD(str, 0, i + 1);
231    }
232    FIX_PREALLOC_SIZE(list);
233    if (PyList_Reverse(list) < 0)
234        goto onError;
235    return list;
236
237  onError:
238    Py_DECREF(list);
239    return NULL;
240}
241
242Py_LOCAL_INLINE(PyObject *)
243STRINGLIB(rsplit_char)(PyObject* str_obj,
244                      const STRINGLIB_CHAR* str, Py_ssize_t str_len,
245                      const STRINGLIB_CHAR ch,
246                      Py_ssize_t maxcount)
247{
248    Py_ssize_t i, j, count=0;
249    PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
250    PyObject *sub;
251
252    if (list == NULL)
253        return NULL;
254
255    i = j = str_len - 1;
256    while ((i >= 0) && (maxcount-- > 0)) {
257        for(; i >= 0; i--) {
258            if (str[i] == ch) {
259                SPLIT_ADD(str, i + 1, j + 1);
260                j = i = i - 1;
261                break;
262            }
263        }
264    }
265#if !STRINGLIB_MUTABLE
266    if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
267        /* ch not in str_obj, so just use str_obj as list[0] */
268        Py_INCREF(str_obj);
269        PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
270        count++;
271    } else
272#endif
273    if (j >= -1) {
274        SPLIT_ADD(str, 0, j + 1);
275    }
276    FIX_PREALLOC_SIZE(list);
277    if (PyList_Reverse(list) < 0)
278        goto onError;
279    return list;
280
281  onError:
282    Py_DECREF(list);
283    return NULL;
284}
285
286Py_LOCAL_INLINE(PyObject *)
287STRINGLIB(rsplit)(PyObject* str_obj,
288                 const STRINGLIB_CHAR* str, Py_ssize_t str_len,
289                 const STRINGLIB_CHAR* sep, Py_ssize_t sep_len,
290                 Py_ssize_t maxcount)
291{
292    Py_ssize_t j, pos, count=0;
293    PyObject *list, *sub;
294
295    if (sep_len == 0) {
296        PyErr_SetString(PyExc_ValueError, "empty separator");
297        return NULL;
298    }
299    else if (sep_len == 1)
300        return STRINGLIB(rsplit_char)(str_obj, str, str_len, sep[0], maxcount);
301
302    list = PyList_New(PREALLOC_SIZE(maxcount));
303    if (list == NULL)
304        return NULL;
305
306    j = str_len;
307    while (maxcount-- > 0) {
308        pos = FASTSEARCH(str, j, sep, sep_len, -1, FAST_RSEARCH);
309        if (pos < 0)
310            break;
311        SPLIT_ADD(str, pos + sep_len, j);
312        j = pos;
313    }
314#if !STRINGLIB_MUTABLE
315    if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
316        /* No match in str_obj, so just use it as list[0] */
317        Py_INCREF(str_obj);
318        PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
319        count++;
320    } else
321#endif
322    {
323        SPLIT_ADD(str, 0, j);
324    }
325    FIX_PREALLOC_SIZE(list);
326    if (PyList_Reverse(list) < 0)
327        goto onError;
328    return list;
329
330  onError:
331    Py_DECREF(list);
332    return NULL;
333}
334
335Py_LOCAL_INLINE(PyObject *)
336STRINGLIB(splitlines)(PyObject* str_obj,
337                     const STRINGLIB_CHAR* str, Py_ssize_t str_len,
338                     int keepends)
339{
340    /* This does not use the preallocated list because splitlines is
341       usually run with hundreds of newlines.  The overhead of
342       switching between PyList_SET_ITEM and append causes about a
343       2-3% slowdown for that common case.  A smarter implementation
344       could move the if check out, so the SET_ITEMs are done first
345       and the appends only done when the prealloc buffer is full.
346       That's too much work for little gain.*/
347
348    Py_ssize_t i;
349    Py_ssize_t j;
350    PyObject *list = PyList_New(0);
351    PyObject *sub;
352
353    if (list == NULL)
354        return NULL;
355
356    for (i = j = 0; i < str_len; ) {
357        Py_ssize_t eol;
358
359        /* Find a line and append it */
360        while (i < str_len && !STRINGLIB_ISLINEBREAK(str[i]))
361            i++;
362
363        /* Skip the line break reading CRLF as one line break */
364        eol = i;
365        if (i < str_len) {
366            if (str[i] == '\r' && i + 1 < str_len && str[i+1] == '\n')
367                i += 2;
368            else
369                i++;
370            if (keepends)
371                eol = i;
372        }
373#if !STRINGLIB_MUTABLE
374        if (j == 0 && eol == str_len && STRINGLIB_CHECK_EXACT(str_obj)) {
375            /* No linebreak in str_obj, so just use it as list[0] */
376            if (PyList_Append(list, str_obj))
377                goto onError;
378            break;
379        }
380#endif
381        SPLIT_APPEND(str, j, eol);
382        j = i;
383    }
384    return list;
385
386  onError:
387    Py_DECREF(list);
388    return NULL;
389}
390
391