diff options
author | Matti Picus <matti.picus@gmail.com> | 2022-02-14 11:16:18 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-02-14 11:16:18 +0200 |
commit | ee1722d287fb32a724ac3f5807731e36cbbc5567 (patch) | |
tree | 02c1f285dda3128dfebd5c7644f8f00cedb91c9e | |
parent | 3e8dedab8c126636013794f8d1130476f086be0a (diff) | |
parent | 6fd9670a3e804d4d1bd2bd9249748bc82e2a3770 (diff) | |
download | numpy-ee1722d287fb32a724ac3f5807731e36cbbc5567.tar.gz |
Merge pull request #21025 from seberg/tokenizer-cpp
MAINT: Use C++ for tokenizer unicode-kind templating
-rw-r--r-- | numpy/core/setup.py | 2 | ||||
-rw-r--r-- | numpy/core/src/multiarray/textreading/growth.h | 8 | ||||
-rw-r--r-- | numpy/core/src/multiarray/textreading/parser_config.h | 8 | ||||
-rw-r--r-- | numpy/core/src/multiarray/textreading/stream.h | 8 | ||||
-rw-r--r-- | numpy/core/src/multiarray/textreading/tokenize.cpp (renamed from numpy/core/src/multiarray/textreading/tokenize.c.src) | 69 | ||||
-rw-r--r-- | numpy/core/src/multiarray/textreading/tokenize.h | 8 |
6 files changed, 61 insertions, 42 deletions
diff --git a/numpy/core/setup.py b/numpy/core/setup.py index 219bca57c..f6b31075d 100644 --- a/numpy/core/setup.py +++ b/numpy/core/setup.py @@ -964,7 +964,7 @@ def configuration(parent_package='',top_path=None): join('src', 'multiarray', 'textreading', 'rows.c'), join('src', 'multiarray', 'textreading', 'stream_pyobject.c'), join('src', 'multiarray', 'textreading', 'str_to_int.c'), - join('src', 'multiarray', 'textreading', 'tokenize.c.src'), + join('src', 'multiarray', 'textreading', 'tokenize.cpp'), ] ####################################################################### diff --git a/numpy/core/src/multiarray/textreading/growth.h b/numpy/core/src/multiarray/textreading/growth.h index 237b77ad3..c7ebe3651 100644 --- a/numpy/core/src/multiarray/textreading/growth.h +++ b/numpy/core/src/multiarray/textreading/growth.h @@ -1,7 +1,15 @@ #ifndef NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_GROWTH_H_ #define NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_GROWTH_H_ +#ifdef __cplusplus +extern "C" { +#endif + NPY_NO_EXPORT npy_intp grow_size_and_multiply(npy_intp *size, npy_intp min_grow, npy_intp itemsize); +#ifdef __cplusplus +} +#endif + #endif /* NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_GROWTH_H_ */ diff --git a/numpy/core/src/multiarray/textreading/parser_config.h b/numpy/core/src/multiarray/textreading/parser_config.h index 00e911667..67b5c8483 100644 --- a/numpy/core/src/multiarray/textreading/parser_config.h +++ b/numpy/core/src/multiarray/textreading/parser_config.h @@ -4,6 +4,10 @@ #include <stdbool.h> +#ifdef __cplusplus +extern "C" { +#endif + typedef struct { /* * Field delimiter character. @@ -58,4 +62,8 @@ typedef struct { } parser_config; +#ifdef __cplusplus +} +#endif + #endif /* NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_PARSER_CONFIG_H_ */ diff --git a/numpy/core/src/multiarray/textreading/stream.h b/numpy/core/src/multiarray/textreading/stream.h index 59bd14074..42ca654db 100644 --- a/numpy/core/src/multiarray/textreading/stream.h +++ b/numpy/core/src/multiarray/textreading/stream.h @@ -3,6 +3,10 @@ #include <stdint.h> +#ifdef __cplusplus +extern "C" { +#endif + /* * When getting the next line, we hope that the buffer provider can already * give some information about the newlines, because for Python iterables @@ -38,4 +42,8 @@ typedef struct _stream { ((s)->stream_nextbuf((s), start, end, kind)) #define stream_close(s) ((s)->stream_close((s))) +#ifdef __cplusplus +} +#endif + #endif /* NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_STREAM_H_ */ diff --git a/numpy/core/src/multiarray/textreading/tokenize.c.src b/numpy/core/src/multiarray/textreading/tokenize.cpp index 6ddba3345..57bee1cdd 100644 --- a/numpy/core/src/multiarray/textreading/tokenize.c.src +++ b/numpy/core/src/multiarray/textreading/tokenize.cpp @@ -1,11 +1,6 @@ #include <Python.h> -#include <stdio.h> -#include <stdlib.h> -#include <stdbool.h> -#include <string.h> - #define NPY_NO_DEPRECATED_API NPY_API_VERSION #define _MULTIARRAYMODULE #include "numpy/ndarraytypes.h" @@ -15,7 +10,6 @@ #include "textreading/parser_config.h" #include "textreading/growth.h" - /* How parsing quoted fields works: @@ -45,12 +39,10 @@ */ -/**begin repeat - * #type = Py_UCS1, Py_UCS2, Py_UCS4# - */ +template <typename UCS> static NPY_INLINE int -copy_to_field_buffer_@type@(tokenizer_state *ts, - const @type@ *chunk_start, const @type@ *chunk_end) +copy_to_field_buffer(tokenizer_state *ts, + const UCS *chunk_start, const UCS *chunk_end) { npy_intp chunk_length = chunk_end - chunk_start; npy_intp size = chunk_length + ts->field_buffer_pos + 2; @@ -62,8 +54,8 @@ copy_to_field_buffer_@type@(tokenizer_state *ts, "line too long to handle while reading file."); return -1; } - Py_UCS4 *grown = PyMem_Realloc(ts->field_buffer, alloc_size); - if (grown == NULL) { + Py_UCS4 *grown = (Py_UCS4 *)PyMem_Realloc(ts->field_buffer, alloc_size); + if (grown == nullptr) { PyErr_NoMemory(); return -1; } @@ -79,7 +71,6 @@ copy_to_field_buffer_@type@(tokenizer_state *ts, ts->field_buffer_pos += chunk_length; return 0; } -/**end repeat**/ static NPY_INLINE int @@ -99,8 +90,8 @@ add_field(tokenizer_state *ts) "too many columns found; cannot read file."); return -1; } - field_info *fields = PyMem_Realloc(ts->fields, alloc_size); - if (fields == NULL) { + field_info *fields = (field_info *)PyMem_Realloc(ts->fields, alloc_size); + if (fields == nullptr) { PyErr_NoMemory(); return -1; } @@ -117,16 +108,13 @@ add_field(tokenizer_state *ts) } -/**begin repeat - * #kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND, PyUnicode_4BYTE_KIND# - * #type = Py_UCS1, Py_UCS2, Py_UCS4# - */ +template <typename UCS> static NPY_INLINE int -tokenizer_core_@type@(tokenizer_state *ts, parser_config *const config) +tokenizer_core(tokenizer_state *ts, parser_config *const config) { - @type@ *pos = (@type@ *)ts->pos; - @type@ *stop = (@type@ *)ts->end; - @type@ *chunk_start; + UCS *pos = (UCS *)ts->pos; + UCS *stop = (UCS *)ts->end; + UCS *chunk_start; if (ts->state == TOKENIZE_CHECK_QUOTED) { /* before we can check for quotes, strip leading whitespace */ @@ -174,7 +162,7 @@ tokenizer_core_@type@(tokenizer_state *ts, parser_config *const config) break; } } - if (copy_to_field_buffer_@type@(ts, chunk_start, pos) < 0) { + if (copy_to_field_buffer(ts, chunk_start, pos) < 0) { return -1; } pos++; @@ -201,7 +189,7 @@ tokenizer_core_@type@(tokenizer_state *ts, parser_config *const config) break; } } - if (copy_to_field_buffer_@type@(ts, chunk_start, pos) < 0) { + if (copy_to_field_buffer(ts, chunk_start, pos) < 0) { return -1; } pos++; @@ -215,7 +203,7 @@ tokenizer_core_@type@(tokenizer_state *ts, parser_config *const config) break; } } - if (copy_to_field_buffer_@type@(ts, chunk_start, pos) < 0) { + if (copy_to_field_buffer(ts, chunk_start, pos) < 0) { return -1; } pos++; @@ -224,7 +212,7 @@ tokenizer_core_@type@(tokenizer_state *ts, parser_config *const config) case TOKENIZE_QUOTED_CHECK_DOUBLE_QUOTE: if (*pos == config->quote) { /* Copy the quote character directly from the config: */ - if (copy_to_field_buffer_Py_UCS4(ts, + if (copy_to_field_buffer(ts, &config->quote, &config->quote+1) < 0) { return -1; } @@ -271,7 +259,6 @@ tokenizer_core_@type@(tokenizer_state *ts, parser_config *const config) ts->pos = (char *)pos; return 0; } -/**end repeat**/ /* @@ -308,7 +295,7 @@ tokenize(stream *s, tokenizer_state *ts, parser_config *const config) ts->field_buffer_pos = 0; ts->num_fields = 0; - while (1) { + while (true) { /* * This loop adds new fields to the result (to make up a full row) * until the row ends (typically a line end or the file end) @@ -352,14 +339,14 @@ tokenize(stream *s, tokenizer_state *ts, parser_config *const config) } int status; if (ts->unicode_kind == PyUnicode_1BYTE_KIND) { - status = tokenizer_core_Py_UCS1(ts, config); + status = tokenizer_core<Py_UCS1>(ts, config); } else if (ts->unicode_kind == PyUnicode_2BYTE_KIND) { - status = tokenizer_core_Py_UCS2(ts, config); + status = tokenizer_core<Py_UCS2>(ts, config); } else { assert(ts->unicode_kind == PyUnicode_4BYTE_KIND); - status = tokenizer_core_Py_UCS4(ts, config); + status = tokenizer_core<Py_UCS4>(ts, config); } if (status < 0) { return -1; @@ -408,11 +395,11 @@ NPY_NO_EXPORT void tokenizer_clear(tokenizer_state *ts) { PyMem_FREE(ts->field_buffer); - ts->field_buffer = NULL; + ts->field_buffer = nullptr; ts->field_buffer_length = 0; PyMem_FREE(ts->fields); - ts->fields = NULL; + ts->fields = nullptr; ts->fields_size = 0; } @@ -437,18 +424,18 @@ tokenizer_init(tokenizer_state *ts, parser_config *config) ts->num_fields = 0; ts->buf_state = 0; - ts->pos = NULL; - ts->end = NULL; + ts->pos = nullptr; + ts->end = nullptr; - ts->field_buffer = PyMem_Malloc(32 * sizeof(Py_UCS4)); - if (ts->field_buffer == NULL) { + ts->field_buffer = (Py_UCS4 *)PyMem_Malloc(32 * sizeof(Py_UCS4)); + if (ts->field_buffer == nullptr) { PyErr_NoMemory(); return -1; } ts->field_buffer_length = 32; - ts->fields = PyMem_Malloc(4 * sizeof(*ts->fields)); - if (ts->fields == NULL) { + ts->fields = (field_info *)PyMem_Malloc(4 * sizeof(*ts->fields)); + if (ts->fields == nullptr) { PyErr_NoMemory(); return -1; } diff --git a/numpy/core/src/multiarray/textreading/tokenize.h b/numpy/core/src/multiarray/textreading/tokenize.h index fa10bb9b0..a78c6d936 100644 --- a/numpy/core/src/multiarray/textreading/tokenize.h +++ b/numpy/core/src/multiarray/textreading/tokenize.h @@ -8,6 +8,10 @@ #include "textreading/stream.h" #include "textreading/parser_config.h" +#ifdef __cplusplus +extern "C" { +#endif + typedef enum { /* Initialization of fields */ @@ -75,4 +79,8 @@ tokenizer_init(tokenizer_state *ts, parser_config *config); NPY_NO_EXPORT int tokenize(stream *s, tokenizer_state *ts, parser_config *const config); +#ifdef __cplusplus +} +#endif + #endif /* NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_TOKENIZE_H_ */ |