summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatti Picus <matti.picus@gmail.com>2022-02-14 11:16:18 +0200
committerGitHub <noreply@github.com>2022-02-14 11:16:18 +0200
commitee1722d287fb32a724ac3f5807731e36cbbc5567 (patch)
tree02c1f285dda3128dfebd5c7644f8f00cedb91c9e
parent3e8dedab8c126636013794f8d1130476f086be0a (diff)
parent6fd9670a3e804d4d1bd2bd9249748bc82e2a3770 (diff)
downloadnumpy-ee1722d287fb32a724ac3f5807731e36cbbc5567.tar.gz
Merge pull request #21025 from seberg/tokenizer-cpp
MAINT: Use C++ for tokenizer unicode-kind templating
-rw-r--r--numpy/core/setup.py2
-rw-r--r--numpy/core/src/multiarray/textreading/growth.h8
-rw-r--r--numpy/core/src/multiarray/textreading/parser_config.h8
-rw-r--r--numpy/core/src/multiarray/textreading/stream.h8
-rw-r--r--numpy/core/src/multiarray/textreading/tokenize.cpp (renamed from numpy/core/src/multiarray/textreading/tokenize.c.src)69
-rw-r--r--numpy/core/src/multiarray/textreading/tokenize.h8
6 files changed, 61 insertions, 42 deletions
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index 219bca57c..f6b31075d 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -964,7 +964,7 @@ def configuration(parent_package='',top_path=None):
join('src', 'multiarray', 'textreading', 'rows.c'),
join('src', 'multiarray', 'textreading', 'stream_pyobject.c'),
join('src', 'multiarray', 'textreading', 'str_to_int.c'),
- join('src', 'multiarray', 'textreading', 'tokenize.c.src'),
+ join('src', 'multiarray', 'textreading', 'tokenize.cpp'),
]
#######################################################################
diff --git a/numpy/core/src/multiarray/textreading/growth.h b/numpy/core/src/multiarray/textreading/growth.h
index 237b77ad3..c7ebe3651 100644
--- a/numpy/core/src/multiarray/textreading/growth.h
+++ b/numpy/core/src/multiarray/textreading/growth.h
@@ -1,7 +1,15 @@
#ifndef NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_GROWTH_H_
#define NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_GROWTH_H_
+#ifdef __cplusplus
+extern "C" {
+#endif
+
NPY_NO_EXPORT npy_intp
grow_size_and_multiply(npy_intp *size, npy_intp min_grow, npy_intp itemsize);
+#ifdef __cplusplus
+}
+#endif
+
#endif /* NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_GROWTH_H_ */
diff --git a/numpy/core/src/multiarray/textreading/parser_config.h b/numpy/core/src/multiarray/textreading/parser_config.h
index 00e911667..67b5c8483 100644
--- a/numpy/core/src/multiarray/textreading/parser_config.h
+++ b/numpy/core/src/multiarray/textreading/parser_config.h
@@ -4,6 +4,10 @@
#include <stdbool.h>
+#ifdef __cplusplus
+extern "C" {
+#endif
+
typedef struct {
/*
* Field delimiter character.
@@ -58,4 +62,8 @@ typedef struct {
} parser_config;
+#ifdef __cplusplus
+}
+#endif
+
#endif /* NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_PARSER_CONFIG_H_ */
diff --git a/numpy/core/src/multiarray/textreading/stream.h b/numpy/core/src/multiarray/textreading/stream.h
index 59bd14074..42ca654db 100644
--- a/numpy/core/src/multiarray/textreading/stream.h
+++ b/numpy/core/src/multiarray/textreading/stream.h
@@ -3,6 +3,10 @@
#include <stdint.h>
+#ifdef __cplusplus
+extern "C" {
+#endif
+
/*
* When getting the next line, we hope that the buffer provider can already
* give some information about the newlines, because for Python iterables
@@ -38,4 +42,8 @@ typedef struct _stream {
((s)->stream_nextbuf((s), start, end, kind))
#define stream_close(s) ((s)->stream_close((s)))
+#ifdef __cplusplus
+}
+#endif
+
#endif /* NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_STREAM_H_ */
diff --git a/numpy/core/src/multiarray/textreading/tokenize.c.src b/numpy/core/src/multiarray/textreading/tokenize.cpp
index 6ddba3345..57bee1cdd 100644
--- a/numpy/core/src/multiarray/textreading/tokenize.c.src
+++ b/numpy/core/src/multiarray/textreading/tokenize.cpp
@@ -1,11 +1,6 @@
#include <Python.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdbool.h>
-#include <string.h>
-
#define NPY_NO_DEPRECATED_API NPY_API_VERSION
#define _MULTIARRAYMODULE
#include "numpy/ndarraytypes.h"
@@ -15,7 +10,6 @@
#include "textreading/parser_config.h"
#include "textreading/growth.h"
-
/*
How parsing quoted fields works:
@@ -45,12 +39,10 @@
*/
-/**begin repeat
- * #type = Py_UCS1, Py_UCS2, Py_UCS4#
- */
+template <typename UCS>
static NPY_INLINE int
-copy_to_field_buffer_@type@(tokenizer_state *ts,
- const @type@ *chunk_start, const @type@ *chunk_end)
+copy_to_field_buffer(tokenizer_state *ts,
+ const UCS *chunk_start, const UCS *chunk_end)
{
npy_intp chunk_length = chunk_end - chunk_start;
npy_intp size = chunk_length + ts->field_buffer_pos + 2;
@@ -62,8 +54,8 @@ copy_to_field_buffer_@type@(tokenizer_state *ts,
"line too long to handle while reading file.");
return -1;
}
- Py_UCS4 *grown = PyMem_Realloc(ts->field_buffer, alloc_size);
- if (grown == NULL) {
+ Py_UCS4 *grown = (Py_UCS4 *)PyMem_Realloc(ts->field_buffer, alloc_size);
+ if (grown == nullptr) {
PyErr_NoMemory();
return -1;
}
@@ -79,7 +71,6 @@ copy_to_field_buffer_@type@(tokenizer_state *ts,
ts->field_buffer_pos += chunk_length;
return 0;
}
-/**end repeat**/
static NPY_INLINE int
@@ -99,8 +90,8 @@ add_field(tokenizer_state *ts)
"too many columns found; cannot read file.");
return -1;
}
- field_info *fields = PyMem_Realloc(ts->fields, alloc_size);
- if (fields == NULL) {
+ field_info *fields = (field_info *)PyMem_Realloc(ts->fields, alloc_size);
+ if (fields == nullptr) {
PyErr_NoMemory();
return -1;
}
@@ -117,16 +108,13 @@ add_field(tokenizer_state *ts)
}
-/**begin repeat
- * #kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND, PyUnicode_4BYTE_KIND#
- * #type = Py_UCS1, Py_UCS2, Py_UCS4#
- */
+template <typename UCS>
static NPY_INLINE int
-tokenizer_core_@type@(tokenizer_state *ts, parser_config *const config)
+tokenizer_core(tokenizer_state *ts, parser_config *const config)
{
- @type@ *pos = (@type@ *)ts->pos;
- @type@ *stop = (@type@ *)ts->end;
- @type@ *chunk_start;
+ UCS *pos = (UCS *)ts->pos;
+ UCS *stop = (UCS *)ts->end;
+ UCS *chunk_start;
if (ts->state == TOKENIZE_CHECK_QUOTED) {
/* before we can check for quotes, strip leading whitespace */
@@ -174,7 +162,7 @@ tokenizer_core_@type@(tokenizer_state *ts, parser_config *const config)
break;
}
}
- if (copy_to_field_buffer_@type@(ts, chunk_start, pos) < 0) {
+ if (copy_to_field_buffer(ts, chunk_start, pos) < 0) {
return -1;
}
pos++;
@@ -201,7 +189,7 @@ tokenizer_core_@type@(tokenizer_state *ts, parser_config *const config)
break;
}
}
- if (copy_to_field_buffer_@type@(ts, chunk_start, pos) < 0) {
+ if (copy_to_field_buffer(ts, chunk_start, pos) < 0) {
return -1;
}
pos++;
@@ -215,7 +203,7 @@ tokenizer_core_@type@(tokenizer_state *ts, parser_config *const config)
break;
}
}
- if (copy_to_field_buffer_@type@(ts, chunk_start, pos) < 0) {
+ if (copy_to_field_buffer(ts, chunk_start, pos) < 0) {
return -1;
}
pos++;
@@ -224,7 +212,7 @@ tokenizer_core_@type@(tokenizer_state *ts, parser_config *const config)
case TOKENIZE_QUOTED_CHECK_DOUBLE_QUOTE:
if (*pos == config->quote) {
/* Copy the quote character directly from the config: */
- if (copy_to_field_buffer_Py_UCS4(ts,
+ if (copy_to_field_buffer(ts,
&config->quote, &config->quote+1) < 0) {
return -1;
}
@@ -271,7 +259,6 @@ tokenizer_core_@type@(tokenizer_state *ts, parser_config *const config)
ts->pos = (char *)pos;
return 0;
}
-/**end repeat**/
/*
@@ -308,7 +295,7 @@ tokenize(stream *s, tokenizer_state *ts, parser_config *const config)
ts->field_buffer_pos = 0;
ts->num_fields = 0;
- while (1) {
+ while (true) {
/*
* This loop adds new fields to the result (to make up a full row)
* until the row ends (typically a line end or the file end)
@@ -352,14 +339,14 @@ tokenize(stream *s, tokenizer_state *ts, parser_config *const config)
}
int status;
if (ts->unicode_kind == PyUnicode_1BYTE_KIND) {
- status = tokenizer_core_Py_UCS1(ts, config);
+ status = tokenizer_core<Py_UCS1>(ts, config);
}
else if (ts->unicode_kind == PyUnicode_2BYTE_KIND) {
- status = tokenizer_core_Py_UCS2(ts, config);
+ status = tokenizer_core<Py_UCS2>(ts, config);
}
else {
assert(ts->unicode_kind == PyUnicode_4BYTE_KIND);
- status = tokenizer_core_Py_UCS4(ts, config);
+ status = tokenizer_core<Py_UCS4>(ts, config);
}
if (status < 0) {
return -1;
@@ -408,11 +395,11 @@ NPY_NO_EXPORT void
tokenizer_clear(tokenizer_state *ts)
{
PyMem_FREE(ts->field_buffer);
- ts->field_buffer = NULL;
+ ts->field_buffer = nullptr;
ts->field_buffer_length = 0;
PyMem_FREE(ts->fields);
- ts->fields = NULL;
+ ts->fields = nullptr;
ts->fields_size = 0;
}
@@ -437,18 +424,18 @@ tokenizer_init(tokenizer_state *ts, parser_config *config)
ts->num_fields = 0;
ts->buf_state = 0;
- ts->pos = NULL;
- ts->end = NULL;
+ ts->pos = nullptr;
+ ts->end = nullptr;
- ts->field_buffer = PyMem_Malloc(32 * sizeof(Py_UCS4));
- if (ts->field_buffer == NULL) {
+ ts->field_buffer = (Py_UCS4 *)PyMem_Malloc(32 * sizeof(Py_UCS4));
+ if (ts->field_buffer == nullptr) {
PyErr_NoMemory();
return -1;
}
ts->field_buffer_length = 32;
- ts->fields = PyMem_Malloc(4 * sizeof(*ts->fields));
- if (ts->fields == NULL) {
+ ts->fields = (field_info *)PyMem_Malloc(4 * sizeof(*ts->fields));
+ if (ts->fields == nullptr) {
PyErr_NoMemory();
return -1;
}
diff --git a/numpy/core/src/multiarray/textreading/tokenize.h b/numpy/core/src/multiarray/textreading/tokenize.h
index fa10bb9b0..a78c6d936 100644
--- a/numpy/core/src/multiarray/textreading/tokenize.h
+++ b/numpy/core/src/multiarray/textreading/tokenize.h
@@ -8,6 +8,10 @@
#include "textreading/stream.h"
#include "textreading/parser_config.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
typedef enum {
/* Initialization of fields */
@@ -75,4 +79,8 @@ tokenizer_init(tokenizer_state *ts, parser_config *config);
NPY_NO_EXPORT int
tokenize(stream *s, tokenizer_state *ts, parser_config *const config);
+#ifdef __cplusplus
+}
+#endif
+
#endif /* NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_TOKENIZE_H_ */