Merge pull request #21025 from seberg/tokenizer-cpp

MAINT: Use C++ for tokenizer unicode-kind templating
author: Matti Picus <matti.picus@gmail.com> 2022-02-14 11:16:18 +0200
committer: GitHub <noreply@github.com> 2022-02-14 11:16:18 +0200
commit: ee1722d287fb32a724ac3f5807731e36cbbc5567 (patch)
tree: 02c1f285dda3128dfebd5c7644f8f00cedb91c9e
parent: 3e8dedab8c126636013794f8d1130476f086be0a (diff)
parent: 6fd9670a3e804d4d1bd2bd9249748bc82e2a3770 (diff)
download: numpy-ee1722d287fb32a724ac3f5807731e36cbbc5567.tar.gz
6 files changed, 61 insertions, 42 deletions
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index 219bca57c..f6b31075d 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -964,7 +964,7 @@ def configuration(parent_package='',top_path=None):
             join('src', 'multiarray', 'textreading', 'rows.c'),
             join('src', 'multiarray', 'textreading', 'stream_pyobject.c'),
             join('src', 'multiarray', 'textreading', 'str_to_int.c'),
-            join('src', 'multiarray', 'textreading', 'tokenize.c.src'),
+            join('src', 'multiarray', 'textreading', 'tokenize.cpp'),
             ]
 
     #######################################################################
diff --git a/numpy/core/src/multiarray/textreading/growth.h b/numpy/core/src/multiarray/textreading/growth.h
index 237b77ad3..c7ebe3651 100644
--- a/numpy/core/src/multiarray/textreading/growth.h
+++ b/numpy/core/src/multiarray/textreading/growth.h
@@ -1,7 +1,15 @@
 #ifndef NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_GROWTH_H_
 #define NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_GROWTH_H_
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 NPY_NO_EXPORT npy_intp
 grow_size_and_multiply(npy_intp *size, npy_intp min_grow, npy_intp itemsize);
 
+#ifdef __cplusplus
+}
+#endif
+
 #endif  /* NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_GROWTH_H_ */
diff --git a/numpy/core/src/multiarray/textreading/parser_config.h b/numpy/core/src/multiarray/textreading/parser_config.h
index 00e911667..67b5c8483 100644
--- a/numpy/core/src/multiarray/textreading/parser_config.h
+++ b/numpy/core/src/multiarray/textreading/parser_config.h
@@ -4,6 +4,10 @@
 
 #include <stdbool.h>
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 typedef struct {
     /*
      *  Field delimiter character.
@@ -58,4 +62,8 @@ typedef struct {
 } parser_config;
 
 
+#ifdef __cplusplus
+}
+#endif
+
 #endif  /* NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_PARSER_CONFIG_H_ */
diff --git a/numpy/core/src/multiarray/textreading/stream.h b/numpy/core/src/multiarray/textreading/stream.h
index 59bd14074..42ca654db 100644
--- a/numpy/core/src/multiarray/textreading/stream.h
+++ b/numpy/core/src/multiarray/textreading/stream.h
@@ -3,6 +3,10 @@
 
 #include <stdint.h>
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 /*
  * When getting the next line, we hope that the buffer provider can already
  * give some information about the newlines, because for Python iterables
@@ -38,4 +42,8 @@ typedef struct _stream {
         ((s)->stream_nextbuf((s), start, end, kind))
 #define stream_close(s)    ((s)->stream_close((s)))
 
+#ifdef __cplusplus
+}
+#endif
+
 #endif  /* NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_STREAM_H_ */
diff --git a/numpy/core/src/multiarray/textreading/tokenize.c.src b/numpy/core/src/multiarray/textreading/tokenize.cpp
index 6ddba3345..57bee1cdd 100644
--- a/numpy/core/src/multiarray/textreading/tokenize.c.src
+++ b/numpy/core/src/multiarray/textreading/tokenize.cpp
@@ -1,11 +1,6 @@
 
 #include <Python.h>
 
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdbool.h>
-#include <string.h>
-
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
 #define _MULTIARRAYMODULE
 #include "numpy/ndarraytypes.h"
@@ -15,7 +10,6 @@
 #include "textreading/parser_config.h"
 #include "textreading/growth.h"
 
-
 /*
     How parsing quoted fields works:
 
@@ -45,12 +39,10 @@
 */
 
 
-/**begin repeat
- * #type = Py_UCS1, Py_UCS2, Py_UCS4#
- */
+template <typename UCS>
 static NPY_INLINE int
-copy_to_field_buffer_@type@(tokenizer_state *ts,
-        const @type@ *chunk_start, const @type@ *chunk_end)
+copy_to_field_buffer(tokenizer_state *ts,
+        const UCS *chunk_start, const UCS *chunk_end)
 {
     npy_intp chunk_length = chunk_end - chunk_start;
     npy_intp size = chunk_length + ts->field_buffer_pos + 2;
@@ -62,8 +54,8 @@ copy_to_field_buffer_@type@(tokenizer_state *ts,
                     "line too long to handle while reading file.");
             return -1;
         }
-        Py_UCS4 *grown = PyMem_Realloc(ts->field_buffer, alloc_size);
-        if (grown == NULL) {
+        Py_UCS4 *grown = (Py_UCS4 *)PyMem_Realloc(ts->field_buffer, alloc_size);
+        if (grown == nullptr) {
             PyErr_NoMemory();
             return -1;
         }
@@ -79,7 +71,6 @@ copy_to_field_buffer_@type@(tokenizer_state *ts,
     ts->field_buffer_pos += chunk_length;
     return 0;
 }
-/**end repeat**/
 
 
 static NPY_INLINE int
@@ -99,8 +90,8 @@ add_field(tokenizer_state *ts)
                     "too many columns found; cannot read file.");
             return -1;
         }
-        field_info *fields = PyMem_Realloc(ts->fields, alloc_size);
-        if (fields == NULL) {
+        field_info *fields = (field_info *)PyMem_Realloc(ts->fields, alloc_size);
+        if (fields == nullptr) {
             PyErr_NoMemory();
             return -1;
         }
@@ -117,16 +108,13 @@ add_field(tokenizer_state *ts)
 }
 
 
-/**begin repeat
- * #kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND, PyUnicode_4BYTE_KIND#
- * #type = Py_UCS1, Py_UCS2, Py_UCS4#
- */
+template <typename UCS>
 static NPY_INLINE int
-tokenizer_core_@type@(tokenizer_state *ts, parser_config *const config)
+tokenizer_core(tokenizer_state *ts, parser_config *const config)
 {
-    @type@ *pos = (@type@ *)ts->pos;
-    @type@ *stop = (@type@ *)ts->end;
-    @type@ *chunk_start;
+    UCS *pos = (UCS *)ts->pos;
+    UCS *stop = (UCS *)ts->end;
+    UCS *chunk_start;
 
     if (ts->state == TOKENIZE_CHECK_QUOTED) {
         /* before we can check for quotes, strip leading whitespace */
@@ -174,7 +162,7 @@ tokenizer_core_@type@(tokenizer_state *ts, parser_config *const config)
                     break;
                 }
             }
-            if (copy_to_field_buffer_@type@(ts, chunk_start, pos) < 0) {
+            if (copy_to_field_buffer(ts, chunk_start, pos) < 0) {
                 return -1;
             }
             pos++;
@@ -201,7 +189,7 @@ tokenizer_core_@type@(tokenizer_state *ts, parser_config *const config)
                     break;
                 }
             }
-            if (copy_to_field_buffer_@type@(ts, chunk_start, pos) < 0) {
+            if (copy_to_field_buffer(ts, chunk_start, pos) < 0) {
                 return -1;
             }
             pos++;
@@ -215,7 +203,7 @@ tokenizer_core_@type@(tokenizer_state *ts, parser_config *const config)
                     break;
                 }
             }
-            if (copy_to_field_buffer_@type@(ts, chunk_start, pos) < 0) {
+            if (copy_to_field_buffer(ts, chunk_start, pos) < 0) {
                 return -1;
             }
             pos++;
@@ -224,7 +212,7 @@ tokenizer_core_@type@(tokenizer_state *ts, parser_config *const config)
         case TOKENIZE_QUOTED_CHECK_DOUBLE_QUOTE:
             if (*pos == config->quote) {
                 /* Copy the quote character directly from the config: */
-                if (copy_to_field_buffer_Py_UCS4(ts,
+                if (copy_to_field_buffer(ts,
                         &config->quote, &config->quote+1) < 0) {
                     return -1;
                 }
@@ -271,7 +259,6 @@ tokenizer_core_@type@(tokenizer_state *ts, parser_config *const config)
     ts->pos = (char *)pos;
     return 0;
 }
-/**end repeat**/
 
 
 /*
@@ -308,7 +295,7 @@ tokenize(stream *s, tokenizer_state *ts, parser_config *const config)
     ts->field_buffer_pos = 0;
     ts->num_fields = 0;
 
-    while (1) {
+    while (true) {
         /*
          * This loop adds new fields to the result (to make up a full row)
          * until the row ends (typically a line end or the file end)
@@ -352,14 +339,14 @@ tokenize(stream *s, tokenizer_state *ts, parser_config *const config)
         }
         int status;
         if (ts->unicode_kind == PyUnicode_1BYTE_KIND) {
-            status = tokenizer_core_Py_UCS1(ts, config);
+            status = tokenizer_core<Py_UCS1>(ts, config);
         }
         else if (ts->unicode_kind == PyUnicode_2BYTE_KIND) {
-            status = tokenizer_core_Py_UCS2(ts, config);
+            status = tokenizer_core<Py_UCS2>(ts, config);
         }
         else {
             assert(ts->unicode_kind == PyUnicode_4BYTE_KIND);
-            status = tokenizer_core_Py_UCS4(ts, config);
+            status = tokenizer_core<Py_UCS4>(ts, config);
         }
         if (status < 0) {
             return -1;
@@ -408,11 +395,11 @@ NPY_NO_EXPORT void
 tokenizer_clear(tokenizer_state *ts)
 {
     PyMem_FREE(ts->field_buffer);
-    ts->field_buffer = NULL;
+    ts->field_buffer = nullptr;
     ts->field_buffer_length = 0;
 
     PyMem_FREE(ts->fields);
-    ts->fields = NULL;
+    ts->fields = nullptr;
     ts->fields_size = 0;
 }
 
@@ -437,18 +424,18 @@ tokenizer_init(tokenizer_state *ts, parser_config *config)
     ts->num_fields = 0;
 
     ts->buf_state = 0;
-    ts->pos = NULL;
-    ts->end = NULL;
+    ts->pos = nullptr;
+    ts->end = nullptr;
 
-    ts->field_buffer = PyMem_Malloc(32 * sizeof(Py_UCS4));
-    if (ts->field_buffer == NULL) {
+    ts->field_buffer = (Py_UCS4 *)PyMem_Malloc(32 * sizeof(Py_UCS4));
+    if (ts->field_buffer == nullptr) {
         PyErr_NoMemory();
         return -1;
     }
     ts->field_buffer_length = 32;
 
-    ts->fields = PyMem_Malloc(4 * sizeof(*ts->fields));
-    if (ts->fields == NULL) {
+    ts->fields = (field_info *)PyMem_Malloc(4 * sizeof(*ts->fields));
+    if (ts->fields == nullptr) {
         PyErr_NoMemory();
         return -1;
     }
diff --git a/numpy/core/src/multiarray/textreading/tokenize.h b/numpy/core/src/multiarray/textreading/tokenize.h
index fa10bb9b0..a78c6d936 100644
--- a/numpy/core/src/multiarray/textreading/tokenize.h
+++ b/numpy/core/src/multiarray/textreading/tokenize.h
@@ -8,6 +8,10 @@
 #include "textreading/stream.h"
 #include "textreading/parser_config.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 
 typedef enum {
     /* Initialization of fields */
@@ -75,4 +79,8 @@ tokenizer_init(tokenizer_state *ts, parser_config *config);
 NPY_NO_EXPORT int
 tokenize(stream *s, tokenizer_state *ts, parser_config *const config);
 
+#ifdef __cplusplus
+}
+#endif
+
 #endif  /* NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_TOKENIZE_H_ */
author	Matti Picus <matti.picus@gmail.com>	2022-02-14 11:16:18 +0200
committer	GitHub <noreply@github.com>	2022-02-14 11:16:18 +0200
commit	ee1722d287fb32a724ac3f5807731e36cbbc5567 (patch)
tree	02c1f285dda3128dfebd5c7644f8f00cedb91c9e
parent	3e8dedab8c126636013794f8d1130476f086be0a (diff)
parent	6fd9670a3e804d4d1bd2bd9249748bc82e2a3770 (diff)
download	numpy-ee1722d287fb32a724ac3f5807731e36cbbc5567.tar.gz