summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--benchmarks/benchmarks/bench_io.py177
1 files changed, 177 insertions, 0 deletions
diff --git a/benchmarks/benchmarks/bench_io.py b/benchmarks/benchmarks/bench_io.py
index 782d4ab30..ce30c4345 100644
--- a/benchmarks/benchmarks/bench_io.py
+++ b/benchmarks/benchmarks/bench_io.py
@@ -3,6 +3,7 @@ from __future__ import absolute_import, division, print_function
from .common import Benchmark, get_squares
import numpy as np
+from io import StringIO
class Copy(Benchmark):
@@ -62,3 +63,179 @@ class Savez(Benchmark):
def time_vb_savez_squares(self):
np.savez('tmp.npz', self.squares)
+
+class LoadtxtCSVComments(Benchmark):
+ # benchmarks for np.loadtxt comment handling
+ # when reading in CSV files
+
+ params = [10, int(1e2), int(1e4), int(1e5)]
+ param_names = ['num_lines']
+
+ def setup(self, num_lines):
+ data = [u'1,2,3 # comment'] * num_lines
+ # unfortunately, timeit will only run setup()
+ # between repeat events, but not for iterations
+ # within repeats, so the StringIO object
+ # will have to be rewinded in the benchmark proper
+ self.data_comments = StringIO(u'\n'.join(data))
+
+ def time_comment_loadtxt_csv(self, num_lines):
+ # benchmark handling of lines with comments
+ # when loading in from csv files
+
+ # inspired by similar benchmark in pandas
+ # for read_csv
+
+ # need to rewind StringIO object (unfortunately
+ # confounding timing result somewhat) for every
+ # call to timing test proper
+ np.loadtxt(self.data_comments,
+ delimiter=u',')
+ self.data_comments.seek(0)
+
+class LoadtxtCSVdtypes(Benchmark):
+ # benchmarks for np.loadtxt operating with
+ # different dtypes parsed / cast from CSV files
+
+ params = (['float32', 'float64', 'int32', 'int64',
+ 'complex128', 'str', 'object'],
+ [10, int(1e2), int(1e4), int(1e5)])
+ param_names = ['dtype', 'num_lines']
+
+ def setup(self, dtype, num_lines):
+ data = [u'5, 7, 888'] * num_lines
+ self.csv_data = StringIO(u'\n'.join(data))
+
+ def time_loadtxt_dtypes_csv(self, dtype, num_lines):
+ # benchmark loading arrays of various dtypes
+ # from csv files
+
+ # state-dependent timing benchmark requires
+ # rewind of StringIO object
+
+ np.loadtxt(self.csv_data,
+ delimiter=u',',
+ dtype=dtype)
+ self.csv_data.seek(0)
+
+class LoadtxtCSVStructured(Benchmark):
+ # benchmarks for np.loadtxt operating with
+ # a structured data type & CSV file
+
+ def setup(self):
+ num_lines = 50000
+ data = [u"M, 21, 72, X, 155"] * num_lines
+ self.csv_data = StringIO(u'\n'.join(data))
+
+ def time_loadtxt_csv_struct_dtype(self):
+ # obligate rewind of StringIO object
+ # between iterations of a repeat:
+
+ np.loadtxt(self.csv_data,
+ delimiter=u',',
+ dtype=[('category_1', 'S1'),
+ ('category_2', 'i4'),
+ ('category_3', 'f8'),
+ ('category_4', 'S1'),
+ ('category_5', 'f8')])
+ self.csv_data.seek(0)
+
+
+class LoadtxtCSVSkipRows(Benchmark):
+ # benchmarks for loadtxt row skipping when
+ # reading in csv file data; a similar benchmark
+ # is present in the pandas asv suite
+
+ params = [0, 500, 10000]
+ param_names = ['skiprows']
+
+ def setup(self, skiprows):
+ np.random.seed(123)
+ test_array = np.random.rand(100000, 3)
+ self.fname = 'test_array.csv'
+ np.savetxt(fname=self.fname,
+ X=test_array,
+ delimiter=',')
+
+ def time_skiprows_csv(self, skiprows):
+ np.loadtxt(self.fname,
+ delimiter=',',
+ skiprows=skiprows)
+
+class LoadtxtReadUint64Integers(Benchmark):
+ # pandas has a similar CSV reading benchmark
+ # modified to suit np.loadtxt
+
+ params = [550, 1000, 10000]
+ param_names = ['size']
+
+ def setup(self, size):
+ arr = np.arange(size).astype('uint64') + 2**63
+ self.data1 = StringIO(u'\n'.join(arr.astype(str).tolist()))
+ arr = arr.astype(object)
+ arr[500] = -1
+ self.data2 = StringIO(u'\n'.join(arr.astype(str).tolist()))
+
+ def time_read_uint64(self, size):
+ # mandatory rewind of StringIO object
+ # between iterations of a repeat:
+ np.loadtxt(self.data1)
+ self.data1.seek(0)
+
+ def time_read_uint64_neg_values(self, size):
+ # mandatory rewind of StringIO object
+ # between iterations of a repeat:
+ np.loadtxt(self.data2)
+ self.data2.seek(0)
+
+class LoadtxtUseColsCSV(Benchmark):
+ # benchmark selective column reading from CSV files
+ # using np.loadtxt
+
+ params = [2, [1, 3], [1, 3, 5, 7]]
+ param_names = ['usecols']
+
+ def setup(self, usecols):
+ num_lines = 5000
+ data = [u'0, 1, 2, 3, 4, 5, 6, 7, 8, 9'] * num_lines
+ self.csv_data = StringIO(u'\n'.join(data))
+
+ def time_loadtxt_usecols_csv(self, usecols):
+ # must rewind StringIO because of state
+ # dependence of file reading
+ np.loadtxt(self.csv_data,
+ delimiter=u',',
+ usecols=usecols)
+ self.csv_data.seek(0)
+
+class LoadtxtCSVDateTime(Benchmark):
+ # benchmarks for np.loadtxt operating with
+ # datetime data in a CSV file
+
+ params = [20, 200, 2000, 20000]
+ param_names = ['num_lines']
+
+ def setup(self, num_lines):
+ # create the equivalent of a two-column CSV file
+ # with date strings in the first column and random
+ # floating point data in the second column
+ dates = np.arange('today', 20, dtype=np.datetime64)
+ np.random.seed(123)
+ values = np.random.rand(20)
+ date_line = u''
+
+ for date, value in zip(dates, values):
+ date_line += (str(date) + ',' + str(value) + '\n')
+
+ # expand data to specified number of lines
+ data = date_line * (num_lines // 20)
+ self.csv_data = StringIO(data)
+
+ def time_loadtxt_csv_datetime(self, num_lines):
+ # rewind StringIO object -- the timing iterations
+ # are state-dependent
+ X = np.loadtxt(self.csv_data,
+ delimiter=u',',
+ dtype=([('dates', 'M8[us]'),
+ ('values', 'float64')]))
+ self.csv_data.seek(0)