kafka/producer/buffer.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401

from __future__ import absolute_import

import collections
import io
import threading
import time

from ..codec import (has_gzip, has_snappy, has_lz4,
                     gzip_encode, snappy_encode,
                     lz4_encode, lz4_encode_old_kafka)
from .. import errors as Errors
from ..protocol.types import Int32, Int64
from ..protocol.message import MessageSet, Message


class MessageSetBuffer(object):
    """Wrap a buffer for writing MessageSet batches.

    Arguments:
        buf (IO stream): a buffer for writing data. Typically BytesIO.
        batch_size (int): maximum number of bytes to write to the buffer.

    Keyword Arguments:
        compression_type ('gzip', 'snappy', None): compress messages before
            publishing. Default: None.
    """
    _COMPRESSORS = {
        'gzip': (has_gzip, gzip_encode, Message.CODEC_GZIP),
        'snappy': (has_snappy, snappy_encode, Message.CODEC_SNAPPY),
        'lz4': (has_lz4, lz4_encode, Message.CODEC_LZ4),
        'lz4-old-kafka': (has_lz4, lz4_encode_old_kafka, Message.CODEC_LZ4),
    }
    def __init__(self, buf, batch_size, compression_type=None, message_version=0):
        if compression_type is not None:
            assert compression_type in self._COMPRESSORS, 'Unrecognized compression type'

            # Kafka 0.8/0.9 had a quirky lz4...
            if compression_type == 'lz4' and message_version == 0:
                compression_type = 'lz4-old-kafka'

            checker, encoder, attributes = self._COMPRESSORS[compression_type]
            assert checker(), 'Compression Libraries Not Found'
            self._compressor = encoder
            self._compression_attributes = attributes
        else:
            self._compressor = None
            self._compression_attributes = None

        self._message_version = message_version
        self._buffer = buf
        # Init MessageSetSize to 0 -- update on close
        self._buffer.seek(0)
        self._buffer.write(Int32.encode(0))
        self._batch_size = batch_size
        self._closed = False
        self._messages = 0

    def append(self, offset, message):
        """Apend a Message to the MessageSet.

        Arguments:
            offset (int): offset of the message
            message (Message or bytes): message struct or encoded bytes
        """
        if isinstance(message, Message):
            encoded = message.encode()
        else:
            encoded = bytes(message)
        msg = Int64.encode(offset) + Int32.encode(len(encoded)) + encoded
        self._buffer.write(msg)
        self._messages += 1

    def has_room_for(self, key, value):
        if self._closed:
            return False
        if not self._messages:
            return True
        needed_bytes = MessageSet.HEADER_SIZE + Message.HEADER_SIZE
        if key is not None:
            needed_bytes += len(key)
        if value is not None:
            needed_bytes += len(value)
        return self._buffer.tell() + needed_bytes < self._batch_size

    def is_full(self):
        if self._closed:
            return True
        return self._buffer.tell() >= self._batch_size

    def close(self):
        # This method may be called multiple times on the same batch
        # i.e., on retries
        # we need to make sure we only close it out once
        # otherwise compressed messages may be double-compressed
        # see Issue 718
        if not self._closed:
            if self._compressor:
                # TODO: avoid copies with bytearray / memoryview
                self._buffer.seek(4)
                msg = Message(self._compressor(self._buffer.read()),
                              attributes=self._compression_attributes,
                              magic=self._message_version)
                encoded = msg.encode()
                self._buffer.seek(4)
                self._buffer.write(Int64.encode(0)) # offset 0 for wrapper msg
                self._buffer.write(Int32.encode(len(encoded)))
                self._buffer.write(encoded)

            # Update the message set size, and return ready for full read()
            size = self._buffer.tell() - 4
            self._buffer.seek(0)
            self._buffer.write(Int32.encode(size))

        self._buffer.seek(0)
        self._closed = True

    def size_in_bytes(self):
        return self._buffer.tell()

    def buffer(self):
        return self._buffer


class SimpleBufferPool(object):
    """A simple pool of BytesIO objects with a weak memory ceiling."""
    def __init__(self, memory, poolable_size):
        """Create a new buffer pool.

        Arguments:
            memory (int): maximum memory that this buffer pool can allocate
            poolable_size (int): memory size per buffer to cache in the free
                list rather than deallocating
        """
        self._poolable_size = poolable_size
        self._lock = threading.RLock()

        buffers = int(memory / poolable_size) if poolable_size else 0
        self._free = collections.deque([io.BytesIO() for _ in range(buffers)])

        self._waiters = collections.deque()
        #self.metrics = metrics;
        #self.waitTime = this.metrics.sensor("bufferpool-wait-time");
        #MetricName metricName = metrics.metricName("bufferpool-wait-ratio", metricGrpName, "The fraction of time an appender waits for space allocation.");
        #this.waitTime.add(metricName, new Rate(TimeUnit.NANOSECONDS));

    def allocate(self, size, max_time_to_block_ms):
        """
        Allocate a buffer of the given size. This method blocks if there is not
        enough memory and the buffer pool is configured with blocking mode.

        Arguments:
            size (int): The buffer size to allocate in bytes [ignored]
            max_time_to_block_ms (int): The maximum time in milliseconds to
                block for buffer memory to be available

        Returns:
            io.BytesIO
        """
        with self._lock:
            # check if we have a free buffer of the right size pooled
            if self._free:
                return self._free.popleft()

            elif self._poolable_size == 0:
                return io.BytesIO()

            else:
                # we are out of buffers and will have to block
                buf = None
                more_memory = threading.Condition(self._lock)
                self._waiters.append(more_memory)
                # loop over and over until we have a buffer or have reserved
                # enough memory to allocate one
                while buf is None:
                    start_wait = time.time()
                    more_memory.wait(max_time_to_block_ms / 1000.0)
                    end_wait = time.time()
                    #this.waitTime.record(endWait - startWait, time.milliseconds());

                    if self._free:
                        buf = self._free.popleft()
                    else:
                        raise Errors.KafkaTimeoutError(
                            "Failed to allocate memory within the configured"
                            " max blocking time")

                # remove the condition for this thread to let the next thread
                # in line start getting memory
                removed = self._waiters.popleft()
                assert removed is more_memory, 'Wrong condition'

                # signal any additional waiters if there is more memory left
                # over for them
                if self._free and self._waiters:
                    self._waiters[0].notify()

                # unlock and return the buffer
                return buf

    def deallocate(self, buf):
        """
        Return buffers to the pool. If they are of the poolable size add them
        to the free list, otherwise just mark the memory as free.

        Arguments:
            buffer_ (io.BytesIO): The buffer to return
        """
        with self._lock:
            # BytesIO.truncate here makes the pool somewhat pointless
            # but we stick with the BufferPool API until migrating to
            # bytesarray / memoryview. The buffer we return must not
            # expose any prior data on read().
            buf.truncate(0)
            self._free.append(buf)
            if self._waiters:
                self._waiters[0].notify()

    def queued(self):
        """The number of threads blocked waiting on memory."""
        with self._lock:
            return len(self._waiters)

'''
class BufferPool(object):
    """
    A pool of ByteBuffers kept under a given memory limit. This class is fairly
    specific to the needs of the producer. In particular it has the following
    properties:

    * There is a special "poolable size" and buffers of this size are kept in a
      free list and recycled
    * It is fair. That is all memory is given to the longest waiting thread
      until it has sufficient memory. This prevents starvation or deadlock when
      a thread asks for a large chunk of memory and needs to block until
      multiple buffers are deallocated.
    """
    def __init__(self, memory, poolable_size):
        """Create a new buffer pool.

        Arguments:
            memory (int): maximum memory that this buffer pool can allocate
            poolable_size (int): memory size per buffer to cache in the free
                list rather than deallocating
        """
        self._poolable_size = poolable_size
        self._lock = threading.RLock()
        self._free = collections.deque()
        self._waiters = collections.deque()
        self._total_memory = memory
        self._available_memory = memory
        #self.metrics = metrics;
        #self.waitTime = this.metrics.sensor("bufferpool-wait-time");
        #MetricName metricName = metrics.metricName("bufferpool-wait-ratio", metricGrpName, "The fraction of time an appender waits for space allocation.");
        #this.waitTime.add(metricName, new Rate(TimeUnit.NANOSECONDS));

    def allocate(self, size, max_time_to_block_ms):
        """
        Allocate a buffer of the given size. This method blocks if there is not
        enough memory and the buffer pool is configured with blocking mode.

        Arguments:
            size (int): The buffer size to allocate in bytes
            max_time_to_block_ms (int): The maximum time in milliseconds to
                block for buffer memory to be available

        Returns:
            buffer

        Raises:
            InterruptedException If the thread is interrupted while blocked
            IllegalArgumentException if size is larger than the total memory
                controlled by the pool (and hence we would block forever)
        """
        assert size <= self._total_memory, (
            "Attempt to allocate %d bytes, but there is a hard limit of %d on"
            " memory allocations." % (size, self._total_memory))

        with self._lock:
            # check if we have a free buffer of the right size pooled
            if (size == self._poolable_size and len(self._free) > 0):
                return self._free.popleft()

            # now check if the request is immediately satisfiable with the
            # memory on hand or if we need to block
            free_list_size = len(self._free) * self._poolable_size
            if self._available_memory + free_list_size >= size:
                # we have enough unallocated or pooled memory to immediately
                # satisfy the request
                self._free_up(size)
                self._available_memory -= size
                raise NotImplementedError()
                #return ByteBuffer.allocate(size)
            else:
                # we are out of memory and will have to block
                accumulated = 0
                buf = None
                more_memory = threading.Condition(self._lock)
                self._waiters.append(more_memory)
                # loop over and over until we have a buffer or have reserved
                # enough memory to allocate one
                while (accumulated < size):
                    start_wait = time.time()
                    if not more_memory.wait(max_time_to_block_ms / 1000.0):
                        raise Errors.KafkaTimeoutError(
                            "Failed to allocate memory within the configured"
                            " max blocking time")
                    end_wait = time.time()
                    #this.waitTime.record(endWait - startWait, time.milliseconds());

                    # check if we can satisfy this request from the free list,
                    # otherwise allocate memory
                    if (accumulated == 0
                        and size == self._poolable_size
                        and self._free):

                        # just grab a buffer from the free list
                        buf = self._free.popleft()
                        accumulated = size
                    else:
                        # we'll need to allocate memory, but we may only get
                        # part of what we need on this iteration
                        self._free_up(size - accumulated)
                        got = min(size - accumulated, self._available_memory)
                        self._available_memory -= got
                        accumulated += got

                # remove the condition for this thread to let the next thread
                # in line start getting memory
                removed = self._waiters.popleft()
                assert removed is more_memory, 'Wrong condition'

                # signal any additional waiters if there is more memory left
                # over for them
                if (self._available_memory > 0 or len(self._free) > 0):
                    if len(self._waiters) > 0:
                        self._waiters[0].notify()

                # unlock and return the buffer
                if buf is None:
                    raise NotImplementedError()
                    #return ByteBuffer.allocate(size)
                else:
                    return buf

    def _free_up(self, size):
        """
        Attempt to ensure we have at least the requested number of bytes of
        memory for allocation by deallocating pooled buffers (if needed)
        """
        while self._free and self._available_memory < size:
            self._available_memory += self._free.pop().capacity

    def deallocate(self, buffer_, size=None):
        """
        Return buffers to the pool. If they are of the poolable size add them
        to the free list, otherwise just mark the memory as free.

        Arguments:
            buffer (io.BytesIO): The buffer to return
            size (int): The size of the buffer to mark as deallocated, note
                that this maybe smaller than buffer.capacity since the buffer
                may re-allocate itself during in-place compression
        """
        with self._lock:
            if size is None:
                size = buffer_.capacity
            if (size == self._poolable_size and size == buffer_.capacity):
                buffer_.seek(0)
                buffer_.truncate()
                self._free.append(buffer_)
            else:
                self._available_memory += size

            if self._waiters:
                more_mem = self._waiters[0]
                more_mem.notify()

    def available_memory(self):
        """The total free memory both unallocated and in the free list."""
        with self._lock:
            return self._available_memory + len(self._free) * self._poolable_size

    def unallocated_memory(self):
        """Get the unallocated memory (not in the free list or in use)."""
        with self._lock:
            return self._available_memory

    def queued(self):
        """The number of threads blocked waiting on memory."""
        with self._lock:
            return len(self._waiters)

    def poolable_size(self):
        """The buffer size that will be retained in the free list after use."""
        return self._poolable_size

    def total_memory(self):
        """The total memory managed by this pool."""
        return self._total_memory
'''