Source code for postgresql.copyman

##
# .copyman - COPY manager
##
"""
Manage complex COPY operations; one-to-many COPY streaming.

Primarily this module houses the `CopyManager` class, and the `transfer`
function for a high-level interface to using the `CopyManager`.
"""
import sys
from abc import abstractmethod, abstractproperty
from collections import Iterator
from .python.element import Element, ElementSet
from .python.structlib import ulong_unpack, ulong_pack
from .protocol.buffer import pq_message_stream
from .protocol.element3 import CopyData, CopyDone, Complete, cat_messages
from .protocol.xact3 import Complete as xactComplete

#: 10KB buffer for COPY messages by default.
default_buffer_size = 1024 * 10

class Fault(Exception):
	pass

[docs]class ProducerFault(Fault): """ Exception raised when the Producer caused an exception. Normally, Producer faults are fatal. """ def __init__(self, manager): self.manager = manager def __str__(self): return "producer raised exception"
[docs]class ReceiverFault(Fault): """ Exception raised when Receivers cause an exception. Faults should be trapped if recovery from an exception is possible, or if the failed receiver is optional to the succes of the operation. The 'manager' attribute is the CopyManager that raised the fault. The 'faults' attribute is a dictionary mapping the receiver to the exception instance raised. """ def __init__(self, manager, faults): self.manager = manager self.faults = faults def __str__(self): return "{0} faults occurred".format(len(self.faults))
[docs]class CopyFail(Exception): """ Exception thrown by the CopyManager when the COPY operation failed. The 'manager' attribute the CopyManager that raised the CopyFail. The 'reason' attribute is a string indicating why it failed. The 'receiver_faults' attribute is a mapping of receivers to exceptions that were raised on exit. The 'producer_fault' attribute specifies if the producer raise an exception on exit. """ def __init__(self, manager, reason = None, receiver_faults = None, producer_fault = None, ): self.manager = manager self.reason = reason self.receiver_faults = receiver_faults or {} self.producer_fault = producer_fault def __str__(self): return self.reason or 'copy aborted' # The identifier for PQv3 copy data.
PROTOCOL_PQv3 = "PQv3" # The identifier for iterables of copy data sequences. # iter([[row1, row2], [row3, row4]]) PROTOCOL_CHUNKS = "CHUNKS" # The protocol identifier for NULL producers and receivers. PROTOCOL_NULL = None class ChunkProtocol(object): __slots__ = ('buffer',) def __init__(self): self.buffer = pq_message_stream() def __call__(self, data): self.buffer.write(bytes(data)) return [x[1] for x in self.buffer.read()] # Null protocol mapping. def EmptyView(arg): return memoryview(b'') def EmptyList(arg): return [] def ReturnNone(arg): return None # Zero-Transformation def NoTransformation(arg): return arg # Copy protocols being at the Python level; *not* wire/serialization format. copy_protocol_mappings = { # PQv3 -> Chunks (PROTOCOL_PQv3, PROTOCOL_CHUNKS) : ChunkProtocol, # Chunks -> PQv3 (PROTOCOL_CHUNKS, PROTOCOL_PQv3) : lambda: cat_messages, # Null Producers and Receivers (PROTOCOL_NULL, PROTOCOL_PQv3) : lambda: EmptyView, (PROTOCOL_NULL, PROTOCOL_CHUNKS) : lambda: EmptyList, (PROTOCOL_PQv3, PROTOCOL_NULL) : lambda: ReturnNone, (PROTOCOL_CHUNKS, PROTOCOL_NULL) : lambda: ReturnNone, # Zero Transformations (PROTOCOL_NULL, PROTOCOL_NULL) : lambda: NoTransformation, (PROTOCOL_CHUNKS, PROTOCOL_CHUNKS) : lambda: NoTransformation, (PROTOCOL_PQv3, PROTOCOL_PQv3) : lambda: NoTransformation, } # Used to manage the conversions of COPY data. # Notably, chunks -> PQv3 or PQv3 -> chunks. class CopyTransformer(object): __slots__ = ('current', 'transformers', 'get') def __init__(self, source_protocol, target_protocols): self.current = {} self.transformers = { x : copy_protocol_mappings[(source_protocol, x)]() for x in set(target_protocols) } self.get = self.current.__getitem__ def __call__(self, data): for protocol, transformer in self.transformers.items(): self.current[protocol] = transformer(data) ## # This is the object that does the magic. # It tracks the state of the wire. # It ends when non-COPY data is found.
[docs]class WireState(object): """ Manages the state of the wire. This class manages three possible positions: 1. Between wire messages 2. Inside message header 3. Inside message (with complete header) The wire state will become unusable when the configured condition is True. """ __slots__ = ('remaining_bytes', 'size_fragment', 'final_view', 'condition',)
[docs] def update(self, view, getlen = ulong_unpack, len = len): """ Given the state of the COPY and new data, advance the position on the COPY stream. """ # Only usable until the terminating condition. if self.final_view is not None: raise RuntimeError("wire state encountered exceptional condition") nmessages = 0 # State carried over from prior run. remaining_bytes = self.remaining_bytes size_fragment = self.size_fragment # Terminating condition. CONDITION = self.condition # Is it a continuation of a message header? if remaining_bytes == -1: ## # Inside message header; after message type. # Continue adding to the 'size_fragment' # until there are four bytes to unpack. ## o = len(size_fragment) size_fragment += bytes(view[:4-o]) if len(size_fragment) == 4: # The size fragment is completed; only part # of the fragment remains to be consumed. remaining_bytes = getlen(size_fragment) - o size_fragment = b'' else: assert len(size_fragment) < 4 # size_fragment got updated.. if remaining_bytes >= 0: vlen = len(view) while True: if remaining_bytes: ## # Inside message body. Message length has been unpacked. ## view = view[remaining_bytes:] # How much is remaining now? rb = remaining_bytes - vlen if rb <= 0: # Finished it. vlen = -rb remaining_bytes = 0 nmessages += 1 else: vlen = 0 remaining_bytes = rb ## # In between protocol messages. ## if not view: # no more data to analyze break # There is at least one byte in the view. if CONDITION(view[0]): # State is dead now. # User needs to handle unexpected message, then continue. self.final_view = view assert remaining_bytes == 0 break if vlen < 5: # Header continuation. remaining_bytes = -1 view = view[1:] size_fragment += bytes(view) # Not enough left for the header of the next message? break # Update remaining_bytes to include the header, and start over. remaining_bytes = getlen(view[1:5]) + 1 # Update the state for the next update. self.remaining_bytes, self.size_fragment = ( remaining_bytes, size_fragment, ) # Emit the number of messages "consumed" this round. return nmessages
def __init__(self, condition = (CopyData.type[0].__ne__ if isinstance(memoryview(b'f')[0], int) else CopyData.type.__ne__)): self.remaining_bytes = 0 self.size_fragment = b'' self.final_view = None self.condition = condition
class Fitting(Element): _e_label = 'FITTING' def _e_metas(self): yield None, '[' + self.state + ']' @abstractproperty def protocol(self): """ The COPY data format produced or consumed. """ # Used to setup the Receiver/Producer def __enter__(self): pass # Used to tear down the Receiver/Producer def __exit__(self, typ, val, tb): pass class Producer(Fitting, Iterator): _e_label = 'PRODUCER' def _e_metas(self): for x in super()._e_metas(): yield x yield 'data', str(self.total_bytes / (1024**2)) + 'MB' yield 'messages', self.total_messages yield 'average size', (self.total_bytes / self.total_messages) def __init__(self): self.total_messages = 0 self.total_bytes = 0 @abstractmethod def realign(self): """ Method implemented by producers that emit COPY data that is not guaranteed to be aligned. This is only necessary in failure cases where receivers still need more data to complete the message. """ @abstractmethod def __next__(self): """ Produce the next set of data. """ class Receiver(Fitting): _e_label = 'RECEIVER' @abstractmethod def transmit(self): """ Finish the reception of the accepted data. """ @abstractmethod def accept(self, data): """ Take the data object to be processed. """
[docs]class NullProducer(Producer): """ Produces no copy data. """ _e_factors = () protocol = PROTOCOL_NULL def realign(self): # Never needs to realigned. pass def __next__(self): raise StopIteration
class IteratorProducer(Producer): _e_factors = ('iterator',) protocol = PROTOCOL_CHUNKS def __init__(self, iterator): self.iterator = iter(iterator) self.__next__ = self.iterator.__next__ super().__init__() def realign(self): # Never needs to realign; data is emitted on message boundaries. pass def __next__(self, next = next): n = next(self.iterator) self.total_messages += len(n) self.total_bytes += sum(map(len, n)) return n
[docs]class ProtocolProducer(Producer): """ Producer using a PQv3 data stream. Normally, this class needs to be subclassed as it assumes that the given recv_into function will write COPY messages. """ protocol = PROTOCOL_PQv3 @abstractmethod
[docs] def recover(self, view): """ Given a view containing data read from the wire, recover the controller's state. This needs to be implemented by subclasses in order for the ProtocolReceiver to pass control back to the original state machine. """ ## # When a COPY is interrupted, this can be used to accommodate # the original state machine to identify the message boundaries.
def realign(self): s = self._state if s is None: # It's already aligned. self.nextchunk = iter(()).__next__ return if s.final_view: # It was at the end or non-COPY. for_producer = bytes(s.final_view) for_receivers = b'' elif s.remaining_bytes == -1: # In the middle of a message header. for_producer = CopyData.type + s.size_fragment # receivers: header = (self._state.size_fragment.ljust(3, b'\x00') + b'\x04') # Don't include the already sent parts. buf = header[len(self._state.size_fragment):] bodylen = ulong_unpack(header) - 4 # This will often cause an invalid copy data error, # but it doesn't matter much because we will issue a copy fail. buf += b'\x00' * bodylen for_receivers = buf elif s.remaining_bytes > 0: # In the middle of a message. for_producer = CopyData.type + ulong_pack(s.remaining_bytes + 4) for_receivers = b'\x00' * self._state.remaining_bytes else: for_producer = for_receivers = b'' self.recover(for_producer) if for_receivers: self.nextchunk = iter((for_receivers,)).__next__ else: self.nextchunk = iter(()).__next__ def process_copy_data(self, view): self.total_messages += self._state.update(view) if self._state.final_view is not None: # It's not COPY data. fv = self._state.final_view # Only publish up to the final_view. if fv: view = view[:-len(fv)] # The next next() will handle the async, error, or completion. self.recover(fv) self._state = None self.total_bytes += len(view) return view # Given a view, begin tracking the state of the wire. def track_state(self, view): self._state = WireState() self.nextchunk = self.recv_view return self.process_copy_data(view) # The usual method for receiving more data. def recv_view(self): view = self.buffer_view[:self.recv_into(self.buffer, self.buffer_size)] if not view: # Zero read; let the subclass handle the situation. self.recover(memoryview(b'')) return self.nextchunk() view = self.process_copy_data(view) return view def nextchunk(self): raise RuntimeError("producer not properly initialized") def __next__(self): return self.nextchunk() def __init__(self, recv_into : "callable taking writable buffer and size", buffer_size = default_buffer_size ): super().__init__() self.recv_into = recv_into self.buffer_size = buffer_size self.buffer = bytearray(buffer_size) self.buffer_view = memoryview(self.buffer) self._state = None
class StatementProducer(ProtocolProducer): _e_factors = ('statement', 'parameters',) def _e_metas(self): for x in super()._e_metas(): yield x @property def state(self): if self._chunks is None: return 'created' return 'producing' def count(self): return self._chunks.count() def command(self): return self._chunks.command() def __init__(self, statement, *args, **kw): super().__init__(statement.database.pq.socket.recv_into, **kw) self.statement = statement self.parameters = args self._chunks = None ## # Take any data held by the statement's chunks and connection. def confiscate(self, next = next): current = [] try: while not current: current.extend(next(self._chunks)) except StopIteration: if not current: # End of COPY. raise pq = self._chunks.database.pq buffer = cat_messages(current) + pq.message_buffer.getvalue() + (pq.read_data or b'') view = memoryview(buffer) pq.read_data = None pq.message_buffer.truncate() # Reconstruct the buffer from the already parsed lines. r = self.track_state(view) # XXX: Better way? Probably shouldn't do the full track_state if complete.. if self._chunks._xact.state is xactComplete: # It's over, don't hand off to recv_view. self.nextchunk = self.confiscate assert self._state.final_view is None return r def recover(self, view): # Method used when non-COPY data is found. self._chunks.database.pq.message_buffer.write(bytes(view)) self.nextchunk = self.confiscate def __enter__(self): super().__enter__() if self._chunks is not None: raise RuntimeError("receiver already used") self._chunks = self.statement.chunks(*self.parameters) # Start by confiscating the connection state. self.nextchunk = self.confiscate def __exit__(self, typ, val, tb): if typ is None or issubclass(typ, Exception): db = self.statement.database if not db.closed and self._chunks._xact is not None: # The COPY transaction is still happening, # force an interrupt if the connection still exists. db.interrupt() if db.pq.xact: # Raise, CopyManager should trap. db._pq_complete() super().__exit__(typ, val, tb) class NullReceiver(Receiver): _e_factors = () protocol = PROTOCOL_NULL state = 'null' def transmit(self): # Nothing to do. pass def accept(self, data): pass class ProtocolReceiver(Receiver): protocol = PROTOCOL_PQv3 __slots__ = ('send', 'view') def __init__(self, send): super().__init__() self.send = send self.view = memoryview(b'') def accept(self, data): self.view = data def transmit(self): while self.view: self.view = self.view[self.send(self.view):] def __enter__(self): return self def __exit__(self, typ, val, tb): pass class StatementReceiver(ProtocolReceiver): _e_factors = ('statement', 'parameters',) __slots__ = ProtocolReceiver.__slots__ + _e_factors + ('xact',) def _e_metas(self): yield None, '[' + self.state + ']' def __init__(self, statement, *parameters): self.statement = statement self.parameters = parameters self.xact = None super().__init__(statement.database.pq.socket.send,) # XXX: A bit of a hack... # This is actually a good indication that statements need a .copy() # execution method for producing a "CopyCursor" that reads or writes. class WireReady(BaseException): pass def raise_wire_ready(self): raise self.WireReady() yield None def __enter__(self, iter = iter): super().__enter__() # Get the connection in the COPY state. try: self.statement.load_chunks( iter(self.raise_wire_ready()), *self.parameters ) except self.WireReady: # It's a BaseException; nothing should trap it. # Note the transaction object; we'll use it on exit. self.xact = self.statement.database.pq.xact def __exit__(self, typ, val, tb): if self.xact is None: # Nothing to do. return super().__exit__(typ, val, tb) if self.view: # The realigned producer emitted the necessary # data for message boundary alignment. # # In this case, we unconditionally fail. pq = self.statement.database.pq # There shouldn't be any message_data, atm. pq.message_data = bytes(self.view) self.statement.database._pq_complete() # It is possible for a non-alignment view to exist in cases of # faults. However, exit should *not* be called in those cases. ## elif typ is None: # Success? self.xact.messages = self.xact.CopyDoneSequence # If not, this will blow up. self.statement.database._pq_complete() # Find the complete message for command and count. for x in self.xact.messages_received(): if getattr(x, 'type', None) == Complete.type: self._complete_message = x elif issubclass(typ, Exception): # Likely raises. CopyManager should trap. self.statement.database._pq_complete() return super().__exit__(typ, val, tb) def count(self): return self._complete_message.extract_count() def command(self): return self._complete_message.extract_command().decode('ascii')
[docs]class CallReceiver(Receiver): """ Call the given object with a list of COPY lines. """ _e_factors = ('callable',) protocol = PROTOCOL_CHUNKS def __init__(self, callable): self.callable = callable self.lines = None super().__init__() def transmit(self): if self.lines is not None: self.callable(self.lines) self.lines = None def accept(self, lines): self.lines = lines
[docs]class CopyManager(Element, Iterator): """ A class for managing COPY operations. Connects the producer to the receivers. """ _e_label = 'COPY' _e_factors = ('producer', 'receivers',) def _e_metas(self): yield None, '[' + self.state + ']' @property def state(self): if self.transformer is None: return 'initialized' return str(self.producer.total_messages) + ' messages transferred' def __init__(self, producer, *receivers): self.producer = producer self.transformer = None self.receivers = ElementSet(receivers) self._seen_stop_iteration = False rp = set() add = rp.add for x in self.receivers: add(x.protocol) self.protocols = rp def __enter__(self): if self.transformer: raise RuntimeError("copy already started") self._stats = (0, 0) self.transformer = CopyTransformer(self.producer.protocol, self.protocols) self.producer.__enter__() try: for x in self.receivers: x.__enter__() except Exception: self.__exit__(*sys.exc_info()) return self def __exit__(self, typ, val, tb): ## # Exiting the CopyManager is a fairly complex operation. # # In cases of failure, re-alignment may need to happen # for when the receivers are not on a message boundary. ## if typ is not None and not issubclass(typ, Exception): # Don't bother, it's an interrupt or sufficient resources. return profail = None try: # Does nothing if the COPY was successful. self.producer.realign() try: ## # If the producer is not aligned to a message boundary, # it can emit completion data that will put the receivers # back on track. # This last service call will move that data onto the receivers. self._service_producer() ## # The receivers need to handle any new data in their __exit__. except StopIteration: # No re-alignment needed. pass self.producer.__exit__(typ, val, tb) except Exception as x: # reference profail later. profail = x # No receivers? It wasn't a success. if not self.receivers: raise CopyFail(self, "no receivers", producer_fault = profail) exit_faults = {} for x in self.receivers: try: x.__exit__(typ, val, tb) except Exception as e: exit_faults[x] = e if typ or exit_faults or profail or not self._seen_stop_iteration: raise CopyFail(self, "could not complete the COPY operation", receiver_faults = exit_faults or None, producer_fault = profail )
[docs] def reconcile(self, r): """ Reconcile a receiver that faulted. This method should be used to add back a receiver that failed to complete its write operation, but is capable of completing the operation at this time. """ if r.protocol not in self.protocols: raise RuntimeError("cannot add new receivers to copy operations") r.transmit() # Okay, add it back. self.receivers.add(r)
def _service_producer(self): # Setup current data. if not self.receivers: # No receivers to take the data. raise StopIteration try: nextdata = next(self.producer) except StopIteration: # Should be over. self._seen_stop_iteration = True raise except Exception: raise ProducerFault(self) self.transformer(nextdata) # Distribute data to receivers. for x in self.receivers: x.accept(self.transformer.get(x.protocol)) def _service_receivers(self): faults = {} for x in self.receivers: # Process all the receivers. try: x.transmit() except Exception as e: faults[x] = e if faults: # The CopyManager is eager to continue the operation. for x in faults: self.receivers.discard(x) raise ReceiverFault(self, faults) # Run the COPY to completion. def run(self): with self: try: while True: self._service_producer() self._service_receivers() except StopIteration: # It's done. pass def __iter__(self): return self def __next__(self): messages = self.producer.total_messages bytes = self.producer.total_bytes self._service_producer() # Record the progress in case a receiver faults. self._stats = ( self._stats[0] + (self.producer.total_messages - messages), self._stats[1] + (self.producer.total_bytes - bytes), ) self._service_receivers() # Return the progress. current_stats = self._stats self._stats = (0, 0) return current_stats
[docs]def transfer(producer, *receivers): """ Perform a COPY operation using the given statements:: >>> import copyman >>> copyman.transfer(src.prepare("COPY table TO STDOUT"), dst.prepare("COPY table FROM STDIN")) """ cm = CopyManager( StatementProducer(producer), *[x if isinstance(x, Receiver) else StatementReceiver(x) for x in receivers] ) cm.run() return (cm.producer.total_messages, cm.producer.total_bytes)