"""Custom implementation of multiprocessing.Pool with custom pickler This module provides efficient ways of working with data stored in shared memory with numpy.memmap arrays without inducing any memory copy between the parent and child processes. This module should not be imported if multiprocessing is not available as it implements subclasses of multiprocessing Pool that uses a custom alternative to SimpleQueue. """ # Author: Olivier Grisel # Copyright: 2012, Olivier Grisel # License: BSD 3 clause from mmap import mmap import errno import os import stat import sys import threading import atexit import tempfile import shutil import warnings from time import sleep try: WindowsError except NameError: WindowsError = None try: # Python 2 compat from cPickle import loads from cPickle import dumps except ImportError: from pickle import loads from pickle import dumps import copyreg # Customizable pure Python pickler in Python 2 # customizable C-optimized pickler under Python 3.3+ from pickle import Pickler from pickle import HIGHEST_PROTOCOL from io import BytesIO from ._multiprocessing_helpers import mp, assert_spawning # We need the class definition to derive from it not the multiprocessing.Pool # factory function from multiprocessing.pool import Pool try: import numpy as np from numpy.lib.stride_tricks import as_strided except ImportError: np = None from .numpy_pickle import load from .numpy_pickle import dump from .hashing import hash # Some system have a ramdisk mounted by default, we can use it instead of /tmp # as the default folder to dump big arrays to share with subprocesses SYSTEM_SHARED_MEM_FS = '/dev/shm' # Folder and file permissions to chmod temporary files generated by the # memmaping pool. Only the owner of the Python process can access the # temporary files and folder. FOLDER_PERMISSIONS = stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR FILE_PERMISSIONS = stat.S_IRUSR | stat.S_IWUSR ############################################################################### # Support for efficient transient pickling of numpy data structures def _get_backing_memmap(a): """Recursively look up the original np.memmap instance base if any""" b = getattr(a, 'base', None) if b is None: # TODO: check scipy sparse datastructure if scipy is installed # a nor its descendants do not have a memmap base return None elif isinstance(b, mmap): # a is already a real memmap instance. return a else: # Recursive exploration of the base ancestry return _get_backing_memmap(b) def has_shareable_memory(a): """Return True if a is backed by some mmap buffer directly or not""" return _get_backing_memmap(a) is not None def _strided_from_memmap(filename, dtype, mode, offset, order, shape, strides, total_buffer_len): """Reconstruct an array view on a memmory mapped file""" if mode == 'w+': # Do not zero the original data when unpickling mode = 'r+' if strides is None: # Simple, contiguous memmap return np.memmap(filename, dtype=dtype, shape=shape, mode=mode, offset=offset, order=order) else: # For non-contiguous data, memmap the total enclosing buffer and then # extract the non-contiguous view with the stride-tricks API base = np.memmap(filename, dtype=dtype, shape=total_buffer_len, mode=mode, offset=offset, order=order) return as_strided(base, shape=shape, strides=strides) def _reduce_memmap_backed(a, m): """Pickling reduction for memmap backed arrays a is expected to be an instance of np.ndarray (or np.memmap) m is expected to be an instance of np.memmap on the top of the ``base`` attribute ancestry of a. ``m.base`` should be the real python mmap object. """ # offset that comes from the striding differences between a and m a_start, a_end = np.byte_bounds(a) m_start = np.byte_bounds(m)[0] offset = a_start - m_start # offset from the backing memmap offset += m.offset if m.flags['F_CONTIGUOUS']: order = 'F' else: # The backing memmap buffer is necessarily contiguous hence C if not # Fortran order = 'C' if a.flags['F_CONTIGUOUS'] or a.flags['C_CONTIGUOUS']: # If the array is a contiguous view, no need to pass the strides strides = None total_buffer_len = None else: # Compute the total number of items to map from which the strided # view will be extracted. strides = a.strides total_buffer_len = (a_end - a_start) // a.itemsize return (_strided_from_memmap, (m.filename, a.dtype, m.mode, offset, order, a.shape, strides, total_buffer_len)) def reduce_memmap(a): """Pickle the descriptors of a memmap instance to reopen on same file""" m = _get_backing_memmap(a) if m is not None: # m is a real mmap backed memmap instance, reduce a preserving striding # information return _reduce_memmap_backed(a, m) else: # This memmap instance is actually backed by a regular in-memory # buffer: this can happen when using binary operators on numpy.memmap # instances return (loads, (dumps(np.asarray(a), protocol=HIGHEST_PROTOCOL),)) class ArrayMemmapReducer(object): """Reducer callable to dump large arrays to memmap files. Parameters ---------- max_nbytes: int Threshold to trigger memmaping of large arrays to files created a folder. temp_folder: str Path of a folder where files for backing memmaped arrays are created. mmap_mode: 'r', 'r+' or 'c' Mode for the created memmap datastructure. See the documentation of numpy.memmap for more details. Note: 'w+' is coerced to 'r+' automatically to avoid zeroing the data on unpickling. verbose: int, optional, 0 by default If verbose > 0, memmap creations are logged. If verbose > 1, both memmap creations, reuse and array pickling are logged. prewarm: bool, optional, False by default. Force a read on newly memmaped array to make sure that OS pre-cache it memory. This can be useful to avoid concurrent disk access when the same data array is passed to different worker processes. """ def __init__(self, max_nbytes, temp_folder, mmap_mode, verbose=0, context_id=None, prewarm=True): self._max_nbytes = max_nbytes self._temp_folder = temp_folder self._mmap_mode = mmap_mode self.verbose = int(verbose) self._prewarm = prewarm if context_id is not None: warnings.warn('context_id is deprecated and ignored in joblib' ' 0.9.4 and will be removed in 0.11', DeprecationWarning) def __call__(self, a): m = _get_backing_memmap(a) if m is not None: # a is already backed by a memmap file, let's reuse it directly return _reduce_memmap_backed(a, m) if (not a.dtype.hasobject and self._max_nbytes is not None and a.nbytes > self._max_nbytes): # check that the folder exists (lazily create the pool temp folder # if required) try: os.makedirs(self._temp_folder) os.chmod(self._temp_folder, FOLDER_PERMISSIONS) except OSError as e: if e.errno != errno.EEXIST: raise e # Find a unique, concurrent safe filename for writing the # content of this array only once. basename = "%d-%d-%s.pkl" % ( os.getpid(), id(threading.current_thread()), hash(a)) filename = os.path.join(self._temp_folder, basename) # In case the same array with the same content is passed several # times to the pool subprocess children, serialize it only once # XXX: implement an explicit reference counting scheme to make it # possible to delete temporary files as soon as the workers are # done processing this data. if not os.path.exists(filename): if self.verbose > 0: print("Memmaping (shape=%r, dtype=%s) to new file %s" % ( a.shape, a.dtype, filename)) for dumped_filename in dump(a, filename): os.chmod(dumped_filename, FILE_PERMISSIONS) if self._prewarm: # Warm up the data to avoid concurrent disk access in # multiple children processes load(filename, mmap_mode=self._mmap_mode).max() elif self.verbose > 1: print("Memmaping (shape=%s, dtype=%s) to old file %s" % ( a.shape, a.dtype, filename)) # The worker process will use joblib.load to memmap the data return (load, (filename, self._mmap_mode)) else: # do not convert a into memmap, let pickler do its usual copy with # the default system pickler if self.verbose > 1: print("Pickling array (shape=%r, dtype=%s)." % ( a.shape, a.dtype)) return (loads, (dumps(a, protocol=HIGHEST_PROTOCOL),)) ############################################################################### # Enable custom pickling in Pool queues class CustomizablePickler(Pickler): """Pickler that accepts custom reducers. HIGHEST_PROTOCOL is selected by default as this pickler is used to pickle ephemeral datastructures for interprocess communication hence no backward compatibility is required. `reducers` is expected expected to be a dictionary with key/values being `(type, callable)` pairs where `callable` is a function that give an instance of `type` will return a tuple `(constructor, tuple_of_objects)` to rebuild an instance out of the pickled `tuple_of_objects` as would return a `__reduce__` method. See the standard library documentation on pickling for more details. """ # We override the pure Python pickler as its the only way to be able to # customize the dispatch table without side effects in Python 2.6 # to 3.2. For Python 3.3+ leverage the new dispatch_table # feature from http://bugs.python.org/issue14166 that makes it possible # to use the C implementation of the Pickler which is faster. def __init__(self, writer, reducers=None, protocol=HIGHEST_PROTOCOL): Pickler.__init__(self, writer, protocol=protocol) if reducers is None: reducers = {} if hasattr(Pickler, 'dispatch'): # Make the dispatch registry an instance level attribute instead of # a reference to the class dictionary under Python 2 self.dispatch = Pickler.dispatch.copy() else: # Under Python 3 initialize the dispatch table with a copy of the # default registry self.dispatch_table = copyreg.dispatch_table.copy() for type, reduce_func in reducers.items(): self.register(type, reduce_func) def register(self, type, reduce_func): if hasattr(Pickler, 'dispatch'): # Python 2 pickler dispatching is not explicitly customizable. # Let us use a closure to workaround this limitation. def dispatcher(self, obj): reduced = reduce_func(obj) self.save_reduce(obj=obj, *reduced) self.dispatch[type] = dispatcher else: self.dispatch_table[type] = reduce_func class CustomizablePicklingQueue(object): """Locked Pipe implementation that uses a customizable pickler. This class is an alternative to the multiprocessing implementation of SimpleQueue in order to make it possible to pass custom pickling reducers, for instance to avoid memory copy when passing memmory mapped datastructures. `reducers` is expected expected to be a dictionary with key/values being `(type, callable)` pairs where `callable` is a function that give an instance of `type` will return a tuple `(constructor, tuple_of_objects)` to rebuild an instance out of the pickled `tuple_of_objects` as would return a `__reduce__` method. See the standard library documentation on pickling for more details. """ def __init__(self, context, reducers=None): self._reducers = reducers self._reader, self._writer = context.Pipe(duplex=False) self._rlock = context.Lock() if sys.platform == 'win32': self._wlock = None else: self._wlock = context.Lock() self._make_methods() def __getstate__(self): assert_spawning(self) return (self._reader, self._writer, self._rlock, self._wlock, self._reducers) def __setstate__(self, state): (self._reader, self._writer, self._rlock, self._wlock, self._reducers) = state self._make_methods() def empty(self): return not self._reader.poll() def _make_methods(self): self._recv = recv = self._reader.recv racquire, rrelease = self._rlock.acquire, self._rlock.release def get(): racquire() try: return recv() finally: rrelease() self.get = get if self._reducers: def send(obj): buffer = BytesIO() CustomizablePickler(buffer, self._reducers).dump(obj) self._writer.send_bytes(buffer.getvalue()) self._send = send else: self._send = send = self._writer.send if self._wlock is None: # writes to a message oriented win32 pipe are atomic self.put = send else: wlock_acquire, wlock_release = ( self._wlock.acquire, self._wlock.release) def put(obj): wlock_acquire() try: return send(obj) finally: wlock_release() self.put = put class PicklingPool(Pool): """Pool implementation with customizable pickling reducers. This is useful to control how data is shipped between processes and makes it possible to use shared memory without useless copies induces by the default pickling methods of the original objects passed as arguments to dispatch. `forward_reducers` and `backward_reducers` are expected to be dictionaries with key/values being `(type, callable)` pairs where `callable` is a function that give an instance of `type` will return a tuple `(constructor, tuple_of_objects)` to rebuild an instance out of the pickled `tuple_of_objects` as would return a `__reduce__` method. See the standard library documentation on pickling for more details. """ def __init__(self, processes=None, forward_reducers=None, backward_reducers=None, **kwargs): if forward_reducers is None: forward_reducers = dict() if backward_reducers is None: backward_reducers = dict() self._forward_reducers = forward_reducers self._backward_reducers = backward_reducers poolargs = dict(processes=processes) poolargs.update(kwargs) super(PicklingPool, self).__init__(**poolargs) def _setup_queues(self): context = getattr(self, '_ctx', mp) self._inqueue = CustomizablePicklingQueue(context, self._forward_reducers) self._outqueue = CustomizablePicklingQueue(context, self._backward_reducers) self._quick_put = self._inqueue._send self._quick_get = self._outqueue._recv def delete_folder(folder_path): """Utility function to cleanup a temporary folder if still existing""" try: if os.path.exists(folder_path): shutil.rmtree(folder_path) except WindowsError: warnings.warn("Failed to clean temporary folder: %s" % folder_path) class MemmapingPool(PicklingPool): """Process pool that shares large arrays to avoid memory copy. This drop-in replacement for `multiprocessing.pool.Pool` makes it possible to work efficiently with shared memory in a numpy context. Existing instances of numpy.memmap are preserved: the child suprocesses will have access to the same shared memory in the original mode except for the 'w+' mode that is automatically transformed as 'r+' to avoid zeroing the original data upon instantiation. Furthermore large arrays from the parent process are automatically dumped to a temporary folder on the filesystem such as child processes to access their content via memmaping (file system backed shared memory). Note: it is important to call the terminate method to collect the temporary folder used by the pool. Parameters ---------- processes: int, optional Number of worker processes running concurrently in the pool. initializer: callable, optional Callable executed on worker process creation. initargs: tuple, optional Arguments passed to the initializer callable. temp_folder: str, optional Folder to be used by the pool for memmaping large arrays for sharing memory with worker processes. If None, this will try in order: - a folder pointed by the JOBLIB_TEMP_FOLDER environment variable, - /dev/shm if the folder exists and is writable: this is a RAMdisk filesystem available by default on modern Linux distributions, - the default system temporary folder that can be overridden with TMP, TMPDIR or TEMP environment variables, typically /tmp under Unix operating systems. max_nbytes int or None, optional, 1e6 by default Threshold on the size of arrays passed to the workers that triggers automated memmory mapping in temp_folder. Use None to disable memmaping of large arrays. forward_reducers: dictionary, optional Reducers used to pickle objects passed from master to worker processes: see below. backward_reducers: dictionary, optional Reducers used to pickle return values from workers back to the master process. verbose: int, optional Make it possible to monitor how the communication of numpy arrays with the subprocess is handled (pickling or memmaping) prewarm: bool or str, optional, "auto" by default. If True, force a read on newly memmaped array to make sure that OS pre- cache it in memory. This can be useful to avoid concurrent disk access when the same data array is passed to different worker processes. If "auto" (by default), prewarm is set to True, unless the Linux shared memory partition /dev/shm is available and used as temp_folder. `forward_reducers` and `backward_reducers` are expected to be dictionaries with key/values being `(type, callable)` pairs where `callable` is a function that give an instance of `type` will return a tuple `(constructor, tuple_of_objects)` to rebuild an instance out of the pickled `tuple_of_objects` as would return a `__reduce__` method. See the standard library documentation on pickling for more details. """ def __init__(self, processes=None, temp_folder=None, max_nbytes=1e6, mmap_mode='r', forward_reducers=None, backward_reducers=None, verbose=0, context_id=None, prewarm=False, **kwargs): if forward_reducers is None: forward_reducers = dict() if backward_reducers is None: backward_reducers = dict() if context_id is not None: warnings.warn('context_id is deprecated and ignored in joblib' ' 0.9.4 and will be removed in 0.11', DeprecationWarning) # Prepare a sub-folder name for the serialization of this particular # pool instance (do not create in advance to spare FS write access if # no array is to be dumped): use_shared_mem = False pool_folder_name = "joblib_memmaping_pool_%d_%d" % ( os.getpid(), id(self)) if temp_folder is None: temp_folder = os.environ.get('JOBLIB_TEMP_FOLDER', None) if temp_folder is None: if os.path.exists(SYSTEM_SHARED_MEM_FS): try: temp_folder = SYSTEM_SHARED_MEM_FS pool_folder = os.path.join(temp_folder, pool_folder_name) if not os.path.exists(pool_folder): os.makedirs(pool_folder) use_shared_mem = True except IOError: # Missing rights in the the /dev/shm partition, # fallback to regular temp folder. temp_folder = None if temp_folder is None: # Fallback to the default tmp folder, typically /tmp temp_folder = tempfile.gettempdir() temp_folder = os.path.abspath(os.path.expanduser(temp_folder)) pool_folder = os.path.join(temp_folder, pool_folder_name) self._temp_folder = pool_folder # Register the garbage collector at program exit in case caller forgets # to call terminate explicitly: note we do not pass any reference to # self to ensure that this callback won't prevent garbage collection of # the pool instance and related file handler resources such as POSIX # semaphores and pipes atexit.register(lambda: delete_folder(pool_folder)) if np is not None: # Register smart numpy.ndarray reducers that detects memmap backed # arrays and that is alse able to dump to memmap large in-memory # arrays over the max_nbytes threshold if prewarm == "auto": prewarm = not use_shared_mem forward_reduce_ndarray = ArrayMemmapReducer( max_nbytes, pool_folder, mmap_mode, verbose, prewarm=prewarm) forward_reducers[np.ndarray] = forward_reduce_ndarray forward_reducers[np.memmap] = reduce_memmap # Communication from child process to the parent process always # pickles in-memory numpy.ndarray without dumping them as memmap # to avoid confusing the caller and make it tricky to collect the # temporary folder backward_reduce_ndarray = ArrayMemmapReducer( None, pool_folder, mmap_mode, verbose) backward_reducers[np.ndarray] = backward_reduce_ndarray backward_reducers[np.memmap] = reduce_memmap poolargs = dict( processes=processes, forward_reducers=forward_reducers, backward_reducers=backward_reducers) poolargs.update(kwargs) super(MemmapingPool, self).__init__(**poolargs) def terminate(self): super(MemmapingPool, self).terminate() delete_folder(self._temp_folder)