diff --git a/doc/runtime_memory.rst b/doc/runtime_memory.rst index f4e01f266..ffe6661e8 100644 --- a/doc/runtime_memory.rst +++ b/doc/runtime_memory.rst @@ -281,6 +281,8 @@ Transfers .. autofunction:: enqueue_copy(queue, dest, src, **kwargs) +.. autofunction:: enqueue_fill(queue, dest, src, **kwargs) + Mapping Memory into Host Address Space -------------------------------------- diff --git a/examples/demo_array_svm.py b/examples/demo_array_svm.py new file mode 100644 index 000000000..eaba23209 --- /dev/null +++ b/examples/demo_array_svm.py @@ -0,0 +1,46 @@ +import pyopencl as cl +import pyopencl.array as cl_array +from pyopencl.tools import SVMAllocator +import numpy as np +import numpy.linalg as la + +n = 500000 +a = np.random.rand(n).astype(np.float32) +b = np.random.rand(n).astype(np.float32) + + +ctx = cl.create_some_context() +queue = cl.CommandQueue(ctx) + +alloc = SVMAllocator(ctx, 0, cl.svm_mem_flags.READ_WRITE, queue) + +a_dev = cl_array.to_device(queue, a, allocator=alloc) +print("A_DEV", a_dev.data) +b_dev = cl_array.to_device(queue, b, allocator=alloc) +dest_dev = cl_array.empty_like(a_dev) +print("DEST", dest_dev.data) + +prg = cl.Program(ctx, """ + __kernel void sum(__global const float *a, + __global const float *b, __global float *c) + { + int gid = get_global_id(0); + c[gid] = a[gid] + b[gid]; + } + """).build() + +knl = prg.sum # Use this Kernel object for repeated calls +knl(queue, a.shape, None, a_dev.data, b_dev.data, dest_dev.data) + +# PROBLEM: numpy frees the temporary out of (a_dev+b_dev) before +# we're done with it +diff = dest_dev - (a_dev+b_dev) + +if 0: + diff = diff.get() + np.set_printoptions(linewidth=400) + print(dest_dev) + print((a_dev+b_dev).get()) + print(diff) + print(la.norm(diff)) + print("A_DEV", a_dev.data.mem.__array_interface__) diff --git a/pyopencl/__init__.py b/pyopencl/__init__.py index 6420e5afb..22562d1f9 100644 --- a/pyopencl/__init__.py +++ b/pyopencl/__init__.py @@ -21,6 +21,8 @@ """ from sys import intern +from warnings import warn +from typing import Union, Any from pyopencl.version import VERSION, VERSION_STATUS, VERSION_TEXT # noqa @@ -43,7 +45,6 @@ import os from os.path import dirname, join, realpath if realpath(join(os.getcwd(), "pyopencl")) == realpath(dirname(__file__)): - from warnings import warn warn("It looks like you are importing PyOpenCL from " "its source directory. This likely won't work.") raise @@ -199,11 +200,9 @@ if get_cl_header_version() >= (2, 0): from pyopencl._cl import ( # noqa - SVMAllocation, + SVMPointer, SVM, - - # FIXME - #enqueue_svm_migratemem, + SVMAllocation, ) if _cl.have_gl(): @@ -267,7 +266,6 @@ class CommandQueueUsedAfterExit(UserWarning): def compiler_output(text): import os - from warnings import warn if int(os.environ.get("PYOPENCL_COMPILER_OUTPUT", "0")): warn(text, CompilerWarning) else: @@ -389,7 +387,6 @@ def enable_debugging(platform_or_context): import os os.environ["CPU_MAX_COMPUTE_UNITS"] = "1" else: - from warnings import warn warn("do not know how to enable debugging on '%s'" % platform.name) @@ -428,7 +425,6 @@ def _get_prg(self): return self._prg else: # "no program" can only happen in from-source case. - from warnings import warn warn("Pre-build attribute access defeats compiler caching.", stacklevel=3) @@ -662,7 +658,6 @@ def device_hashable_model_and_version_identifier(self): return ("v1", self.vendor, self.vendor_id, self.name, self.version) def device_persistent_unique_id(self): - from warnings import warn warn("Device.persistent_unique_id is deprecated. " "Use Device.hashable_model_and_version_identifier instead.", DeprecationWarning, stacklevel=2) @@ -684,7 +679,6 @@ def device_persistent_unique_id(self): def context_init(self, devices, properties, dev_type, cache_dir=None): if cache_dir is not None: - from warnings import warn warn("The 'cache_dir' argument to the Context constructor " "is deprecated and no longer has an effect. " "It was removed because it only applied to the wrapper " @@ -970,7 +964,6 @@ def image_init(self, context, flags, format, shape=None, pitches=None, if hostbuf is not None and not \ (flags & (mem_flags.USE_HOST_PTR | mem_flags.COPY_HOST_PTR)): - from warnings import warn warn("'hostbuf' was passed, but no memory flags to make use of it.") if hostbuf is None and pitches is not None: @@ -1043,7 +1036,6 @@ def image_init(self, context, flags, format, shape=None, pitches=None, class _ImageInfoGetter: def __init__(self, event): - from warnings import warn warn("Image.image.attr is deprecated and will go away in 2021. " "Use Image.attr directly, instead.") @@ -1150,24 +1142,19 @@ def memory_map_exit(self, exc_type, exc_val, exc_tb): """ if get_cl_header_version() >= (2, 0): - svmallocation_old_init = SVMAllocation.__init__ - - def svmallocation_init(self, ctx, size, alignment, flags, _interface=None): - """ - :arg ctx: a :class:`Context` - :arg flags: some of :class:`svm_mem_flags`. - """ - svmallocation_old_init(self, ctx, size, alignment, flags) - - # mem_flags.READ_ONLY applies to kernels, not the host - read_write = True - _interface["data"] = ( - int(self._ptr_as_int()), not read_write) - - self.__array_interface__ = _interface - - if get_cl_header_version() >= (2, 0): - SVMAllocation.__init__ = svmallocation_init + class _ArrayInterfaceSVMAllocation(SVMAllocation): + def __init__(self, ctx, size, alignment, flags, _interface=None, + queue=None): + """ + :arg ctx: a :class:`Context` + :arg flags: some of :class:`svm_mem_flags`. + """ + super().__init__(ctx, size, alignment, flags, queue) + + # mem_flags.READ_ONLY applies to kernels, not the host + read_write = True + _interface["data"] = ( + int(self._ptr_as_int()), not read_write) # }}} @@ -1778,12 +1765,14 @@ def enqueue_copy(queue, dest, src, **kwargs): else: raise ValueError("invalid dest mem object type") - elif get_cl_header_version() >= (2, 0) and isinstance(dest, SVM): + elif get_cl_header_version() >= (2, 0) and isinstance(dest, SVMPointer): # to SVM - if not isinstance(src, SVM): + if not isinstance(src, SVMPointer): src = SVM(src) is_blocking = kwargs.pop("is_blocking", True) + assert kwargs.pop("src_offset", 0) == 0 + assert kwargs.pop("dest_offset", 0) == 0 return _cl._enqueue_svm_memcpy(queue, is_blocking, dest, src, **kwargs) else: @@ -1809,7 +1798,7 @@ def enqueue_copy(queue, dest, src, **kwargs): queue, src, origin, region, dest, **kwargs) else: raise ValueError("invalid src mem object type") - elif isinstance(src, SVM): + elif isinstance(src, SVMPointer): # from svm # dest is not a SVM instance, otherwise we'd be in the branch above is_blocking = kwargs.pop("is_blocking", True) @@ -1822,6 +1811,26 @@ def enqueue_copy(queue, dest, src, **kwargs): # }}} +# {{{ enqueue_fill + +def enqueue_fill(queue: CommandQueue, dest: Union[MemoryObjectHolder, SVMPointer], + pattern: Any, size: int, *, offset: int = 0, wait_for=None) -> Event: + """ + .. versionadded:: 2022.2 + """ + if isinstance(dest, MemoryObjectHolder): + return enqueue_fill_buffer(queue, dest, pattern, offset, size, wait_for) + elif isinstance(dest, SVMPointer): + if offset: + raise NotImplementedError("enqueue_fill with SVM does not yet support " + "offsets") + return enqueue_svm_memfill(queue, dest, pattern, size, wait_for) + else: + raise TypeError(f"enqueue_fill does not know how to fill '{type(dest)}'") + +# }}} + + # {{{ image creation DTYPE_TO_CHANNEL_TYPE = { @@ -1927,7 +1936,6 @@ def enqueue_barrier(queue, wait_for=None): def enqueue_fill_buffer(queue, mem, pattern, offset, size, wait_for=None): if not (queue._get_cl_version() >= (1, 2) and get_cl_header_version() >= (1, 2)): - from warnings import warn warn("The context for this queue does not declare OpenCL 1.2 support, so " "the next thing you might see is a crash") @@ -1944,7 +1952,7 @@ def enqueue_fill_buffer(queue, mem, pattern, offset, size, wait_for=None): def enqueue_svm_memfill(queue, dest, pattern, byte_count=None, wait_for=None): """Fill shared virtual memory with a pattern. - :arg dest: a Python buffer object, optionally wrapped in an :class:`SVM` object + :arg dest: a Python buffer object, or any implementation of :class:`SVMPointer`. :arg pattern: a Python buffer object (e.g. a :class:`numpy.ndarray` with the fill pattern to be used. :arg byte_count: The size of the memory to be fill. Defaults to the @@ -1955,8 +1963,8 @@ def enqueue_svm_memfill(queue, dest, pattern, byte_count=None, wait_for=None): .. versionadded:: 2016.2 """ - if not isinstance(dest, SVM): - dest = SVM(dest) + if not isinstance(dest, SVMPointer): + dest = SVMPointer(dest) return _cl._enqueue_svm_memfill( queue, dest, pattern, byte_count=None, wait_for=None) @@ -1965,7 +1973,7 @@ def enqueue_svm_memfill(queue, dest, pattern, byte_count=None, wait_for=None): def enqueue_svm_migratemem(queue, svms, flags, wait_for=None): """ :arg svms: a collection of Python buffer objects (e.g. :mod:`numpy` - arrays), optionally wrapped in :class:`SVM` objects. + arrays), or any implementation of :class:`SVMPointer`. :arg flags: a combination of :class:`mem_migration_flags` |std-enqueue-blurb| @@ -1983,7 +1991,7 @@ def enqueue_svm_migratemem(queue, svms, flags, wait_for=None): wait_for) -def svm_empty(ctx, flags, shape, dtype, order="C", alignment=None): +def svm_empty(ctx, flags, shape, dtype, order="C", alignment=None, queue=None): """Allocate an empty :class:`numpy.ndarray` of the given *shape*, *dtype* and *order*. (See :func:`numpy.empty` for the meaning of these arguments.) The array will be allocated in shared virtual memory belonging @@ -2001,6 +2009,10 @@ def svm_empty(ctx, flags, shape, dtype, order="C", alignment=None): will likely want to wrap the returned array in an :class:`SVM` tag. .. versionadded:: 2016.2 + + .. versionchanged:: 2022.2 + + *queue* argument added. """ dtype = np.dtype(dtype) @@ -2047,7 +2059,9 @@ def svm_empty(ctx, flags, shape, dtype, order="C", alignment=None): if alignment is None: alignment = itemsize - svm_alloc = SVMAllocation(ctx, nbytes, alignment, flags, _interface=interface) + svm_alloc = _ArrayInterfaceSVMAllocation( + ctx, nbytes, alignment, flags, _interface=interface, + queue=queue) return np.asarray(svm_alloc) diff --git a/pyopencl/array.py b/pyopencl/array.py index cb66121a5..540e43ad5 100644 --- a/pyopencl/array.py +++ b/pyopencl/array.py @@ -720,9 +720,14 @@ def set(self, ary, queue=None, async_=None, **kwargs): stacklevel=2) if self.size: - event1 = cl.enqueue_copy(queue or self.queue, self.base_data, ary, - device_offset=self.offset, - is_blocking=not async_) + if self.offset: + event1 = cl.enqueue_copy(queue or self.queue, self.base_data, ary, + device_offset=self.offset, + is_blocking=not async_) + else: + event1 = cl.enqueue_copy(queue or self.queue, self.base_data, ary, + is_blocking=not async_) + self.add_event(event1) def _get(self, queue=None, ary=None, async_=None, **kwargs): @@ -770,9 +775,14 @@ def _get(self, queue=None, ary=None, async_=None, **kwargs): "to associate one.") if self.size: - event1 = cl.enqueue_copy(queue, ary, self.base_data, - device_offset=self.offset, - wait_for=self.events, is_blocking=not async_) + if self.offset: + event1 = cl.enqueue_copy(queue, ary, self.base_data, + device_offset=self.offset, + wait_for=self.events, is_blocking=not async_) + else: + event1 = cl.enqueue_copy(queue, ary, self.base_data, + wait_for=self.events, is_blocking=not async_) + self.add_event(event1) else: event1 = None @@ -1458,8 +1468,8 @@ def _zero_fill(self, queue=None, wait_for=None): # https://github.com/inducer/pyopencl/issues/395 if cl_version_gtr_1_2 and not (on_nvidia and self.nbytes >= 2**31): self.add_event( - cl.enqueue_fill_buffer(queue, self.base_data, np.int8(0), - self.offset, self.nbytes, wait_for=wait_for)) + cl.enqueue_fill(queue, self.base_data, np.int8(0), + self.nbytes, offset=self.offset, wait_for=wait_for)) else: zero = np.zeros((), self.dtype) self.fill(zero, queue=queue) diff --git a/pyopencl/tools.py b/pyopencl/tools.py index 27adac75b..b414168c6 100644 --- a/pyopencl/tools.py +++ b/pyopencl/tools.py @@ -33,7 +33,7 @@ import numpy as np from pytools import memoize, memoize_method -from pyopencl._cl import bitlog2 # noqa: F401 +from pyopencl._cl import bitlog2, get_cl_header_version # noqa: F401 from pytools.persistent_dict import KeyBuilder as KeyBuilderBase import re @@ -59,10 +59,19 @@ def _register_types(): # {{{ imported names from pyopencl._cl import ( # noqa - PooledBuffer as PooledBuffer, + _tools_PooledBuffer as PooledBuffer, _tools_DeferredAllocator as DeferredAllocator, _tools_ImmediateAllocator as ImmediateAllocator, - MemoryPool as MemoryPool) + _tools_MemoryPool as MemoryPool, + ) + + +if get_cl_header_version() >= (2, 0): + from pyopencl._cl import ( # noqa + _tools_SVMemoryPool as SVMemoryPool, + _tools_PooledSVM as PooledSVM, + _tools_SVMAllocator as SVMAllocator, + ) # }}} diff --git a/src/mempool.hpp b/src/mempool.hpp index 44f0fd643..812bd2f1c 100644 --- a/src/mempool.hpp +++ b/src/mempool.hpp @@ -233,7 +233,8 @@ namespace PYGPU_PACKAGE std::cout << "[pool] allocation of size " << size << " served from bin " << bin_nr << " which contained " << bin.size() << " entries" << std::endl; - return pop_block_from_bin(bin, size); + return m_allocator->hand_out_existing_block( + pop_block_from_bin(bin, size)); } size_type alloc_sz = alloc_size(bin_nr); @@ -256,7 +257,8 @@ namespace PYGPU_PACKAGE m_allocator->try_release_blocks(); if (bin.size()) - return pop_block_from_bin(bin, size); + return m_allocator->hand_out_existing_block( + pop_block_from_bin(bin, size)); if (m_trace) std::cout << "[pool] allocation still OOM after GC" << std::endl; @@ -282,7 +284,7 @@ namespace PYGPU_PACKAGE "failed to free memory for allocation"); } - void free(pointer_type p, size_type size) + void free(pointer_type &&p, size_type size) { --m_active_blocks; m_active_bytes -= size; @@ -291,7 +293,7 @@ namespace PYGPU_PACKAGE if (!m_stop_holding) { inc_held_blocks(); - get_bin(bin_nr).push_back(p); + get_bin(bin_nr).push_back(std::move(p)); if (m_trace) std::cout << "[pool] block of size " << size << " returned to bin " @@ -300,7 +302,7 @@ namespace PYGPU_PACKAGE } else { - m_allocator->free(p); + m_allocator->free(std::move(p)); m_managed_bytes -= alloc_size(bin_nr); } } @@ -313,7 +315,7 @@ namespace PYGPU_PACKAGE while (bin.size()) { - m_allocator->free(bin.back()); + m_allocator->free(std::move(bin.back())); m_managed_bytes -= alloc_size(bin_pair.first); bin.pop_back(); @@ -353,7 +355,7 @@ namespace PYGPU_PACKAGE if (bin.size()) { - m_allocator->free(bin.back()); + m_allocator->free(std::move(bin.back())); m_managed_bytes -= alloc_size(bin_pair.first); bin.pop_back(); @@ -379,7 +381,7 @@ namespace PYGPU_PACKAGE pointer_type pop_block_from_bin(bin_t &bin, size_type size) { - pointer_type result = bin.back(); + pointer_type result(std::move(bin.back())); bin.pop_back(); dec_held_blocks(); @@ -399,7 +401,7 @@ namespace PYGPU_PACKAGE typedef typename Pool::pointer_type pointer_type; typedef typename Pool::size_type size_type; - private: + protected: PYGPU_SHARED_PTR m_pool; pointer_type m_ptr; @@ -421,7 +423,7 @@ namespace PYGPU_PACKAGE { if (m_valid) { - m_pool->free(m_ptr, m_size); + m_pool->free(std::move(m_ptr), m_size); m_valid = false; } else @@ -435,16 +437,8 @@ namespace PYGPU_PACKAGE #endif ); } - - pointer_type ptr() const - { return m_ptr; } - - size_type size() const - { return m_size; } }; } - - #endif diff --git a/src/wrap_cl.hpp b/src/wrap_cl.hpp index 98964056f..ad60372db 100644 --- a/src/wrap_cl.hpp +++ b/src/wrap_cl.hpp @@ -137,10 +137,8 @@ // }}} +// {{{ macros and typedefs for wrappers - - -// {{{ tools #if PY_VERSION_HEX >= 0x02050000 typedef Py_ssize_t PYOPENCL_BUFFER_SIZE_T; #else @@ -229,8 +227,6 @@ } - - #define PYOPENCL_RETRY_IF_MEM_ERROR(OPERATION) \ { \ bool failed_with_mem_error = false; \ @@ -260,8 +256,20 @@ } \ } + +#define PYOPENCL_GET_SVM_SIZE(NAME) \ + size_t NAME##_size; \ + bool NAME##_has_size = false; \ + try \ + { \ + NAME##_size = NAME.size(); \ + NAME##_has_size = true; \ + } \ + catch (size_not_available) { } + // }}} + // {{{ tracing and error reporting #ifdef PYOPENCL_TRACE #define PYOPENCL_PRINT_CALL_TRACE(NAME) \ @@ -329,6 +337,7 @@ // }}} + // {{{ get_info helpers #define PYOPENCL_GET_OPAQUE_INFO(WHAT, FIRST_ARG, SECOND_ARG, CL_TYPE, TYPE) \ { \ @@ -383,6 +392,7 @@ // }}} + // {{{ event helpers -------------------------------------------------------------- #define PYOPENCL_PARSE_WAIT_FOR \ cl_uint num_events_in_wait_list = 0; \ @@ -424,7 +434,9 @@ // }}} + // {{{ equality testing + #define PYOPENCL_EQUALITY_TESTS(cls) \ bool operator==(cls const &other) const \ { return data() == other.data(); } \ @@ -432,8 +444,8 @@ { return data() != other.data(); } \ long hash() const \ { return (long) (intptr_t) data(); } -// }}} +// }}} namespace pyopencl @@ -496,6 +508,19 @@ namespace pyopencl // }}} + // {{{ utility functions + + inline bool is_queue_out_of_order(cl_command_queue queue) + { + cl_command_queue_properties param_value; + PYOPENCL_CALL_GUARDED(clGetCommandQueueInfo, + (queue, CL_QUEUE_PROPERTIES, sizeof(param_value), ¶m_value, 0)); + return param_value & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE; + } + + // }}} + + // {{{ buffer interface helper @@ -1655,6 +1680,82 @@ namespace pyopencl // }}} + // {{{ command_queue_ref + + // In contrast to command_queue, command_queue_ref is "nullable", i.e. + // it is a RAII *optional* reference to a command queue. + + class command_queue_ref + { + private: + bool m_valid; + cl_command_queue m_queue; + + public: + command_queue_ref() + : m_valid(false) + {} + + command_queue_ref(cl_command_queue queue) + : m_valid(true), m_queue(queue) + { + PYOPENCL_CALL_GUARDED(clRetainCommandQueue, (m_queue)); + } + + command_queue_ref(command_queue_ref &&src) + : m_valid(src.m_valid), m_queue(src.m_queue) + { + src.m_valid = false; + } + + command_queue_ref(const command_queue_ref &) + { + throw error("command_queue_ref", CL_INVALID_VALUE, + "command_queue_ref copy constructor is never supposed to be called; " + "all notional invocations should be eliminated because of NRVO"); + } + + command_queue_ref &operator=(const command_queue_ref &) = delete; + + ~command_queue_ref() + { + reset(); + } + + bool is_valid() const + { + return m_valid; + } + + cl_command_queue data() const + { + if (m_valid) + return m_queue; + else + throw error("command_queue_ref.data", CL_INVALID_VALUE, + "command_queue_ref is not valid"); + } + + void reset() + { + if (m_valid) + PYOPENCL_CALL_GUARDED_CLEANUP(clReleaseCommandQueue, (m_queue)); + m_valid = false; + } + + void set(cl_command_queue queue) + { + if (m_valid) + PYOPENCL_CALL_GUARDED(clReleaseCommandQueue, (m_queue)); + m_queue = queue; + PYOPENCL_CALL_GUARDED(clRetainCommandQueue, (m_queue)); + m_valid = true; + } + }; + + // }}} + + // {{{ event/synchronization class event : noncopyable @@ -3441,11 +3542,26 @@ namespace pyopencl // }}} - // {{{ svm - #if PYOPENCL_CL_VERSION >= 0x2000 - class svm_arg_wrapper + // {{{ svm pointer + + class size_not_available { }; + + class svm_pointer + { + public: + virtual void *svm_ptr() const = 0; + // may throw size_not_available + virtual size_t size() const = 0; + }; + + // }}} + + + // {{{ svm_arg_wrapper + + class svm_arg_wrapper : public svm_pointer { private: void *m_ptr; @@ -3468,7 +3584,7 @@ namespace pyopencl m_size = ward->m_buf.len; } - void *ptr() const + void *svm_ptr() const { return m_ptr; } @@ -3478,17 +3594,34 @@ namespace pyopencl } }; + // }}} + + + // {{{ svm_allocation - class svm_allocation : noncopyable + class svm_allocation : public svm_pointer { private: std::shared_ptr m_context; void *m_allocation; + size_t m_size; + command_queue_ref m_queue; + // FIXME Should keep a list of events so that we can wait for users + // to finish in the case of out-of-order queues. public: - svm_allocation(std::shared_ptr const &ctx, size_t size, cl_uint alignment, cl_svm_mem_flags flags) - : m_context(ctx) + svm_allocation(std::shared_ptr const &ctx, size_t size, cl_uint alignment, + cl_svm_mem_flags flags, const command_queue *queue = nullptr) + : m_context(ctx), m_size(size) { + if (queue) + { + m_queue.set(queue->data()); + if (is_queue_out_of_order(m_queue.data())) + throw error("SVMAllocation.__init__", CL_INVALID_VALUE, + "supplying an out-of-order queue to SVMAllocation is invalid"); + } + PYOPENCL_PRINT_CALL_TRACE("clSVMalloc"); m_allocation = clSVMAlloc( ctx->data(), @@ -3498,6 +3631,9 @@ namespace pyopencl throw pyopencl::error("clSVMAlloc", CL_OUT_OF_RESOURCES); } + svm_allocation(const svm_allocation &) = delete; + svm_allocation &operator=(const svm_allocation &) = delete; + ~svm_allocation() { if (m_allocation) @@ -3510,8 +3646,20 @@ namespace pyopencl throw error("SVMAllocation.release", CL_INVALID_VALUE, "trying to double-unref svm allocation"); - clSVMFree(m_context->data(), m_allocation); - m_allocation = nullptr; + if (m_queue.is_valid()) + { + PYOPENCL_CALL_GUARDED_CLEANUP(clEnqueueSVMFree, ( + m_queue.data(), 1, &m_allocation, + nullptr, nullptr, + 0, nullptr, nullptr)); + m_queue.reset(); + } + else + { + PYOPENCL_PRINT_CALL_TRACE("clSVMFree"); + clSVMFree(m_context->data(), m_allocation); + m_allocation = nullptr; + } } void enqueue_release(command_queue &queue, py::object py_wait_for) @@ -3532,11 +3680,16 @@ namespace pyopencl m_allocation = nullptr; } - void *ptr() const + void *svm_ptr() const { return m_allocation; } + size_t size() const + { + return m_size; + } + intptr_t ptr_as_int() const { return (intptr_t) m_allocation; @@ -3551,22 +3704,95 @@ namespace pyopencl { return m_allocation != other.m_allocation; } + + void bind_to_queue(command_queue const &queue) + { + if (is_queue_out_of_order(queue.data())) + throw error("SVMAllocation.bind_to_queue", CL_INVALID_VALUE, + "supplying an out-of-order queue to SVMAllocation is invalid"); + + if (m_queue.is_valid()) + { + // make sure synchronization promises stay valid in new queue + cl_event evt; + + PYOPENCL_CALL_GUARDED(clEnqueueMarker, (m_queue.data(), &evt)); + PYOPENCL_CALL_GUARDED(clEnqueueWaitForEvents, (queue.data(), 1, &evt)); + } + + m_queue.set(queue.data()); + } + + void unbind_from_queue() + { + // NOTE: This absolves the allocation from any synchronization promises + // made. Keeping those before calling this method is the responsibility + // of the user. + m_queue.reset(); + } }; + // }}} + + + // {{{ svm operations inline event *enqueue_svm_memcpy( command_queue &cq, cl_bool is_blocking, - svm_arg_wrapper &dst, svm_arg_wrapper &src, - py::object py_wait_for + svm_pointer &dst, svm_pointer &src, + py::object py_wait_for, + py::object byte_count_py ) { PYOPENCL_PARSE_WAIT_FOR; - if (src.size() != dst.size()) + // {{{ process size + + PYOPENCL_GET_SVM_SIZE(src); + PYOPENCL_GET_SVM_SIZE(dst); + + size_t size; + bool have_size = false; + + if (src_has_size) + { + size = src_size; + have_size = true; + } + if (dst_has_size) + { + if (have_size) + { + if (!byte_count_py.is_none()) + size = std::min(size, dst_size); + else if (size != dst_size) + throw error("_enqueue_svm_memcpy", CL_INVALID_VALUE, + "sizes of source and destination buffer do not match"); + } + else + { + size = dst_size; + have_size = true; + } + } + + if (!byte_count_py.is_none()) + { + size_t byte_count = byte_count_py.cast(); + if (have_size && byte_count > size) + throw error("_enqueue_svm_memcpy", CL_INVALID_VALUE, + "specified byte_count larger than size of source or destination buffers"); + size = byte_count; + have_size = true; + } + + if (!have_size) throw error("_enqueue_svm_memcpy", CL_INVALID_VALUE, - "sizes of source and destination buffer do not match"); + "size not passed and could not be determined"); + + // }}} cl_event evt; PYOPENCL_CALL_GUARDED( @@ -3574,8 +3800,8 @@ namespace pyopencl ( cq.data(), is_blocking, - dst.ptr(), src.ptr(), - dst.size(), + dst.svm_ptr(), src.svm_ptr(), + size, PYOPENCL_WAITLIST_ARGS, &evt )); @@ -3587,7 +3813,7 @@ namespace pyopencl inline event *enqueue_svm_memfill( command_queue &cq, - svm_arg_wrapper &dst, py::object py_pattern, + svm_pointer &dst, py::object py_pattern, py::object byte_count, py::object py_wait_for ) @@ -3604,18 +3830,41 @@ namespace pyopencl pattern_ptr = pattern_ward->m_buf.buf; pattern_len = pattern_ward->m_buf.len; - size_t fill_size = dst.size(); + // {{{ process size + + PYOPENCL_GET_SVM_SIZE(dst); + + size_t size; + bool have_size = false; + if (dst_has_size) + { + size = dst_size; + have_size = true; + } if (!byte_count.is_none()) - fill_size = py::cast(byte_count); + { + size_t user_size = py::cast(byte_count); + if (have_size && user_size > size) + throw error("enqueue_svm_memfill", CL_INVALID_VALUE, + "byte_count too large for specified SVM buffer"); + } + + if (!have_size) + { + throw error("enqueue_svm_memfill", CL_INVALID_VALUE, + "byte_count not passed and could not be determined"); + } + + // }}} cl_event evt; PYOPENCL_CALL_GUARDED( clEnqueueSVMMemFill, ( cq.data(), - dst.ptr(), pattern_ptr, + dst.svm_ptr(), pattern_ptr, pattern_len, - fill_size, + size, PYOPENCL_WAITLIST_ARGS, &evt )); @@ -3629,12 +3878,40 @@ namespace pyopencl command_queue &cq, cl_bool is_blocking, cl_map_flags flags, - svm_arg_wrapper &svm, - py::object py_wait_for + svm_pointer &svm, + py::object py_wait_for, + py::object user_size_py ) { PYOPENCL_PARSE_WAIT_FOR; + // {{{ process size + + PYOPENCL_GET_SVM_SIZE(svm); + + size_t size; + bool have_size = false; + if (svm_has_size) + { + size = svm_size; + have_size = true; + } + if (!user_size_py.is_none()) + { + size_t user_size = py::cast(user_size_py); + if (have_size && user_size > size) + throw error("enqueue_svm_memfill", CL_INVALID_VALUE, + "user-provided size too large for specified SVM buffer"); + } + + if (!have_size) + { + throw error("enqueue_svm_mem_map", CL_INVALID_VALUE, + "size not passed and could not be determined"); + } + + // }}} + cl_event evt; PYOPENCL_CALL_GUARDED( clEnqueueSVMMap, @@ -3642,7 +3919,7 @@ namespace pyopencl cq.data(), is_blocking, flags, - svm.ptr(), svm.size(), + svm.svm_ptr(), size, PYOPENCL_WAITLIST_ARGS, &evt )); @@ -3654,7 +3931,7 @@ namespace pyopencl inline event *enqueue_svm_unmap( command_queue &cq, - svm_arg_wrapper &svm, + svm_pointer &svm, py::object py_wait_for ) { @@ -3665,7 +3942,7 @@ namespace pyopencl clEnqueueSVMUnmap, ( cq.data(), - svm.ptr(), + svm.svm_ptr(), PYOPENCL_WAITLIST_ARGS, &evt )); @@ -3691,9 +3968,9 @@ namespace pyopencl for (py::handle py_svm: svms) { - svm_arg_wrapper &svm(py::cast(py_svm)); + svm_pointer &svm(py::cast(py_svm)); - svm_pointers.push_back(svm.ptr()); + svm_pointers.push_back(svm.svm_ptr()); sizes.push_back(svm.size()); } @@ -4486,10 +4763,10 @@ namespace pyopencl } #if PYOPENCL_CL_VERSION >= 0x2000 - void set_arg_svm(cl_uint arg_index, svm_arg_wrapper const &wrp) + void set_arg_svm(cl_uint arg_index, svm_pointer const &wrp) { PYOPENCL_CALL_GUARDED(clSetKernelArgSVMPointer, - (m_kernel, arg_index, wrp.ptr())); + (m_kernel, arg_index, wrp.svm_ptr())); } #endif @@ -4511,7 +4788,7 @@ namespace pyopencl #if PYOPENCL_CL_VERSION >= 0x2000 try { - set_arg_svm(arg_index, arg.cast()); + set_arg_svm(arg_index, arg.cast()); return; } catch (py::cast_error &) { } diff --git a/src/wrap_cl_part_2.cpp b/src/wrap_cl_part_2.cpp index 0c9a0d1b1..3c40e1c90 100644 --- a/src/wrap_cl_part_2.cpp +++ b/src/wrap_cl_part_2.cpp @@ -295,25 +295,50 @@ void pyopencl_expose_part_2(py::module &m) // {{{ svm #if PYOPENCL_CL_VERSION >= 0x2000 + { + typedef svm_pointer cls; + py::class_(m, "SVMPointer", py::dynamic_attr()) + .def("_ptr_as_int", [](cls &self) { return (intptr_t) self.svm_ptr(); }) + .def("_size", [](cls &self) -> py::object + { + try + { + return py::cast(self.size()); + } + catch (size_not_available) + { + return py::none(); + } + }) + ; + } + { typedef svm_arg_wrapper cls; - py::class_(m, "SVM", py::dynamic_attr()) + py::class_(m, "SVM", py::dynamic_attr()) .def(py::init()) ; } { typedef svm_allocation cls; - py::class_(m, "SVMAllocation", py::dynamic_attr()) - .def(py::init, size_t, cl_uint, cl_svm_mem_flags>()) + py::class_(m, "SVMAllocation", py::dynamic_attr()) + .def(py::init, size_t, cl_uint, cl_svm_mem_flags, const command_queue *>(), + py::arg("context"), + py::arg("size"), + py::arg("alignment"), + py::arg("flags"), + py::arg("queue").none(true)=py::none() + ) .DEF_SIMPLE_METHOD(release) .def("enqueue_release", &cls::enqueue_release, ":returns: a :class:`pyopencl.Event`\n\n" "|std-enqueue-blurb|") - .def("_ptr_as_int", &cls::ptr_as_int) .def(py::self == py::self) .def(py::self != py::self) - .def("__hash__", &cls::ptr_as_int) + .def("__hash__", [](cls &self) { return (intptr_t) self.svm_ptr(); }) + .DEF_SIMPLE_METHOD(bind_to_queue) + .DEF_SIMPLE_METHOD(unbind_from_queue) ; } @@ -322,7 +347,8 @@ void pyopencl_expose_part_2(py::module &m) py::arg("is_blocking"), py::arg("dst"), py::arg("src"), - py::arg("wait_for")=py::none() + py::arg("wait_for")=py::none(), + py::arg("byte_count")=py::none() ); m.def("_enqueue_svm_memfill", enqueue_svm_memfill, @@ -338,7 +364,8 @@ void pyopencl_expose_part_2(py::module &m) py::arg("is_blocking"), py::arg("flags"), py::arg("svm"), - py::arg("wait_for")=py::none() + py::arg("wait_for")=py::none(), + py::arg("size")=py::none() ); m.def("_enqueue_svm_unmap", enqueue_svm_unmap, diff --git a/src/wrap_mempool.cpp b/src/wrap_mempool.cpp index 6b014ba5e..34b5e1cea 100644 --- a/src/wrap_mempool.cpp +++ b/src/wrap_mempool.cpp @@ -1,4 +1,4 @@ -// Warp memory pool +// Wrap memory pool // // Copyright (C) 2009 Andreas Kloeckner // @@ -57,12 +57,18 @@ namespace { return false; } + virtual pointer_type allocate(size_type s) { return nullptr; } - void free(pointer_type p) + virtual pointer_type hand_out_existing_block(pointer_type &&p) + { + return p; + } + + void free(pointer_type &&p) { } void try_release_blocks() @@ -70,14 +76,16 @@ namespace }; - class cl_allocator_base + // {{{ buffer allocators + + class buffer_allocator_base { protected: std::shared_ptr m_context; cl_mem_flags m_flags; public: - cl_allocator_base(std::shared_ptr const &ctx, + buffer_allocator_base(std::shared_ptr const &ctx, cl_mem_flags flags=CL_MEM_READ_WRITE) : m_context(ctx), m_flags(flags) { @@ -86,21 +94,26 @@ namespace "cannot specify USE_HOST_PTR or COPY_HOST_PTR flags"); } - cl_allocator_base(cl_allocator_base const &src) + buffer_allocator_base(buffer_allocator_base const &src) : m_context(src.m_context), m_flags(src.m_flags) { } - virtual ~cl_allocator_base() + virtual ~buffer_allocator_base() { } typedef cl_mem pointer_type; typedef size_t size_type; - virtual cl_allocator_base *copy() const = 0; + virtual buffer_allocator_base *copy() const = 0; virtual bool is_deferred() const = 0; virtual pointer_type allocate(size_type s) = 0; - void free(pointer_type p) + virtual pointer_type hand_out_existing_block(pointer_type p) + { + return p; + } + + void free(pointer_type &&p) { PYOPENCL_CALL_GUARDED(clReleaseMemObject, (p)); } @@ -111,20 +124,21 @@ namespace } }; - class cl_deferred_allocator : public cl_allocator_base + + class deferred_buffer_allocator : public buffer_allocator_base { private: - typedef cl_allocator_base super; + typedef buffer_allocator_base super; public: - cl_deferred_allocator(std::shared_ptr const &ctx, + deferred_buffer_allocator(std::shared_ptr const &ctx, cl_mem_flags flags=CL_MEM_READ_WRITE) : super(ctx, flags) { } - cl_allocator_base *copy() const + buffer_allocator_base *copy() const { - return new cl_deferred_allocator(*this); + return new deferred_buffer_allocator(*this); } bool is_deferred() const @@ -141,26 +155,26 @@ namespace const unsigned zero = 0; - class cl_immediate_allocator : public cl_allocator_base + class immediate_buffer_allocator : public buffer_allocator_base { private: - typedef cl_allocator_base super; + typedef buffer_allocator_base super; pyopencl::command_queue m_queue; public: - cl_immediate_allocator(pyopencl::command_queue &queue, + immediate_buffer_allocator(pyopencl::command_queue &queue, cl_mem_flags flags=CL_MEM_READ_WRITE) : super(std::shared_ptr(queue.get_context()), flags), m_queue(queue.data(), /*retain*/ true) { } - cl_immediate_allocator(cl_immediate_allocator const &src) + immediate_buffer_allocator(immediate_buffer_allocator const &src) : super(src), m_queue(src.m_queue) { } - cl_allocator_base *copy() const + buffer_allocator_base *copy() const { - return new cl_immediate_allocator(*this); + return new immediate_buffer_allocator(*this); } bool is_deferred() const @@ -210,11 +224,13 @@ namespace } }; + // }}} + // {{{ buffer_allocator_call inline - pyopencl::buffer *allocator_call(cl_allocator_base &alloc, size_t size) + pyopencl::buffer *buffer_allocator_call(buffer_allocator_base &alloc, size_t size) { cl_mem mem; int try_count = 0; @@ -256,16 +272,18 @@ namespace } } + // }}} + // {{{ pooled_buffer class pooled_buffer - : public pyopencl::pooled_allocation >, + : public pyopencl::pooled_allocation >, public pyopencl::memory_object_holder { private: typedef - pyopencl::pooled_allocation > + pyopencl::pooled_allocation > super; public: @@ -275,22 +293,233 @@ namespace { } const super::pointer_type data() const - { return ptr(); } + { return m_ptr; } + + size_t size() const + { + return m_size; + } }; + // }}} + // {{{ buffer_pool_allocate - pooled_buffer *device_pool_allocate( - std::shared_ptr > pool, - pyopencl::memory_pool::size_type sz) + pooled_buffer *buffer_pool_allocate( + std::shared_ptr > pool, + pyopencl::memory_pool::size_type sz) { return new pooled_buffer(pool, sz); } + // }}} + + +#if PYOPENCL_CL_VERSION >= 0x2000 + + // {{{ svm allocator + // FIXME: Does this need deferred and immediate just like the buffer-level + // allocators? (I.e. can I tell whether I am out of memory just from allocations?) + struct svm_held_pointer + { + void *ptr; + pyopencl::command_queue_ref queue; + }; + class svm_allocator + { + public: + typedef svm_held_pointer pointer_type; + typedef size_t size_type; + + protected: + std::shared_ptr m_context; + cl_uint m_alignment; + cl_svm_mem_flags m_flags; + pyopencl::command_queue_ref m_queue; + + + public: + svm_allocator(std::shared_ptr const &ctx, + cl_uint alignment, cl_svm_mem_flags flags=CL_MEM_READ_WRITE, + pyopencl::command_queue *queue=nullptr) + : m_context(ctx), m_alignment(alignment), m_flags(flags) + { + if (queue) + m_queue.set(queue->data()); + } + + svm_allocator(svm_allocator const &src) + : m_context(src.m_context), m_alignment(src.m_alignment), + m_flags(src.m_flags) + { } + + virtual ~svm_allocator() + { } + + virtual svm_allocator *copy() const + { + return new svm_allocator(m_context, m_alignment, m_flags); + } + + virtual bool is_deferred() const + { + // FIXME: I don't know whether that's true. + return false; + } + + pointer_type allocate(size_type size) + { + if (size == 0) + return { nullptr, nullptr }; + + PYOPENCL_PRINT_CALL_TRACE("clSVMalloc"); + return { + clSVMAlloc(m_context->data(), m_flags, size, m_alignment), + pyopencl::command_queue_ref(m_queue.data()) + }; + } + + virtual pointer_type hand_out_existing_block(pointer_type &&p) + { + if (m_queue.is_valid()) + { + if (p.queue.is_valid()) + { + // make sure synchronization promises stay valid in new queue + cl_event evt; + + PYOPENCL_CALL_GUARDED(clEnqueueMarker, (p.queue.data(), &evt)); + PYOPENCL_CALL_GUARDED(clEnqueueWaitForEvents, (m_queue.data(), 1, &evt)); + } + p.queue.set(m_queue.data()); + } + return p; + } + + void free(pointer_type &&p) + { + if (p.queue.is_valid()) + { + PYOPENCL_CALL_GUARDED_CLEANUP(clEnqueueSVMFree, ( + p.queue.data(), 1, &p.ptr, + nullptr, nullptr, + 0, nullptr, nullptr)); + p.queue.reset(); + } + else + { + PYOPENCL_PRINT_CALL_TRACE("clSVMFree"); + clSVMFree(m_context->data(), p.ptr); + } + } + + void try_release_blocks() + { + pyopencl::run_python_gc(); + } + }; + + // }}} + + + // {{{ svm_allocator_call + + inline + svm_held_pointer svm_allocator_call(svm_allocator &alloc, size_t size) + { + svm_held_pointer mem; + int try_count = 0; + while (true) + { + try + { + return alloc.allocate(size); + } + catch (pyopencl::error &e) + { + if (!e.is_out_of_memory()) + throw; + if (++try_count == 2) + throw; + } + + alloc.try_release_blocks(); + } + } + + // }}} + + + // {{{ pooled_svm + + class pooled_svm + : public pyopencl::pooled_allocation>, + public pyopencl::svm_pointer + { + private: + typedef + pyopencl::pooled_allocation> + super; + + public: + pooled_svm( + std::shared_ptr p, super::size_type s) + : super(p, s) + { } + + void *svm_ptr() const + { return m_ptr.ptr; } + + size_t size() const + { return m_size; } + + void bind_to_queue(pyopencl::command_queue const &queue) + { + if (pyopencl::is_queue_out_of_order(queue.data())) + throw pyopencl::error("PooledSVM.bind_to_queue", CL_INVALID_VALUE, + "supplying an out-of-order queue to SVMAllocation is invalid"); + + if (m_ptr.queue.is_valid()) + { + // make sure synchronization promises stay valid in new queue + cl_event evt; + + PYOPENCL_CALL_GUARDED(clEnqueueMarker, (m_ptr.queue.data(), &evt)); + PYOPENCL_CALL_GUARDED(clEnqueueWaitForEvents, (queue.data(), 1, &evt)); + } + + m_ptr.queue.set(queue.data()); + } + + void unbind_from_queue() + { + // NOTE: This absolves the allocation from any synchronization promises + // made. Keeping those before calling this method is the responsibility + // of the user. + m_ptr.queue.reset(); + } + }; + + // }}} + + + // {{{ svm_pool_allocate + + pooled_svm *svm_pool_allocate( + std::shared_ptr > pool, + pyopencl::memory_pool::size_type sz) + { + return new pooled_svm(pool, sz); + } + + // }}} + +#endif + template void expose_memory_pool(Wrapper &wrapper) { @@ -316,11 +545,11 @@ void pyopencl_expose_mempool(py::module &m) m.def("bitlog2", pyopencl::bitlog2); { - typedef cl_allocator_base cls; + typedef buffer_allocator_base cls; py::class_ wrapper( m, "_tools_AllocatorBase"/*, py::no_init */); wrapper - .def("__call__", allocator_call) + .def("__call__", buffer_allocator_call) ; } @@ -345,8 +574,8 @@ void pyopencl_expose_mempool(py::module &m) } { - typedef cl_deferred_allocator cls; - py::class_ wrapper( + typedef deferred_buffer_allocator cls; + py::class_ wrapper( m, "_tools_DeferredAllocator"); wrapper .def(py::init< @@ -359,8 +588,8 @@ void pyopencl_expose_mempool(py::module &m) } { - typedef cl_immediate_allocator cls; - py::class_ wrapper( + typedef immediate_buffer_allocator cls; + py::class_ wrapper( m, "_tools_ImmediateAllocator"); wrapper .def(py::init()) @@ -370,18 +599,18 @@ void pyopencl_expose_mempool(py::module &m) } { - typedef pyopencl::memory_pool cls; + typedef pyopencl::memory_pool cls; py::class_< cls, /* boost::noncopyable, */ - std::shared_ptr> wrapper( m, "MemoryPool"); + std::shared_ptr> wrapper( m, "_tools_MemoryPool"); wrapper - .def(py::init(), + .def(py::init(), py::arg("allocator"), py::arg("leading_bits_in_bin_id")=4 ) - .def("allocate", device_pool_allocate) - .def("__call__", device_pool_allocate) + .def("allocate", buffer_pool_allocate) + .def("__call__", buffer_pool_allocate) // undoc for now .DEF_SIMPLE_METHOD(set_trace) ; @@ -393,8 +622,70 @@ void pyopencl_expose_mempool(py::module &m) typedef pooled_buffer cls; py::class_( - m, "PooledBuffer"/* , py::no_init */) + m, "_tools_PooledBuffer"/* , py::no_init */) + .def("release", &cls::free) + // undocumented for now, for consistency with SVM + .def("bind_to_queue", [](cls &self, pyopencl::command_queue &queue) { /* no-op */ }) + .def("unbind_from_queue", [](cls &self) { /* no-op */ }) + ; + } + +#if PYOPENCL_CL_VERSION >= 0x2000 + { + typedef pyopencl::memory_pool cls; + + py::class_< + cls, /* boost::noncopyable, */ + std::shared_ptr> wrapper( m, "_tools_SVMemoryPool"); + wrapper + .def(py::init(), + py::arg("allocator"), + py::arg("leading_bits_in_bin_id")=4 + ) + .def("allocate", svm_pool_allocate) + .def("__call__", svm_pool_allocate) + // undoc for now + .DEF_SIMPLE_METHOD(set_trace) + ; + + expose_memory_pool(wrapper); + } + + { + typedef pooled_svm cls; + py::class_( + m, "_tools_PooledSVM"/* , py::no_init */) .def("release", &cls::free) + .def("__eq__", [](const cls &self, const cls &other) + { return self.svm_ptr() == other.svm_ptr(); }) + .def("__hash__", [](cls &self) { return (intptr_t) self.svm_ptr(); }) + .DEF_SIMPLE_METHOD(bind_to_queue) + .DEF_SIMPLE_METHOD(unbind_from_queue) ; } + + { + typedef svm_allocator cls; + py::class_ wrapper( + m, "_tools_SVMAllocator"); + wrapper + .def(py::init const &, cl_uint, cl_uint, pyopencl::command_queue *>(), + py::arg("context"), + py::arg("alignment"), + py::arg("flags")=CL_MEM_READ_WRITE, + py::arg("command_queue").none(true)=nullptr + ) + .def("__call__", svm_allocator_call) + ; + } + + { + typedef svm_held_pointer cls; + py::class_ wrapper( + m, "_tools_SVMHeldPointer"); + } +#endif } + +// vim: foldmethod=marker