From 51bf88d9ad63930e5637a77f09ac50c95630929f Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Fri, 1 Jul 2022 01:09:44 -0500
Subject: [PATCH 01/15] __init__: import warn once at the top

---
 pyopencl/__init__.py | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/pyopencl/__init__.py b/pyopencl/__init__.py
index 6420e5afb..016ad4d82 100644
--- a/pyopencl/__init__.py
+++ b/pyopencl/__init__.py
@@ -21,6 +21,7 @@
 """
 
 from sys import intern
+from warnings import warn
 
 from pyopencl.version import VERSION, VERSION_STATUS, VERSION_TEXT  # noqa
 
@@ -43,7 +44,6 @@
     import os
     from os.path import dirname, join, realpath
     if realpath(join(os.getcwd(), "pyopencl")) == realpath(dirname(__file__)):
-        from warnings import warn
         warn("It looks like you are importing PyOpenCL from "
                 "its source directory. This likely won't work.")
     raise
@@ -267,7 +267,6 @@ class CommandQueueUsedAfterExit(UserWarning):
 
 def compiler_output(text):
     import os
-    from warnings import warn
     if int(os.environ.get("PYOPENCL_COMPILER_OUTPUT", "0")):
         warn(text, CompilerWarning)
     else:
@@ -389,7 +388,6 @@ def enable_debugging(platform_or_context):
         import os
         os.environ["CPU_MAX_COMPUTE_UNITS"] = "1"
     else:
-        from warnings import warn
         warn("do not know how to enable debugging on '%s'"
                 % platform.name)
 
@@ -428,7 +426,6 @@ def _get_prg(self):
             return self._prg
         else:
             # "no program" can only happen in from-source case.
-            from warnings import warn
             warn("Pre-build attribute access defeats compiler caching.",
                     stacklevel=3)
 
@@ -662,7 +659,6 @@ def device_hashable_model_and_version_identifier(self):
         return ("v1", self.vendor, self.vendor_id, self.name, self.version)
 
     def device_persistent_unique_id(self):
-        from warnings import warn
         warn("Device.persistent_unique_id is deprecated. "
                 "Use Device.hashable_model_and_version_identifier instead.",
                 DeprecationWarning, stacklevel=2)
@@ -684,7 +680,6 @@ def device_persistent_unique_id(self):
 
     def context_init(self, devices, properties, dev_type, cache_dir=None):
         if cache_dir is not None:
-            from warnings import warn
             warn("The 'cache_dir' argument to the Context constructor "
                 "is deprecated and no longer has an effect. "
                 "It was removed because it only applied to the wrapper "
@@ -970,7 +965,6 @@ def image_init(self, context, flags, format, shape=None, pitches=None,
 
         if hostbuf is not None and not \
                 (flags & (mem_flags.USE_HOST_PTR | mem_flags.COPY_HOST_PTR)):
-            from warnings import warn
             warn("'hostbuf' was passed, but no memory flags to make use of it.")
 
         if hostbuf is None and pitches is not None:
@@ -1043,7 +1037,6 @@ def image_init(self, context, flags, format, shape=None, pitches=None,
 
     class _ImageInfoGetter:
         def __init__(self, event):
-            from warnings import warn
             warn("Image.image.attr is deprecated and will go away in 2021. "
                     "Use Image.attr directly, instead.")
 
@@ -1927,7 +1920,6 @@ def enqueue_barrier(queue, wait_for=None):
 
 def enqueue_fill_buffer(queue, mem, pattern, offset, size, wait_for=None):
     if not (queue._get_cl_version() >= (1, 2) and get_cl_header_version() >= (1, 2)):
-        from warnings import warn
         warn("The context for this queue does not declare OpenCL 1.2 support, so "
                 "the next thing you might see is a crash")
 

From 3446d215118b36acbe95998ca67b1faed09ae248 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Fri, 1 Jul 2022 01:14:58 -0500
Subject: [PATCH 02/15] Add is_queue_in_order

---
 src/wrap_cl.hpp | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/src/wrap_cl.hpp b/src/wrap_cl.hpp
index 98964056f..8de8cd0df 100644
--- a/src/wrap_cl.hpp
+++ b/src/wrap_cl.hpp
@@ -496,6 +496,19 @@ namespace pyopencl
   // }}}
 
 
+  // {{{ utility functions
+
+  inline bool is_queue_in_order(cl_command_queue queue)
+  {
+      cl_command_queue_properties param_value;
+      PYOPENCL_CALL_GUARDED(clGetCommandQueueInfo,
+          (queue, CL_QUEUE_PROPERTIES, sizeof(param_value), &param_value, 0));
+      return !(param_value & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE);
+  }
+
+  // }}}
+
+
   // {{{ buffer interface helper
 
 

From 9722b58779d78599bed789a5ed5637ff0dce03b5 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Fri, 1 Jul 2022 01:15:32 -0500
Subject: [PATCH 03/15] Add command_queue_ref

---
 src/wrap_cl.hpp | 70 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 70 insertions(+)

diff --git a/src/wrap_cl.hpp b/src/wrap_cl.hpp
index 8de8cd0df..8147cb183 100644
--- a/src/wrap_cl.hpp
+++ b/src/wrap_cl.hpp
@@ -1668,6 +1668,76 @@ namespace pyopencl
   // }}}
 
 
+  // {{{ command_queue_ref
+
+  // In contrast to command_queue, command_queue_ref is "nullable", i.e.
+  // it is a RAII *optional* reference to a command queue.
+
+  class command_queue_ref
+  {
+    private:
+      bool m_valid;
+      cl_command_queue m_queue;
+
+    public:
+      command_queue_ref()
+        : m_valid(false)
+      {}
+
+      command_queue_ref(cl_command_queue queue)
+        : m_valid(true), m_queue(queue)
+      {
+        PYOPENCL_CALL_GUARDED(clRetainCommandQueue, (m_queue));
+      }
+
+      command_queue_ref(command_queue_ref &&src)
+        : m_valid(src.m_valid), m_queue(src.m_queue)
+      {
+        src.m_valid = false;
+      }
+
+      command_queue_ref(const command_queue_ref &) = delete;
+      command_queue_ref &operator=(const command_queue_ref &) = delete;
+
+      ~command_queue_ref()
+      {
+        reset();
+      }
+
+      bool is_valid() const
+      {
+        return m_valid;
+      }
+
+      cl_command_queue data() const
+      {
+        if (m_valid)
+          return m_queue;
+        else
+          throw error("command_queue_ref.data", CL_INVALID_VALUE,
+              "command_queue_ref is not valid");
+      }
+
+      void reset()
+      {
+        if (m_valid)
+          PYOPENCL_CALL_GUARDED_CLEANUP(clReleaseCommandQueue, (m_queue));
+        m_valid = false;
+      }
+
+      void set(cl_command_queue queue)
+      {
+        if (m_valid)
+          PYOPENCL_CALL_GUARDED(clReleaseCommandQueue, (m_queue));
+        m_queue = queue;
+        PYOPENCL_CALL_GUARDED(clRetainCommandQueue, (m_queue));
+        m_valid = true;
+      }
+  };
+
+  // }}}
+
+
   // {{{ event/synchronization
 
   class event : noncopyable

From b65cbe4c9700ae42d6ee8136682530adb6cdd4e2 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Thu, 30 Jun 2022 23:18:16 -0500
Subject: [PATCH 04/15] Add, use enqueue_fill

---
 doc/runtime_memory.rst |  2 ++
 pyopencl/__init__.py   | 21 +++++++++++++++++++++
 pyopencl/array.py      |  4 ++--
 3 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/doc/runtime_memory.rst b/doc/runtime_memory.rst
index f4e01f266..ffe6661e8 100644
--- a/doc/runtime_memory.rst
+++ b/doc/runtime_memory.rst
@@ -281,6 +281,8 @@ Transfers
 
 .. autofunction:: enqueue_copy(queue, dest, src, **kwargs)
 
+.. autofunction:: enqueue_fill(queue, dest, src, **kwargs)
+
 Mapping Memory into Host Address Space
 --------------------------------------
 
diff --git a/pyopencl/__init__.py b/pyopencl/__init__.py
index 016ad4d82..86c77f05b 100644
--- a/pyopencl/__init__.py
+++ b/pyopencl/__init__.py
@@ -22,6 +22,7 @@
 
 from sys import intern
 from warnings import warn
+from typing import Union, Any
 
 from pyopencl.version import VERSION, VERSION_STATUS, VERSION_TEXT  # noqa
 
@@ -1815,6 +1816,26 @@ def enqueue_copy(queue, dest, src, **kwargs):
 # }}}
 
 
+# {{{ enqueue_fill
+
+def enqueue_fill(queue: CommandQueue, dest: Union[MemoryObjectHolder, SVM],
+        pattern: Any, size: int, *, offset: int = 0, wait_for=None) -> Event:
+    """
+    .. versionadded:: 2022.2
+    """
+    if isinstance(dest, MemoryObjectHolder):
+        return enqueue_fill_buffer(queue, dest, pattern, offset, size, wait_for)
+    elif isinstance(dest, SVM):
+        if offset:
+            raise NotImplementedError("enqueue_fill with SVM does not yet support "
+                    "offsets")
+        return enqueue_svm_memfill(queue, dest, pattern, size, wait_for)
+    else:
+        raise TypeError(f"enqueue_fill does not know how to fill '{type(dest)}'")
+
+# }}}
+
+
 # {{{ image creation
 
 DTYPE_TO_CHANNEL_TYPE = {
diff --git a/pyopencl/array.py b/pyopencl/array.py
index cb66121a5..28c253c21 100644
--- a/pyopencl/array.py
+++ b/pyopencl/array.py
@@ -1458,8 +1458,8 @@ def _zero_fill(self, queue=None, wait_for=None):
         # https://github.com/inducer/pyopencl/issues/395
         if cl_version_gtr_1_2 and not (on_nvidia and self.nbytes >= 2**31):
             self.add_event(
-                    cl.enqueue_fill_buffer(queue, self.base_data, np.int8(0),
-                        self.offset, self.nbytes, wait_for=wait_for))
+                    cl.enqueue_fill(queue, self.base_data, np.int8(0),
+                        self.nbytes, offset=self.offset, wait_for=wait_for))
         else:
             zero = np.zeros((), self.dtype)
             self.fill(zero, queue=queue)

From fe0d49e32dbf6d8c5d1843e6ac20190632b8f617 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Mon, 29 Mar 2021 00:21:04 -0500
Subject: [PATCH 05/15] Experiment with SVM backing arrays

---
 examples/demo_array_svm.py | 57 ++++++++++++++++++++++++++++++++++++++
 pyopencl/array.py          | 22 +++++++++++----
 2 files changed, 73 insertions(+), 6 deletions(-)
 create mode 100644 examples/demo_array_svm.py

diff --git a/examples/demo_array_svm.py b/examples/demo_array_svm.py
new file mode 100644
index 000000000..07454f1fe
--- /dev/null
+++ b/examples/demo_array_svm.py
@@ -0,0 +1,57 @@
+import pyopencl as cl
+import pyopencl.array as cl_array
+import numpy as np
+import numpy.linalg as la
+
+n = 5000000
+a = np.random.rand(n).astype(np.float32)
+b = np.random.rand(n).astype(np.float32)
+
+
+class SVMAllocator:
+    def __init__(self, ctx, flags, alignment):
+        self._context = ctx
+        self._flags = flags
+        self._alignment = alignment
+
+    def __call__(self, nbytes):
+        return cl.SVM(cl.svm_empty(
+                ctx, self._flags, (nbytes,), np.int8, "C", self._alignment))
+
+
+ctx = cl.create_some_context()
+queue = cl.CommandQueue(ctx)
+
+alloc = SVMAllocator(ctx,
+        cl.svm_mem_flags.READ_WRITE | cl.svm_mem_flags.SVM_FINE_GRAIN_BUFFER,
+        0)
+
+a_dev = cl_array.to_device(queue, a, allocator=alloc)
+print("A_DEV", a_dev.data.mem.nbytes, a_dev.data.mem.__array_interface__)
+b_dev = cl_array.to_device(queue, b, allocator=alloc)
+dest_dev = cl_array.empty_like(a_dev)
+print("DEST", dest_dev.data.mem.__array_interface__)
+
+prg = cl.Program(ctx, """
+    __kernel void sum(__global const float *a,
+    __global const float *b, __global float *c)
+    {
+      int gid = get_global_id(0);
+      c[gid] = a[gid] + b[gid];
+    }
+    """).build()
+
+knl = prg.sum  # Use this Kernel object for repeated calls
+knl(queue, a.shape, None, a_dev.data, b_dev.data, dest_dev.data)
+
+# PROBLEM: numpy frees the temporary out of (a_dev+b_dev) before
+# we're done with it
+diff = dest_dev - (a_dev+b_dev)
+if 0:
+    diff = diff.get()
+    np.set_printoptions(linewidth=400)
+    print(dest_dev)
+    print((a_dev+b_dev).get())
+    print(diff)
+    print(la.norm(diff))
+    print("A_DEV", a_dev.data.mem.__array_interface__)
diff --git a/pyopencl/array.py b/pyopencl/array.py
index 28c253c21..540e43ad5 100644
--- a/pyopencl/array.py
+++ b/pyopencl/array.py
@@ -720,9 +720,14 @@ def set(self, ary, queue=None, async_=None, **kwargs):
                     stacklevel=2)
 
         if self.size:
-            event1 = cl.enqueue_copy(queue or self.queue, self.base_data, ary,
-                    device_offset=self.offset,
-                    is_blocking=not async_)
+            if self.offset:
+                event1 = cl.enqueue_copy(queue or self.queue, self.base_data, ary,
+                        device_offset=self.offset,
+                        is_blocking=not async_)
+            else:
+                event1 = cl.enqueue_copy(queue or self.queue, self.base_data, ary,
+                        is_blocking=not async_)
+
             self.add_event(event1)
 
     def _get(self, queue=None, ary=None, async_=None, **kwargs):
@@ -770,9 +775,14 @@ def _get(self, queue=None, ary=None, async_=None, **kwargs):
                     "to associate one.")
 
         if self.size:
-            event1 = cl.enqueue_copy(queue, ary, self.base_data,
-                    device_offset=self.offset,
-                    wait_for=self.events, is_blocking=not async_)
+            if self.offset:
+                event1 = cl.enqueue_copy(queue, ary, self.base_data,
+                        device_offset=self.offset,
+                        wait_for=self.events, is_blocking=not async_)
+            else:
+                event1 = cl.enqueue_copy(queue, ary, self.base_data,
+                        wait_for=self.events, is_blocking=not async_)
+
             self.add_event(event1)
         else:
             event1 = None

From f3684b8905a2b5734980fd1d7d3f3028617c9b43 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Mon, 27 Jun 2022 01:12:54 -0500
Subject: [PATCH 06/15] Allow tying an svm_allocation to a queue

---
 pyopencl/__init__.py   | 14 +++++++++----
 src/wrap_cl.hpp        | 45 ++++++++++++++++++++++++++++++++++++++----
 src/wrap_cl_part_2.cpp | 10 +++++++++-
 3 files changed, 60 insertions(+), 9 deletions(-)

diff --git a/pyopencl/__init__.py b/pyopencl/__init__.py
index 86c77f05b..5b8d8b72e 100644
--- a/pyopencl/__init__.py
+++ b/pyopencl/__init__.py
@@ -1146,12 +1146,13 @@ def memory_map_exit(self, exc_type, exc_val, exc_tb):
     if get_cl_header_version() >= (2, 0):
         svmallocation_old_init = SVMAllocation.__init__
 
-    def svmallocation_init(self, ctx, size, alignment, flags, _interface=None):
+    def svmallocation_init(self, ctx, size, alignment, flags, _interface=None,
+            queue=None):
         """
         :arg ctx: a :class:`Context`
         :arg flags: some of :class:`svm_mem_flags`.
         """
-        svmallocation_old_init(self, ctx, size, alignment, flags)
+        svmallocation_old_init(self, ctx, size, alignment, flags, queue)
 
         # mem_flags.READ_ONLY applies to kernels, not the host
         read_write = True
@@ -1996,7 +1997,7 @@ def enqueue_svm_migratemem(queue, svms, flags, wait_for=None):
             wait_for)
 
 
-def svm_empty(ctx, flags, shape, dtype, order="C", alignment=None):
+def svm_empty(ctx, flags, shape, dtype, order="C", alignment=None, queue=None):
     """Allocate an empty :class:`numpy.ndarray` of the given *shape*, *dtype*
     and *order*. (See :func:`numpy.empty` for the meaning of these arguments.)
     The array will be allocated in shared virtual memory belonging
@@ -2014,6 +2015,10 @@ def svm_empty(ctx, flags, shape, dtype, order="C", alignment=None):
     will likely want to wrap the returned array in an :class:`SVM` tag.
 
     .. versionadded:: 2016.2
+
+    .. versionchanged:: 2022.2
+
+        *queue* argument added.
     """
 
     dtype = np.dtype(dtype)
@@ -2060,7 +2065,8 @@ def svm_empty(ctx, flags, shape, dtype, order="C", alignment=None):
     if alignment is None:
         alignment = itemsize
 
-    svm_alloc = SVMAllocation(ctx, nbytes, alignment, flags, _interface=interface)
+    svm_alloc = SVMAllocation(ctx, nbytes, alignment, flags, _interface=interface,
+            queue=queue)
     return np.asarray(svm_alloc)
 
 
diff --git a/src/wrap_cl.hpp b/src/wrap_cl.hpp
index 8147cb183..2582a5103 100644
--- a/src/wrap_cl.hpp
+++ b/src/wrap_cl.hpp
@@ -3562,16 +3562,28 @@ namespace pyopencl
   };
 
 
-  class svm_allocation : noncopyable
+  class svm_allocation
   {
     private:
       std::shared_ptr<context> m_context;
       void *m_allocation;
+      command_queue_ref m_queue;
+      // FIXME Should keep a list of events so that we can wait for users
+      // to finish in the case of out-of-order queues.
 
     public:
-      svm_allocation(std::shared_ptr<context> const &ctx, size_t size, cl_uint alignment, cl_svm_mem_flags flags)
+      svm_allocation(std::shared_ptr<context> const &ctx, size_t size, cl_uint alignment,
+          cl_svm_mem_flags flags, const command_queue *queue = nullptr)
         : m_context(ctx)
       {
+        if (queue)
+        {
+          m_queue.set(queue->data());
+          if (!is_queue_in_order(m_queue.data()))
+            throw error("SVMAllocation.__init__", CL_INVALID_VALUE,
+                "supplying an out-of-order queue to SVMAllocation is invalid");
+        }
+
         PYOPENCL_PRINT_CALL_TRACE("clSVMalloc");
         m_allocation = clSVMAlloc(
             ctx->data(),
@@ -3581,6 +3593,9 @@ namespace pyopencl
           throw pyopencl::error("clSVMAlloc", CL_OUT_OF_RESOURCES);
       }
 
+      svm_allocation(const svm_allocation &) = delete;
+      svm_allocation &operator=(const svm_allocation &) = delete;
+
       ~svm_allocation()
       {
         if (m_allocation)
@@ -3593,8 +3608,20 @@ namespace pyopencl
           throw error("SVMAllocation.release", CL_INVALID_VALUE,
               "trying to double-unref svm allocation");
 
-        clSVMFree(m_context->data(), m_allocation);
-        m_allocation = nullptr;
+        if (m_queue.is_valid())
+        {
+          PYOPENCL_CALL_GUARDED_CLEANUP(clEnqueueSVMFree, (
+                m_queue.data(), 1, &m_allocation,
+                nullptr, nullptr,
+                0, nullptr, nullptr));
+          m_queue.reset();
+        }
+        else
+        {
+          PYOPENCL_PRINT_CALL_TRACE("clSVMFree");
+          clSVMFree(m_context->data(), m_allocation);
+          m_allocation = nullptr;
+        }
       }
 
       void enqueue_release(command_queue &queue, py::object py_wait_for)
@@ -3634,6 +3661,16 @@ namespace pyopencl
       {
         return m_allocation != other.m_allocation;
       }
+
+      void bind_to_queue(command_queue const &queue)
+      {
+        m_queue.set(queue.data());
+      }
+
+      void unbind_from_queue()
+      {
+        m_queue.reset();
+      }
   };
 
 
diff --git a/src/wrap_cl_part_2.cpp b/src/wrap_cl_part_2.cpp
index 0c9a0d1b1..ace5e9777 100644
--- a/src/wrap_cl_part_2.cpp
+++ b/src/wrap_cl_part_2.cpp
@@ -305,7 +305,13 @@ void pyopencl_expose_part_2(py::module &m)
   {
     typedef svm_allocation cls;
     py::class_<cls>(m, "SVMAllocation", py::dynamic_attr())
-      .def(py::init<std::shared_ptr<context>, size_t, cl_uint, cl_svm_mem_flags>())
+      .def(py::init<std::shared_ptr<context>, size_t, cl_uint, cl_svm_mem_flags, const command_queue *>(),
+          py::arg("context"),
+          py::arg("size"),
+          py::arg("alignment"),
+          py::arg("flags"),
+          py::arg("queue").none(true)=py::none()
+          )
       .DEF_SIMPLE_METHOD(release)
       .def("enqueue_release", &cls::enqueue_release,
           ":returns: a :class:`pyopencl.Event`\n\n"
@@ -314,6 +320,8 @@ void pyopencl_expose_part_2(py::module &m)
       .def(py::self == py::self)
       .def(py::self != py::self)
       .def("__hash__", &cls::ptr_as_int)
+      .DEF_SIMPLE_METHOD(bind_to_queue)
+      .DEF_SIMPLE_METHOD(unbind_from_queue)
       ;
   }
 

From 18a52ae3cfad2a7e87923cef0261340c8e7c36d0 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Mon, 27 Jun 2022 01:13:52 -0500
Subject: [PATCH 07/15] Add fold markers in wrap_mempool

---
 src/wrap_mempool.cpp | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/src/wrap_mempool.cpp b/src/wrap_mempool.cpp
index 6b014ba5e..2738ccca2 100644
--- a/src/wrap_mempool.cpp
+++ b/src/wrap_mempool.cpp
@@ -70,6 +70,8 @@ namespace
   };
 
 
+  // {{{ cl allocators
+
   class cl_allocator_base
   {
     protected:
@@ -210,8 +212,10 @@ namespace
       }
   };
 
+  // }}}
 
 
+  // {{{ allocator_call
 
   inline
   pyopencl::buffer *allocator_call(cl_allocator_base &alloc, size_t size)
@@ -256,8 +260,10 @@ namespace
     }
   }
 
+  // }}}
 
 
+  // {{{ pooled_buffer
 
   class pooled_buffer
     : public pyopencl::pooled_allocation<pyopencl::memory_pool<cl_allocator_base> >,
@@ -278,8 +284,10 @@ namespace
       { return ptr(); }
   };
 
+  // }}}
 
 
+  // {{{{ device_pool_allocate
 
   pooled_buffer *device_pool_allocate(
       std::shared_ptr<pyopencl::memory_pool<cl_allocator_base> > pool,
@@ -288,6 +296,9 @@ namespace
     return new pooled_buffer(pool, sz);
   }
 
+  // }}}
+
+
 
 
 
@@ -398,3 +409,5 @@ void pyopencl_expose_mempool(py::module &m)
       ;
   }
 }
+
+// vim: foldmethod=marker

From 130527217a651bc1ffd8785fc68c2d7bb5b08576 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Mon, 27 Jun 2022 01:14:15 -0500
Subject: [PATCH 08/15] Draft svm_allocator for mempool

---
 src/wrap_mempool.cpp | 54 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 54 insertions(+)

diff --git a/src/wrap_mempool.cpp b/src/wrap_mempool.cpp
index 2738ccca2..35630b036 100644
--- a/src/wrap_mempool.cpp
+++ b/src/wrap_mempool.cpp
@@ -299,6 +299,60 @@ namespace
   // }}}
 
 
+  // {{{ svm allocator
+
+  // FIXME: Does this need deferred and immediate just like the buffer-level
+  // allocators? (I.e. can I tell whether I am out of memory just from allocations?)
+  class svm_allocator
+  {
+    protected:
+      std::shared_ptr<pyopencl::context> m_context;
+      cl_uint m_alignment;
+      cl_mem_flags m_flags;
+
+    public:
+      svm_allocator(std::shared_ptr<pyopencl::context> const &ctx,
+          cl_uint alignment, cl_mem_flags flags=CL_MEM_READ_WRITE)
+        : m_context(ctx), m_alignment(alignment), m_flags(flags)
+      {
+        if (flags & (CL_MEM_USE_HOST_PTR | CL_MEM_COPY_HOST_PTR))
+          throw pyopencl::error("Allocator", CL_INVALID_VALUE,
+              "cannot specify USE_HOST_PTR or COPY_HOST_PTR flags");
+      }
+
+      svm_allocator(svm_allocator const &src)
+      : m_context(src.m_context), m_alignment(src.m_alignment),
+      m_flags(src.m_flags)
+      { }
+
+      virtual ~svm_allocator()
+      { }
+
+      typedef void *pointer_type;
+      typedef size_t size_type;
+
+      pointer_type allocate(size_type size)
+      {
+        if (size == 0)
+          return nullptr;
+
+        PYOPENCL_PRINT_CALL_TRACE("clSVMalloc");
+        return clSVMAlloc(m_context->data(), m_flags, size, m_alignment);
+      }
+
+      void free(pointer_type p)
+      {
+        clSVMFree(m_context->data(), p);
+      }
+
+      void try_release_blocks()
+      {
+        pyopencl::run_python_gc();
+      }
+  };
+
+  // }}}
+
 
 
 

From 008ac676bd03fdfa6dd7e60afd594cf14cbfa728 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Fri, 1 Jul 2022 01:11:50 -0500
Subject: [PATCH 09/15] enqueue_copy for SVM: Accept no-op {src,dest}_offset,
 byte_count parameters

---
 pyopencl/__init__.py   | 3 +++
 src/wrap_cl_part_2.cpp | 1 +
 2 files changed, 4 insertions(+)

diff --git a/pyopencl/__init__.py b/pyopencl/__init__.py
index 5b8d8b72e..12ae3be04 100644
--- a/pyopencl/__init__.py
+++ b/pyopencl/__init__.py
@@ -1779,6 +1779,9 @@ def enqueue_copy(queue, dest, src, **kwargs):
             src = SVM(src)
 
         is_blocking = kwargs.pop("is_blocking", True)
+        assert kwargs.pop("src_offset", 0) == 0
+        assert kwargs.pop("dest_offset", 0) == 0
+        assert "byte_count" not in kwargs or kwargs.pop("byte_count") == src._size()
         return _cl._enqueue_svm_memcpy(queue, is_blocking, dest, src, **kwargs)
 
     else:
diff --git a/src/wrap_cl_part_2.cpp b/src/wrap_cl_part_2.cpp
index ace5e9777..a6f604ee7 100644
--- a/src/wrap_cl_part_2.cpp
+++ b/src/wrap_cl_part_2.cpp
@@ -299,6 +299,7 @@ void pyopencl_expose_part_2(py::module &m)
     typedef svm_arg_wrapper cls;
     py::class_<cls>(m, "SVM", py::dynamic_attr())
       .def(py::init<py::object>())
+      .def("_size", &cls::size)
       ;
   }
 

From a71ba747a473b383dee1a5ccd5954865538250b7 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Fri, 1 Jul 2022 01:17:38 -0500
Subject: [PATCH 10/15] Add SVMAllocator (in Python)

---
 pyopencl/tools.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/pyopencl/tools.py b/pyopencl/tools.py
index 27adac75b..1b027f0ff 100644
--- a/pyopencl/tools.py
+++ b/pyopencl/tools.py
@@ -67,6 +67,25 @@ def _register_types():
 # }}}
 
 
+# {{{ svm allocator
+
+# FIXME: Replace me with C++
+class SVMAllocator:
+    def __init__(self, ctx, flags, *, alignment=0, queue=None):
+        self._context = ctx
+        self._flags = flags
+        self._alignment = alignment
+        self._queue = queue
+
+    def __call__(self, nbytes):
+        import pyopencl as cl
+        return cl.SVM(cl.svm_empty(
+                self._context, self._flags, (nbytes,), np.int8, "C", self._alignment,
+                self._queue))
+
+# }}}
+
+
 # {{{ first-arg caches
 
 _first_arg_dependent_caches = []

From 04b9bb22cf9fa1a5d38db2b54822f74371a11f29 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Fri, 1 Jul 2022 01:18:13 -0500
Subject: [PATCH 11/15] Jostle some section headers in wrap_cl.hpp

---
 src/wrap_cl.hpp | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/wrap_cl.hpp b/src/wrap_cl.hpp
index 2582a5103..ec6089ccc 100644
--- a/src/wrap_cl.hpp
+++ b/src/wrap_cl.hpp
@@ -137,10 +137,8 @@
 // }}}
 
 
+// {{{ macros and typedefs for wrappers
 
-
-
-// {{{ tools
 #if PY_VERSION_HEX >= 0x02050000
   typedef Py_ssize_t PYOPENCL_BUFFER_SIZE_T;
 #else
@@ -262,6 +260,7 @@
 
 // }}}
 
+
 // {{{ tracing and error reporting
 #ifdef PYOPENCL_TRACE
   #define PYOPENCL_PRINT_CALL_TRACE(NAME) \
@@ -329,6 +328,7 @@
 
 // }}}
 
+
 // {{{ get_info helpers
 #define PYOPENCL_GET_OPAQUE_INFO(WHAT, FIRST_ARG, SECOND_ARG, CL_TYPE, TYPE) \
   { \
@@ -383,6 +383,7 @@
 
 // }}}
 
+
 // {{{ event helpers --------------------------------------------------------------
 #define PYOPENCL_PARSE_WAIT_FOR \
     cl_uint num_events_in_wait_list = 0; \
@@ -424,7 +425,9 @@
 
 // }}}
 
+
 // {{{ equality testing
+
 #define PYOPENCL_EQUALITY_TESTS(cls) \
     bool operator==(cls const &other) const \
     { return data() == other.data(); } \
@@ -432,8 +435,8 @@
     { return data() != other.data(); } \
     long hash() const \
     { return (long) (intptr_t) data(); }
-// }}}
 
+// }}}
 
 
 namespace pyopencl

From c6685de8ca224382c8afdc41740e2ade3ecf736d Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Sat, 2 Jul 2022 00:07:27 -0500
Subject: [PATCH 12/15] Add SVM._ptr_as_int()

---
 src/wrap_cl_part_2.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/wrap_cl_part_2.cpp b/src/wrap_cl_part_2.cpp
index a6f604ee7..a2b5cde17 100644
--- a/src/wrap_cl_part_2.cpp
+++ b/src/wrap_cl_part_2.cpp
@@ -299,6 +299,7 @@ void pyopencl_expose_part_2(py::module &m)
     typedef svm_arg_wrapper cls;
     py::class_<cls>(m, "SVM", py::dynamic_attr())
       .def(py::init<py::object>())
+      .def("_ptr_as_int", [](cls &self) { return (intptr_t) self.ptr(); })
       .def("_size", &cls::size)
       ;
   }

From 0dcd53e5aca9ab3c9d2a2fc4dd012cba8b47c03c Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Thu, 7 Jul 2022 17:52:07 +0200
Subject: [PATCH 13/15] Move from SVM(np.ndarray(SVMAllocation)) to bare
 SVMAllocation

---
 examples/demo_array_svm.py |  23 ++---
 pyopencl/__init__.py       |  60 +++++------
 pyopencl/tools.py          |   6 +-
 src/wrap_cl.hpp            | 200 ++++++++++++++++++++++++++++++++-----
 src/wrap_cl_part_2.cpp     |  33 ++++--
 5 files changed, 234 insertions(+), 88 deletions(-)

diff --git a/examples/demo_array_svm.py b/examples/demo_array_svm.py
index 07454f1fe..054e582c0 100644
--- a/examples/demo_array_svm.py
+++ b/examples/demo_array_svm.py
@@ -1,36 +1,24 @@
 import pyopencl as cl
 import pyopencl.array as cl_array
+from pyopencl.tools import SVMAllocator
 import numpy as np
 import numpy.linalg as la
 
-n = 5000000
+n = 500000
 a = np.random.rand(n).astype(np.float32)
 b = np.random.rand(n).astype(np.float32)
 
 
-class SVMAllocator:
-    def __init__(self, ctx, flags, alignment):
-        self._context = ctx
-        self._flags = flags
-        self._alignment = alignment
-
-    def __call__(self, nbytes):
-        return cl.SVM(cl.svm_empty(
-                ctx, self._flags, (nbytes,), np.int8, "C", self._alignment))
-
-
 ctx = cl.create_some_context()
 queue = cl.CommandQueue(ctx)
 
-alloc = SVMAllocator(ctx,
-        cl.svm_mem_flags.READ_WRITE | cl.svm_mem_flags.SVM_FINE_GRAIN_BUFFER,
-        0)
+alloc = SVMAllocator(ctx, cl.svm_mem_flags.READ_WRITE, queue=queue)
 
 a_dev = cl_array.to_device(queue, a, allocator=alloc)
-print("A_DEV", a_dev.data.mem.nbytes, a_dev.data.mem.__array_interface__)
+print("A_DEV", a_dev.data)
 b_dev = cl_array.to_device(queue, b, allocator=alloc)
 dest_dev = cl_array.empty_like(a_dev)
-print("DEST", dest_dev.data.mem.__array_interface__)
+print("DEST", dest_dev.data)
 
 prg = cl.Program(ctx, """
     __kernel void sum(__global const float *a,
@@ -47,6 +35,7 @@ def __call__(self, nbytes):
 # PROBLEM: numpy frees the temporary out of (a_dev+b_dev) before
 # we're done with it
 diff = dest_dev - (a_dev+b_dev)
+
 if 0:
     diff = diff.get()
     np.set_printoptions(linewidth=400)
diff --git a/pyopencl/__init__.py b/pyopencl/__init__.py
index 12ae3be04..22562d1f9 100644
--- a/pyopencl/__init__.py
+++ b/pyopencl/__init__.py
@@ -200,11 +200,9 @@
 
 if get_cl_header_version() >= (2, 0):
     from pyopencl._cl import (  # noqa
-        SVMAllocation,
+        SVMPointer,
         SVM,
-
-        # FIXME
-        #enqueue_svm_migratemem,
+        SVMAllocation,
         )
 
 if _cl.have_gl():
@@ -1144,25 +1142,19 @@ def memory_map_exit(self, exc_type, exc_val, exc_tb):
             """
 
     if get_cl_header_version() >= (2, 0):
-        svmallocation_old_init = SVMAllocation.__init__
-
-    def svmallocation_init(self, ctx, size, alignment, flags, _interface=None,
-            queue=None):
-        """
-        :arg ctx: a :class:`Context`
-        :arg flags: some of :class:`svm_mem_flags`.
-        """
-        svmallocation_old_init(self, ctx, size, alignment, flags, queue)
-
-        # mem_flags.READ_ONLY applies to kernels, not the host
-        read_write = True
-        _interface["data"] = (
-                int(self._ptr_as_int()), not read_write)
-
-        self.__array_interface__ = _interface
-
-    if get_cl_header_version() >= (2, 0):
-        SVMAllocation.__init__ = svmallocation_init
+        class _ArrayInterfaceSVMAllocation(SVMAllocation):
+            def __init__(self, ctx, size, alignment, flags, _interface=None,
+                    queue=None):
+                """
+                :arg ctx: a :class:`Context`
+                :arg flags: some of :class:`svm_mem_flags`.
+                """
+                super().__init__(ctx, size, alignment, flags, queue)
+
+                # mem_flags.READ_ONLY applies to kernels, not the host
+                read_write = True
+                _interface["data"] = (
+                        int(self._ptr_as_int()), not read_write)
 
     # }}}
 
@@ -1773,15 +1765,14 @@ def enqueue_copy(queue, dest, src, **kwargs):
         else:
             raise ValueError("invalid dest mem object type")
 
-    elif get_cl_header_version() >= (2, 0) and isinstance(dest, SVM):
+    elif get_cl_header_version() >= (2, 0) and isinstance(dest, SVMPointer):
         # to SVM
-        if not isinstance(src, SVM):
+        if not isinstance(src, SVMPointer):
             src = SVM(src)
 
         is_blocking = kwargs.pop("is_blocking", True)
         assert kwargs.pop("src_offset", 0) == 0
         assert kwargs.pop("dest_offset", 0) == 0
-        assert "byte_count" not in kwargs or kwargs.pop("byte_count") == src._size()
         return _cl._enqueue_svm_memcpy(queue, is_blocking, dest, src, **kwargs)
 
     else:
@@ -1807,7 +1798,7 @@ def enqueue_copy(queue, dest, src, **kwargs):
                         queue, src, origin, region, dest, **kwargs)
             else:
                 raise ValueError("invalid src mem object type")
-        elif isinstance(src, SVM):
+        elif isinstance(src, SVMPointer):
             # from svm
             # dest is not a SVM instance, otherwise we'd be in the branch above
             is_blocking = kwargs.pop("is_blocking", True)
@@ -1822,14 +1813,14 @@ def enqueue_copy(queue, dest, src, **kwargs):
 
 # {{{ enqueue_fill
 
-def enqueue_fill(queue: CommandQueue, dest: Union[MemoryObjectHolder, SVM],
+def enqueue_fill(queue: CommandQueue, dest: Union[MemoryObjectHolder, SVMPointer],
         pattern: Any, size: int, *, offset: int = 0, wait_for=None) -> Event:
     """
     .. versionadded:: 2022.2
     """
     if isinstance(dest, MemoryObjectHolder):
         return enqueue_fill_buffer(queue, dest, pattern, offset, size, wait_for)
-    elif isinstance(dest, SVM):
+    elif isinstance(dest, SVMPointer):
         if offset:
             raise NotImplementedError("enqueue_fill with SVM does not yet support "
                     "offsets")
@@ -1961,7 +1952,7 @@ def enqueue_fill_buffer(queue, mem, pattern, offset, size, wait_for=None):
 def enqueue_svm_memfill(queue, dest, pattern, byte_count=None, wait_for=None):
     """Fill shared virtual memory with a pattern.
 
-    :arg dest: a Python buffer object, optionally wrapped in an :class:`SVM` object
+    :arg dest: a Python buffer object, or any implementation of :class:`SVMPointer`.
     :arg pattern: a Python buffer object (e.g. a :class:`numpy.ndarray` with the
         fill pattern to be used.
     :arg byte_count: The size of the memory to be fill. Defaults to the
@@ -1972,8 +1963,8 @@ def enqueue_svm_memfill(queue, dest, pattern, byte_count=None, wait_for=None):
     .. versionadded:: 2016.2
     """
 
-    if not isinstance(dest, SVM):
-        dest = SVM(dest)
+    if not isinstance(dest, SVMPointer):
+        dest = SVMPointer(dest)
 
     return _cl._enqueue_svm_memfill(
             queue, dest, pattern, byte_count=None, wait_for=None)
@@ -1982,7 +1973,7 @@ def enqueue_svm_memfill(queue, dest, pattern, byte_count=None, wait_for=None):
 def enqueue_svm_migratemem(queue, svms, flags, wait_for=None):
     """
     :arg svms: a collection of Python buffer objects (e.g. :mod:`numpy`
-        arrays), optionally wrapped in :class:`SVM` objects.
+        arrays), or any implementation of :class:`SVMPointer`.
     :arg flags: a combination of :class:`mem_migration_flags`
 
     |std-enqueue-blurb|
@@ -2068,7 +2059,8 @@ def svm_empty(ctx, flags, shape, dtype, order="C", alignment=None, queue=None):
     if alignment is None:
         alignment = itemsize
 
-    svm_alloc = SVMAllocation(ctx, nbytes, alignment, flags, _interface=interface,
+    svm_alloc = _ArrayInterfaceSVMAllocation(
+            ctx, nbytes, alignment, flags, _interface=interface,
             queue=queue)
     return np.asarray(svm_alloc)
 
diff --git a/pyopencl/tools.py b/pyopencl/tools.py
index 1b027f0ff..09e4d0566 100644
--- a/pyopencl/tools.py
+++ b/pyopencl/tools.py
@@ -79,9 +79,9 @@ def __init__(self, ctx, flags, *, alignment=0, queue=None):
 
     def __call__(self, nbytes):
         import pyopencl as cl
-        return cl.SVM(cl.svm_empty(
-                self._context, self._flags, (nbytes,), np.int8, "C", self._alignment,
-                self._queue))
+        return cl.SVMAllocation(
+                self._context, nbytes, self._alignment, self._flags,
+                queue=self._queue)
 
 # }}}
 
diff --git a/src/wrap_cl.hpp b/src/wrap_cl.hpp
index ec6089ccc..76477f55f 100644
--- a/src/wrap_cl.hpp
+++ b/src/wrap_cl.hpp
@@ -227,8 +227,6 @@
     }
 
 
-
-
 #define PYOPENCL_RETRY_IF_MEM_ERROR(OPERATION) \
   { \
     bool failed_with_mem_error = false; \
@@ -258,6 +256,17 @@
     } \
   }
 
+
+#define PYOPENCL_GET_SVM_SIZE(NAME) \
+  size_t NAME##_size; \
+  bool NAME##_has_size = false; \
+  try \
+  { \
+    NAME##_size = NAME.size(); \
+    NAME##_has_size = true; \
+  } \
+  catch (size_not_available)  { }
+
 // }}}
 
 
@@ -501,12 +510,12 @@ namespace pyopencl
 
   // {{{ utility functions
 
-  inline bool is_queue_in_order(cl_command_queue queue)
+  inline bool is_queue_out_of_order(cl_command_queue queue)
   {
       cl_command_queue_properties param_value;
       PYOPENCL_CALL_GUARDED(clGetCommandQueueInfo,
           (queue, CL_QUEUE_PROPERTIES, sizeof(param_value), &param_value, 0));
-      return !(param_value & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE);
+      return param_value & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE;
   }
 
   // }}}
@@ -3527,11 +3536,26 @@ namespace pyopencl
   // }}}
 
 
-  // {{{ svm
-
 #if PYOPENCL_CL_VERSION >= 0x2000
 
-  class svm_arg_wrapper
+  // {{{ svm pointer
+
+  class size_not_available { };
+
+  class svm_pointer
+  {
+    public:
+      virtual void *ptr() const = 0;
+      // may throw size_not_available
+      virtual size_t size() const = 0;
+  };
+
+  // }}}
+
+
+  // {{{ svm_arg_wrapper
+
+  class svm_arg_wrapper : public svm_pointer
   {
     private:
       void *m_ptr;
@@ -3564,12 +3588,17 @@ namespace pyopencl
       }
   };
 
+  // }}}
+
 
-  class svm_allocation
+  // {{{ svm_allocation
+
+  class svm_allocation : public svm_pointer
   {
     private:
       std::shared_ptr<context> m_context;
       void *m_allocation;
+      size_t m_size;
       command_queue_ref m_queue;
       // FIXME Should keep a list of events so that we can wait for users
       // to finish in the case of out-of-order queues.
@@ -3577,12 +3606,12 @@ namespace pyopencl
     public:
       svm_allocation(std::shared_ptr<context> const &ctx, size_t size, cl_uint alignment,
           cl_svm_mem_flags flags, const command_queue *queue = nullptr)
-        : m_context(ctx)
+        : m_context(ctx), m_size(size)
       {
         if (queue)
         {
           m_queue.set(queue->data());
-          if (!is_queue_in_order(m_queue.data()))
+          if (is_queue_out_of_order(m_queue.data()))
             throw error("SVMAllocation.__init__", CL_INVALID_VALUE,
                 "supplying an out-of-order queue to SVMAllocation is invalid");
         }
@@ -3650,6 +3679,11 @@ namespace pyopencl
         return m_allocation;
       }
 
+      size_t size() const
+      {
+        return m_size;
+      }
+
       intptr_t ptr_as_int() const
       {
         return (intptr_t) m_allocation;
@@ -3667,29 +3701,92 @@ namespace pyopencl
 
       void bind_to_queue(command_queue const &queue)
       {
+        if (is_queue_out_of_order(m_queue.data()))
+          throw error("SVMAllocation.bind_to_queue", CL_INVALID_VALUE,
+              "supplying an out-of-order queue to SVMAllocation is invalid");
+
+        if (m_queue.is_valid())
+        {
+          // make sure synchronization promises stay valid in new queue
+          cl_event evt;
+
+          PYOPENCL_CALL_GUARDED(clEnqueueMarker, (m_queue.data(), &evt));
+          PYOPENCL_CALL_GUARDED(clEnqueueWaitForEvents, (queue.data(), 1, &evt));
+        }
+
         m_queue.set(queue.data());
       }
 
       void unbind_from_queue()
       {
+        // NOTE: This absolves the allocation from any synchronization promises
+        // made. Keeping those before calling this method is the responsibility
+        // of the user.
         m_queue.reset();
       }
   };
 
+  // }}}
+
+
+  // {{{ svm operations
 
   inline
   event *enqueue_svm_memcpy(
       command_queue &cq,
       cl_bool is_blocking,
-      svm_arg_wrapper &dst, svm_arg_wrapper &src,
-      py::object py_wait_for
+      svm_pointer &dst, svm_pointer &src,
+      py::object py_wait_for,
+      py::object byte_count_py
       )
   {
     PYOPENCL_PARSE_WAIT_FOR;
 
-    if (src.size() != dst.size())
+    // {{{ process size
+
+    PYOPENCL_GET_SVM_SIZE(src);
+    PYOPENCL_GET_SVM_SIZE(dst);
+
+    size_t size;
+    bool have_size = false;
+
+    if (src_has_size)
+    {
+      size = src_size;
+      have_size = true;
+    }
+    if (dst_has_size)
+    {
+      if (have_size)
+      {
+        if (!byte_count_py.is_none())
+          size = std::min(size, dst_size);
+        else if (size != dst_size)
+          throw error("_enqueue_svm_memcpy", CL_INVALID_VALUE,
+              "sizes of source and destination buffer do not match");
+      }
+      else
+      {
+        size = dst_size;
+        have_size = true;
+      }
+    }
+
+    if (!byte_count_py.is_none())
+    {
+      size_t byte_count = byte_count_py.cast<size_t>();
+      if (have_size && byte_count > size)
+        throw error("_enqueue_svm_memcpy", CL_INVALID_VALUE,
+            "specified byte_count larger than size of source or destination buffers");
+      size = byte_count;
+      have_size = true;
+    }
+
+    if (!have_size)
       throw error("_enqueue_svm_memcpy", CL_INVALID_VALUE,
-          "sizes of source and destination buffer do not match");
+          "size not passed and could not be determined");
+
+    // }}}
 
     cl_event evt;
     PYOPENCL_CALL_GUARDED(
@@ -3698,7 +3795,7 @@ namespace pyopencl
           cq.data(),
           is_blocking,
           dst.ptr(), src.ptr(),
-          dst.size(),
+          size,
           PYOPENCL_WAITLIST_ARGS,
           &evt
         ));
@@ -3710,7 +3807,7 @@ namespace pyopencl
   inline
   event *enqueue_svm_memfill(
       command_queue &cq,
-      svm_arg_wrapper &dst, py::object py_pattern,
+      svm_pointer &dst, py::object py_pattern,
       py::object byte_count,
       py::object py_wait_for
       )
@@ -3727,9 +3824,32 @@ namespace pyopencl
     pattern_ptr = pattern_ward->m_buf.buf;
     pattern_len = pattern_ward->m_buf.len;
 
-    size_t fill_size = dst.size();
+    // {{{ process size
+
+    PYOPENCL_GET_SVM_SIZE(dst);
+
+    size_t size;
+    bool have_size = false;
+    if (dst_has_size)
+    {
+      size = dst_size;
+      have_size = true;
+    }
     if (!byte_count.is_none())
-      fill_size = py::cast<size_t>(byte_count);
+    {
+      size_t user_size = py::cast<size_t>(byte_count);
+      if (have_size && user_size > size)
+        throw error("enqueue_svm_memfill", CL_INVALID_VALUE,
+            "byte_count too large for specified SVM buffer");
+    }
+
+    if (!have_size)
+    {
+      throw error("enqueue_svm_memfill", CL_INVALID_VALUE,
+          "byte_count not passed and could not be determined");
+    }
+
+    // }}}
 
     cl_event evt;
     PYOPENCL_CALL_GUARDED(
@@ -3738,7 +3858,7 @@ namespace pyopencl
           cq.data(),
           dst.ptr(), pattern_ptr,
           pattern_len,
-          fill_size,
+          size,
           PYOPENCL_WAITLIST_ARGS,
           &evt
         ));
@@ -3752,12 +3872,40 @@ namespace pyopencl
       command_queue &cq,
       cl_bool is_blocking,
       cl_map_flags flags,
-      svm_arg_wrapper &svm,
-      py::object py_wait_for
+      svm_pointer &svm,
+      py::object py_wait_for,
+      py::object user_size_py
       )
   {
     PYOPENCL_PARSE_WAIT_FOR;
 
+    // {{{ process size
+
+    PYOPENCL_GET_SVM_SIZE(svm);
+
+    size_t size;
+    bool have_size = false;
+    if (svm_has_size)
+    {
+      size = svm_size;
+      have_size = true;
+    }
+    if (!user_size_py.is_none())
+    {
+      size_t user_size = py::cast<size_t>(user_size_py);
+      if (have_size && user_size > size)
+        throw error("enqueue_svm_memfill", CL_INVALID_VALUE,
+            "user-provided size too large for specified SVM buffer");
+    }
+
+    if (!have_size)
+    {
+      throw error("enqueue_svm_mem_map", CL_INVALID_VALUE,
+          "size not passed and could not be determined");
+    }
+
+    // }}}
+
     cl_event evt;
     PYOPENCL_CALL_GUARDED(
         clEnqueueSVMMap,
@@ -3765,7 +3913,7 @@ namespace pyopencl
           cq.data(),
           is_blocking,
           flags,
-          svm.ptr(), svm.size(),
+          svm.ptr(), size,
           PYOPENCL_WAITLIST_ARGS,
           &evt
         ));
@@ -3777,7 +3925,7 @@ namespace pyopencl
   inline
   event *enqueue_svm_unmap(
       command_queue &cq,
-      svm_arg_wrapper &svm,
+      svm_pointer &svm,
       py::object py_wait_for
       )
   {
@@ -3814,7 +3962,7 @@ namespace pyopencl
 
     for (py::handle py_svm: svms)
     {
-      svm_arg_wrapper &svm(py::cast<svm_arg_wrapper &>(py_svm));
+      svm_pointer &svm(py::cast<svm_pointer &>(py_svm));
 
       svm_pointers.push_back(svm.ptr());
       sizes.push_back(svm.size());
@@ -4609,7 +4757,7 @@ namespace pyopencl
       }
 
 #if PYOPENCL_CL_VERSION >= 0x2000
-      void set_arg_svm(cl_uint arg_index, svm_arg_wrapper const &wrp)
+      void set_arg_svm(cl_uint arg_index, svm_pointer const &wrp)
       {
         PYOPENCL_CALL_GUARDED(clSetKernelArgSVMPointer,
             (m_kernel, arg_index, wrp.ptr()));
@@ -4634,7 +4782,7 @@ namespace pyopencl
 #if PYOPENCL_CL_VERSION >= 0x2000
         try
         {
-          set_arg_svm(arg_index, arg.cast<svm_arg_wrapper const &>());
+          set_arg_svm(arg_index, arg.cast<svm_pointer const &>());
           return;
         }
         catch (py::cast_error &) { }
diff --git a/src/wrap_cl_part_2.cpp b/src/wrap_cl_part_2.cpp
index a2b5cde17..453f34c38 100644
--- a/src/wrap_cl_part_2.cpp
+++ b/src/wrap_cl_part_2.cpp
@@ -295,18 +295,34 @@ void pyopencl_expose_part_2(py::module &m)
   // {{{ svm
 
 #if PYOPENCL_CL_VERSION >= 0x2000
+  {
+    typedef svm_pointer cls;
+    py::class_<cls>(m, "SVMPointer", py::dynamic_attr())
+      .def("_ptr_as_int", [](cls &self) { return (intptr_t) self.ptr(); })
+      .def("_size", [](cls &self) -> py::object
+          {
+            try
+            {
+              return py::cast(self.size());
+            }
+            catch (size_not_available)
+            {
+              return py::none();
+            }
+          })
+      ;
+  }
+
   {
     typedef svm_arg_wrapper cls;
-    py::class_<cls>(m, "SVM", py::dynamic_attr())
+    py::class_<cls, svm_pointer>(m, "SVM", py::dynamic_attr())
       .def(py::init<py::object>())
-      .def("_ptr_as_int", [](cls &self) { return (intptr_t) self.ptr(); })
-      .def("_size", &cls::size)
       ;
   }
 
   {
     typedef svm_allocation cls;
-    py::class_<cls>(m, "SVMAllocation", py::dynamic_attr())
+    py::class_<cls, svm_pointer>(m, "SVMAllocation", py::dynamic_attr())
       .def(py::init<std::shared_ptr<context>, size_t, cl_uint, cl_svm_mem_flags, const command_queue *>(),
           py::arg("context"),
           py::arg("size"),
@@ -318,10 +334,9 @@ void pyopencl_expose_part_2(py::module &m)
       .def("enqueue_release", &cls::enqueue_release,
           ":returns: a :class:`pyopencl.Event`\n\n"
           "|std-enqueue-blurb|")
-      .def("_ptr_as_int", &cls::ptr_as_int)
       .def(py::self == py::self)
       .def(py::self != py::self)
-      .def("__hash__", &cls::ptr_as_int)
+      .def("__hash__", [](cls &self) { return (intptr_t) self.ptr(); })
       .DEF_SIMPLE_METHOD(bind_to_queue)
       .DEF_SIMPLE_METHOD(unbind_from_queue)
       ;
@@ -332,7 +347,8 @@ void pyopencl_expose_part_2(py::module &m)
       py::arg("is_blocking"),
       py::arg("dst"),
       py::arg("src"),
-      py::arg("wait_for")=py::none()
+      py::arg("wait_for")=py::none(),
+      py::arg("byte_count")=py::none()
       );
 
   m.def("_enqueue_svm_memfill", enqueue_svm_memfill,
@@ -348,7 +364,8 @@ void pyopencl_expose_part_2(py::module &m)
       py::arg("is_blocking"),
       py::arg("flags"),
       py::arg("svm"),
-      py::arg("wait_for")=py::none()
+      py::arg("wait_for")=py::none(),
+      py::arg("size")=py::none()
       );
 
   m.def("_enqueue_svm_unmap", enqueue_svm_unmap,

From 1f4f7699d38aade3aeaea71436f7a484eace5449 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Fri, 22 Jul 2022 15:59:45 +0200
Subject: [PATCH 14/15] WIP: Towards SVM memory pool

---
 pyopencl/tools.py      |  15 +-
 src/mempool.hpp        |  30 ++--
 src/wrap_cl.hpp        |  28 ++--
 src/wrap_cl_part_2.cpp |   4 +-
 src/wrap_mempool.cpp   | 320 ++++++++++++++++++++++++++++++++++-------
 5 files changed, 312 insertions(+), 85 deletions(-)

diff --git a/pyopencl/tools.py b/pyopencl/tools.py
index 09e4d0566..831d445ce 100644
--- a/pyopencl/tools.py
+++ b/pyopencl/tools.py
@@ -33,7 +33,7 @@
 
 import numpy as np
 from pytools import memoize, memoize_method
-from pyopencl._cl import bitlog2  # noqa: F401
+from pyopencl._cl import bitlog2, get_cl_header_version  # noqa: F401
 from pytools.persistent_dict import KeyBuilder as KeyBuilderBase
 
 import re
@@ -59,10 +59,19 @@ def _register_types():
 # {{{ imported names
 
 from pyopencl._cl import (  # noqa
-        PooledBuffer as PooledBuffer,
+        _tools_PooledBuffer as PooledBuffer,
         _tools_DeferredAllocator as DeferredAllocator,
         _tools_ImmediateAllocator as ImmediateAllocator,
-        MemoryPool as MemoryPool)
+        _tools_MemoryPool as MemoryPool,
+        )
+
+
+if get_cl_header_version() >= (2, 0):
+    from pyopencl._cl import (  # noqa
+            _tools_SVMemoryPool as SVMemoryPool,
+            _tools_PooledSVM as PooledSVM,
+            _tools_SVMAllocator as SVMAllocator,
+            )
 
 # }}}
 
diff --git a/src/mempool.hpp b/src/mempool.hpp
index 44f0fd643..812bd2f1c 100644
--- a/src/mempool.hpp
+++ b/src/mempool.hpp
@@ -233,7 +233,8 @@ namespace PYGPU_PACKAGE
             std::cout
               << "[pool] allocation of size " << size << " served from bin " << bin_nr
               << " which contained " << bin.size() << " entries" << std::endl;
-          return pop_block_from_bin(bin, size);
+          return m_allocator->hand_out_existing_block(
+              pop_block_from_bin(bin, size));
         }
 
          size_type alloc_sz = alloc_size(bin_nr);
@@ -256,7 +257,8 @@ namespace PYGPU_PACKAGE
 
         m_allocator->try_release_blocks();
         if (bin.size())
-          return pop_block_from_bin(bin, size);
+          return m_allocator->hand_out_existing_block(
+              pop_block_from_bin(bin, size));
 
         if (m_trace)
           std::cout << "[pool] allocation still OOM after GC" << std::endl;
@@ -282,7 +284,7 @@ namespace PYGPU_PACKAGE
             "failed to free memory for allocation");
       }
 
-      void free(pointer_type p, size_type size)
+      void free(pointer_type &&p, size_type size)
       {
         --m_active_blocks;
         m_active_bytes -= size;
@@ -291,7 +293,7 @@ namespace PYGPU_PACKAGE
         if (!m_stop_holding)
         {
           inc_held_blocks();
-          get_bin(bin_nr).push_back(p);
+          get_bin(bin_nr).push_back(std::move(p));
 
           if (m_trace)
             std::cout << "[pool] block of size " << size << " returned to bin "
@@ -300,7 +302,7 @@ namespace PYGPU_PACKAGE
         }
         else
         {
-          m_allocator->free(p);
+          m_allocator->free(std::move(p));
           m_managed_bytes -= alloc_size(bin_nr);
         }
       }
@@ -313,7 +315,7 @@ namespace PYGPU_PACKAGE
 
           while (bin.size())
           {
-            m_allocator->free(bin.back());
+            m_allocator->free(std::move(bin.back()));
             m_managed_bytes -= alloc_size(bin_pair.first);
             bin.pop_back();
 
@@ -353,7 +355,7 @@ namespace PYGPU_PACKAGE
 
           if (bin.size())
           {
-            m_allocator->free(bin.back());
+            m_allocator->free(std::move(bin.back()));
             m_managed_bytes -= alloc_size(bin_pair.first);
             bin.pop_back();
 
@@ -379,7 +381,7 @@ namespace PYGPU_PACKAGE
 
       pointer_type pop_block_from_bin(bin_t &bin, size_type size)
       {
-        pointer_type result = bin.back();
+        pointer_type result(std::move(bin.back()));
         bin.pop_back();
 
         dec_held_blocks();
@@ -399,7 +401,7 @@ namespace PYGPU_PACKAGE
       typedef typename Pool::pointer_type pointer_type;
       typedef typename Pool::size_type size_type;
 
-    private:
+    protected:
       PYGPU_SHARED_PTR<pool_type> m_pool;
 
       pointer_type m_ptr;
@@ -421,7 +423,7 @@ namespace PYGPU_PACKAGE
       {
         if (m_valid)
         {
-          m_pool->free(m_ptr, m_size);
+          m_pool->free(std::move(m_ptr), m_size);
           m_valid = false;
         }
         else
@@ -435,16 +437,8 @@ namespace PYGPU_PACKAGE
 #endif
               );
       }
-
-      pointer_type ptr() const
-      { return m_ptr; }
-
-      size_type size() const
-      { return m_size; }
   };
 }
 
 
-
-
 #endif
diff --git a/src/wrap_cl.hpp b/src/wrap_cl.hpp
index 76477f55f..ad60372db 100644
--- a/src/wrap_cl.hpp
+++ b/src/wrap_cl.hpp
@@ -1708,7 +1708,13 @@ namespace pyopencl
         src.m_valid = false;
       }
 
-      command_queue_ref(const command_queue_ref &) = delete;
+      command_queue_ref(const command_queue_ref &)
+      {
+        throw error("command_queue_ref", CL_INVALID_VALUE,
+            "command_queue_ref copy constructor is never supposed to be called; "
+            "all notional invocations should be eliminated because of NRVO");
+      }
+
       command_queue_ref &operator=(const command_queue_ref &) = delete;
 
       ~command_queue_ref()
@@ -3545,7 +3551,7 @@ namespace pyopencl
   class svm_pointer
   {
     public:
-      virtual void *ptr() const = 0;
+      virtual void *svm_ptr() const = 0;
       // may throw size_not_available
       virtual size_t size() const = 0;
   };
@@ -3578,7 +3584,7 @@ namespace pyopencl
         m_size = ward->m_buf.len;
       }
 
-      void *ptr() const
+      void *svm_ptr() const
       {
         return m_ptr;
       }
@@ -3674,7 +3680,7 @@ namespace pyopencl
         m_allocation = nullptr;
       }
 
-      void *ptr() const
+      void *svm_ptr() const
       {
         return m_allocation;
       }
@@ -3701,7 +3707,7 @@ namespace pyopencl
 
       void bind_to_queue(command_queue const &queue)
       {
-        if (is_queue_out_of_order(m_queue.data()))
+        if (is_queue_out_of_order(queue.data()))
           throw error("SVMAllocation.bind_to_queue", CL_INVALID_VALUE,
               "supplying an out-of-order queue to SVMAllocation is invalid");
 
@@ -3794,7 +3800,7 @@ namespace pyopencl
         (
           cq.data(),
           is_blocking,
-          dst.ptr(), src.ptr(),
+          dst.svm_ptr(), src.svm_ptr(),
           size,
           PYOPENCL_WAITLIST_ARGS,
           &evt
@@ -3856,7 +3862,7 @@ namespace pyopencl
         clEnqueueSVMMemFill,
         (
           cq.data(),
-          dst.ptr(), pattern_ptr,
+          dst.svm_ptr(), pattern_ptr,
           pattern_len,
           size,
           PYOPENCL_WAITLIST_ARGS,
@@ -3913,7 +3919,7 @@ namespace pyopencl
           cq.data(),
           is_blocking,
           flags,
-          svm.ptr(), size,
+          svm.svm_ptr(), size,
           PYOPENCL_WAITLIST_ARGS,
           &evt
         ));
@@ -3936,7 +3942,7 @@ namespace pyopencl
         clEnqueueSVMUnmap,
         (
           cq.data(),
-          svm.ptr(),
+          svm.svm_ptr(),
           PYOPENCL_WAITLIST_ARGS,
           &evt
         ));
@@ -3964,7 +3970,7 @@ namespace pyopencl
     {
       svm_pointer &svm(py::cast<svm_pointer &>(py_svm));
 
-      svm_pointers.push_back(svm.ptr());
+      svm_pointers.push_back(svm.svm_ptr());
       sizes.push_back(svm.size());
     }
 
@@ -4760,7 +4766,7 @@ namespace pyopencl
       void set_arg_svm(cl_uint arg_index, svm_pointer const &wrp)
       {
         PYOPENCL_CALL_GUARDED(clSetKernelArgSVMPointer,
-            (m_kernel, arg_index, wrp.ptr()));
+            (m_kernel, arg_index, wrp.svm_ptr()));
       }
 #endif
 
diff --git a/src/wrap_cl_part_2.cpp b/src/wrap_cl_part_2.cpp
index 453f34c38..3c40e1c90 100644
--- a/src/wrap_cl_part_2.cpp
+++ b/src/wrap_cl_part_2.cpp
@@ -298,7 +298,7 @@ void pyopencl_expose_part_2(py::module &m)
   {
     typedef svm_pointer cls;
     py::class_<cls>(m, "SVMPointer", py::dynamic_attr())
-      .def("_ptr_as_int", [](cls &self) { return (intptr_t) self.ptr(); })
+      .def("_ptr_as_int", [](cls &self) { return (intptr_t) self.svm_ptr(); })
       .def("_size", [](cls &self) -> py::object
           {
             try
@@ -336,7 +336,7 @@ void pyopencl_expose_part_2(py::module &m)
           "|std-enqueue-blurb|")
       .def(py::self == py::self)
       .def(py::self != py::self)
-      .def("__hash__", [](cls &self) { return (intptr_t) self.ptr(); })
+      .def("__hash__", [](cls &self) { return (intptr_t) self.svm_ptr(); })
       .DEF_SIMPLE_METHOD(bind_to_queue)
       .DEF_SIMPLE_METHOD(unbind_from_queue)
       ;
diff --git a/src/wrap_mempool.cpp b/src/wrap_mempool.cpp
index 35630b036..429e3e1b9 100644
--- a/src/wrap_mempool.cpp
+++ b/src/wrap_mempool.cpp
@@ -57,12 +57,18 @@ namespace
       {
         return false;
       }
+
       virtual pointer_type allocate(size_type s)
       {
         return nullptr;
       }
 
-      void free(pointer_type p)
+      virtual pointer_type hand_out_existing_block(pointer_type &&p)
+      {
+        return p;
+      }
+
+      void free(pointer_type &&p)
       { }
 
       void try_release_blocks()
@@ -70,16 +76,16 @@ namespace
   };
 
 
-  // {{{ cl allocators
+  // {{{ buffer allocators
 
-  class cl_allocator_base
+  class buffer_allocator_base
   {
     protected:
       std::shared_ptr<pyopencl::context> m_context;
       cl_mem_flags m_flags;
 
     public:
-      cl_allocator_base(std::shared_ptr<pyopencl::context> const &ctx,
+      buffer_allocator_base(std::shared_ptr<pyopencl::context> const &ctx,
           cl_mem_flags flags=CL_MEM_READ_WRITE)
         : m_context(ctx), m_flags(flags)
       {
@@ -88,21 +94,26 @@ namespace
               "cannot specify USE_HOST_PTR or COPY_HOST_PTR flags");
       }
 
-      cl_allocator_base(cl_allocator_base const &src)
+      buffer_allocator_base(buffer_allocator_base const &src)
       : m_context(src.m_context), m_flags(src.m_flags)
       { }
 
-      virtual ~cl_allocator_base()
+      virtual ~buffer_allocator_base()
       { }
 
       typedef cl_mem pointer_type;
       typedef size_t size_type;
 
-      virtual cl_allocator_base *copy() const = 0;
+      virtual buffer_allocator_base *copy() const = 0;
       virtual bool is_deferred() const = 0;
       virtual pointer_type allocate(size_type s) = 0;
 
-      void free(pointer_type p)
+      virtual pointer_type hand_out_existing_block(pointer_type p)
+      {
+        return p;
+      }
+
+      void free(pointer_type &&p)
       {
         PYOPENCL_CALL_GUARDED(clReleaseMemObject, (p));
       }
@@ -113,20 +124,21 @@ namespace
       }
   };
 
-  class cl_deferred_allocator : public cl_allocator_base
+
+  class deferred_buffer_allocator : public buffer_allocator_base
   {
     private:
-      typedef cl_allocator_base super;
+      typedef buffer_allocator_base super;
 
     public:
-      cl_deferred_allocator(std::shared_ptr<pyopencl::context> const &ctx,
+      deferred_buffer_allocator(std::shared_ptr<pyopencl::context> const &ctx,
           cl_mem_flags flags=CL_MEM_READ_WRITE)
         : super(ctx, flags)
       { }
 
-      cl_allocator_base *copy() const
+      buffer_allocator_base *copy() const
       {
-        return new cl_deferred_allocator(*this);
+        return new deferred_buffer_allocator(*this);
       }
 
       bool is_deferred() const
@@ -143,26 +155,26 @@ namespace
 
   const unsigned zero = 0;
 
-  class cl_immediate_allocator : public cl_allocator_base
+  class immediate_buffer_allocator : public buffer_allocator_base
   {
     private:
-      typedef cl_allocator_base super;
+      typedef buffer_allocator_base super;
       pyopencl::command_queue m_queue;
 
     public:
-      cl_immediate_allocator(pyopencl::command_queue &queue,
+      immediate_buffer_allocator(pyopencl::command_queue &queue,
           cl_mem_flags flags=CL_MEM_READ_WRITE)
         : super(std::shared_ptr<pyopencl::context>(queue.get_context()), flags),
         m_queue(queue.data(), /*retain*/ true)
       { }
 
-      cl_immediate_allocator(cl_immediate_allocator const &src)
+      immediate_buffer_allocator(immediate_buffer_allocator const &src)
         : super(src), m_queue(src.m_queue)
       { }
 
-      cl_allocator_base *copy() const
+      buffer_allocator_base *copy() const
       {
-        return new cl_immediate_allocator(*this);
+        return new immediate_buffer_allocator(*this);
       }
 
       bool is_deferred() const
@@ -215,10 +227,10 @@ namespace
   // }}}
 
 
-  // {{{ allocator_call
+  // {{{ buffer_allocator_call
 
   inline
-  pyopencl::buffer *allocator_call(cl_allocator_base &alloc, size_t size)
+  pyopencl::buffer *buffer_allocator_call(buffer_allocator_base &alloc, size_t size)
   {
     cl_mem mem;
     int try_count = 0;
@@ -266,12 +278,12 @@ namespace
   // {{{ pooled_buffer
 
   class pooled_buffer
-    : public pyopencl::pooled_allocation<pyopencl::memory_pool<cl_allocator_base> >,
+    : public pyopencl::pooled_allocation<pyopencl::memory_pool<buffer_allocator_base> >,
     public pyopencl::memory_object_holder
   {
     private:
       typedef
-        pyopencl::pooled_allocation<pyopencl::memory_pool<cl_allocator_base> >
+        pyopencl::pooled_allocation<pyopencl::memory_pool<buffer_allocator_base> >
         super;
 
     public:
@@ -281,17 +293,22 @@ namespace
       { }
 
       const super::pointer_type data() const
-      { return ptr(); }
+      { return m_ptr; }
+
+      size_t size() const
+      {
+        return m_size;
+      }
   };
 
   // }}}
 
 
-  // {{{{ device_pool_allocate
+  // {{{ buffer_pool_allocate
 
-  pooled_buffer *device_pool_allocate(
-      std::shared_ptr<pyopencl::memory_pool<cl_allocator_base> > pool,
-      pyopencl::memory_pool<cl_allocator_base>::size_type sz)
+  pooled_buffer *buffer_pool_allocate(
+      std::shared_ptr<pyopencl::memory_pool<buffer_allocator_base> > pool,
+      pyopencl::memory_pool<buffer_allocator_base>::size_type sz)
   {
     return new pooled_buffer(pool, sz);
   }
@@ -299,25 +316,40 @@ namespace
   // }}}
 
 
+#if PYOPENCL_CL_VERSION >= 0x2000
+
   // {{{ svm allocator
 
   // FIXME: Does this need deferred and immediate just like the buffer-level
   // allocators? (I.e. can I tell whether I am out of memory just from allocations?)
+  struct svm_held_pointer
+  {
+    void *ptr;
+    pyopencl::command_queue_ref queue;
+  };
+
+
   class svm_allocator
   {
+    public:
+      typedef svm_held_pointer pointer_type;
+      typedef size_t size_type;
+
     protected:
       std::shared_ptr<pyopencl::context> m_context;
       cl_uint m_alignment;
-      cl_mem_flags m_flags;
+      cl_svm_mem_flags m_flags;
+      pyopencl::command_queue_ref m_queue;
+
 
     public:
       svm_allocator(std::shared_ptr<pyopencl::context> const &ctx,
-          cl_uint alignment, cl_mem_flags flags=CL_MEM_READ_WRITE)
+          cl_uint alignment, cl_svm_mem_flags flags=CL_MEM_READ_WRITE,
+          pyopencl::command_queue *queue=nullptr)
         : m_context(ctx), m_alignment(alignment), m_flags(flags)
       {
-        if (flags & (CL_MEM_USE_HOST_PTR | CL_MEM_COPY_HOST_PTR))
-          throw pyopencl::error("Allocator", CL_INVALID_VALUE,
-              "cannot specify USE_HOST_PTR or COPY_HOST_PTR flags");
+        if (queue)
+          m_queue.set(m_queue.data());
       }
 
       svm_allocator(svm_allocator const &src)
@@ -328,21 +360,61 @@ namespace
       virtual ~svm_allocator()
       { }
 
-      typedef void *pointer_type;
-      typedef size_t size_type;
+      virtual svm_allocator *copy() const
+      {
+        return new svm_allocator(m_context, m_alignment, m_flags);
+      }
+
+      virtual bool is_deferred() const
+      {
+        // FIXME: I don't know whether that's true.
+        return false;
+      }
 
       pointer_type allocate(size_type size)
       {
         if (size == 0)
-          return nullptr;
+          return { nullptr, nullptr };
 
         PYOPENCL_PRINT_CALL_TRACE("clSVMalloc");
-        return clSVMAlloc(m_context->data(), m_flags, size, m_alignment);
+        return {
+          clSVMAlloc(m_context->data(), m_flags, size, m_alignment),
+          pyopencl::command_queue_ref(m_queue.data())
+        };
+      }
+
+      virtual pointer_type hand_out_existing_block(pointer_type &&p)
+      {
+        if (m_queue.is_valid())
+        {
+          if (p.queue.is_valid())
+          {
+            // make sure synchronization promises stay valid in new queue
+            cl_event evt;
+
+            PYOPENCL_CALL_GUARDED(clEnqueueMarker, (p.queue.data(), &evt));
+            PYOPENCL_CALL_GUARDED(clEnqueueWaitForEvents, (m_queue.data(), 1, &evt));
+          }
+          p.queue.set(m_queue.data());
+        }
+        return p;
       }
 
-      void free(pointer_type p)
+      void free(pointer_type &&p)
       {
-        clSVMFree(m_context->data(), p);
+        if (p.queue.is_valid())
+        {
+          PYOPENCL_CALL_GUARDED_CLEANUP(clEnqueueSVMFree, (
+                p.queue.data(), 1, &p.ptr,
+                nullptr, nullptr,
+                0, nullptr, nullptr));
+          p.queue.reset();
+        }
+        else
+        {
+          PYOPENCL_PRINT_CALL_TRACE("clSVMFree");
+          clSVMFree(m_context->data(), p.ptr);
+        }
       }
 
       void try_release_blocks()
@@ -354,7 +426,99 @@ namespace
   // }}}
 
 
+  // {{{ svm_allocator_call
+
+  inline
+  svm_held_pointer svm_allocator_call(svm_allocator &alloc, size_t size)
+  {
+    svm_held_pointer mem;
+    int try_count = 0;
+    while (true)
+    {
+      try
+      {
+        return alloc.allocate(size);
+      }
+      catch (pyopencl::error &e)
+      {
+        if (!e.is_out_of_memory())
+          throw;
+        if (++try_count == 2)
+          throw;
+      }
 
+      alloc.try_release_blocks();
+    }
+  }
+
+  // }}}
+
+
+  // {{{ pooled_svm
+
+  class pooled_svm
+    : public pyopencl::pooled_allocation<pyopencl::memory_pool<svm_allocator>>,
+    public pyopencl::svm_pointer
+  {
+    private:
+      typedef
+        pyopencl::pooled_allocation<pyopencl::memory_pool<svm_allocator>>
+        super;
+
+    public:
+      pooled_svm(
+          std::shared_ptr<super::pool_type> p, super::size_type s)
+        : super(p, s)
+      { }
+
+      void *svm_ptr() const
+      { return m_ptr.ptr; }
+
+      size_t size() const
+      { return m_size; }
+
+      void bind_to_queue(pyopencl::command_queue const &queue)
+      {
+        if (pyopencl::is_queue_out_of_order(queue.data()))
+          throw pyopencl::error("PooledSVM.bind_to_queue", CL_INVALID_VALUE,
+              "supplying an out-of-order queue to SVMAllocation is invalid");
+
+        if (m_ptr.queue.is_valid())
+        {
+          // make sure synchronization promises stay valid in new queue
+          cl_event evt;
+
+          PYOPENCL_CALL_GUARDED(clEnqueueMarker, (m_ptr.queue.data(), &evt));
+          PYOPENCL_CALL_GUARDED(clEnqueueWaitForEvents, (queue.data(), 1, &evt));
+        }
+
+        m_ptr.queue.set(queue.data());
+      }
+
+      void unbind_from_queue()
+      {
+        // NOTE: This absolves the allocation from any synchronization promises
+        // made. Keeping those before calling this method is the responsibility
+        // of the user.
+        m_ptr.queue.reset();
+      }
+  };
+
+  // }}}
+
+
+  // {{{ svm_pool_allocate
+
+  pooled_svm *svm_pool_allocate(
+      std::shared_ptr<pyopencl::memory_pool<svm_allocator> > pool,
+      pyopencl::memory_pool<svm_allocator>::size_type sz)
+  {
+    return new pooled_svm(pool, sz);
+  }
+
+  // }}}
+
+#endif
 
   template<class Wrapper>
   void expose_memory_pool(Wrapper &wrapper)
@@ -381,11 +545,11 @@ void pyopencl_expose_mempool(py::module &m)
   m.def("bitlog2", pyopencl::bitlog2);
 
   {
-    typedef cl_allocator_base cls;
+    typedef buffer_allocator_base cls;
     py::class_<cls /*, boost::noncopyable */> wrapper(
         m, "_tools_AllocatorBase"/*, py::no_init */);
     wrapper
-      .def("__call__", allocator_call)
+      .def("__call__", buffer_allocator_call)
       ;
 
   }
@@ -410,8 +574,8 @@ void pyopencl_expose_mempool(py::module &m)
   }
 
   {
-    typedef cl_deferred_allocator cls;
-    py::class_<cls, cl_allocator_base> wrapper(
+    typedef deferred_buffer_allocator cls;
+    py::class_<cls, buffer_allocator_base> wrapper(
         m, "_tools_DeferredAllocator");
     wrapper
       .def(py::init<
@@ -424,8 +588,8 @@ void pyopencl_expose_mempool(py::module &m)
   }
 
   {
-    typedef cl_immediate_allocator cls;
-    py::class_<cls, cl_allocator_base> wrapper(
+    typedef immediate_buffer_allocator cls;
+    py::class_<cls, buffer_allocator_base> wrapper(
         m, "_tools_ImmediateAllocator");
     wrapper
       .def(py::init<pyopencl::command_queue &>())
@@ -435,18 +599,18 @@ void pyopencl_expose_mempool(py::module &m)
   }
 
   {
-    typedef pyopencl::memory_pool<cl_allocator_base> cls;
+    typedef pyopencl::memory_pool<buffer_allocator_base> cls;
 
     py::class_<
       cls, /* boost::noncopyable, */
-      std::shared_ptr<cls>> wrapper( m, "MemoryPool");
+      std::shared_ptr<cls>> wrapper( m, "_tools_MemoryPool");
     wrapper
-      .def(py::init<cl_allocator_base const &, unsigned>(),
+      .def(py::init<buffer_allocator_base const &, unsigned>(),
           py::arg("allocator"),
           py::arg("leading_bits_in_bin_id")=4
           )
-      .def("allocate", device_pool_allocate)
-      .def("__call__", device_pool_allocate)
+      .def("allocate", buffer_pool_allocate)
+      .def("__call__", buffer_pool_allocate)
       // undoc for now
       .DEF_SIMPLE_METHOD(set_trace)
       ;
@@ -458,10 +622,64 @@ void pyopencl_expose_mempool(py::module &m)
     typedef pooled_buffer cls;
     py::class_<cls, /* boost::noncopyable, */
       pyopencl::memory_object_holder>(
-          m, "PooledBuffer"/* , py::no_init */)
+          m, "_tools_PooledBuffer"/* , py::no_init */)
+      .def("release", &cls::free)
+      // undocumented for now, for consistency with SVM
+      .def("bind_to_queue", [](cls &self, pyopencl::command_queue &queue) { /* no-op */ })
+      .def("unbind_from_queue", [](cls &self) { /* no-op */ })
+      ;
+  }
+
+#if PYOPENCL_CL_VERSION >= 0x2000
+  {
+    typedef pyopencl::memory_pool<svm_allocator> cls;
+
+    py::class_<
+      cls, /* boost::noncopyable, */
+      std::shared_ptr<cls>> wrapper( m, "_tools_SVMemoryPool");
+    wrapper
+      .def(py::init<svm_allocator const &, unsigned>(),
+          py::arg("allocator"),
+          py::arg("leading_bits_in_bin_id")=4
+          )
+      .def("allocate", svm_pool_allocate)
+      .def("__call__", svm_pool_allocate)
+      // undoc for now
+      .DEF_SIMPLE_METHOD(set_trace)
+      ;
+
+    expose_memory_pool(wrapper);
+  }
+
+  {
+    typedef pooled_svm cls;
+    py::class_<cls, /* boost::noncopyable, */
+      pyopencl::svm_pointer>(
+          m, "_tools_PooledSVM"/* , py::no_init */)
       .def("release", &cls::free)
+      .def("__eq__", [](const cls &self, const cls &other)
+          { return self.svm_ptr() == other.svm_ptr(); })
+      .def("__hash__", [](cls &self) { return (intptr_t) self.svm_ptr(); })
+      .DEF_SIMPLE_METHOD(bind_to_queue)
+      .DEF_SIMPLE_METHOD(unbind_from_queue)
+      ;
+  }
+
+  {
+    typedef svm_allocator cls;
+    py::class_<cls> wrapper(
+        m, "_tools_SVMAllocator");
+    wrapper
+      .def(py::init<std::shared_ptr<pyopencl::context>  const &, cl_uint, cl_uint, pyopencl::command_queue *>(),
+          py::arg("context"),
+          py::arg("alignment"),
+          py::arg("flags")=CL_MEM_READ_WRITE,
+          py::arg("command_queue").none(true)=nullptr
+          )
+      .def("__call__", svm_allocator_call)
       ;
   }
+#endif
 }
 
 // vim: foldmethod=marker

From f721a96f07a0316cc73a2c2805b96c04c4a7af38 Mon Sep 17 00:00:00 2001
From: Matthias Diener <mdiener@illinois.edu>
Date: Wed, 27 Jul 2022 12:32:11 -0500
Subject: [PATCH 15/15] Some SVM fixes

---
 examples/demo_array_svm.py |  2 +-
 pyopencl/tools.py          | 19 -------------------
 src/wrap_mempool.cpp       | 10 ++++++++--
 3 files changed, 9 insertions(+), 22 deletions(-)

diff --git a/examples/demo_array_svm.py b/examples/demo_array_svm.py
index 054e582c0..eaba23209 100644
--- a/examples/demo_array_svm.py
+++ b/examples/demo_array_svm.py
@@ -12,7 +12,7 @@
 ctx = cl.create_some_context()
 queue = cl.CommandQueue(ctx)
 
-alloc = SVMAllocator(ctx, cl.svm_mem_flags.READ_WRITE, queue=queue)
+alloc = SVMAllocator(ctx, 0, cl.svm_mem_flags.READ_WRITE, queue)
 
 a_dev = cl_array.to_device(queue, a, allocator=alloc)
 print("A_DEV", a_dev.data)
diff --git a/pyopencl/tools.py b/pyopencl/tools.py
index 831d445ce..b414168c6 100644
--- a/pyopencl/tools.py
+++ b/pyopencl/tools.py
@@ -76,25 +76,6 @@ def _register_types():
 # }}}
 
 
-# {{{ svm allocator
-
-# FIXME: Replace me with C++
-class SVMAllocator:
-    def __init__(self, ctx, flags, *, alignment=0, queue=None):
-        self._context = ctx
-        self._flags = flags
-        self._alignment = alignment
-        self._queue = queue
-
-    def __call__(self, nbytes):
-        import pyopencl as cl
-        return cl.SVMAllocation(
-                self._context, nbytes, self._alignment, self._flags,
-                queue=self._queue)
-
-# }}}
-
-
 # {{{ first-arg caches
 
 _first_arg_dependent_caches = []
diff --git a/src/wrap_mempool.cpp b/src/wrap_mempool.cpp
index 429e3e1b9..34b5e1cea 100644
--- a/src/wrap_mempool.cpp
+++ b/src/wrap_mempool.cpp
@@ -1,4 +1,4 @@
-// Warp memory pool
+// Wrap memory pool
 //
 // Copyright (C) 2009 Andreas Kloeckner
 //
@@ -349,7 +349,7 @@ namespace
         : m_context(ctx), m_alignment(alignment), m_flags(flags)
       {
         if (queue)
-          m_queue.set(m_queue.data());
+          m_queue.set(queue->data());
       }
 
       svm_allocator(svm_allocator const &src)
@@ -679,6 +679,12 @@ void pyopencl_expose_mempool(py::module &m)
       .def("__call__", svm_allocator_call)
       ;
   }
+
+  {
+    typedef svm_held_pointer cls;
+    py::class_<cls> wrapper(
+        m, "_tools_SVMHeldPointer");
+  }
 #endif
 }