HenriquesLab · brunomsaraiva · Jan 30, 2026 · Feb 3, 2026
diff --git a/pyproject.toml b/pyproject.toml
@@ -39,7 +39,7 @@ dependencies = [
 ]
 
 [project.optional-dependencies]
-all = ["vlab4mic[dev,test]"]
+all = ["vlab4mic[dev,test,gpu]"]
 dev = [
     "pre-commit>=3.7.0",
     "ruff>=0.4.3",
@@ -54,6 +54,9 @@ test = [
     "nbmake>=1.5.3",
     "mypy>=1.10.0"
 ]
+gpu = [
+    "pyopencl>=2022.3.1"
+]
 
 [tool.setuptools]
 package-dir = { "" = "src" }

diff --git a/src/vlab4mic/utils/__opencl__.py b/src/vlab4mic/utils/__opencl__.py
@@ -0,0 +1,196 @@
+import os
+import inspect
+import warnings
+from pathlib import Path
+
+os.environ["PYOPENCL_COMPILER_OUTPUT"] = "1"
+
+
+class NoCL(object):
+    class MemoryError(Exception):
+        def __init__(self):
+            self.message = "NoCL memory error"
+
+        def __str__(self):
+            print(self.message)
+
+    class LogicError(Exception):
+        def __init__(self):
+            self.message = "NoCL logic error"
+
+        def __str__(self):
+            print(self.message)
+
+    class Error(Exception):
+        def __init__(self):
+            self.message = "NoCL error"
+
+        def __str__(self):
+            print(self.message)
+
+    def __init__(self):
+        pass
+
+    def __bool__(self):
+        return False
+
+
+try:
+    import pyopencl as cl
+    import pyopencl.array as cl_array
+
+    devices = []
+    _fastest_device = None
+    max_perf = 0
+
+    if (
+        len(cl.get_platforms()) == 1
+        and len(cl.get_platforms()[0].get_devices()) == 1
+    ):
+        dev = cl.get_platforms()[0].get_devices()[0]
+        _fastest_device = {"device": dev, "DP": False}
+        devices.append({"device": dev, "DP": False})
+
+    else:
+        for platform in cl.get_platforms():
+            if (
+                "Microsoft" in platform.vendor
+            ):  # TODO this takes out emulated GPUs
+                continue
+            for dev in platform.get_devices():
+                # check if the device is a GPU
+                if "GPU" not in cl.device_type.to_string(dev.type):
+                    continue
+                if "cl_khr_fp64" in dev.extensions.strip().split(" "):
+                    cl_dp = False
+                else:
+                    cl_dp = False
+                if "Intel" in platform.vendor:
+                    # penalty for Intel based integrated GPUs as compute units here refer to threads
+                    perf = (
+                        dev.max_compute_units
+                        / 2
+                        * dev.max_clock_frequency
+                        * dev.max_mem_alloc_size
+                    )
+                else:
+                    perf = (
+                        dev.max_compute_units
+                        * dev.max_clock_frequency
+                        * dev.max_mem_alloc_size
+                    )
+                if perf > max_perf:
+                    max_perf = perf
+                    _fastest_device = {"device": dev, "DP": cl_dp}
+                devices.append({"device": dev, "DP": cl_dp})
+
+
+except (ImportError, OSError, Exception) as e:
+    print("This exception is what's causing cl equals None:", e)
+    cl = NoCL()
+    cl_array = None
+    devices = None
+    _fastest_device = None
+
+
+def print_opencl_info():
+    """
+    Prints information about the OpenCL devices on the system
+    """
+    # REF: https://github.com/benshope/PyOpenCL-Tutorial
+
+    msg = "\n" + "=" * 60 + "\nOpenCL Platforms and Devices \n"
+    # Print each platform on this computer
+    for platform in cl.get_platforms():
+        msg += "=" * 60 + "\n"
+        msg += "Platform - Name:  " + platform.name + "\n"
+        msg += "Platform - Vendor:  " + platform.vendor + "\n"
+        msg += "Platform - Version:  " + platform.version + "\n"
+        msg += "Platform - Profile:  " + platform.profile + "\n"
+        # Print each device per-platform
+        for device in platform.get_devices():
+            msg += "\t" + "-" * 56 + "\n"
+            msg += "\tDevice - Name: " + device.name + "\n"
+            msg += (
+                "\tDevice - Type: "
+                + cl.device_type.to_string(device.type)
+                + "\n"
+            )
+            msg += (
+                f"\tDevice - Max Clock Speed:  {device.max_clock_frequency} Mhz"
+                + "\n"
+            )
+
+            msg += (
+                f"\tDevice - Compute Units:  {device.max_compute_units}" + "\n"
+            )
+            msg += (
+                f"\tDevice - Local Memory:  {device.local_mem_size / 1024.0:.0f} KB"
+                + "\n"
+            )
+            msg += (
+                f"\tDevice - Constant Memory:  {device.max_constant_buffer_size / 1024.0:.0f} KB"
+                + "\n"
+            )
+            msg += (
+                f"\tDevice - Global Memory: {device.global_mem_size / 1073741824.0:.0f} GB"
+                + "\n"
+            )
+            msg += (
+                f"\tDevice - Max Buffer/Image Size: {device.max_mem_alloc_size / 1048576.0:.0f} MB"
+                + "\n"
+            )
+            msg += (
+                f"\tDevice - Max Work Group Size: {device.max_work_group_size:.0f}"
+                + "\n"
+            )
+
+    return msg
+
+
+def opencl_works():
+    """
+    Checks if the system has OpenCL compatibility
+    :return: True if the system has OpenCL compatibility, False otherwise
+    """
+    disabled = os.environ.get("NANOPYX_DISABLE_OPENCL", "0") == "1"
+    enabled = os.environ.get("NANOPYX_ENABLE_OPENCL", "1") == "1"
+
+    if disabled or not enabled:
+        warnings.warn(
+            "OpenCL is disabled. To enable it, set the environment variable NANOPYX_ENABLE_OPENCL=1"
+        )
+        return False
+
+    elif enabled:
+        if not cl:
+            warnings.warn("tap... tap... tap... COMPUTER SAYS NO (OpenCL)!")
+            os.environ["NANOPYX_DISABLE_OPENCL"] = "1"
+            return False
+
+    return True
+
+
+def _get_cl_code(file_name, cl_dp):
+    """
+    Retrieves the OpenCL code from the corresponding .cl file
+    """
+    cl_file = os.path.splitext(file_name)[0] + ".cl"
+
+    if not os.path.exists(cl_file):
+        # Use the path of the file that called this function
+        caller_frame = inspect.stack()[1]
+        caller_path = os.path.dirname(os.path.abspath(caller_frame.filename))
+        cl_file = os.path.join(caller_path, file_name)
+
+    assert os.path.exists(cl_file), "Could not find OpenCL file: " + str(
+        cl_file
+    )
+
+    with open(cl_file) as f:
+        kernel_str = f.read()
+
+    if not cl_dp:
+        kernel_str = kernel_str.replace("double", "float")
+
+    return kernel_str
diff --git a/src/vlab4mic/utils/transform/_convolution.cl b/src/vlab4mic/utils/transform/_convolution.cl
@@ -0,0 +1,85 @@
+__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
+
+// 1D convolution along x-axis (columns) for buffer (handles 3D data)
+__kernel void
+conv1d_x(__global float *image, __global float *image_out, __global float *kernel_array, int kernel_size, int nrows, int ncols){
+
+  int row = get_global_id(0);
+  int col = get_global_id(1);
+  int z = get_global_id(2);
+
+  // Return if out of bounds
+  if (row >= nrows || col >= ncols) return;
+
+  int kernel_center = (kernel_size-1)/2;
+  float acc = 0.0f;
+
+  // Calculate the linear index for this z-slice
+  int slice_offset = z * nrows * ncols;
+
+  for (int k = 0; k < kernel_size; k++) {
+    int localcol = col + (k - kernel_center);
+    // Zero-padding at boundaries (mode='same')
+    if (localcol >= 0 && localcol < ncols) {
+      acc += kernel_array[k] * image[slice_offset + row * ncols + localcol];
+    }
+  }
+
+  image_out[slice_offset + row * ncols + col] = acc;
+}
+
+// 1D convolution along y-axis (rows) for buffer (handles 3D data)
+__kernel void
+conv1d_y(__global float *image, __global float *image_out, __global float *kernel_array, int kernel_size, int nrows, int ncols){
+
+  int row = get_global_id(0);
+  int col = get_global_id(1);
+  int z = get_global_id(2);
+
+  // Return if out of bounds
+  if (row >= nrows || col >= ncols) return;
+
+  int kernel_center = (kernel_size-1)/2;
+  float acc = 0.0f;
+
+  // Calculate the linear index for this z-slice
+  int slice_offset = z * nrows * ncols;
+
+  for (int k = 0; k < kernel_size; k++) {
+    int localrow = row + (k - kernel_center);
+    // Zero-padding at boundaries (mode='same')
+    if (localrow >= 0 && localrow < nrows) {
+      acc += kernel_array[k] * image[slice_offset + localrow * ncols + col];
+    }
+  }
+
+  image_out[slice_offset + row * ncols + col] = acc;
+}
+
+// 1D convolution along z-axis (depth) for buffer (handles 3D data)
+__kernel void
+conv1d_z(__global float *image, __global float *image_out, __global float *kernel_array, int kernel_size, int nrows, int ncols, int ndepth){
+
+  int row = get_global_id(0);
+  int col = get_global_id(1);
+
+  // Return if out of bounds
+  if (row >= nrows || col >= ncols) return;
+
+  int kernel_center = (kernel_size-1)/2;
+
+  // For each depth slice at this row,col position
+  for (int z = 0; z < ndepth; z++) {
+    float acc = 0.0f;
+
+    for (int k = 0; k < kernel_size; k++) {
+      int localz = z + (k - kernel_center);
+      // Zero-padding at boundaries (mode='same')
+      if (localz >= 0 && localz < ndepth) {
+        acc += kernel_array[k] * image[localz * nrows * ncols + row * ncols + col];
+      }
+    }
+
+    image_out[z * nrows * ncols + row * ncols + col] = acc;
+  }
+}