diff --git a/pyproject.toml b/pyproject.toml
index cbcceb1..d98d5a9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -39,7 +39,7 @@ dependencies = [
 ]
 
 [project.optional-dependencies]
-all = ["vlab4mic[dev,test]"]
+all = ["vlab4mic[dev,test,gpu]"]
 dev = [
     "pre-commit>=3.7.0",
     "ruff>=0.4.3",
@@ -54,6 +54,9 @@ test = [
     "nbmake>=1.5.3",
     "mypy>=1.10.0"
 ]
+gpu = [
+    "pyopencl>=2022.3.1"
+]
 
 [tool.setuptools]
 package-dir = { "" = "src" }
diff --git a/src/vlab4mic/utils/__opencl__.py b/src/vlab4mic/utils/__opencl__.py
new file mode 100644
index 0000000..51cf65a
--- /dev/null
+++ b/src/vlab4mic/utils/__opencl__.py
@@ -0,0 +1,196 @@
+import os
+import inspect
+import warnings
+from pathlib import Path
+
+os.environ["PYOPENCL_COMPILER_OUTPUT"] = "1"
+
+
+class NoCL(object):
+    class MemoryError(Exception):
+        def __init__(self):
+            self.message = "NoCL memory error"
+
+        def __str__(self):
+            print(self.message)
+
+    class LogicError(Exception):
+        def __init__(self):
+            self.message = "NoCL logic error"
+
+        def __str__(self):
+            print(self.message)
+
+    class Error(Exception):
+        def __init__(self):
+            self.message = "NoCL error"
+
+        def __str__(self):
+            print(self.message)
+
+    def __init__(self):
+        pass
+
+    def __bool__(self):
+        return False
+
+
+try:
+    import pyopencl as cl
+    import pyopencl.array as cl_array
+
+    devices = []
+    _fastest_device = None
+    max_perf = 0
+
+    if (
+        len(cl.get_platforms()) == 1
+        and len(cl.get_platforms()[0].get_devices()) == 1
+    ):
+        dev = cl.get_platforms()[0].get_devices()[0]
+        _fastest_device = {"device": dev, "DP": False}
+        devices.append({"device": dev, "DP": False})
+
+    else:
+        for platform in cl.get_platforms():
+            if (
+                "Microsoft" in platform.vendor
+            ):  # TODO this takes out emulated GPUs
+                continue
+            for dev in platform.get_devices():
+                # check if the device is a GPU
+                if "GPU" not in cl.device_type.to_string(dev.type):
+                    continue
+                if "cl_khr_fp64" in dev.extensions.strip().split(" "):
+                    cl_dp = False
+                else:
+                    cl_dp = False
+                if "Intel" in platform.vendor:
+                    # penalty for Intel based integrated GPUs as compute units here refer to threads
+                    perf = (
+                        dev.max_compute_units
+                        / 2
+                        * dev.max_clock_frequency
+                        * dev.max_mem_alloc_size
+                    )
+                else:
+                    perf = (
+                        dev.max_compute_units
+                        * dev.max_clock_frequency
+                        * dev.max_mem_alloc_size
+                    )
+                if perf > max_perf:
+                    max_perf = perf
+                    _fastest_device = {"device": dev, "DP": cl_dp}
+                devices.append({"device": dev, "DP": cl_dp})
+
+
+except (ImportError, OSError, Exception) as e:
+    print("This exception is what's causing cl equals None:", e)
+    cl = NoCL()
+    cl_array = None
+    devices = None
+    _fastest_device = None
+
+
+def print_opencl_info():
+    """
+    Prints information about the OpenCL devices on the system
+    """
+    # REF: https://github.com/benshope/PyOpenCL-Tutorial
+
+    msg = "\n" + "=" * 60 + "\nOpenCL Platforms and Devices \n"
+    # Print each platform on this computer
+    for platform in cl.get_platforms():
+        msg += "=" * 60 + "\n"
+        msg += "Platform - Name:  " + platform.name + "\n"
+        msg += "Platform - Vendor:  " + platform.vendor + "\n"
+        msg += "Platform - Version:  " + platform.version + "\n"
+        msg += "Platform - Profile:  " + platform.profile + "\n"
+        # Print each device per-platform
+        for device in platform.get_devices():
+            msg += "\t" + "-" * 56 + "\n"
+            msg += "\tDevice - Name: " + device.name + "\n"
+            msg += (
+                "\tDevice - Type: "
+                + cl.device_type.to_string(device.type)
+                + "\n"
+            )
+            msg += (
+                f"\tDevice - Max Clock Speed:  {device.max_clock_frequency} Mhz"
+                + "\n"
+            )
+
+            msg += (
+                f"\tDevice - Compute Units:  {device.max_compute_units}" + "\n"
+            )
+            msg += (
+                f"\tDevice - Local Memory:  {device.local_mem_size / 1024.0:.0f} KB"
+                + "\n"
+            )
+            msg += (
+                f"\tDevice - Constant Memory:  {device.max_constant_buffer_size / 1024.0:.0f} KB"
+                + "\n"
+            )
+            msg += (
+                f"\tDevice - Global Memory: {device.global_mem_size / 1073741824.0:.0f} GB"
+                + "\n"
+            )
+            msg += (
+                f"\tDevice - Max Buffer/Image Size: {device.max_mem_alloc_size / 1048576.0:.0f} MB"
+                + "\n"
+            )
+            msg += (
+                f"\tDevice - Max Work Group Size: {device.max_work_group_size:.0f}"
+                + "\n"
+            )
+
+    return msg
+
+
+def opencl_works():
+    """
+    Checks if the system has OpenCL compatibility
+    :return: True if the system has OpenCL compatibility, False otherwise
+    """
+    disabled = os.environ.get("NANOPYX_DISABLE_OPENCL", "0") == "1"
+    enabled = os.environ.get("NANOPYX_ENABLE_OPENCL", "1") == "1"
+
+    if disabled or not enabled:
+        warnings.warn(
+            "OpenCL is disabled. To enable it, set the environment variable NANOPYX_ENABLE_OPENCL=1"
+        )
+        return False
+
+    elif enabled:
+        if not cl:
+            warnings.warn("tap... tap... tap... COMPUTER SAYS NO (OpenCL)!")
+            os.environ["NANOPYX_DISABLE_OPENCL"] = "1"
+            return False
+
+    return True
+
+
+def _get_cl_code(file_name, cl_dp):
+    """
+    Retrieves the OpenCL code from the corresponding .cl file
+    """
+    cl_file = os.path.splitext(file_name)[0] + ".cl"
+
+    if not os.path.exists(cl_file):
+        # Use the path of the file that called this function
+        caller_frame = inspect.stack()[1]
+        caller_path = os.path.dirname(os.path.abspath(caller_frame.filename))
+        cl_file = os.path.join(caller_path, file_name)
+
+    assert os.path.exists(cl_file), "Could not find OpenCL file: " + str(
+        cl_file
+    )
+
+    with open(cl_file) as f:
+        kernel_str = f.read()
+
+    if not cl_dp:
+        kernel_str = kernel_str.replace("double", "float")
+
+    return kernel_str
diff --git a/src/vlab4mic/utils/transform/_convolution.cl b/src/vlab4mic/utils/transform/_convolution.cl
new file mode 100644
index 0000000..71ea932
--- /dev/null
+++ b/src/vlab4mic/utils/transform/_convolution.cl
@@ -0,0 +1,85 @@
+__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
+
+// 1D convolution along x-axis (columns) for buffer (handles 3D data)
+__kernel void
+conv1d_x(__global float *image, __global float *image_out, __global float *kernel_array, int kernel_size, int nrows, int ncols){
+
+  int row = get_global_id(0);
+  int col = get_global_id(1);
+  int z = get_global_id(2);
+
+  // Return if out of bounds
+  if (row >= nrows || col >= ncols) return;
+
+  int kernel_center = (kernel_size-1)/2;
+  float acc = 0.0f;
+
+  // Calculate the linear index for this z-slice
+  int slice_offset = z * nrows * ncols;
+
+  for (int k = 0; k < kernel_size; k++) {
+    int localcol = col + (k - kernel_center);
+    // Zero-padding at boundaries (mode='same')
+    if (localcol >= 0 && localcol < ncols) {
+      acc += kernel_array[k] * image[slice_offset + row * ncols + localcol];
+    }
+  }
+
+  image_out[slice_offset + row * ncols + col] = acc;
+}
+
+// 1D convolution along y-axis (rows) for buffer (handles 3D data)
+__kernel void
+conv1d_y(__global float *image, __global float *image_out, __global float *kernel_array, int kernel_size, int nrows, int ncols){
+
+  int row = get_global_id(0);
+  int col = get_global_id(1);
+  int z = get_global_id(2);
+
+  // Return if out of bounds
+  if (row >= nrows || col >= ncols) return;
+
+  int kernel_center = (kernel_size-1)/2;
+  float acc = 0.0f;
+
+  // Calculate the linear index for this z-slice
+  int slice_offset = z * nrows * ncols;
+
+  for (int k = 0; k < kernel_size; k++) {
+    int localrow = row + (k - kernel_center);
+    // Zero-padding at boundaries (mode='same')
+    if (localrow >= 0 && localrow < nrows) {
+      acc += kernel_array[k] * image[slice_offset + localrow * ncols + col];
+    }
+  }
+
+  image_out[slice_offset + row * ncols + col] = acc;
+}
+
+// 1D convolution along z-axis (depth) for buffer (handles 3D data)
+__kernel void
+conv1d_z(__global float *image, __global float *image_out, __global float *kernel_array, int kernel_size, int nrows, int ncols, int ndepth){
+
+  int row = get_global_id(0);
+  int col = get_global_id(1);
+
+  // Return if out of bounds
+  if (row >= nrows || col >= ncols) return;
+
+  int kernel_center = (kernel_size-1)/2;
+
+  // For each depth slice at this row,col position
+  for (int z = 0; z < ndepth; z++) {
+    float acc = 0.0f;
+
+    for (int k = 0; k < kernel_size; k++) {
+      int localz = z + (k - kernel_center);
+      // Zero-padding at boundaries (mode='same')
+      if (localz >= 0 && localz < ndepth) {
+        acc += kernel_array[k] * image[localz * nrows * ncols + row * ncols + col];
+      }
+    }
+
+    image_out[z * nrows * ncols + row * ncols + col] = acc;
+  }
+}
\ No newline at end of file
diff --git a/src/vlab4mic/utils/transform/image_convolution.py b/src/vlab4mic/utils/transform/image_convolution.py
index f063bd1..ff88c48 100644
--- a/src/vlab4mic/utils/transform/image_convolution.py
+++ b/src/vlab4mic/utils/transform/image_convolution.py
@@ -5,22 +5,178 @@
 import skimage
 from scipy.ndimage import zoom
 import copy
+from ..__opencl__ import (
+    opencl_works,
+    cl_array,
+    _fastest_device,
+    cl,
+    _get_cl_code,
+)
+
+
+def test_kernel_separability(psf):
+    cz, cy, cx = (s // 2 for s in psf.shape)
+
+    kx = psf[cz, cy, :]
+    ky = psf[cz, :, cx]
+    kz = psf[:, cy, cx]
+
+    recon = kz[:, None, None] * ky[None, :, None] * kx[None, None, :]
+
+    return np.linalg.norm(psf - recon) / np.linalg.norm(psf)
 
 
-# keep
 def convolve3D(voxelated_field, psf, method="fft"):
     """
     psf is the 3D superpixelated psf.
     The pixelsize of both psf and voxelated field must be equal
     psf is also assumed to be centered
     """
-    result = convolve(voxelated_field, psf, mode="same", method=method)
+    if opencl_works():
+        if test_kernel_separability(psf) < 1e-2:
+            result = convolve_opencl(voxelated_field, psf)
+        else:
+            result = convolve(voxelated_field, psf, mode="same", method=method)
+    else:
+        result = convolve(voxelated_field, psf, mode="same", method=method)
     return result
 
 
-"""
-Section: convolve in 3D, use zrange to get emitter that are to be considered
-"""
+def convolve_opencl(input, kernel):
+    # Convolve 3D using separable decomposition
+    # For a rank-1 separable 3D kernel K[z,y,x] = ky[y] * kx[x] * kz[z],
+    # extract the 1D kernels and normalize them
+
+    cz, cy, cx = (s // 2 for s in kernel.shape)
+
+    # For a separable kernel, we can extract 1D kernels from center slices
+    # K[cz, cy, x] = ky[cz] * kx[x] * kz[cy] → kx[x] = K[cz, cy, x] / (ky[cz] * kz[cy])
+    # K[cz, y, cx] = ky[y] * kx[cx] * kz[cz] → ky[y] = K[cz, y, cx] / (kx[cx] * kz[cz])
+    # K[z, cy, cx] = ky[cy] * kx[cx] * kz[z] → kz[z] = K[z, cy, cx] / (ky[cy] * kx[cx])
+
+    # The simplest approach: extract center slices and normalize to sum=1
+    center_val = kernel[cz, cy, cx]
+
+    kz = (
+        kernel[cz, cy, :] / center_val
+        if center_val != 0
+        else kernel[cz, cy, :]
+    )
+    ky = (
+        kernel[cz, :, cx] / center_val
+        if center_val != 0
+        else kernel[cz, :, cx]
+    )
+    kx = (
+        kernel[:, cy, cx] / center_val
+        if center_val != 0
+        else kernel[:, cy, cx]
+    )
+
+    # Normalize each to sum to 1
+    kz = kz / np.sum(kz) if np.sum(kz) > 0 else kz
+    ky = ky / np.sum(ky) if np.sum(ky) > 0 else ky
+    kx = kx / np.sum(kx) if np.sum(kx) > 0 else kx
+
+    # Apply separable 1D convolutions
+    intermediate = convolve1D_opencl(input, ky, axis=0)
+    intermediate = convolve1D_opencl(intermediate, kx, axis=1)
+    output = convolve1D_opencl(intermediate, kz, axis=2)
+    return output
+
+
+def convolve1D_opencl(input, kernel, axis=0, device=None):
+    """
+    Perform 1D convolution along a specified axis.
+    Matches scipy.signal.convolve behavior (kernel is flipped internally).
+
+    Parameters:
+    - input: 3D input array
+    - kernel: 1D kernel array
+    - axis: Which axis to convolve along (0=depth, 1=rows, 2=columns)
+    - device: OpenCL device to use
+    """
+
+    if device is None:
+        device = _fastest_device
+
+    # scipy.signal.convolve flips the kernel, so do it here
+    kernel_flipped = kernel[::-1]
+
+    # Ensure input and kernel are contiguous
+    input_contig = np.ascontiguousarray(input, dtype=np.float32)
+    kernel_contig = np.ascontiguousarray(kernel_flipped, dtype=np.float32)
+
+    # QUEUE AND CONTEXT
+    cl_ctx = cl.Context([device["device"]])
+    cl_queue = cl.CommandQueue(cl_ctx)
+
+    image_out = np.zeros_like(input_contig, dtype=np.float32)
+    mf = cl.mem_flags
+
+    input_image = cl.Buffer(
+        cl_ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=input_contig
+    )
+    input_kernel = cl.Buffer(
+        cl_ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=kernel_contig
+    )
+    output_opencl = cl.Buffer(cl_ctx, mf.WRITE_ONLY, image_out.nbytes)
+
+    kernelsize = np.int32(kernel_contig.shape[0])
+    ndepth = np.int32(input_contig.shape[0])  # z-dimension (depth)
+    nrows = np.int32(input_contig.shape[1])  # y-dimension (rows/height)
+    ncols = np.int32(input_contig.shape[2])  # x-dimension (columns/width)
+
+    code = _get_cl_code("_convolution.cl", device["DP"])
+    prg = cl.Program(cl_ctx, code).build()
+
+    if axis == 0:
+        # Convolution along dimension 0 (z/depth)
+        knl = prg.conv1d_z
+        knl(
+            cl_queue,
+            (int(nrows), int(ncols)),
+            None,
+            input_image,
+            output_opencl,
+            input_kernel,
+            kernelsize,
+            nrows,
+            ncols,
+            ndepth,
+        ).wait()
+    elif axis == 1:
+        # Convolution along dimension 1 (y/rows)
+        knl = prg.conv1d_y
+        knl(
+            cl_queue,
+            (int(nrows), int(ncols), int(ndepth)),
+            None,
+            input_image,
+            output_opencl,
+            input_kernel,
+            kernelsize,
+            nrows,
+            ncols,
+        ).wait()
+    elif axis == 2:
+        # Convolution along dimension 2 (x/columns)
+        knl = prg.conv1d_x
+        knl(
+            cl_queue,
+            (int(nrows), int(ncols), int(ndepth)),
+            None,
+            input_image,
+            output_opencl,
+            input_kernel,
+            kernelsize,
+            nrows,
+            ncols,
+        ).wait()
+
+    cl.enqueue_copy(cl_queue, image_out, output_opencl).wait()
+    cl_queue.finish()
+    return image_out
 
 
 # keep
@@ -64,11 +220,12 @@ def voxelize_active_fluorophores_withrange(
     coordinates_intensity = np.repeat(
         activated_coordinates, repeats=photons_per_emitter, axis=0
     )
-    voxel = voxelate_points_withrange(coordinates_intensity, bin_pixelsize, ranges)
+    voxel = voxelate_points_withrange(
+        coordinates_intensity, bin_pixelsize, ranges
+    )
     return voxel
 
 
-## keep
 def frame_by_volume_convolution(
     coordinates,
     photon_vector,
@@ -82,21 +239,21 @@ def frame_by_volume_convolution(
     as_mask=False,
 ):
     """
-    Generate a 2D image from the  3D field of active emitters
+    Generate a 2D input from the  3D field of active emitters
     convolved with the 3D PSF
-    The resulting 2D image can be generated by considering the whole
+    The resulting 2D input can be generated by considering the whole
     volume or a subset from it.
     projection_depth tells the depth of the volume to be considered
     in units of frames
 
-    IMPORTANT: All units are in the same scale, except image dimensions
+    IMPORTANT: All units are in the same scale, except input dimensions
     """
     projection_depth_half = int(projection_depth / 2)
     intensity_voxel = voxelize_active_fluorophores_withrange(
         coordinates, photon_vector, ranges, bin_pixelsize=bin_size
     )
     if as_mask:
-        frame = np.sum(np.array(intensity_voxel),axis=2)
+        frame = np.sum(np.array(intensity_voxel), axis=2)
         return frame
     if activation_only:
         return intensity_voxel
@@ -115,7 +272,11 @@ def frame_by_volume_convolution(
         frame = np.sum(
             np.array(
                 convolved_intensity[
-                    :, :, zfocus_slice - projection_depth_half : zfocus_slice + projection_depth_half
+                    :,
+                    :,
+                    zfocus_slice
+                    - projection_depth_half : zfocus_slice
+                    + projection_depth_half,
                 ]
             ),
             axis=2,
@@ -123,7 +284,7 @@ def frame_by_volume_convolution(
         return frame
     else:
         return convolved_intensity
-    
+
 
 # keep
 def generate_frames_volume_convolution(
@@ -174,7 +335,7 @@ def generate_frames_volume_convolution(
                 psf_array,
                 psf_focus_slice,
                 projection_depth=psf_projection_depth,
-                as_mask=as_mask
+                as_mask=as_mask,
             )
             cframes.append(frame_n)
 
@@ -213,7 +374,7 @@ def generate_frames_volume_convolution(
                 psf_focus_slice,
                 projection_depth=psf_projection_depth,
                 asframe=asframes,
-                activation_only=activation_only
+                activation_only=activation_only,
             )
             volume_frames.append(v_frame)
         return volume_frames
@@ -226,7 +387,7 @@ def lateral_binning_stack(stack, currentpixelsizes, newpixelsizes, field_size):
 
     field_size is a touple that contains the absolute size of each dimension
     """
-    print("Binning image stack")
+    print("Binning input stack")
     nframes = stack.shape[0]  ## Z in in the dimension 0
     binned_images = np.zeros((nframes, field_size[0], field_size[0]))
     blocks = np.floor_divide(
@@ -244,7 +405,9 @@ def lateral_binning_stack(stack, currentpixelsizes, newpixelsizes, field_size):
 
 
 # keep
-def discretise_field_xyz(coordinates: np.ndarray, binsizes: list, ranges: list):
+def discretise_field_xyz(
+    coordinates: np.ndarray, binsizes: list, ranges: list
+):
     """
     Generate a 3D histogram from a set of coordinates.
 
@@ -287,7 +450,8 @@ def _prepare_data_4convolutions(
     **kwargs,
 ):
     psf_half_range = (
-        min((psf_array.shape[2] - psf_focus_slice), psf_focus_slice) * psf_zstep
+        min((psf_array.shape[2] - psf_focus_slice), psf_focus_slice)
+        * psf_zstep
     )
     zrange = psf_half_range * 2
     nframes = np.shape(photons_frames)[1]
@@ -357,21 +521,27 @@ def generate_frames_projection_conv_optimised2(
 
 
 # keep
-def construct_frames(convolved_emitters_list, nframes, photons_frames, **kwargs):
+def construct_frames(
+    convolved_emitters_list, nframes, photons_frames, **kwargs
+):
     print("constructing frames")
     timeserie = []
     for f in tqdm(range(int(nframes))):
         # extract the vector that corresponds to the current frame
         photons_in_frame_vect = photons_frames[:, f]
         timeserie.append(
-            merge_emissions_lookup(photons_in_frame_vect, convolved_emitters_list)
+            merge_emissions_lookup(
+                photons_in_frame_vect, convolved_emitters_list
+            )
         )
     return np.array(timeserie)
 
 
 # keep
-def bin_timeseries(timeseires, currentpixelsizes, newpixelsizes, field_size, **kwargs):
-    # field_size is equivalent to image size
+def bin_timeseries(
+    timeseires, currentpixelsizes, newpixelsizes, field_size, **kwargs
+):
+    # field_size is equivalent to input size
     # assumes the stack has dimensions tXY
     print(
         f"current pixels size for bining: {currentpixelsizes}; new ones {newpixelsizes}"
@@ -463,7 +633,9 @@ def merge_emissions_lookup(photons_in_frame_vect, convolved_emitters_list):
     if total_photons_frame > 0:
         for pos in range(len(photons_in_frame_vect)):
             if photons_in_frame_vect[pos] > 0:
-                intensity = convolved_emitters_list[pos] * photons_in_frame_vect[pos]
+                intensity = (
+                    convolved_emitters_list[pos] * photons_in_frame_vect[pos]
+                )
                 frame = np.sum([frame, intensity], axis=0)
-    # shall return a single image
+    # shall return a single input
     return frame