Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ dependencies = [
]

[project.optional-dependencies]
all = ["vlab4mic[dev,test]"]
all = ["vlab4mic[dev,test,gpu]"]
dev = [
"pre-commit>=3.7.0",
"ruff>=0.4.3",
Expand All @@ -54,6 +54,9 @@ test = [
"nbmake>=1.5.3",
"mypy>=1.10.0"
]
gpu = [
"pyopencl>=2022.3.1"
]

[tool.setuptools]
package-dir = { "" = "src" }
Expand Down
196 changes: 196 additions & 0 deletions src/vlab4mic/utils/__opencl__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
import os
import inspect
import warnings
from pathlib import Path

os.environ["PYOPENCL_COMPILER_OUTPUT"] = "1"


class NoCL(object):
class MemoryError(Exception):
def __init__(self):
self.message = "NoCL memory error"

def __str__(self):
print(self.message)

class LogicError(Exception):
def __init__(self):
self.message = "NoCL logic error"

def __str__(self):
print(self.message)

class Error(Exception):
def __init__(self):
self.message = "NoCL error"

def __str__(self):
print(self.message)

def __init__(self):
pass

def __bool__(self):
return False


try:
import pyopencl as cl
import pyopencl.array as cl_array

devices = []
_fastest_device = None
max_perf = 0

if (
len(cl.get_platforms()) == 1
and len(cl.get_platforms()[0].get_devices()) == 1
):
dev = cl.get_platforms()[0].get_devices()[0]
_fastest_device = {"device": dev, "DP": False}
devices.append({"device": dev, "DP": False})

else:
for platform in cl.get_platforms():
if (
"Microsoft" in platform.vendor
): # TODO this takes out emulated GPUs
continue
for dev in platform.get_devices():
# check if the device is a GPU
if "GPU" not in cl.device_type.to_string(dev.type):
continue
if "cl_khr_fp64" in dev.extensions.strip().split(" "):
cl_dp = False
else:
cl_dp = False
if "Intel" in platform.vendor:
# penalty for Intel based integrated GPUs as compute units here refer to threads
perf = (
dev.max_compute_units
/ 2
* dev.max_clock_frequency
* dev.max_mem_alloc_size
)
else:
perf = (
dev.max_compute_units
* dev.max_clock_frequency
* dev.max_mem_alloc_size
)
if perf > max_perf:
max_perf = perf
_fastest_device = {"device": dev, "DP": cl_dp}
devices.append({"device": dev, "DP": cl_dp})


except (ImportError, OSError, Exception) as e:
print("This exception is what's causing cl equals None:", e)
cl = NoCL()
cl_array = None
devices = None
_fastest_device = None


def print_opencl_info():
"""
Prints information about the OpenCL devices on the system
"""
# REF: https://github.com/benshope/PyOpenCL-Tutorial

msg = "\n" + "=" * 60 + "\nOpenCL Platforms and Devices \n"
# Print each platform on this computer
for platform in cl.get_platforms():
msg += "=" * 60 + "\n"
msg += "Platform - Name: " + platform.name + "\n"
msg += "Platform - Vendor: " + platform.vendor + "\n"
msg += "Platform - Version: " + platform.version + "\n"
msg += "Platform - Profile: " + platform.profile + "\n"
# Print each device per-platform
for device in platform.get_devices():
msg += "\t" + "-" * 56 + "\n"
msg += "\tDevice - Name: " + device.name + "\n"
msg += (
"\tDevice - Type: "
+ cl.device_type.to_string(device.type)
+ "\n"
)
msg += (
f"\tDevice - Max Clock Speed: {device.max_clock_frequency} Mhz"
+ "\n"
)

msg += (
f"\tDevice - Compute Units: {device.max_compute_units}" + "\n"
)
msg += (
f"\tDevice - Local Memory: {device.local_mem_size / 1024.0:.0f} KB"
+ "\n"
)
msg += (
f"\tDevice - Constant Memory: {device.max_constant_buffer_size / 1024.0:.0f} KB"
+ "\n"
)
msg += (
f"\tDevice - Global Memory: {device.global_mem_size / 1073741824.0:.0f} GB"
+ "\n"
)
msg += (
f"\tDevice - Max Buffer/Image Size: {device.max_mem_alloc_size / 1048576.0:.0f} MB"
+ "\n"
)
msg += (
f"\tDevice - Max Work Group Size: {device.max_work_group_size:.0f}"
+ "\n"
)

return msg


def opencl_works():
"""
Checks if the system has OpenCL compatibility
:return: True if the system has OpenCL compatibility, False otherwise
"""
disabled = os.environ.get("NANOPYX_DISABLE_OPENCL", "0") == "1"
enabled = os.environ.get("NANOPYX_ENABLE_OPENCL", "1") == "1"

if disabled or not enabled:
warnings.warn(
"OpenCL is disabled. To enable it, set the environment variable NANOPYX_ENABLE_OPENCL=1"
)
return False

elif enabled:
if not cl:
warnings.warn("tap... tap... tap... COMPUTER SAYS NO (OpenCL)!")
os.environ["NANOPYX_DISABLE_OPENCL"] = "1"
return False

return True


def _get_cl_code(file_name, cl_dp):
"""
Retrieves the OpenCL code from the corresponding .cl file
"""
cl_file = os.path.splitext(file_name)[0] + ".cl"

if not os.path.exists(cl_file):
# Use the path of the file that called this function
caller_frame = inspect.stack()[1]
caller_path = os.path.dirname(os.path.abspath(caller_frame.filename))
cl_file = os.path.join(caller_path, file_name)

assert os.path.exists(cl_file), "Could not find OpenCL file: " + str(
cl_file
)

with open(cl_file) as f:
kernel_str = f.read()

if not cl_dp:
kernel_str = kernel_str.replace("double", "float")

return kernel_str
85 changes: 85 additions & 0 deletions src/vlab4mic/utils/transform/_convolution.cl
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;

// 1D convolution along x-axis (columns) for buffer (handles 3D data)
__kernel void
conv1d_x(__global float *image, __global float *image_out, __global float *kernel_array, int kernel_size, int nrows, int ncols){

int row = get_global_id(0);
int col = get_global_id(1);
int z = get_global_id(2);

// Return if out of bounds
if (row >= nrows || col >= ncols) return;

int kernel_center = (kernel_size-1)/2;
float acc = 0.0f;

// Calculate the linear index for this z-slice
int slice_offset = z * nrows * ncols;

for (int k = 0; k < kernel_size; k++) {
int localcol = col + (k - kernel_center);
// Zero-padding at boundaries (mode='same')
if (localcol >= 0 && localcol < ncols) {
acc += kernel_array[k] * image[slice_offset + row * ncols + localcol];
}
}

image_out[slice_offset + row * ncols + col] = acc;
}

// 1D convolution along y-axis (rows) for buffer (handles 3D data)
__kernel void
conv1d_y(__global float *image, __global float *image_out, __global float *kernel_array, int kernel_size, int nrows, int ncols){

int row = get_global_id(0);
int col = get_global_id(1);
int z = get_global_id(2);

// Return if out of bounds
if (row >= nrows || col >= ncols) return;

int kernel_center = (kernel_size-1)/2;
float acc = 0.0f;

// Calculate the linear index for this z-slice
int slice_offset = z * nrows * ncols;

for (int k = 0; k < kernel_size; k++) {
int localrow = row + (k - kernel_center);
// Zero-padding at boundaries (mode='same')
if (localrow >= 0 && localrow < nrows) {
acc += kernel_array[k] * image[slice_offset + localrow * ncols + col];
}
}

image_out[slice_offset + row * ncols + col] = acc;
}

// 1D convolution along z-axis (depth) for buffer (handles 3D data)
__kernel void
conv1d_z(__global float *image, __global float *image_out, __global float *kernel_array, int kernel_size, int nrows, int ncols, int ndepth){

int row = get_global_id(0);
int col = get_global_id(1);

// Return if out of bounds
if (row >= nrows || col >= ncols) return;

int kernel_center = (kernel_size-1)/2;

// For each depth slice at this row,col position
for (int z = 0; z < ndepth; z++) {
float acc = 0.0f;

for (int k = 0; k < kernel_size; k++) {
int localz = z + (k - kernel_center);
// Zero-padding at boundaries (mode='same')
if (localz >= 0 && localz < ndepth) {
acc += kernel_array[k] * image[localz * nrows * ncols + row * ncols + col];
}
}

image_out[z * nrows * ncols + row * ncols + col] = acc;
}
}
Loading
Loading