diff --git a/pyproject.toml b/pyproject.toml index cbcceb1..d98d5a9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,7 +39,7 @@ dependencies = [ ] [project.optional-dependencies] -all = ["vlab4mic[dev,test]"] +all = ["vlab4mic[dev,test,gpu]"] dev = [ "pre-commit>=3.7.0", "ruff>=0.4.3", @@ -54,6 +54,9 @@ test = [ "nbmake>=1.5.3", "mypy>=1.10.0" ] +gpu = [ + "pyopencl>=2022.3.1" +] [tool.setuptools] package-dir = { "" = "src" } diff --git a/src/vlab4mic/utils/__opencl__.py b/src/vlab4mic/utils/__opencl__.py new file mode 100644 index 0000000..51cf65a --- /dev/null +++ b/src/vlab4mic/utils/__opencl__.py @@ -0,0 +1,196 @@ +import os +import inspect +import warnings +from pathlib import Path + +os.environ["PYOPENCL_COMPILER_OUTPUT"] = "1" + + +class NoCL(object): + class MemoryError(Exception): + def __init__(self): + self.message = "NoCL memory error" + + def __str__(self): + print(self.message) + + class LogicError(Exception): + def __init__(self): + self.message = "NoCL logic error" + + def __str__(self): + print(self.message) + + class Error(Exception): + def __init__(self): + self.message = "NoCL error" + + def __str__(self): + print(self.message) + + def __init__(self): + pass + + def __bool__(self): + return False + + +try: + import pyopencl as cl + import pyopencl.array as cl_array + + devices = [] + _fastest_device = None + max_perf = 0 + + if ( + len(cl.get_platforms()) == 1 + and len(cl.get_platforms()[0].get_devices()) == 1 + ): + dev = cl.get_platforms()[0].get_devices()[0] + _fastest_device = {"device": dev, "DP": False} + devices.append({"device": dev, "DP": False}) + + else: + for platform in cl.get_platforms(): + if ( + "Microsoft" in platform.vendor + ): # TODO this takes out emulated GPUs + continue + for dev in platform.get_devices(): + # check if the device is a GPU + if "GPU" not in cl.device_type.to_string(dev.type): + continue + if "cl_khr_fp64" in dev.extensions.strip().split(" "): + cl_dp = False + else: + cl_dp = False + if "Intel" in platform.vendor: + # penalty for Intel based integrated GPUs as compute units here refer to threads + perf = ( + dev.max_compute_units + / 2 + * dev.max_clock_frequency + * dev.max_mem_alloc_size + ) + else: + perf = ( + dev.max_compute_units + * dev.max_clock_frequency + * dev.max_mem_alloc_size + ) + if perf > max_perf: + max_perf = perf + _fastest_device = {"device": dev, "DP": cl_dp} + devices.append({"device": dev, "DP": cl_dp}) + + +except (ImportError, OSError, Exception) as e: + print("This exception is what's causing cl equals None:", e) + cl = NoCL() + cl_array = None + devices = None + _fastest_device = None + + +def print_opencl_info(): + """ + Prints information about the OpenCL devices on the system + """ + # REF: https://github.com/benshope/PyOpenCL-Tutorial + + msg = "\n" + "=" * 60 + "\nOpenCL Platforms and Devices \n" + # Print each platform on this computer + for platform in cl.get_platforms(): + msg += "=" * 60 + "\n" + msg += "Platform - Name: " + platform.name + "\n" + msg += "Platform - Vendor: " + platform.vendor + "\n" + msg += "Platform - Version: " + platform.version + "\n" + msg += "Platform - Profile: " + platform.profile + "\n" + # Print each device per-platform + for device in platform.get_devices(): + msg += "\t" + "-" * 56 + "\n" + msg += "\tDevice - Name: " + device.name + "\n" + msg += ( + "\tDevice - Type: " + + cl.device_type.to_string(device.type) + + "\n" + ) + msg += ( + f"\tDevice - Max Clock Speed: {device.max_clock_frequency} Mhz" + + "\n" + ) + + msg += ( + f"\tDevice - Compute Units: {device.max_compute_units}" + "\n" + ) + msg += ( + f"\tDevice - Local Memory: {device.local_mem_size / 1024.0:.0f} KB" + + "\n" + ) + msg += ( + f"\tDevice - Constant Memory: {device.max_constant_buffer_size / 1024.0:.0f} KB" + + "\n" + ) + msg += ( + f"\tDevice - Global Memory: {device.global_mem_size / 1073741824.0:.0f} GB" + + "\n" + ) + msg += ( + f"\tDevice - Max Buffer/Image Size: {device.max_mem_alloc_size / 1048576.0:.0f} MB" + + "\n" + ) + msg += ( + f"\tDevice - Max Work Group Size: {device.max_work_group_size:.0f}" + + "\n" + ) + + return msg + + +def opencl_works(): + """ + Checks if the system has OpenCL compatibility + :return: True if the system has OpenCL compatibility, False otherwise + """ + disabled = os.environ.get("NANOPYX_DISABLE_OPENCL", "0") == "1" + enabled = os.environ.get("NANOPYX_ENABLE_OPENCL", "1") == "1" + + if disabled or not enabled: + warnings.warn( + "OpenCL is disabled. To enable it, set the environment variable NANOPYX_ENABLE_OPENCL=1" + ) + return False + + elif enabled: + if not cl: + warnings.warn("tap... tap... tap... COMPUTER SAYS NO (OpenCL)!") + os.environ["NANOPYX_DISABLE_OPENCL"] = "1" + return False + + return True + + +def _get_cl_code(file_name, cl_dp): + """ + Retrieves the OpenCL code from the corresponding .cl file + """ + cl_file = os.path.splitext(file_name)[0] + ".cl" + + if not os.path.exists(cl_file): + # Use the path of the file that called this function + caller_frame = inspect.stack()[1] + caller_path = os.path.dirname(os.path.abspath(caller_frame.filename)) + cl_file = os.path.join(caller_path, file_name) + + assert os.path.exists(cl_file), "Could not find OpenCL file: " + str( + cl_file + ) + + with open(cl_file) as f: + kernel_str = f.read() + + if not cl_dp: + kernel_str = kernel_str.replace("double", "float") + + return kernel_str diff --git a/src/vlab4mic/utils/transform/_convolution.cl b/src/vlab4mic/utils/transform/_convolution.cl new file mode 100644 index 0000000..71ea932 --- /dev/null +++ b/src/vlab4mic/utils/transform/_convolution.cl @@ -0,0 +1,85 @@ +__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST; + +// 1D convolution along x-axis (columns) for buffer (handles 3D data) +__kernel void +conv1d_x(__global float *image, __global float *image_out, __global float *kernel_array, int kernel_size, int nrows, int ncols){ + + int row = get_global_id(0); + int col = get_global_id(1); + int z = get_global_id(2); + + // Return if out of bounds + if (row >= nrows || col >= ncols) return; + + int kernel_center = (kernel_size-1)/2; + float acc = 0.0f; + + // Calculate the linear index for this z-slice + int slice_offset = z * nrows * ncols; + + for (int k = 0; k < kernel_size; k++) { + int localcol = col + (k - kernel_center); + // Zero-padding at boundaries (mode='same') + if (localcol >= 0 && localcol < ncols) { + acc += kernel_array[k] * image[slice_offset + row * ncols + localcol]; + } + } + + image_out[slice_offset + row * ncols + col] = acc; +} + +// 1D convolution along y-axis (rows) for buffer (handles 3D data) +__kernel void +conv1d_y(__global float *image, __global float *image_out, __global float *kernel_array, int kernel_size, int nrows, int ncols){ + + int row = get_global_id(0); + int col = get_global_id(1); + int z = get_global_id(2); + + // Return if out of bounds + if (row >= nrows || col >= ncols) return; + + int kernel_center = (kernel_size-1)/2; + float acc = 0.0f; + + // Calculate the linear index for this z-slice + int slice_offset = z * nrows * ncols; + + for (int k = 0; k < kernel_size; k++) { + int localrow = row + (k - kernel_center); + // Zero-padding at boundaries (mode='same') + if (localrow >= 0 && localrow < nrows) { + acc += kernel_array[k] * image[slice_offset + localrow * ncols + col]; + } + } + + image_out[slice_offset + row * ncols + col] = acc; +} + +// 1D convolution along z-axis (depth) for buffer (handles 3D data) +__kernel void +conv1d_z(__global float *image, __global float *image_out, __global float *kernel_array, int kernel_size, int nrows, int ncols, int ndepth){ + + int row = get_global_id(0); + int col = get_global_id(1); + + // Return if out of bounds + if (row >= nrows || col >= ncols) return; + + int kernel_center = (kernel_size-1)/2; + + // For each depth slice at this row,col position + for (int z = 0; z < ndepth; z++) { + float acc = 0.0f; + + for (int k = 0; k < kernel_size; k++) { + int localz = z + (k - kernel_center); + // Zero-padding at boundaries (mode='same') + if (localz >= 0 && localz < ndepth) { + acc += kernel_array[k] * image[localz * nrows * ncols + row * ncols + col]; + } + } + + image_out[z * nrows * ncols + row * ncols + col] = acc; + } +} \ No newline at end of file diff --git a/src/vlab4mic/utils/transform/image_convolution.py b/src/vlab4mic/utils/transform/image_convolution.py index f063bd1..ff88c48 100644 --- a/src/vlab4mic/utils/transform/image_convolution.py +++ b/src/vlab4mic/utils/transform/image_convolution.py @@ -5,22 +5,178 @@ import skimage from scipy.ndimage import zoom import copy +from ..__opencl__ import ( + opencl_works, + cl_array, + _fastest_device, + cl, + _get_cl_code, +) + + +def test_kernel_separability(psf): + cz, cy, cx = (s // 2 for s in psf.shape) + + kx = psf[cz, cy, :] + ky = psf[cz, :, cx] + kz = psf[:, cy, cx] + + recon = kz[:, None, None] * ky[None, :, None] * kx[None, None, :] + + return np.linalg.norm(psf - recon) / np.linalg.norm(psf) -# keep def convolve3D(voxelated_field, psf, method="fft"): """ psf is the 3D superpixelated psf. The pixelsize of both psf and voxelated field must be equal psf is also assumed to be centered """ - result = convolve(voxelated_field, psf, mode="same", method=method) + if opencl_works(): + if test_kernel_separability(psf) < 1e-2: + result = convolve_opencl(voxelated_field, psf) + else: + result = convolve(voxelated_field, psf, mode="same", method=method) + else: + result = convolve(voxelated_field, psf, mode="same", method=method) return result -""" -Section: convolve in 3D, use zrange to get emitter that are to be considered -""" +def convolve_opencl(input, kernel): + # Convolve 3D using separable decomposition + # For a rank-1 separable 3D kernel K[z,y,x] = ky[y] * kx[x] * kz[z], + # extract the 1D kernels and normalize them + + cz, cy, cx = (s // 2 for s in kernel.shape) + + # For a separable kernel, we can extract 1D kernels from center slices + # K[cz, cy, x] = ky[cz] * kx[x] * kz[cy] → kx[x] = K[cz, cy, x] / (ky[cz] * kz[cy]) + # K[cz, y, cx] = ky[y] * kx[cx] * kz[cz] → ky[y] = K[cz, y, cx] / (kx[cx] * kz[cz]) + # K[z, cy, cx] = ky[cy] * kx[cx] * kz[z] → kz[z] = K[z, cy, cx] / (ky[cy] * kx[cx]) + + # The simplest approach: extract center slices and normalize to sum=1 + center_val = kernel[cz, cy, cx] + + kz = ( + kernel[cz, cy, :] / center_val + if center_val != 0 + else kernel[cz, cy, :] + ) + ky = ( + kernel[cz, :, cx] / center_val + if center_val != 0 + else kernel[cz, :, cx] + ) + kx = ( + kernel[:, cy, cx] / center_val + if center_val != 0 + else kernel[:, cy, cx] + ) + + # Normalize each to sum to 1 + kz = kz / np.sum(kz) if np.sum(kz) > 0 else kz + ky = ky / np.sum(ky) if np.sum(ky) > 0 else ky + kx = kx / np.sum(kx) if np.sum(kx) > 0 else kx + + # Apply separable 1D convolutions + intermediate = convolve1D_opencl(input, ky, axis=0) + intermediate = convolve1D_opencl(intermediate, kx, axis=1) + output = convolve1D_opencl(intermediate, kz, axis=2) + return output + + +def convolve1D_opencl(input, kernel, axis=0, device=None): + """ + Perform 1D convolution along a specified axis. + Matches scipy.signal.convolve behavior (kernel is flipped internally). + + Parameters: + - input: 3D input array + - kernel: 1D kernel array + - axis: Which axis to convolve along (0=depth, 1=rows, 2=columns) + - device: OpenCL device to use + """ + + if device is None: + device = _fastest_device + + # scipy.signal.convolve flips the kernel, so do it here + kernel_flipped = kernel[::-1] + + # Ensure input and kernel are contiguous + input_contig = np.ascontiguousarray(input, dtype=np.float32) + kernel_contig = np.ascontiguousarray(kernel_flipped, dtype=np.float32) + + # QUEUE AND CONTEXT + cl_ctx = cl.Context([device["device"]]) + cl_queue = cl.CommandQueue(cl_ctx) + + image_out = np.zeros_like(input_contig, dtype=np.float32) + mf = cl.mem_flags + + input_image = cl.Buffer( + cl_ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=input_contig + ) + input_kernel = cl.Buffer( + cl_ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=kernel_contig + ) + output_opencl = cl.Buffer(cl_ctx, mf.WRITE_ONLY, image_out.nbytes) + + kernelsize = np.int32(kernel_contig.shape[0]) + ndepth = np.int32(input_contig.shape[0]) # z-dimension (depth) + nrows = np.int32(input_contig.shape[1]) # y-dimension (rows/height) + ncols = np.int32(input_contig.shape[2]) # x-dimension (columns/width) + + code = _get_cl_code("_convolution.cl", device["DP"]) + prg = cl.Program(cl_ctx, code).build() + + if axis == 0: + # Convolution along dimension 0 (z/depth) + knl = prg.conv1d_z + knl( + cl_queue, + (int(nrows), int(ncols)), + None, + input_image, + output_opencl, + input_kernel, + kernelsize, + nrows, + ncols, + ndepth, + ).wait() + elif axis == 1: + # Convolution along dimension 1 (y/rows) + knl = prg.conv1d_y + knl( + cl_queue, + (int(nrows), int(ncols), int(ndepth)), + None, + input_image, + output_opencl, + input_kernel, + kernelsize, + nrows, + ncols, + ).wait() + elif axis == 2: + # Convolution along dimension 2 (x/columns) + knl = prg.conv1d_x + knl( + cl_queue, + (int(nrows), int(ncols), int(ndepth)), + None, + input_image, + output_opencl, + input_kernel, + kernelsize, + nrows, + ncols, + ).wait() + + cl.enqueue_copy(cl_queue, image_out, output_opencl).wait() + cl_queue.finish() + return image_out # keep @@ -64,11 +220,12 @@ def voxelize_active_fluorophores_withrange( coordinates_intensity = np.repeat( activated_coordinates, repeats=photons_per_emitter, axis=0 ) - voxel = voxelate_points_withrange(coordinates_intensity, bin_pixelsize, ranges) + voxel = voxelate_points_withrange( + coordinates_intensity, bin_pixelsize, ranges + ) return voxel -## keep def frame_by_volume_convolution( coordinates, photon_vector, @@ -82,21 +239,21 @@ def frame_by_volume_convolution( as_mask=False, ): """ - Generate a 2D image from the 3D field of active emitters + Generate a 2D input from the 3D field of active emitters convolved with the 3D PSF - The resulting 2D image can be generated by considering the whole + The resulting 2D input can be generated by considering the whole volume or a subset from it. projection_depth tells the depth of the volume to be considered in units of frames - IMPORTANT: All units are in the same scale, except image dimensions + IMPORTANT: All units are in the same scale, except input dimensions """ projection_depth_half = int(projection_depth / 2) intensity_voxel = voxelize_active_fluorophores_withrange( coordinates, photon_vector, ranges, bin_pixelsize=bin_size ) if as_mask: - frame = np.sum(np.array(intensity_voxel),axis=2) + frame = np.sum(np.array(intensity_voxel), axis=2) return frame if activation_only: return intensity_voxel @@ -115,7 +272,11 @@ def frame_by_volume_convolution( frame = np.sum( np.array( convolved_intensity[ - :, :, zfocus_slice - projection_depth_half : zfocus_slice + projection_depth_half + :, + :, + zfocus_slice + - projection_depth_half : zfocus_slice + + projection_depth_half, ] ), axis=2, @@ -123,7 +284,7 @@ def frame_by_volume_convolution( return frame else: return convolved_intensity - + # keep def generate_frames_volume_convolution( @@ -174,7 +335,7 @@ def generate_frames_volume_convolution( psf_array, psf_focus_slice, projection_depth=psf_projection_depth, - as_mask=as_mask + as_mask=as_mask, ) cframes.append(frame_n) @@ -213,7 +374,7 @@ def generate_frames_volume_convolution( psf_focus_slice, projection_depth=psf_projection_depth, asframe=asframes, - activation_only=activation_only + activation_only=activation_only, ) volume_frames.append(v_frame) return volume_frames @@ -226,7 +387,7 @@ def lateral_binning_stack(stack, currentpixelsizes, newpixelsizes, field_size): field_size is a touple that contains the absolute size of each dimension """ - print("Binning image stack") + print("Binning input stack") nframes = stack.shape[0] ## Z in in the dimension 0 binned_images = np.zeros((nframes, field_size[0], field_size[0])) blocks = np.floor_divide( @@ -244,7 +405,9 @@ def lateral_binning_stack(stack, currentpixelsizes, newpixelsizes, field_size): # keep -def discretise_field_xyz(coordinates: np.ndarray, binsizes: list, ranges: list): +def discretise_field_xyz( + coordinates: np.ndarray, binsizes: list, ranges: list +): """ Generate a 3D histogram from a set of coordinates. @@ -287,7 +450,8 @@ def _prepare_data_4convolutions( **kwargs, ): psf_half_range = ( - min((psf_array.shape[2] - psf_focus_slice), psf_focus_slice) * psf_zstep + min((psf_array.shape[2] - psf_focus_slice), psf_focus_slice) + * psf_zstep ) zrange = psf_half_range * 2 nframes = np.shape(photons_frames)[1] @@ -357,21 +521,27 @@ def generate_frames_projection_conv_optimised2( # keep -def construct_frames(convolved_emitters_list, nframes, photons_frames, **kwargs): +def construct_frames( + convolved_emitters_list, nframes, photons_frames, **kwargs +): print("constructing frames") timeserie = [] for f in tqdm(range(int(nframes))): # extract the vector that corresponds to the current frame photons_in_frame_vect = photons_frames[:, f] timeserie.append( - merge_emissions_lookup(photons_in_frame_vect, convolved_emitters_list) + merge_emissions_lookup( + photons_in_frame_vect, convolved_emitters_list + ) ) return np.array(timeserie) # keep -def bin_timeseries(timeseires, currentpixelsizes, newpixelsizes, field_size, **kwargs): - # field_size is equivalent to image size +def bin_timeseries( + timeseires, currentpixelsizes, newpixelsizes, field_size, **kwargs +): + # field_size is equivalent to input size # assumes the stack has dimensions tXY print( f"current pixels size for bining: {currentpixelsizes}; new ones {newpixelsizes}" @@ -463,7 +633,9 @@ def merge_emissions_lookup(photons_in_frame_vect, convolved_emitters_list): if total_photons_frame > 0: for pos in range(len(photons_in_frame_vect)): if photons_in_frame_vect[pos] > 0: - intensity = convolved_emitters_list[pos] * photons_in_frame_vect[pos] + intensity = ( + convolved_emitters_list[pos] * photons_in_frame_vect[pos] + ) frame = np.sum([frame, intensity], axis=0) - # shall return a single image + # shall return a single input return frame