-
Notifications
You must be signed in to change notification settings - Fork 79
Description
TLDR: defining array sizes with unsigned integers and then tiling the resulting kernel will break the per-thread check to see if it is inside the loop domain. This is because the check is of the form if (N - something >= 0) {, which is always true if N is unsigned (at least in my testing)
Hefty mvp to reproduce (including PyCuda boilerplate)
import numpy as np
import loopy as lp
from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
domain = '{ [ic, jc] : 2 <= ic < nxc and 2 <= jc < nyc }'
body = '''
for jc, ic
qc[ic-1, jc-1] = (jc-1) + (ic-1)*nxc
end
'''
arguments_unsigned = [
lp.ValueArg('nxc', dtype=np.uint32),
lp.ValueArg('nyc', dtype=np.uint32),
lp.ArrayArg('qc', dtype=np.float64,
address_space=lp.AddressSpace.GLOBAL,
shape=('nxc', 'nyc'))
]
arguments_signed = [
lp.ValueArg('nxc', dtype=np.int32),
lp.ValueArg('nyc', dtype=np.int32),
lp.ArrayArg('qc', dtype=np.float64,
address_space=lp.AddressSpace.GLOBAL,
shape=('nxc', 'nyc'))
]
tile_size = 32
# Boilerplate code ...
def split_inames(kernel):
kernel = lp.split_iname(kernel, 'ic', tile_size, outer_tag="g.0", inner_tag="l.0")
kernel = lp.split_iname(kernel, 'jc', tile_size, outer_tag="g.1", inner_tag="l.1")
return kernel
def print_code(kernel):
kernel = kernel.copy(target=lp.CudaTarget())
cgr = lp.generate_code_v2(kernel)
print(cgr.device_code())
def run_cuda_kernel(kernel, signed, nxc=6, nyc=6):
# Grab device code from Loopy
kernel = kernel.copy(target=lp.CudaTarget())
cgr = lp.generate_code_v2(kernel)
dev_code = cgr.device_code()
print('device code:')
print(dev_code)
# Compile kernel w/ PyCuda
module = SourceModule(dev_code)
cuda_fn = module.get_function('loopy_kernel')
# Allocate out memory
qch = np.empty((nxc, nyc))
qc = cuda.mem_alloc(8 * nxc * nyc)
# Set launch arguments
grid = (1, 1, 1)
block = (tile_size, tile_size, 1)
if signed:
nxc = np.intc(nxc)
nyc = np.intc(nyc)
else:
nxc = np.uintc(nxc)
nyc = np.uintc(nyc)
cuda_fn(nxc, nyc, qc, grid=grid, block=block)
cuda.memcpy_dtoh(qch, qc)
print('output buffer:')
print(qch)
# Define kernels
print(' -- unsigned array limits -- ')
kernel_unsigned = split_inames(lp.make_kernel(
domains=domain, instructions=body, kernel_data=arguments_unsigned
))
run_cuda_kernel(kernel_unsigned, False)
print()
print()
print(' -- signed array limits -- ')
kernel_signed = split_inames(lp.make_kernel(
domains=domain, instructions=body, kernel_data=arguments_signed
))
run_cuda_kernel(kernel_signed, True)For the unsigned case, I get the following device code:
extern "C" __global__ void __launch_bounds__(1024) loopy_kernel(unsigned const nxc, unsigned const nyc, double *__restrict__ qc)
{
if (-1 + -32 * bIdx(y) + -1 * tIdx(y) + nyc >= 0 && -2 + 32 * bIdx(x) + tIdx(x) >= 0 && -2 + 32 * bIdx(y) + tIdx(y) >= 0 && -1 + -32 * bIdx(x) + -1 * tIdx(x) + nxc >= 0)
qc[nyc * (-1 + 32 * bIdx(x) + tIdx(x)) + -1 + 32 * bIdx(y) + tIdx(y)] = (tIdx(x) + bIdx(x) * 32.0 + -1.0) * (tIdx(y) + bIdx(y) * 32.0 + -1.0) * nxc;
}which specifically breaks on the parts on the if statement. Expressions like -1 + -32 * bIdx(y) + -1 * tIdx(y) + nyc>=0 are always true in unsigned arithmetic, so the above kernel will go past the correct array bounds.
Output I get with the unsigned kernel:
[[ 0. 0. 0. 0. 0. 0.]
[ 0. 7. 8. 9. 10. 11.]
[12. 13. 14. 15. 16. 17.]
[18. 19. 20. 21. 22. 23.]
[24. 25. 26. 27. 28. 29.]
[30. 31. 32. 33. 34. 35.]]
versus what I (correctly) get with a signed version, which is a ring of 0's around the exterior:
[[ 0. 0. 0. 0. 0. 0.]
[ 0. 7. 8. 9. 10. 0.]
[ 0. 13. 14. 15. 16. 0.]
[ 0. 19. 20. 21. 22. 0.]
[ 0. 25. 26. 27. 28. 0.]
[ 0. 0. 0. 0. 0. 0.]]