Unsigned array bounds break in CUDA

TLDR: defining array sizes with unsigned integers and then tiling the resulting kernel will break the per-thread check to see if it is inside the loop domain.  This is because the check is of the form `if (N - something >= 0) {`, which is always true if `N` is unsigned (at least in my testing)

Hefty mvp to reproduce (including PyCuda boilerplate)

```python
import numpy as np
import loopy as lp
from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule

domain = '{ [ic, jc] : 2 <= ic < nxc and 2 <= jc < nyc }'

body = '''
for jc, ic
    qc[ic-1, jc-1] = (jc-1) + (ic-1)*nxc
end
'''

arguments_unsigned = [
    lp.ValueArg('nxc', dtype=np.uint32),
    lp.ValueArg('nyc', dtype=np.uint32),
    lp.ArrayArg('qc', dtype=np.float64,
                address_space=lp.AddressSpace.GLOBAL,
                shape=('nxc', 'nyc'))
]

arguments_signed = [
    lp.ValueArg('nxc', dtype=np.int32),
    lp.ValueArg('nyc', dtype=np.int32),
    lp.ArrayArg('qc', dtype=np.float64,
                address_space=lp.AddressSpace.GLOBAL,
                shape=('nxc', 'nyc'))
]

tile_size = 32

# Boilerplate code ...

def split_inames(kernel):
    kernel = lp.split_iname(kernel, 'ic', tile_size, outer_tag="g.0", inner_tag="l.0")
    kernel = lp.split_iname(kernel, 'jc', tile_size, outer_tag="g.1", inner_tag="l.1")
    return kernel

def print_code(kernel):
    kernel = kernel.copy(target=lp.CudaTarget())
    cgr = lp.generate_code_v2(kernel)
    print(cgr.device_code())

def run_cuda_kernel(kernel, signed, nxc=6, nyc=6):
    # Grab device code from Loopy
    kernel = kernel.copy(target=lp.CudaTarget())
    cgr = lp.generate_code_v2(kernel)
    dev_code = cgr.device_code()

    print('device code:')
    print(dev_code)

    # Compile kernel w/ PyCuda
    module = SourceModule(dev_code)
    cuda_fn = module.get_function('loopy_kernel')

    # Allocate out memory
    qch = np.empty((nxc, nyc))
    qc = cuda.mem_alloc(8 * nxc * nyc)

    # Set launch arguments
    grid = (1, 1, 1)
    block = (tile_size, tile_size, 1)

    if signed:
        nxc = np.intc(nxc)
        nyc = np.intc(nyc)
    else:
        nxc = np.uintc(nxc)
        nyc = np.uintc(nyc)

    cuda_fn(nxc, nyc, qc, grid=grid, block=block)
    cuda.memcpy_dtoh(qch, qc)

    print('output buffer:')
    print(qch)

# Define kernels

print(' -- unsigned array limits -- ')
kernel_unsigned = split_inames(lp.make_kernel(
    domains=domain, instructions=body, kernel_data=arguments_unsigned
))
run_cuda_kernel(kernel_unsigned, False)

print()
print()

print(' -- signed array limits -- ')
kernel_signed = split_inames(lp.make_kernel(
    domains=domain, instructions=body, kernel_data=arguments_signed
))
run_cuda_kernel(kernel_signed, True)
```

For the unsigned case, I get the following device code:
```cuda
extern "C" __global__ void __launch_bounds__(1024) loopy_kernel(unsigned const nxc, unsigned const nyc, double *__restrict__ qc)
{
  if (-1 + -32 * bIdx(y) + -1 * tIdx(y) + nyc >= 0 && -2 + 32 * bIdx(x) + tIdx(x) >= 0 && -2 + 32 * bIdx(y) + tIdx(y) >= 0 && -1 + -32 * bIdx(x) + -1 * tIdx(x) + nxc >= 0)
    qc[nyc * (-1 + 32 * bIdx(x) + tIdx(x)) + -1 + 32 * bIdx(y) + tIdx(y)] = (tIdx(x) + bIdx(x) * 32.0 + -1.0) * (tIdx(y) + bIdx(y) * 32.0 + -1.0) * nxc;
}
```
which specifically breaks on the parts on the if statement.  Expressions like `-1 + -32 * bIdx(y) + -1 * tIdx(y) + nyc>=0` are always true in unsigned arithmetic, so the above kernel will go past the correct array bounds.

Output I get with the unsigned kernel:
```
[[ 0.  0.  0.  0.  0.  0.]
 [ 0.  7.  8.  9. 10. 11.]
 [12. 13. 14. 15. 16. 17.]
 [18. 19. 20. 21. 22. 23.]
 [24. 25. 26. 27. 28. 29.]
 [30. 31. 32. 33. 34. 35.]]
```
versus what I (correctly) get with a signed version, which is a ring of 0's around the exterior:
```
[[ 0.  0.  0.  0.  0.  0.]
 [ 0.  7.  8.  9. 10.  0.]
 [ 0. 13. 14. 15. 16.  0.]
 [ 0. 19. 20. 21. 22.  0.]
 [ 0. 25. 26. 27. 28.  0.]
 [ 0.  0.  0.  0.  0.  0.]]
```


Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Unsigned array bounds break in CUDA #808

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Unsigned array bounds break in CUDA #808

Description

Metadata

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Issue actions