diff --git a/cunumeric/module.py b/cunumeric/module.py index 4b9d028ddb..032921a7be 100644 --- a/cunumeric/module.py +++ b/cunumeric/module.py @@ -2046,7 +2046,6 @@ def repeat(a, repeats, axis=None): -------- Multiple GPUs, Multiple CPUs """ - # when array is a scalar if np.ndim(a) == 0: if np.ndim(repeats) == 0: @@ -2075,7 +2074,7 @@ def repeat(a, repeats, axis=None): axis = np.int32(axis) if axis >= array.ndim: - return ValueError("axis exceeds dimension of the input array") + raise ValueError("axis exceeds dimension of the input array") # If repeats is on a zero sized axis, then return the array. if array.shape[axis] == 0: @@ -2100,11 +2099,36 @@ def repeat(a, repeats, axis=None): category=UserWarning, ) repeats = np.int64(repeats) - result = array._thunk.repeat( - repeats=repeats, - axis=axis, - scalar_repeats=True, - ) + if repeats < 0: + raise ValueError( + "'repeats' should not be negative: {}".format(repeats) + ) + + # check output shape (if it will fit to GPU or not) + out_shape = list(array.shape) + out_shape[axis] *= repeats + out_shape = tuple(out_shape) + size = sum(out_shape) * array.itemsize + # check if size of the output array is less 8GB. In this case we can + # use output regions, otherwise we will use statcally allocated + # array + if size < 8589934592 / 2: + + result = array._thunk.repeat( + repeats=repeats, axis=axis, scalar_repeats=True + ) + else: + # this implementation is taken from CuPy + result = ndarray(shape=out_shape, dtype=array.dtype) + a_index = [slice(None)] * len(out_shape) + res_index = list(a_index) + offset = 0 + for i in range(a._shape[axis]): + a_index[axis] = slice(i, i + 1) + res_index[axis] = slice(offset, offset + repeats) + result[res_index] = array[a_index] + offset += repeats + return result # repeats is an array else: # repeats should be integer type @@ -2115,10 +2139,32 @@ def repeat(a, repeats, axis=None): ) repeats = repeats.astype(np.int64) if repeats.shape[0] != array.shape[axis]: - return ValueError("incorrect shape of repeats array") - result = array._thunk.repeat( - repeats=repeats._thunk, axis=axis, scalar_repeats=False - ) + raise ValueError("incorrect shape of repeats array") + + # check output shape (if it will fit to GPU or not) + out_shape = list(array.shape) + n_repeats = sum(repeats) + out_shape[axis] = n_repeats + out_shape = tuple(out_shape) + size = sum(out_shape) * array.itemsize + # check if size of the output array is less 8GB. In this case we can + # use output regions, otherwise we will use statcally allocated + # array + if size < 8589934592 / 2: + result = array._thunk.repeat( + repeats=repeats._thunk, axis=axis, scalar_repeats=False + ) + else: # this implementation is taken from CuPy + result = ndarray(shape=out_shape, dtype=array.dtype) + a_index = [slice(None)] * len(out_shape) + res_index = list(a_index) + offset = 0 + for i in range(a._shape[axis]): + a_index[axis] = slice(i, i + 1) + res_index[axis] = slice(offset, offset + repeats[i]) + result[res_index] = array[a_index] + offset += repeats[i] + return result return ndarray(shape=result.shape, thunk=result) diff --git a/src/cunumeric/index/repeat.cc b/src/cunumeric/index/repeat.cc index bfd134cb4a..e49ae611bc 100644 --- a/src/cunumeric/index/repeat.cc +++ b/src/cunumeric/index/repeat.cc @@ -69,6 +69,8 @@ struct RepeatImplBody { int64_t out_idx = 0; for (size_t in_idx = 0; in_idx < volume; ++in_idx) { auto p = in_pitches.unflatten(in_idx, in_rect.lo); + // TODO replace assert with Legate exception handeling interface when available + assert(repeats[p] >= 0); for (size_t r = 0; r < repeats[p]; r++) out[out_idx++] = in[p]; } } @@ -88,6 +90,8 @@ struct RepeatImplBody { for (int64_t idx = in_rect.lo[axis]; idx <= in_rect.hi[axis]; ++idx) { p[axis] = idx; offsets[off_idx++] = sum; + // TODO replace assert with Legate exception handeling interface when available + assert(repeats[p] >= 0); sum += repeats[p]; } diff --git a/src/cunumeric/index/repeat.cu b/src/cunumeric/index/repeat.cu index 09d6c71978..8d0b8c86b3 100644 --- a/src/cunumeric/index/repeat.cu +++ b/src/cunumeric/index/repeat.cu @@ -41,6 +41,8 @@ static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM) if (offset < extent) { auto p = origin; p[axis] += offset; + // TODO replace assert with Legate exception handeling interface when available + assert(repeats[p] >= 0); auto val = repeats[p]; offsets[offset] = val; SumReduction::fold(value, val); diff --git a/src/cunumeric/index/repeat_omp.cc b/src/cunumeric/index/repeat_omp.cc index 823a1a16a4..8a1b3e46a7 100644 --- a/src/cunumeric/index/repeat_omp.cc +++ b/src/cunumeric/index/repeat_omp.cc @@ -77,7 +77,9 @@ struct RepeatImplBody { int64_t axis_lo = p[axis]; #pragma omp for schedule(static) private(p) for (int64_t idx = 0; idx < axis_extent; ++idx) { - p[axis] = axis_lo + idx; + p[axis] = axis_lo + idx; + // TODO replace assert with Legate exception handeling interface when available + assert(repeats[p] >= 0); auto val = repeats[p]; offsets[idx] = val; local_sums[tid] += val;