Skip to content

Commit 490677a

Browse files
[lang] Fix ndarray cuda dealloc when using preallocated memory (#3829)
* Identify preallocate case * Add test for caching behavior * Auto Format * Add a note * Auto Format Co-authored-by: Taichi Gardener <[email protected]>
1 parent 420e6b6 commit 490677a

File tree

3 files changed

+26
-1
lines changed

3 files changed

+26
-1
lines changed

taichi/backends/cuda/cuda_device.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ DeviceAllocation CudaDevice::allocate_memory(const AllocParams &params) {
2323
info.size = params.size;
2424
info.is_imported = false;
2525
info.use_cached = false;
26+
info.use_preallocated = false;
2627

2728
DeviceAllocation alloc;
2829
alloc.alloc_id = allocations_.size();
@@ -48,6 +49,7 @@ DeviceAllocation CudaDevice::allocate_memory_runtime(
4849
info.size = taichi::iroundup(params.size, taichi_page_size);
4950
info.is_imported = false;
5051
info.use_cached = params.use_cached;
52+
info.use_preallocated = true;
5153

5254
DeviceAllocation alloc;
5355
alloc.alloc_id = allocations_.size();
@@ -69,7 +71,7 @@ void CudaDevice::dealloc_memory(DeviceAllocation handle) {
6971
TI_ERROR("the CudaCachingAllocator is not initialized");
7072
}
7173
caching_allocator_->release(info.size, (uint64_t *)info.ptr);
72-
} else {
74+
} else if (!info.use_preallocated) {
7375
CUDADriver::get_instance().mem_free(info.ptr);
7476
info.ptr = nullptr;
7577
}

taichi/backends/cuda/cuda_device.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,17 @@ class CudaDevice : public Device {
8181
void *ptr{nullptr};
8282
size_t size{0};
8383
bool is_imported{false};
84+
/* Note: Memory allocation in CUDA device.
85+
* CudaDevice can use either its own cuda malloc mechanism via
86+
* `allocate_memory` or the preallocated memory managed by Llvmprogramimpl
87+
* via `allocate_memory_runtime`. The `use_preallocated` is used to track
88+
* this option. For now, we keep both options and the preallocated method is
89+
* used by default for CUDA backend. The `use_cached` is to enable/disable
90+
* the caching behavior in `allocate_memory_runtime`. Later it should be
91+
* always enabled, for now we keep both options to allow a scenario when
92+
* using preallocated memory while disabling the caching behavior.
93+
* */
94+
bool use_preallocated{true};
8495
bool use_cached{false};
8596
};
8697

tests/python/test_ndarray.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -309,6 +309,18 @@ def _test_ndarray_deepcopy():
309309
assert y[4][1, 0] == 9
310310

311311

312+
def test_ndarray_cuda_caching_allocator():
313+
ti.init(arch=ti.cuda,
314+
ndarray_use_torch=False,
315+
ndarray_use_cached_allocator=True)
316+
n = 8
317+
a = ti.ndarray(ti.i32, shape=(n))
318+
a.fill(2)
319+
a = 1
320+
b = ti.ndarray(ti.i32, shape=(n))
321+
b.fill(2)
322+
323+
312324
@ti.test(arch=supported_archs_taichi_ndarray, ndarray_use_torch=False)
313325
def test_ndarray_rw_cache():
314326
a = ti.Vector.ndarray(3, ti.f32, ())

0 commit comments

Comments
 (0)