File tree Expand file tree Collapse file tree 3 files changed +26
-1
lines changed Expand file tree Collapse file tree 3 files changed +26
-1
lines changed Original file line number Diff line number Diff line change @@ -23,6 +23,7 @@ DeviceAllocation CudaDevice::allocate_memory(const AllocParams ¶ms) {
2323 info.size = params.size ;
2424 info.is_imported = false ;
2525 info.use_cached = false ;
26+ info.use_preallocated = false ;
2627
2728 DeviceAllocation alloc;
2829 alloc.alloc_id = allocations_.size ();
@@ -48,6 +49,7 @@ DeviceAllocation CudaDevice::allocate_memory_runtime(
4849 info.size = taichi::iroundup (params.size , taichi_page_size);
4950 info.is_imported = false ;
5051 info.use_cached = params.use_cached ;
52+ info.use_preallocated = true ;
5153
5254 DeviceAllocation alloc;
5355 alloc.alloc_id = allocations_.size ();
@@ -69,7 +71,7 @@ void CudaDevice::dealloc_memory(DeviceAllocation handle) {
6971 TI_ERROR (" the CudaCachingAllocator is not initialized" );
7072 }
7173 caching_allocator_->release (info.size , (uint64_t *)info.ptr );
72- } else {
74+ } else if (!info. use_preallocated ) {
7375 CUDADriver::get_instance ().mem_free (info.ptr );
7476 info.ptr = nullptr ;
7577 }
Original file line number Diff line number Diff line change @@ -81,6 +81,17 @@ class CudaDevice : public Device {
8181 void *ptr{nullptr };
8282 size_t size{0 };
8383 bool is_imported{false };
84+ /* Note: Memory allocation in CUDA device.
85+ * CudaDevice can use either its own cuda malloc mechanism via
86+ * `allocate_memory` or the preallocated memory managed by Llvmprogramimpl
87+ * via `allocate_memory_runtime`. The `use_preallocated` is used to track
88+ * this option. For now, we keep both options and the preallocated method is
89+ * used by default for CUDA backend. The `use_cached` is to enable/disable
90+ * the caching behavior in `allocate_memory_runtime`. Later it should be
91+ * always enabled, for now we keep both options to allow a scenario when
92+ * using preallocated memory while disabling the caching behavior.
93+ * */
94+ bool use_preallocated{true };
8495 bool use_cached{false };
8596 };
8697
Original file line number Diff line number Diff line change @@ -309,6 +309,18 @@ def _test_ndarray_deepcopy():
309309 assert y [4 ][1 , 0 ] == 9
310310
311311
312+ def test_ndarray_cuda_caching_allocator ():
313+ ti .init (arch = ti .cuda ,
314+ ndarray_use_torch = False ,
315+ ndarray_use_cached_allocator = True )
316+ n = 8
317+ a = ti .ndarray (ti .i32 , shape = (n ))
318+ a .fill (2 )
319+ a = 1
320+ b = ti .ndarray (ti .i32 , shape = (n ))
321+ b .fill (2 )
322+
323+
312324@ti .test (arch = supported_archs_taichi_ndarray , ndarray_use_torch = False )
313325def test_ndarray_rw_cache ():
314326 a = ti .Vector .ndarray (3 , ti .f32 , ())
You can’t perform that action at this time.
0 commit comments