pytorch · kimishpatel · Nov 5, 2025 · Nov 6, 2025 · Nov 6, 2025 · Nov 10, 2025
@@ -0,0 +1,48 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Please this file formatted by running:
-# Please this file formatted by running:
+# Please keep this file formatted by running:
-# Please this file formatted by running:
+# Please keep this file formatted by running:
+# ~~~
+# cmake-format -i CMakeLists.txt
+# ~~~
+
+cmake_minimum_required(VERSION 3.19)
+
+# Source root directory for executorch.
+if(NOT EXECUTORCH_ROOT)
+  set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..)
+endif()
+
+list(TRANSFORM _extension_module__srcs PREPEND "${EXECUTORCH_ROOT}/")
-list(TRANSFORM _extension_module__srcs PREPEND "${EXECUTORCH_ROOT}/")
+list(TRANSFORM _extension_memory_allocator__srcs PREPEND "${EXECUTORCH_ROOT}/")
-list(TRANSFORM _extension_module__srcs PREPEND "${EXECUTORCH_ROOT}/")
+list(TRANSFORM _extension_memory_allocator__srcs PREPEND "${EXECUTORCH_ROOT}/")
+if(CMAKE_TOOLCHAIN_IOS
+   OR CMAKE_TOOLCHAIN_ANDROID
+   OR APPLE
+)
+  # Building a share library on iOS requires code signing On Android we see
+  # duplicated registration when using shared lib
+  add_library(extension_memory_allocator STATIC ${_extension_memory_allocator__srcs})
+else()
+  add_library(extension_memory_allocator ${_extension_memory_allocator__srcs})
+endif()
+target_link_libraries(
+  extension_memory_allocator PRIVATE executorch_core)
+target_include_directories(
+  extension_memory_allocator PUBLIC ${_common_include_directories}
+)
+target_compile_options(
+  extension_memory_allocator
+  PUBLIC $<$<CXX_COMPILER_ID:MSVC>:/wd4996>
+         $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-deprecated-declarations -fPIC>
+)
+
+# Install libraries
+install(
+  TARGETS extension_memory_allocator
+  EXPORT ExecuTorchTargets
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  INCLUDES
+  DESTINATION ${_common_include_directories}
+)
@@ -0,0 +1,88 @@
+#include <cstdlib>
+
+#include <executorch/extension/memory_allocator/cpu_caching_malloc_allocator.h>
+
+namespace executorch::extension {
+
+namespace {
+size_t get_alignment_adjusted_size(size_t size, size_t alignment) {
+  alignment = std::max(alignment, kDefaultAlignment);
+  if (size % alignment != 0) {
+    // Adjust size to the next multiple of alignment
+    // This is needed for aligned_alloc to work
+    return (size + alignment) & ~(alignment - 1);
+  } else {
+    return size;
+  }
+}
+} // namespace
+
+CPUCachingAllocator::CPUCachingAllocator(uint32_t max_size) : MemoryAllocator(0, nullptr) {
+  max_size_ = max_size;
+  current_size_ = 0;
+}
+
+void* CPUCachingAllocator::allocate(size_t size, size_t alignment) {
+  EXECUTORCH_TRACK_ALLOCATION(prof_id(), size);
+
+  if (!isPowerOf2(alignment)) {
+    ET_LOG(Error, "Alignment %zu is not a power of 2", alignment);
+    return nullptr;
+  }
+  size = get_alignment_adjusted_size(size, alignment);
+
+  std::lock_guard<std::mutex> guard(mutex_);
+  const auto& it = available_map_.find(size);
+  if (it == available_map_.end() || it->second.empty()) {
+    if (current_size_ + size > max_size_) {
+      // Freeing while holding the lock will cause performance issues
+      // we probably should log how often this happens so as to allow
+      // for calling site to adjust the max_size_ parameter
+      free_cached();
+    }
+    void* ptr = std::aligned_alloc(alignment, size);
+    current_size_ += size;
+    if (ptr == nullptr) {
+      ET_LOG(Error, "Failed to allocate memory");
+      return nullptr;
+    }
-    current_size_ += size;
-    if (ptr == nullptr) {
-      ET_LOG(Error, "Failed to allocate memory");
-      return nullptr;
-    }
+    if (ptr == nullptr) {
+      ET_LOG(Error, "Failed to allocate memory");
+      return nullptr;
+    }
+    current_size_ += size;
-    current_size_ += size;
-    if (ptr == nullptr) {
-      ET_LOG(Error, "Failed to allocate memory");
-      return nullptr;
-    }
+    if (ptr == nullptr) {
+      ET_LOG(Error, "Failed to allocate memory");
+      return nullptr;
+    }
+    current_size_ += size;
+    allocation_map_[ptr] = size;
+    return ptr;
+  }
+  void* ptr = it->second.back();
+  it->second.pop_back();
+  allocation_map_[ptr] = size;
+  return ptr;
+}
+
+void CPUCachingAllocator::free_cached() {
+  // We dont lock mutex_ here because it will cause deadlock otherwise
-  // We dont lock mutex_ here because it will cause deadlock otherwise
+  // We don't lock mutex_ here because it will cause deadlock otherwise
-  // We dont lock mutex_ here because it will cause deadlock otherwise
+  // We don't lock mutex_ here because it will cause deadlock otherwise
+  // we could use recursive_mutex but we just design this differently since
+  // free_cache is not a public API anyways
+  for (const auto& it : available_map_) {
+    for (const auto ptr : it.second) {
+      std::free(ptr);
+    }
+  }
+  available_map_.clear();
+}
+
+void CPUCachingAllocator::reset() {
+  std::lock_guard<std::mutex> guard(mutex_);
+  for (auto& it : allocation_map_) {
+    void* ptr = it.first;
+    size_t alloc_size = it.second;
+    // Cache the memory
+    available_map_[alloc_size].push_back(ptr);
+    current_size_ -= alloc_size;
+  }
+  allocation_map_.clear();
+}
+
+CPUCachingAllocator::~CPUCachingAllocator() {
+  // destructor must be called in thread safe manner
-  // destructor must be called in thread safe manner
+  // destructor must be called in thread safe manner
+  std::lock_guard<std::mutex> guard(mutex_);
-  // destructor must be called in thread safe manner
+  // destructor must be called in thread safe manner
+  std::lock_guard<std::mutex> guard(mutex_);
+  reset();
+  free_cached();
+}
+
+} // namespace executorch::extension
@@ -0,0 +1,81 @@
+#pragma once
+
+#include <cstddef>
+#include <mutex>
+
+#include <executorch/runtime/core/memory_allocator.h>
+
+#ifdef USE_C10_SMALL_VECTOR
+#include <c10/util/SmallVector.h>
+#else
+#include <vector>
+#endif
+
+#ifdef USE_C10_FLAT_HASH_MAP
+#include <c10/util/flat_hash_map.h>
+#else
+#include <unordered_map>
+#endif
+
+/*
+ * CPUCachingAllocator:
+ * This file is copied over from c10/mobile/CPUCachingAllocator.h
+ * It is a thread safe caching allocator.
+ */
+
+namespace executorch::extension {
+
+#ifdef USE_C10_SMALL_VECTOR
+template <typename T, unsigned N>
+using SmallVector = c10::SmallVector<T, N>;
+#else
+template <typename T, unsigned N>
+using SmallVector = std::vector<T>;
+#endif
+
+#ifdef USE_C10_FLAT_HASH_MAP
+template<typename KeyType, typename ValueType>
+using FlatHashMap = ska::flat_hash_map<KeyType, ValueType>;
+#else
+template<typename KeyType, typename ValueType>
+using FlatHashMap = std::unordered_map<KeyType, ValueType>;
+#endif
+
+constexpr size_t kDefaultAlignment = 64;
+class CPUCachingAllocator : public executorch::runtime::MemoryAllocator {
+  /*
+   * What it does:
+   * Caches all the allocations carried out by this allocator.
+   * Cache key is the size of the allocation.
+   * If requested size is found in the cache returns the cached pointer.
+   * What it does not do:
+   * No speculative allocation for any future allocations.
+   */
+ private:
+  void free_cached();
+
+ protected:
+  // Invariants.
+  // New invariants must be written.
+  FlatHashMap<size_t, SmallVector<void*, 16>> available_map_;
+  FlatHashMap<void*, size_t> allocation_map_;
+  // Since allocation_map, which is a global instance, is mutated/read via
+  // all public APIs we need a global mutex.
-  // Since allocation_map, which is a global instance, is mutated/read via
-  // all public APIs we need a global mutex.
+  // Since allocation_map_ and other member variables are mutated/read via
+  // all public APIs, we need a mutex to protect concurrent access to these instance members.
-  // Since allocation_map, which is a global instance, is mutated/read via
-  // all public APIs we need a global mutex.
+  // Since allocation_map_ and other member variables are mutated/read via
+  // all public APIs, we need a mutex to protect concurrent access to these instance members.
+  std::mutex mutex_;
+  size_t max_size_;
+  size_t current_size_;
+
+ public:
+  /*
+    max_size: Maximum size of memory to cache. Never cache more than that.
+  */
+  CPUCachingAllocator(uint32_t max_size);
-  CPUCachingAllocator(uint32_t max_size);
+  CPUCachingAllocator(size_t max_size);
-  CPUCachingAllocator(uint32_t max_size);
+  CPUCachingAllocator(size_t max_size);
+  // Checks the cache to see if allocation of size bytes can be found.
+  // If so return cached memory, else
+  // allocates memory, records it for caching and returns.
+  void* allocate(size_t size, size_t alignment = kDefaultAlignment) override;
+  void reset() override;
+  ~CPUCachingAllocator();
+};
+
+} // namespace executorch::extension
@@ -20,3 +20,20 @@ def define_common_targets():
             "@EXECUTORCH_CLIENTS",
         ],
     )
+
+    runtime.cxx_library(
+        name = "cpu_caching_allocator",
+        srcs = [
+            "cpu_caching_malloc_allocator.cpp",
+        ],
+        exported_headers = [
+            "cpu_caching_malloc_allocator.h",
+        ],
+        exported_deps = [
+            "//executorch/runtime/core:memory_allocator",
+        ],
+        visibility = [
+            "//executorch/extension/memory_allocator/test/...",
+            "@EXECUTORCH_CLIENTS",
+        ],
+    )