pytorch
diff --git a/‎backends/arm/_passes/decompose_meandim_pass.py‎
Lines changed: 80 additions & 2 deletions b/‎backends/arm/_passes/decompose_meandim_pass.py‎
Lines changed: 80 additions & 2 deletions
diff --git a/‎backends/arm/runtime/EthosUBackend.cpp‎
Lines changed: 172 additions & 14 deletions b/‎backends/arm/runtime/EthosUBackend.cpp‎
Lines changed: 172 additions & 14 deletions
diff --git a/‎backends/arm/test/models/test_lstm_arm.py‎
Lines changed: 15 additions & 4 deletions b/‎backends/arm/test/models/test_lstm_arm.py‎
Lines changed: 15 additions & 4 deletions
@@ -13,6 +13,7 @@
 from executorch.backends.arm._passes.decompose_sum_pass import DecomposeSumPass
 from executorch.backends.arm._passes.fuse_constant_ops_pass import ComputeConstantOpsAOT
 from executorch.backends.arm._passes.size_adjust_input_pass import SizeAdjustInputPass
+from executorch.backends.arm.constants import DQ_OPS, Q_OPS
 from executorch.exir.backend.utils import WhyNoPartitionReporter
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass
@@ -50,6 +51,15 @@ def get_view(op):
     raise RuntimeError(f"Can't get meandim decomposition for op {op}")
 
 
+def get_quantization(op):
+    """Returns quant and dequant op of same type (per_channel/ tensor) as op if op is a dequant node, None otherwise."""
+    if op in DQ_OPS:
+        # Input of op can be placeholder, can't use that to get quant node directly.
+        quant_type_index = DQ_OPS.index(op)
+        return Q_OPS[quant_type_index], op
+    return None
+
+
 class DecomposeMeanDimPass(ArmPass):
     """
     Decomposes a meandim into avg_pool and/or sum + mul (1/N) depending on which dims the mean is taken for:
@@ -121,6 +131,7 @@ def call_operator(self, op, args, kwargs, meta):
                 dims_to_reduce = [dim - 1 for dim in dims_to_reduce]
 
             x = super().call_operator(view_op, (x, new_shape), {}, meta, True)
+            x = self._maybe_insert_q_dq_after(x, meta)
 
         # Reduce (h,w) dims by avg pool if possible
         x, dims_to_reduce = self._reduce_by_average_pool(op, x, dims_to_reduce, meta)
@@ -133,7 +144,7 @@ def call_operator(self, op, args, kwargs, meta):
             dims_to_reduce = [dim + len(original_dims) - 1 for dim in dims_to_reduce]
 
             x = super().call_operator(view_op, (x, temp_shape), {}, meta, True)
-
+            x = self._maybe_insert_q_dq_after(x, meta)
         # Reduce remaining dims by sum
         x = self._reduce_by_sum(op, x, dims_to_reduce, meta, dtype)
 
@@ -156,6 +167,45 @@ def _reduce_by_sum(self, op, input_node, dims, meta, dtype):
         full = super().call_operator(
             full_op, ([1] * len(output_shape), 1 / N), {"dtype": dtype}, meta, True
         )
+        if (quant_ops := get_quantization(input_node.node.target)) is not None:
+            # Insert Q and DQ nodes after full op.
+            # Since the value of full is known, we can compute quant params such that dq(q_max_value)
+            q_op, dq_op = quant_ops
+            qmax = input_node.node.args[4]
+            full_quant_args = (
+                1 / (N * qmax),  # Scale to map qmax to 1/N
+                0,  # Zero point
+                *input_node.node.args[3:],
+            )
+            q_args = (full, *full_quant_args)
+            full = super().call_operator(
+                q_op,
+                q_args,
+                kwargs={},
+                meta=meta,
+                updated=True,
+            )
+            dq_args = (full, *full_quant_args)
+            full = super().call_operator(
+                dq_op, dq_args, kwargs={}, meta=meta, updated=True
+            )
+
+            # Insert Q and DQ nodes after sum op.
+            # Scale needs to be adjusted with N, since it was computed on data after the division with N.
+            sum_quant_args = (input_node.node.args[1] * N, *input_node.node.args[2:])
+            q_args = (sum, *sum_quant_args)
+            sum = super().call_operator(
+                q_op,
+                q_args,
+                kwargs={},
+                meta=meta,
+                updated=True,
+            )
+            dq_args = (sum, *sum_quant_args)
+            sum = super().call_operator(
+                dq_op, dq_args, kwargs={}, meta=meta, updated=True
+            )
+
         return super().call_operator(mul_op, (sum, full), {}, meta, True)
 
     def _reduce_by_average_pool(self, op, input_node, dims, meta):
@@ -190,10 +240,38 @@ def _reduce_by_average_pool(self, op, input_node, dims, meta):
         )
 
         if is_supported:
+            out = super().call_operator(avgpool_op, args, {}, meta, True)
+            out = self._maybe_insert_q_dq_after(out, meta)
             return (
-                super().call_operator(avgpool_op, args, {}, meta, True),
+                out,
                 dims_to_reduce_by_sum,
             )
 
         else:
             return input_node, dims
+
+    def _maybe_insert_q_dq_after(self, op, meta):
+        """If the input node of op is a dequant node, insert a q-dq pair after op with identical quantization parameters."""
+
+        if len(op.node.all_input_nodes) > 1:
+            raise ValueError(
+                f"Expected one input to {op.node}, got inputs {op.node.all_input_nodes}"
+            )
+        input_node = op.node.all_input_nodes[0]
+        if (quant_ops := get_quantization(input_node.target)) is not None:
+            q_op, dq_op = quant_ops
+            quant_args = list(input_node.args[1:])
+            q_args = (op, *quant_args)
+            out = super().call_operator(
+                q_op,
+                q_args,
+                kwargs={},
+                meta=meta,
+                updated=True,
+            )
+            dq_args = (out, *quant_args)
+            return super().call_operator(
+                dq_op, dq_args, kwargs={}, meta=meta, updated=True
+            )
+        else:
+            return op
@@ -326,7 +326,8 @@ class EthosUBackend final : public ::executorch::runtime::BackendInterface {
       ET_LOG(Error, "Ethos-U invocation failed error (%d)", result);
       return Error::InvalidProgram;
     }
-    int tensor_dim = 0, io_dim = 0;
+    size_t tensor_bytes_total = 0;
+    size_t io_bytes_total = 0;
     // Write outputs from scratch into EValue pointers
     for (int i = 0; i < handles.outputs->count; i++) {
       int tensor_count = 1, io_count = 1;
@@ -338,23 +339,39 @@ class EthosUBackend final : public ::executorch::runtime::BackendInterface {
       calculate_dimensions(
           tensor_out, &handles.outputs->io[i], &tensor_count, &io_count);
 
-      // At times the topological order of the outputs may change.
-      // Lets instead ensure that the sum of dimensions match.
-      tensor_dim = tensor_dim + tensor_count;
-      io_dim = io_dim + io_count;
+      size_t tensor_bytes = tensor_out.nbytes();
+      size_t io_bytes = static_cast<size_t>(io_count) *
+          static_cast<size_t>(handles.outputs->io[i].elem_size);
+
+      if (tensor_bytes != io_bytes) {
+        Error status = copy_with_layout_adjustment(
+            handles.outputs->io[i], i, output_addr, tensor_out, tensor_bytes);
+        if (status != Error::Ok) {
+          return status;
+        }
+        io_bytes_total += tensor_bytes;
+      } else {
+        EXECUTORCH_PROF_SCOPE(
+            event_tracer, "+EthosUBackend::execute()handles.output.memcpy()");
 
-      EXECUTORCH_PROF_SCOPE(
-          event_tracer, "+EthosUBackend::execute()handles.output.memcpy()");
+        memcpy(
+            tensor_out.mutable_data_ptr<char>(),
+            static_cast<const char*>(output_addr),
+            tensor_bytes);
+        io_bytes_total += io_bytes;
+      }
 
-      memcpy(
-          tensor_out.mutable_data_ptr<char>(),
-          static_cast<const char*>(output_addr),
-          tensor_out.nbytes());
+      // At times the topological order of the outputs may change.
+      // Lets instead ensure that the sum of output bytes match.
+      tensor_bytes_total += tensor_bytes;
     }
-    if (tensor_dim != io_dim) {
+    if (tensor_bytes_total != io_bytes_total) {
       ET_LOG(Error, "Total output tensor sizes do not match");
       ET_LOG(
-          Error, "Program expects size of %d but got %d", tensor_dim, io_dim);
+          Error,
+          "Program expects %zu bytes but got %zu",
+          io_bytes_total,
+          tensor_bytes_total);
       return Error::InvalidProgram;
     }
     return Error::Ok;
@@ -365,6 +382,147 @@ class EthosUBackend final : public ::executorch::runtime::BackendInterface {
   }
 
  private:
+  // Copies Vela output into the ExecuTorch tensor, adjusting for padding or
+  // packed layouts produced by the delegate.
+  Error copy_with_layout_adjustment(
+      const VelaIO& output_io,
+      int output_index,
+      const char* src,
+      executorch::aten::Tensor& tensor_out,
+      size_t tensor_bytes) const {
+    const int elem_size = output_io.elem_size;
+    if (elem_size == 0) {
+      ET_LOG(
+          Error, "Ethos-U output %d reports zero element size", output_index);
+      return Error::InvalidProgram;
+    }
+
+    size_t chunk_count = 1;
+    for (int dim = 0; dim < shapeDim - 1; ++dim) {
+      const int vela_dim = output_io.shape[dim];
+      chunk_count *= static_cast<size_t>(vela_dim == 0 ? 1 : vela_dim);
+    }
+    const int last_dim = output_io.shape[shapeDim - 1];
+    const size_t vela_chunk_elems =
+        static_cast<size_t>(last_dim == 0 ? 1 : last_dim);
+    const size_t vela_chunk_size =
+        vela_chunk_elems * static_cast<size_t>(elem_size);
+
+    if (tensor_bytes % chunk_count != 0) {
+      ET_LOG(
+          Error,
+          "Ethos-U output %d tensor bytes %zu not divisible by chunk count %zu",
+          output_index,
+          tensor_bytes,
+          chunk_count);
+      return Error::InvalidProgram;
+    }
+
+    const size_t chunk_size = tensor_bytes / chunk_count;
+
+    // If Vela writes fewer bytes than the tensor expects we may need to
+    // expand 4-bit data to 8-bit. Ethos-U outputs may be
+    // packed 4-bit values but ExecuTorch tensors are at least 8-bit.
+    if (vela_chunk_size < chunk_size) {
+      if (chunk_size % vela_chunk_size != 0) {
+        ET_LOG(
+            Error,
+            "Ethos-U output %d chunk bytes %zu not divisible by vela chunk bytes %zu",
+            output_index,
+            chunk_size,
+            vela_chunk_size);
+        return Error::InvalidProgram;
+      }
+
+      const size_t expand_factor = chunk_size / vela_chunk_size;
+      if (expand_factor == 2 && elem_size == 1 &&
+          tensor_out.scalar_type() == ScalarType::Char) {
+        return unpack_chunks_4bit_to_int8(
+            reinterpret_cast<const uint8_t*>(src),
+            tensor_out.mutable_data_ptr<int8_t>(),
+            chunk_count,
+            chunk_size,
+            vela_chunk_size);
+      }
+
+      ET_LOG(
+          Error,
+          "Ethos-U output %d expansion factor %zu with element size %d not supported",
+          output_index,
+          expand_factor,
+          elem_size);
+      return Error::InvalidProgram;
+    }
+
+    return strip_delegate_padding(
+        src,
+        tensor_out.mutable_data_ptr<char>(),
+        chunk_count,
+        chunk_size,
+        vela_chunk_size);
+  }
+
+  Error unpack_chunks_4bit_to_int8(
+      const uint8_t* src,
+      int8_t* dest,
+      size_t chunk_count,
+      size_t dest_chunk_size,
+      size_t src_chunk_size) const {
+    const uint8_t* chunk_src = src;
+    int8_t* chunk_dest = dest;
+    for (size_t chunk_idx = 0; chunk_idx < chunk_count; ++chunk_idx) {
+      unpack_single_chunk_4bit_to_int8(chunk_src, chunk_dest, src_chunk_size);
+      chunk_src += src_chunk_size;
+      chunk_dest += dest_chunk_size;
+    }
+    return Error::Ok;
+  }
+
+  void unpack_single_chunk_4bit_to_int8(
+      const uint8_t* src,
+      int8_t* dest,
+      size_t chunk_size) const {
+    for (size_t byte_idx = 0; byte_idx < chunk_size; ++byte_idx) {
+      const uint8_t packed = src[byte_idx];
+      int8_t low = static_cast<int8_t>(packed & 0x0F);
+      int8_t high = static_cast<int8_t>((packed >> 4) & 0x0F);
+      if (low >= 8) {
+        low -= 16;
+      }
+      if (high >= 8) {
+        high -= 16;
+      }
+      dest[2 * byte_idx] = low;
+      dest[2 * byte_idx + 1] = high;
+    }
+  }
+
+  Error strip_delegate_padding(
+      const char* src,
+      char* dest,
+      size_t chunk_count,
+      size_t dest_chunk_size,
+      size_t src_chunk_size) const {
+    if (dest_chunk_size > src_chunk_size) {
+      ET_LOG(
+          Error,
+          "dest chunk size %zu must not exceed src chunk size %zu",
+          dest_chunk_size,
+          src_chunk_size);
+      return Error::InvalidProgram;
+    }
+    if (src == nullptr || dest == nullptr) {
+      ET_LOG(Error, "Ethos-U padded copy received null buffer");
+      return Error::InvalidState;
+    }
+    for (size_t chunk_idx = 0; chunk_idx < chunk_count; ++chunk_idx) {
+      memcpy(dest, src, dest_chunk_size);
+      src += src_chunk_size;
+      dest += dest_chunk_size;
+    }
+    return Error::Ok;
+  }
+
   void calculate_dimensions(
       const executorch::aten::Tensor tensor,
       VelaIO* io,
@@ -389,4 +547,4 @@ static auto registered = register_backend(backend_id);
 
 } // namespace arm
 } // namespace backends
-} // namespace executorch
+} // namespace executorch
@@ -51,7 +51,9 @@ def test_lstm_tosa_FP():
         exir_op=[],
         use_to_edge_transform_and_lower=True,
     )
-    pipeline.change_args("run_method_and_compare_outputs", get_test_inputs(), atol=3e-1)
+    pipeline.change_args(
+        "run_method_and_compare_outputs", inputs=get_test_inputs(), atol=3e-1
+    )
     pipeline.run()
 
 
@@ -64,7 +66,10 @@ def test_lstm_tosa_INT():
         use_to_edge_transform_and_lower=True,
     )
     pipeline.change_args(
-        "run_method_and_compare_outputs", get_test_inputs(), atol=3e-1, qtol=1.0
+        "run_method_and_compare_outputs",
+        inputs=get_test_inputs(),
+        atol=3e-1,
+        qtol=1.0,
     )
     pipeline.run()
 
@@ -79,7 +84,10 @@ def test_lstm_u55_INT():
         use_to_edge_transform_and_lower=True,
     )
     pipeline.change_args(
-        "run_method_and_compare_outputs", get_test_inputs(), atol=3e-1, qtol=1.0
+        "run_method_and_compare_outputs",
+        inputs=get_test_inputs(),
+        atol=3e-1,
+        qtol=1.0,
     )
     pipeline.run()
 
@@ -94,7 +102,10 @@ def test_lstm_u85_INT():
         use_to_edge_transform_and_lower=True,
     )
     pipeline.change_args(
-        "run_method_and_compare_outputs", get_test_inputs(), atol=3e-1, qtol=1.0
+        "run_method_and_compare_outputs",
+        inputs=get_test_inputs(),
+        atol=3e-1,
+        qtol=1.0,
     )
     pipeline.run()
Original file line number	Diff line number	Diff line change
`@@ -51,7 +51,9 @@ def test_lstm_tosa_FP():`
`51`	`51`	`exir_op=[],`
`52`	`52`	`use_to_edge_transform_and_lower=True,`
`53`	`53`	`)`
`54`		`- pipeline.change_args("run_method_and_compare_outputs", get_test_inputs(), atol=3e-1)`
	`54`	`+ pipeline.change_args(`
	`55`	`+ "run_method_and_compare_outputs", inputs=get_test_inputs(), atol=3e-1`
	`56`	`+ )`
`55`	`57`	`pipeline.run()`
`56`	`58`
`57`	`59`
`@@ -64,7 +66,10 @@ def test_lstm_tosa_INT():`
`64`	`66`	`use_to_edge_transform_and_lower=True,`
`65`	`67`	`)`
`66`	`68`	`pipeline.change_args(`
`67`		`- "run_method_and_compare_outputs", get_test_inputs(), atol=3e-1, qtol=1.0`
	`69`	`+ "run_method_and_compare_outputs",`
	`70`	`+ inputs=get_test_inputs(),`
	`71`	`+ atol=3e-1,`
	`72`	`+ qtol=1.0,`
`68`	`73`	`)`
`69`	`74`	`pipeline.run()`
`70`	`75`
`@@ -79,7 +84,10 @@ def test_lstm_u55_INT():`
`79`	`84`	`use_to_edge_transform_and_lower=True,`
`80`	`85`	`)`
`81`	`86`	`pipeline.change_args(`
`82`		`- "run_method_and_compare_outputs", get_test_inputs(), atol=3e-1, qtol=1.0`
	`87`	`+ "run_method_and_compare_outputs",`
	`88`	`+ inputs=get_test_inputs(),`
	`89`	`+ atol=3e-1,`
	`90`	`+ qtol=1.0,`
`83`	`91`	`)`
`84`	`92`	`pipeline.run()`
`85`	`93`
`@@ -94,7 +102,10 @@ def test_lstm_u85_INT():`
`94`	`102`	`use_to_edge_transform_and_lower=True,`
`95`	`103`	`)`
`96`	`104`	`pipeline.change_args(`
`97`		`- "run_method_and_compare_outputs", get_test_inputs(), atol=3e-1, qtol=1.0`
	`105`	`+ "run_method_and_compare_outputs",`
	`106`	`+ inputs=get_test_inputs(),`
	`107`	`+ atol=3e-1,`
	`108`	`+ qtol=1.0,`
`98`	`109`	`)`
`99`	`110`	`pipeline.run()`
`100`	`111`