Add softmax support for int8 in Cortex M (dim=-1)

xingguo01 · xingguo01 · commit d28f1a59e674 · 2025-12-18T20:45:05.000Z
- integrate CMSIS softmax into Cortex-M backend
- add fusion pass/tests for quantized softmax
- lint cleanup passes
- Resolved merge conflicts

Change-Id: I0ec19f011069fa1482e2de2ab62b9e7d7f56b2a8
Signed-off-by: Xingguo Li &lt;xingguo.li@arm.com&gt;
diff --git a/backends/cadence/aot/program_builder.py b/backends/cadence/aot/program_builder.py
@@ -77,13 +77,18 @@ def placeholder(
         return placeholder
 
     def output(
-        self, results: list[ProxyValue], output_kinds: Optional[list[OutputKind]] = None
+        self,
+        results: list[ProxyValue],
+        output_kinds: Optional[list[OutputKind]] = None,
+        output_targets: Optional[list[str | None]] = None,
     ) -> ProxyValue:
         if output_kinds is None:
             output_kinds = [OutputKind.USER_OUTPUT] * len(results)
-        for result, out_kind in zip(results, output_kinds):
+        if output_targets is None:
+            output_targets = [None] * len(results)
+        for result, out_kind, target in zip(results, output_kinds, output_targets):
             self.output_specs.append(
-                OutputSpec(out_kind, TensorArgument(result.node.name), target=None)
+                OutputSpec(out_kind, TensorArgument(result.node.name), target=target)
             )
         return super().output(results)
 
diff --git a/backends/cortex_m/ops/op_softmax.cpp b/backends/cortex_m/ops/op_softmax.cpp
@@ -71,14 +71,15 @@ Tensor& softmax_out(
     return out;
   }
 
-  const int32_t input_zp_val = static_cast<int32_t>(input_zero_point);
   const int32_t output_zp_val = static_cast<int32_t>(output_zero_point);
-  (void)input_zp_val; // Zero-point difference cancels out during subtraction.
+  const int32_t input_multiplier_val = static_cast<int32_t>(input_multiplier);
+  const int32_t input_shift_val = static_cast<int32_t>(input_shift);
+  const int32_t diff_min_val = static_cast<int32_t>(diff_min);
 
   validate_single_quant_params(
-      Scalar(input_zp_val),
-      Scalar(input_multiplier),
-      Scalar(input_shift),
+      Scalar(static_cast<int32_t>(input_zero_point)),
+      Scalar(input_multiplier_val),
+      Scalar(input_shift_val),
       "softmax input");
 
   const auto positive_dim = normalize_dim(input, dim);
@@ -118,10 +119,6 @@ Tensor& softmax_out(
     return out;
   }
 
-  const int32_t input_multiplier_val = static_cast<int32_t>(input_multiplier);
-  const int32_t input_shift_val = static_cast<int32_t>(input_shift);
-  const int32_t diff_min_val = static_cast<int32_t>(diff_min);
-
   if (output_zp_val != kCmsisSoftmaxZeroPoint) {
     ET_LOG(
         Error,
diff --git a/backends/cortex_m/test/ops/test_softmax.py b/backends/cortex_m/test/ops/test_softmax.py
@@ -47,6 +47,10 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         CortexMSoftmax(dim=1),
         (ramp_tensor(-2, 2, (2, 3, 4)),),
     ),
+    "large_tensor": McuTestCase(
+        CortexMSoftmax(dim=-1),
+        (ramp_tensor(-10, 10, (8, 1024)),),
+    ),
 }
 
 

Original file line number	Diff line number	Diff line change
`@@ -47,6 +47,10 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:`
`47`	`47`	`CortexMSoftmax(dim=1),`
`48`	`48`	`(ramp_tensor(-2, 2, (2, 3, 4)),),`
`49`	`49`	`),`
	`50`	`+ "large_tensor": McuTestCase(`
	`51`	`+ CortexMSoftmax(dim=-1),`
	`52`	`+ (ramp_tensor(-10, 10, (8, 1024)),),`
	`53`	`+ ),`
`50`	`54`	`}`
`51`	`55`
`52`	`56`