[Mlir-commits] [mlir] Fix for TOSA-to-linalg lowering of tosa.transpose op (PR #72698)

Fri Nov 17 12:36:30 PST 2023

https://github.com/rafaelubalmw created https://github.com/llvm/llvm-project/pull/72698

The TOSA-to-linalg conversion of `tosa.transpose` contains a bug in the computation of the result tensor shape when using dynamic dimensions. This bug may have widespread implication in projects such as Tensorflow, where `tosa.transpose` is frequently generated.

Consider the following TOSA code using only static dimensions. The code transposes a tensor of shape 10x11x12 into 12x10x11 by permuting dimensions [2, 0, 1] into [0, 1, 2].
 
```
func.func @test_tosa_transpose(%input: tensor<10x11x12xf32>) -> tensor<12x10x11xf32> {
  %perms = "tosa.const"() <{value = dense<[2, 0, 1]> : tensor<3xi32>}> : () -> tensor<3xi32>
  %transposed = "tosa.transpose"(%input, %perms) : (tensor<10x11x12xf32>, tensor<3xi32>) -> tensor<12x10x11xf32>
  return %transposed : tensor<12x10x11xf32>
}
```
 
The code is correctly lowered to:
 
```
#map = affine_map<(d0, d1, d2) -> (d1, d2, d0)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
module {
  func.func @test_tosa_transpose(%arg0: tensor<10x11x12xf32>) -> tensor<12x10x11xf32> {
    %empty = tensor.empty() : tensor<12x10x11xf32>
    %transposed = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%arg0 : tensor<10x11x12xf32>) outs(%empty : tensor<12x10x11xf32>) {
    ^bb0(%in: f32, %out: f32):
      linalg.yield %in : f32
    } -> tensor<12x10x11xf32>
    return %transposed : tensor<12x10x11xf32>
  }
}
```
 
Now let's make all dimensions dynamic in the TOSA code:
 
```
func.func @test_tosa_transpose(%input: tensor<?x?x?xf32>) -> tensor<?x?x?xf32> {
  %perms = "tosa.const"() <{value = dense<[2, 0, 1]> : tensor<3xi32>}> : () -> tensor<3xi32>
  %transposed = "tosa.transpose"(%input, %perms) : (tensor<?x?x?xf32>, tensor<3xi32>) -> tensor<?x?x?xf32>
  return %transposed : tensor<?x?x?xf32>
}
```
 
The `tensor.empty()` op now needs additional information about the size of the output tensor, which is computed dynamically with a set of `tensor.dim` ops. The comments below assume an input tensor of size 10x11x12, as before. The code is lowered as:
 
```
#map = affine_map<(d0, d1, d2) -> (d1, d2, d0)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
module {
  func.func @test_tosa_transpose(%arg0: tensor<?x?x?xf32>) -> tensor<?x?x?xf32> {
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
 
    %arg0_dim0 = tensor.dim %arg0, %c0 : tensor<?x?x?xf32>   // Evaluates to 10
    %arg0_dim1 = tensor.dim %arg0, %c1 : tensor<?x?x?xf32>   // Evaluates to 11
    %arg0_dim2 = tensor.dim %arg0, %c2 : tensor<?x?x?xf32>   // Evaluates to 12
 
    %empty = tensor.empty(%arg0_dim1, %arg0_dim2, %arg0_dim0) : tensor<?x?x?xf32>   // Output of type tensor<11x12x10>  WRONG!
    %transposed = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%arg0 : tensor<?x?x?xf32>) outs(%empty : tensor<?x?x?xf32>) {
    ^bb0(%in: f32, %out: f32):
      linalg.yield %in : f32
    } -> tensor<?x?x?xf32>
    return %transposed : tensor<?x?x?xf32>
  }
}
```
 
The output tensor shape is dynamically computed as 11x12x10 instead of 12x10x11. Since the total size of the output tensor is still the same, the code does not segfault after bufferization. However, index computations are invalid and lead to SWAs.

>From a0d80ba954c6d5b5629fa7dd8851317c235cc3e6 Mon Sep 17 00:00:00 2001
From: Rafael Ubal Tena <rubal at mathworks.com>
Date: Fri, 17 Nov 2023 15:29:10 -0500
Subject: [PATCH] Fix for TOSA-to-linalg lowering of tosa.transpose op

---
 .../Conversion/TosaToLinalg/TosaToLinalg.cpp  |  5 ++-
 .../TosaToLinalg/tosa-to-linalg.mlir          | 35 +++++++++++++++----
 2 files changed, 31 insertions(+), 9 deletions(-)

diff --git a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp
index 3bf7bf12b5e96ff..ca37bd2b6643860 100644
--- a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp
+++ b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp
@@ -1072,12 +1072,11 @@ class TransposeConverter : public OpRewritePattern<tosa::TransposeOp> {
 
     SmallVector<AffineExpr, 2> inputExprs;
     inputExprs.resize(resultTy.getRank());
-    auto operandTy = cast<ShapedType>(input.getType());
     for (const auto &permutation : llvm::enumerate(perms.getValues<APInt>())) {
       auto index = permutation.index();
       auto value = permutation.value().getZExtValue();
-      if (!operandTy.hasRank() || operandTy.isDynamicDim(index)) {
-        dynDims[value] = rewriter.create<tensor::DimOp>(loc, input, index);
+      if (!resultTy.hasRank() || resultTy.isDynamicDim(index)) {
+        dynDims[index] = rewriter.create<tensor::DimOp>(loc, input, value);
       }
       inputExprs[value] = rewriter.getAffineDimExpr(index);
     }
diff --git a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir
index aa53b366f6da684..e0e041139fe4dc2 100644
--- a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir
+++ b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir
@@ -877,14 +877,14 @@ func.func @test_transpose_dyn(%arg0: tensor<1x?x3x4xi32>) -> () {
 // CHECK: #[[$MAP0:.*]] = affine_map<(d0, d1) -> (d1, d0)>
 // CHECK: #[[$MAP1:.*]] = affine_map<(d0, d1) -> (d0, d1)>
 
-// CHECK-LABEL: @test_transpose_dyn
+// CHECK-LABEL: @test_transpose_dyn_multiple_2d
 // CHECK-SAME: (%[[ARG0:.+]]: tensor<?x?xf32>)
-func.func @test_transpose_dyn_multiple(%arg0: tensor<?x?xf32>) -> () {
+func.func @test_transpose_dyn_multiple_2d(%arg0: tensor<?x?xf32>) -> () {
   %0 = arith.constant dense<[1, 0]> : tensor<2xi32>
-  // CHECK: %[[C0:.+]] = arith.constant 0
-  // CHECK: %[[DIM0:.+]] = tensor.dim %[[ARG0]], %[[C0]]
-  // CHECK: %[[C1:.+]] = arith.constant 1
-  // CHECK: %[[DIM1:.+]] = tensor.dim %[[ARG0]], %[[C1]]
+  // CHECK-DAG: %[[C0:.+]] = arith.constant 0
+  // CHECK-DAG: %[[DIM0:.+]] = tensor.dim %[[ARG0]], %[[C0]]
+  // CHECK-DAG: %[[C1:.+]] = arith.constant 1
+  // CHECK-DAG: %[[DIM1:.+]] = tensor.dim %[[ARG0]], %[[C1]]
   // CHECK: %[[INIT:.+]] = tensor.empty(%[[DIM1]], %[[DIM0]])
   // CHECK: %[[GENERIC:.+]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]]], iterator_types = ["parallel", "parallel"]} ins(%[[ARG0]] : tensor<?x?xf32>) outs([[OUT:%.+]] : tensor<?x?xf32>)
   // CHECK: ^bb0([[ARG1:%.+]]: f32, [[ARG2:%.+]]: f32)
@@ -896,6 +896,29 @@ func.func @test_transpose_dyn_multiple(%arg0: tensor<?x?xf32>) -> () {
 
 // -----
 
+// CHECK: #[[$MAP0:.+]] = affine_map<(d0, d1, d2) -> (d1, d2, d0)>
+// CHECK: #[[$MAP1:.+]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+
+// CHECK-LABEL: @test_transpose_dyn_multiple_3d
+// CHECK-SAME: (%[[ARG0:.+]]: tensor<?x?x?xf32>)
+func.func @test_transpose_dyn_multiple_3d(%arg0: tensor<?x?x?xf32>) {
+  %0 = arith.constant dense<[2, 0, 1]> : tensor<3xi32>
+  // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
+  // CHECK-DAG: %[[DIM0:.*]] = tensor.dim %[[ARG0]], %[[C0]] : tensor<?x?x?xf32>
+  // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
+  // CHECK-DAG: %[[DIM1:.*]] = tensor.dim %[[ARG0]], %[[C1]] : tensor<?x?x?xf32>
+  // CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index
+  // CHECK-DAG: %[[DIM2:.*]] = tensor.dim %[[ARG0]], %[[C2]] : tensor<?x?x?xf32>
+  // CHECK: %[[INIT:.*]] = tensor.empty(%[[DIM2]], %[[DIM0]], %[[DIM1]]) : tensor<?x?x?xf32>
+  // CHECK: %[[GENERIC:.*]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]]], iterator_types = ["parallel", "parallel", "parallel"]} ins(%[[ARG0]] : tensor<?x?x?xf32>) outs(%[[INIT]] : tensor<?x?x?xf32>) {
+  // CHECK: ^bb0(%[[IN0:.*]]: f32, %[[OUT0:.*]]: f32):
+  // CHECK:   linalg.yield %[[IN0]] : f32
+  // CHECK: } -> tensor<?x?x?xf32>
+  %1 = "tosa.transpose"(%arg0, %0) : (tensor<?x?x?xf32>, tensor<3xi32>) -> tensor<?x?x?xf32>
+  return
+}
+
+// -----
 
 // CHECK-LABEL: @reduce_float
 // CHECK-SAME: [[ARG0:%.+]]: tensor<5x4xf32>