[Mlir-commits] [mlir] [mlir][amdgpu] fold memref.subview/expand_shape/collapse_shape into amdgpu.gather_to_lds for DST operand (PR #152277)
Jakub Kuderski
llvmlistbot at llvm.org
Wed Aug 6 06:38:29 PDT 2025
================
@@ -80,15 +82,82 @@ func.func @test_collapse_shape(%offset_i: index, %offset_j: index) {
// CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<64x64xf16, 3>
// CHECK: %[[MEM:.*]] = memref.alloc() : memref<64x128xf16>
// CHECK: %[[C0:.*]] = arith.constant 0 : index
- // CHECK: %[[INDICES:.*]]:2 = affine.delinearize_index %[[ARG0]] into (64, 128) : index, index
- // CHECK: amdgpu.gather_to_lds %[[MEM]][%[[INDICES]]#0, %[[INDICES]]#1], %[[LOCAL]][%[[C0]], %[[C0]]]
+ // CHECK: %[[INDICES_MEM:.*]]:2 = affine.delinearize_index %[[ARG0]] into (64, 128) : index, index
+ // CHECK: %[[INDICES_LDS:.*]]:2 = affine.delinearize_index %[[ARG1]] into (64, 64) : index, index
+ // CHECK: amdgpu.gather_to_lds %[[MEM]][%[[INDICES_MEM]]#0, %[[INDICES_MEM]]#1], %[[LOCAL]][%[[INDICES_LDS]]#0, %[[INDICES_LDS]]#1]
// CHECK-SAME: vector<8xf16>, memref<64x128xf16>, memref<64x64xf16, 3>
%alloc = memref.alloc() : memref<64x64xf16, #gpu_lds_addrspace>
+ %collapse_alloc = memref.collapse_shape %alloc [[0, 1]] : memref<64x64xf16, #gpu_lds_addrspace> into memref<4096xf16, #gpu_lds_addrspace>
%mem = memref.alloc() : memref<64x128xf16>
- %collapse = memref.collapse_shape %mem [[0, 1]] : memref<64x128xf16> into memref<8192xf16>
+ %collapse_mem = memref.collapse_shape %mem [[0, 1]] : memref<64x128xf16> into memref<8192xf16>
+ %c0 = arith.constant 0 : index
+ amdgpu.gather_to_lds %collapse_mem[%offset_i], %collapse_alloc[%offset_j]
+ : vector<8xf16>, memref<8192xf16>, memref<4096xf16, #gpu_lds_addrspace>
+ func.return
+}
+
+
+// -----
+
+#gpu_lds_addrspace = 3
+
+
+// CHECK: func @test_expand_shape_src_raw_buffer
+// CHECK-SAME: %[[ARG0:.*]]: memref<8192xf16, #amdgpu.address_space<fat_raw_buffer>>, %[[ARG1:.*]]: index, %[[ARG2:.*]]: index
+func.func @test_expand_shape_src_raw_buffer(%mem : memref<8192xf16, #amdgpu.address_space<fat_raw_buffer>>, %offset_i: index, %offset_j: index) {
+ // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<4096xf16, 3>
+ // CHECK: %[[C0:.*]] = arith.constant 0 : index
+ // CHECK: %[[IDXM:.*]] = affine.linearize_index [%[[ARG1]], %[[ARG2]]] by (64, 128) : index
+ // CHECK: amdgpu.gather_to_lds %[[ARG0]][%[[IDXM]]], %[[LOCAL]][%[[C0]]]
+ // CHECK-SAME: vector<8xf16>, memref<8192xf16, #amdgpu.address_space<fat_raw_buffer>>, memref<4096xf16, 3>
+
+ %alloc = memref.alloc() : memref<4096xf16, #gpu_lds_addrspace>
+ %expand_mem = memref.expand_shape %mem [[0, 1]] output_shape [64, 128] : memref<8192xf16, #amdgpu.address_space<fat_raw_buffer>> into memref<64x128xf16, #amdgpu.address_space<fat_raw_buffer>>
+
+ %c0 = arith.constant 0 : index
+ amdgpu.gather_to_lds %expand_mem[%offset_i, %offset_j], %alloc[%c0]
+ : vector<8xf16>, memref<64x128xf16, #amdgpu.address_space<fat_raw_buffer>>, memref<4096xf16, #gpu_lds_addrspace>
+ func.return
+}
+
+// -----
+
+#gpu_lds_addrspace = 3
+
+// CHECK: func @test_expand_shape_dst_only
+// CHECK-SAME: %[[ARG0:.*]]: index, %[[ARG1:.*]]: index
+func.func @test_expand_shape_dst_only(%offset_i: index, %offset_j: index) {
+ // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<4096xf16, 3>
+ // CHECK: %[[MEM:.*]] = memref.alloc() : memref<8192xf16>
+ // CHECK: %[[C0:.*]] = arith.constant 0 : index
+ // CHECK: %[[IDX_LDS:.*]] = affine.linearize_index [%[[ARG1]], %[[C0]]] by (64, 64) : index
+ // CHECK: amdgpu.gather_to_lds %[[MEM]][%[[ARG0]]], %[[LOCAL]][%[[IDX_LDS]]]
+ // CHECK-SAME: vector<8xf16>, memref<8192xf16>, memref<4096xf16, 3>
+
+ %alloc = memref.alloc() : memref<4096xf16, #gpu_lds_addrspace>
+ %mem = memref.alloc() : memref<8192xf16>
+ %expand_alloc = memref.expand_shape %alloc [[0, 1]] output_shape [64, 64] : memref<4096xf16, #gpu_lds_addrspace> into memref<64x64xf16, #gpu_lds_addrspace>
+
%c0 = arith.constant 0 : index
- amdgpu.gather_to_lds %collapse[%offset_i], %alloc[%c0, %c0]
+ amdgpu.gather_to_lds %mem[%offset_i], %expand_alloc[%offset_j, %c0]
: vector<8xf16>, memref<8192xf16>, memref<64x64xf16, #gpu_lds_addrspace>
func.return
}
+
+// -----
+
+#gpu_lds_addrspace = 3
+
+// CHECK: func @test_nop
+// CHECK-SAME: %[[ARG0:.*]]: memref<8192xf16, #amdgpu.address_space<fat_raw_buffer>>, %[[ARG1:.*]]: index, %[[ARG2:.*]]: index
+func.func @test_nop(%mem : memref<8192xf16, #amdgpu.address_space<fat_raw_buffer>>, %offset_i: index, %offset_j: index) {
+ // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<4096xf16, 3>
+ // CHECK: amdgpu.gather_to_lds %[[ARG0]][%[[ARG1]]], %[[LOCAL]][%[[ARG2]]]
+ // CHECK-SAME: vector<8xf16>, memref<8192xf16, #amdgpu.address_space<fat_raw_buffer>>, memref<4096xf16, 3>
+
+ %alloc = memref.alloc() : memref<4096xf16, #gpu_lds_addrspace>
+ amdgpu.gather_to_lds %mem[%offset_i], %alloc[%offset_j]
+ : vector<8xf16>, memref<8192xf16, #amdgpu.address_space<fat_raw_buffer>>, memref<4096xf16, #gpu_lds_addrspace>
+ func.return
+}
----------------
kuhar wrote:
missing newline at the end of file
https://github.com/llvm/llvm-project/pull/152277
More information about the Mlir-commits
mailing list