[Mlir-commits] [mlir] b13248f - Revert "[mlir][gpu] Add DecomposeMemrefsPass"

Wed Aug 9 18:08:09 PDT 2023

Author: Ivan Butygin
Date: 2023-08-10T03:07:28+02:00
New Revision: b13248f997dd0b47da66be4e51cc61e724e5d76a

URL: https://github.com/llvm/llvm-project/commit/b13248f997dd0b47da66be4e51cc61e724e5d76a
DIFF: https://github.com/llvm/llvm-project/commit/b13248f997dd0b47da66be4e51cc61e724e5d76a.diff

LOG: Revert "[mlir][gpu] Add DecomposeMemrefsPass"

Broke some bots

This reverts commit 2b5b2bfef102b1021d91f2b9485e2443bdea9df5.

Added: 
    

Modified: 
    mlir/include/mlir/Dialect/GPU/Transforms/Passes.h
    mlir/include/mlir/Dialect/GPU/Transforms/Passes.td
    mlir/include/mlir/Dialect/Utils/IndexingUtils.h
    mlir/lib/Dialect/GPU/CMakeLists.txt
    mlir/lib/Dialect/Utils/IndexingUtils.cpp

Removed: 
    mlir/lib/Dialect/GPU/Transforms/DecomposeMemrefs.cpp
    mlir/test/Dialect/GPU/decompose-memrefs.mlir


################################################################################
diff  --git a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h
index 970dfea4677d83..1afbcb2128d490 100644

--- a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h
@@ -150,12 +150,6 @@ std::unique_ptr<Pass> createGpuSerializeToHsacoPass(StringRef triple,
                                                     StringRef features,
                                                     int optLevel);
 
-/// Collect a set of patterns to decompose memrefs ops.
-void populateGpuDecomposeMemrefsPatterns(RewritePatternSet &patterns);
-
-/// Pass decomposes memref ops inside `gpu.launch` body.
-std::unique_ptr<Pass> createGpuDecomposeMemrefsPass();
-
 /// Generate the code for registering passes.
 #define GEN_PASS_REGISTRATION
 #include "mlir/Dialect/GPU/Transforms/Passes.h.inc"

diff  --git a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td
index 7602f8bcc6a482..7ee90b5d0f8437 100644
--- a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td
@@ -37,22 +37,4 @@ def GpuMapParallelLoopsPass
   let dependentDialects = ["mlir::gpu::GPUDialect"];
 }
 
-def GpuDecomposeMemrefsPass : Pass<"gpu-decompose-memrefs"> {
-  let summary = "Decomposes memref index computation into explicit ops.";
-  let description = [{
-    This pass decomposes memref index computation into explicit computations on
-    sizes/strides, obtained from `memref.extract_memref_metadata` which it tries
-    to place outside of `gpu.launch` body. Memrefs are then reconstructed using
-    `memref.reinterpret_cast`.
-    This is needed for as some targets (SPIR-V) lower memrefs to bare pointers
-    and sizes/strides for dynamically-sized memrefs are not available inside
-    `gpu.launch`.
-  }];
-  let constructor = "mlir::createGpuDecomposeMemrefsPass()";
-  let dependentDialects = [
-    "mlir::gpu::GPUDialect", "mlir::memref::MemRefDialect",
-    "mlir::affine::AffineDialect"
-  ];
-}
-
 #endif // MLIR_DIALECT_GPU_PASSES

diff  --git a/mlir/include/mlir/Dialect/Utils/IndexingUtils.h b/mlir/include/mlir/Dialect/Utils/IndexingUtils.h
index 56d028a2576b52..72becd8cc01c43 100644
--- a/mlir/include/mlir/Dialect/Utils/IndexingUtils.h
+++ b/mlir/include/mlir/Dialect/Utils/IndexingUtils.h
@@ -229,12 +229,6 @@ computePermutationVector(int64_t permSize, ArrayRef<int64_t> positions,
 SmallVector<int64_t> getI64SubArray(ArrayAttr arrayAttr, unsigned dropFront = 0,
                                     unsigned dropBack = 0);
 
-/// Compute linear index from provided strides and indices, assuming strided
-/// layout.
-OpFoldResult computeLinearIndex(OpBuilder &builder, Location loc,
-                                OpFoldResult sourceOffset,
-                                ArrayRef<OpFoldResult> strides,
-                                ArrayRef<OpFoldResult> indices);
 } // namespace mlir
 
 #endif // MLIR_DIALECT_UTILS_INDEXINGUTILS_H

diff  --git a/mlir/lib/Dialect/GPU/CMakeLists.txt b/mlir/lib/Dialect/GPU/CMakeLists.txt
index 81d7bf96bbf4c9..f3c518fd93066e 100644
--- a/mlir/lib/Dialect/GPU/CMakeLists.txt
+++ b/mlir/lib/Dialect/GPU/CMakeLists.txt
@@ -47,15 +47,14 @@ add_mlir_dialect_library(MLIRGPUDialect
 add_mlir_dialect_library(MLIRGPUTransforms
   Transforms/AllReduceLowering.cpp
   Transforms/AsyncRegionRewriter.cpp
-  Transforms/DecomposeMemrefs.cpp
   Transforms/GlobalIdRewriter.cpp
   Transforms/KernelOutlining.cpp
   Transforms/MemoryPromotion.cpp
   Transforms/ParallelLoopMapper.cpp
+  Transforms/ShuffleRewriter.cpp
   Transforms/SerializeToBlob.cpp
   Transforms/SerializeToCubin.cpp
   Transforms/SerializeToHsaco.cpp
-  Transforms/ShuffleRewriter.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/GPU

diff  --git a/mlir/lib/Dialect/GPU/Transforms/DecomposeMemrefs.cpp b/mlir/lib/Dialect/GPU/Transforms/DecomposeMemrefs.cpp
deleted file mode 100644
index 1e255635edb29d..00000000000000
--- a/mlir/lib/Dialect/GPU/Transforms/DecomposeMemrefs.cpp
+++ /dev/null
@@ -1,232 +0,0 @@
-//===- DecomposeMemrefs.cpp - Decompose memrefs pass implementation -------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements decompose memrefs pass.
-//
-//===----------------------------------------------------------------------===//
-
-#include "mlir/Dialect/Affine/IR/AffineOps.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/GPU/IR/GPUDialect.h"
-#include "mlir/Dialect/GPU/Transforms/Passes.h"
-#include "mlir/Dialect/MemRef/IR/MemRef.h"
-#include "mlir/Dialect/Utils/IndexingUtils.h"
-#include "mlir/IR/AffineExpr.h"
-#include "mlir/IR/Builders.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-
-namespace mlir {
-#define GEN_PASS_DEF_GPUDECOMPOSEMEMREFSPASS
-#include "mlir/Dialect/GPU/Transforms/Passes.h.inc"
-} // namespace mlir
-
-using namespace mlir;
-
-static void setInsertionPointToStart(OpBuilder &builder, Value val) {
-  if (auto parentOp = val.getDefiningOp()) {
-    builder.setInsertionPointAfter(parentOp);
-  } else {
-    builder.setInsertionPointToStart(val.getParentBlock());
-  }
-}
-
-static bool isInsideLaunch(Operation *op) {
-  return op->getParentOfType<gpu::LaunchOp>();
-}
-
-static std::tuple<Value, OpFoldResult, SmallVector<OpFoldResult>>
-getFlatOffsetAndStrides(OpBuilder &rewriter, Location loc, Value source,
-                        ArrayRef<OpFoldResult> subOffsets,
-                        ArrayRef<OpFoldResult> subStrides = std::nullopt) {
-  auto sourceType = cast<MemRefType>(source.getType());
-  auto sourceRank = static_cast<unsigned>(sourceType.getRank());
-
-  memref::ExtractStridedMetadataOp newExtractStridedMetadata;
-  {
-    OpBuilder::InsertionGuard g(rewriter);
-    setInsertionPointToStart(rewriter, source);
-    newExtractStridedMetadata =
-        rewriter.create<memref::ExtractStridedMetadataOp>(loc, source);
-  }
-
-  auto &&[sourceStrides, sourceOffset] = getStridesAndOffset(sourceType);
-
-  auto getDim = [&](int64_t dim, Value dimVal) -> OpFoldResult {
-    return ShapedType::isDynamic(dim) ? getAsOpFoldResult(dimVal)
-                                      : rewriter.getIndexAttr(dim);
-  };
-
-  OpFoldResult origOffset =
-      getDim(sourceOffset, newExtractStridedMetadata.getOffset());
-  ValueRange sourceStridesVals = newExtractStridedMetadata.getStrides();
-
-  SmallVector<OpFoldResult> origStrides;
-  origStrides.reserve(sourceRank);
-
-  SmallVector<OpFoldResult> strides;
-  strides.reserve(sourceRank);
-
-  AffineExpr s0 = rewriter.getAffineSymbolExpr(0);
-  AffineExpr s1 = rewriter.getAffineSymbolExpr(1);
-  for (auto i : llvm::seq(0u, sourceRank)) {
-    OpFoldResult origStride = getDim(sourceStrides[i], sourceStridesVals[i]);
-
-    if (!subStrides.empty()) {
-      strides.push_back(affine::makeComposedFoldedAffineApply(
-          rewriter, loc, s0 * s1, {subStrides[i], origStride}));
-    }
-
-    origStrides.emplace_back(origStride);
-  }
-
-  OpFoldResult finalOffset =
-      computeLinearIndex(rewriter, loc, origOffset, origStrides, subOffsets);
-  return {newExtractStridedMetadata.getBaseBuffer(), finalOffset, strides};
-}
-
-static Value getFlatMemref(OpBuilder &rewriter, Location loc, Value source,
-                           ValueRange offsets) {
-  SmallVector<OpFoldResult> offsetsTemp = getAsOpFoldResult(offsets);
-  auto &&[base, offset, ignore] =
-      getFlatOffsetAndStrides(rewriter, loc, source, offsetsTemp);
-  auto retType = cast<MemRefType>(base.getType());
-  return rewriter.create<memref::ReinterpretCastOp>(loc, retType, base, offset,
-                                                    std::nullopt, std::nullopt);
-}
-
-static bool needFlatten(Value val) {
-  auto type = cast<MemRefType>(val.getType());
-  return type.getRank() != 0;
-}
-
-static bool checkLayout(Value val) {
-  auto type = cast<MemRefType>(val.getType());
-  return type.getLayout().isIdentity() ||
-         isa<StridedLayoutAttr>(type.getLayout());
-}
-
-namespace {
-struct FlattenLoad : public OpRewritePattern<memref::LoadOp> {
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(memref::LoadOp op,
-                                PatternRewriter &rewriter) const override {
-    if (!isInsideLaunch(op))
-      return rewriter.notifyMatchFailure(op, "not inside gpu.launch");
-
-    Value memref = op.getMemref();
-    if (!needFlatten(memref))
-      return rewriter.notifyMatchFailure(op, "nothing to do");
-
-    if (!checkLayout(memref))
-      return rewriter.notifyMatchFailure(op, "unsupported layout");
-
-    Location loc = op.getLoc();
-    Value flatMemref = getFlatMemref(rewriter, loc, memref, op.getIndices());
-    rewriter.replaceOpWithNewOp<memref::LoadOp>(op, flatMemref);
-    return success();
-  }
-};
-
-struct FlattenStore : public OpRewritePattern<memref::StoreOp> {
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(memref::StoreOp op,
-                                PatternRewriter &rewriter) const override {
-    if (!isInsideLaunch(op))
-      return rewriter.notifyMatchFailure(op, "not inside gpu.launch");
-
-    Value memref = op.getMemref();
-    if (!needFlatten(memref))
-      return rewriter.notifyMatchFailure(op, "nothing to do");
-
-    if (!checkLayout(memref))
-      return rewriter.notifyMatchFailure(op, "unsupported layout");
-
-    Location loc = op.getLoc();
-    Value flatMemref = getFlatMemref(rewriter, loc, memref, op.getIndices());
-    Value value = op.getValue();
-    rewriter.replaceOpWithNewOp<memref::StoreOp>(op, value, flatMemref);
-    return success();
-  }
-};
-
-struct FlattenSubview : public OpRewritePattern<memref::SubViewOp> {
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(memref::SubViewOp op,
-                                PatternRewriter &rewriter) const override {
-    if (!isInsideLaunch(op))
-      return rewriter.notifyMatchFailure(op, "not inside gpu.launch");
-
-    Value memref = op.getSource();
-    if (!needFlatten(memref))
-      return rewriter.notifyMatchFailure(op, "nothing to do");
-
-    if (!checkLayout(memref))
-      return rewriter.notifyMatchFailure(op, "unsupported layout");
-
-    Location loc = op.getLoc();
-    SmallVector<OpFoldResult> subOffsets = op.getMixedOffsets();
-    SmallVector<OpFoldResult> subSizes = op.getMixedSizes();
-    SmallVector<OpFoldResult> subStrides = op.getMixedStrides();
-    auto &&[base, finalOffset, strides] =
-        getFlatOffsetAndStrides(rewriter, loc, memref, subOffsets, subStrides);
-
-    auto srcType = cast<MemRefType>(memref.getType());
-    auto resultType = cast<MemRefType>(op.getType());
-    unsigned subRank = static_cast<unsigned>(resultType.getRank());
-
-    llvm::SmallBitVector droppedDims = op.getDroppedDims();
-
-    SmallVector<OpFoldResult> finalSizes;
-    finalSizes.reserve(subRank);
-
-    SmallVector<OpFoldResult> finalStrides;
-    finalStrides.reserve(subRank);
-
-    for (auto i : llvm::seq(0u, static_cast<unsigned>(srcType.getRank()))) {
-      if (droppedDims.test(i))
-        continue;
-
-      finalSizes.push_back(subSizes[i]);
-      finalStrides.push_back(strides[i]);
-    }
-
-    rewriter.replaceOpWithNewOp<memref::ReinterpretCastOp>(
-        op, resultType, base, finalOffset, finalSizes, finalStrides);
-    return success();
-  }
-};
-
-struct GpuDecomposeMemrefsPass
-    : public impl::GpuDecomposeMemrefsPassBase<GpuDecomposeMemrefsPass> {
-
-  void runOnOperation() override {
-    RewritePatternSet patterns(&getContext());
-
-    populateGpuDecomposeMemrefsPatterns(patterns);
-
-    if (failed(
-            applyPatternsAndFoldGreedily(getOperation(), std::move(patterns))))
-      return signalPassFailure();
-  }
-};
-
-} // namespace
-
-void mlir::populateGpuDecomposeMemrefsPatterns(RewritePatternSet &patterns) {
-  patterns.insert<FlattenLoad, FlattenStore, FlattenSubview>(
-      patterns.getContext());
-}
-
-std::unique_ptr<Pass> mlir::createGpuDecomposeMemrefsPass() {
-  return std::make_unique<GpuDecomposeMemrefsPass>();
-}

diff  --git a/mlir/lib/Dialect/Utils/IndexingUtils.cpp b/mlir/lib/Dialect/Utils/IndexingUtils.cpp
index a344b01a958946..2a774b599a8b68 100644
--- a/mlir/lib/Dialect/Utils/IndexingUtils.cpp
+++ b/mlir/lib/Dialect/Utils/IndexingUtils.cpp
@@ -8,7 +8,6 @@
 
 #include "mlir/Dialect/Utils/IndexingUtils.h"
 
-#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinAttributes.h"
@@ -262,34 +261,3 @@ SmallVector<int64_t> mlir::getI64SubArray(ArrayAttr arrayAttr,
     res.push_back((*it).getValue().getSExtValue());
   return res;
 }
-
-OpFoldResult mlir::computeLinearIndex(OpBuilder &builder, Location loc,
-                                      OpFoldResult sourceOffset,
-                                      ArrayRef<OpFoldResult> strides,
-                                      ArrayRef<OpFoldResult> indices) {
-  assert(strides.size() == indices.size());
-  auto sourceRank = static_cast<unsigned>(strides.size());
-
-  // Hold the affine symbols and values for the computation of the offset.
-  SmallVector<OpFoldResult> values(2 * sourceRank + 1);
-  SmallVector<AffineExpr> symbols(2 * sourceRank + 1);
-
-  bindSymbolsList(builder.getContext(), MutableArrayRef{symbols});
-  AffineExpr expr = symbols.front();
-  values[0] = sourceOffset;
-
-  for (unsigned i = 0; i < sourceRank; ++i) {
-    // Compute the stride.
-    OpFoldResult origStride = strides[i];
-
-    // Build up the computation of the offset.
-    unsigned baseIdxForDim = 1 + 2 * i;
-    unsigned subOffsetForDim = baseIdxForDim;
-    unsigned origStrideForDim = baseIdxForDim + 1;
-    expr = expr + symbols[subOffsetForDim] * symbols[origStrideForDim];
-    values[subOffsetForDim] = indices[i];
-    values[origStrideForDim] = origStride;
-  }
-
-  return affine::makeComposedFoldedAffineApply(builder, loc, expr, values);
-}

diff  --git a/mlir/test/Dialect/GPU/decompose-memrefs.mlir b/mlir/test/Dialect/GPU/decompose-memrefs.mlir
deleted file mode 100644
index d714010d0f254b..00000000000000
--- a/mlir/test/Dialect/GPU/decompose-memrefs.mlir
+++ /dev/null
@@ -1,137 +0,0 @@
-// RUN: mlir-opt -gpu-decompose-memrefs -allow-unregistered-dialect -split-input-file %s | FileCheck %s
-
-//       CHECK: #[[MAP:.*]] = affine_map<()[s0, s1, s2, s3, s4] -> (s0 * s1 + s2 * s3 + s4)>
-//       CHECK: @decompose_store
-//  CHECK-SAME: (%[[VAL:.*]]: f32, %[[MEM:.*]]: memref<?x?x?xf32>)
-//       CHECK:  %[[BASE:.*]], %[[OFFSET:.*]], %[[SIZES:.*]]:3, %[[STRIDES:.*]]:3 = memref.extract_strided_metadata %[[MEM]]
-//       CHECK:  gpu.launch
-//  CHECK-SAME:  threads(%[[TX:.*]], %[[TY:.*]], %[[TZ:.*]]) in
-//       CHECK:  %[[IDX:.*]] = affine.apply #[[MAP]]()[%[[TX]], %[[STRIDES]]#0, %[[TY]], %[[STRIDES]]#1, %[[TZ]]]
-//       CHECK:  %[[PTR:.*]] = memref.reinterpret_cast %[[BASE]] to offset: [%[[IDX]]], sizes: [], strides: [] : memref<f32> to memref<f32>
-//       CHECK:  memref.store %[[VAL]], %[[PTR]][] : memref<f32>
-func.func @decompose_store(%arg0 : f32, %arg1 : memref<?x?x?xf32>) {
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %c2 = arith.constant 2 : index
-  %block_dim0 = memref.dim %arg1, %c0 : memref<?x?x?xf32>
-  %block_dim1 = memref.dim %arg1, %c1 : memref<?x?x?xf32>
-  %block_dim2 = memref.dim %arg1, %c2 : memref<?x?x?xf32>
-  gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c1, %grid_y = %c1, %grid_z = %c1)
-             threads(%tx, %ty, %tz) in (%block_x = %block_dim0, %block_y = %block_dim1, %block_z = %block_dim2) {
-    memref.store %arg0, %arg1[%tx, %ty, %tz] : memref<?x?x?xf32>
-    gpu.terminator
-  }
-  return
-}
-
-// -----
-
-//       CHECK: #[[MAP:.*]] = affine_map<()[s0, s1, s2, s3, s4, s5, s6] -> (s0 + s1 * s2 + s3 * s4 + s5 * s6)>
-//       CHECK: @decompose_store_strided
-//  CHECK-SAME: (%[[VAL:.*]]: f32, %[[MEM:.*]]: memref<?x?x?xf32, strided<[?, ?, ?], offset: ?>>)
-//       CHECK:  %[[BASE:.*]], %[[OFFSET:.*]], %[[SIZES:.*]]:3, %[[STRIDES:.*]]:3 = memref.extract_strided_metadata %[[MEM]]
-//       CHECK:  gpu.launch
-//  CHECK-SAME:  threads(%[[TX:.*]], %[[TY:.*]], %[[TZ:.*]]) in
-//       CHECK:  %[[IDX:.*]] = affine.apply #[[MAP]]()[%[[OFFSET]], %[[TX]], %[[STRIDES]]#0, %[[TY]], %[[STRIDES]]#1, %[[TZ]], %[[STRIDES]]#2]
-//       CHECK:  %[[PTR:.*]] = memref.reinterpret_cast %[[BASE]] to offset: [%[[IDX]]], sizes: [], strides: [] : memref<f32> to memref<f32>
-//       CHECK:  memref.store %[[VAL]], %[[PTR]][] : memref<f32>
-func.func @decompose_store_strided(%arg0 : f32, %arg1 : memref<?x?x?xf32, strided<[?, ?, ?], offset: ?>>) {
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %c2 = arith.constant 2 : index
-  %block_dim0 = memref.dim %arg1, %c0 : memref<?x?x?xf32, strided<[?, ?, ?], offset: ?>>
-  %block_dim1 = memref.dim %arg1, %c1 : memref<?x?x?xf32, strided<[?, ?, ?], offset: ?>>
-  %block_dim2 = memref.dim %arg1, %c2 : memref<?x?x?xf32, strided<[?, ?, ?], offset: ?>>
-  gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c1, %grid_y = %c1, %grid_z = %c1)
-             threads(%tx, %ty, %tz) in (%block_x = %block_dim0, %block_y = %block_dim1, %block_z = %block_dim2) {
-    memref.store %arg0, %arg1[%tx, %ty, %tz] : memref<?x?x?xf32, strided<[?, ?, ?], offset: ?>>
-    gpu.terminator
-  }
-  return
-}
-
-// -----
-
-//       CHECK: #[[MAP:.*]] = affine_map<()[s0, s1, s2, s3, s4] -> (s0 * s1 + s2 * s3 + s4)>
-//       CHECK: @decompose_load
-//  CHECK-SAME: (%[[MEM:.*]]: memref<?x?x?xf32>)
-//       CHECK:  %[[BASE:.*]], %[[OFFSET:.*]], %[[SIZES:.*]]:3, %[[STRIDES:.*]]:3 = memref.extract_strided_metadata %[[MEM]]
-//       CHECK:  gpu.launch
-//  CHECK-SAME:  threads(%[[TX:.*]], %[[TY:.*]], %[[TZ:.*]]) in
-//       CHECK:  %[[IDX:.*]] = affine.apply #[[MAP]]()[%[[TX]], %[[STRIDES]]#0, %[[TY]], %[[STRIDES]]#1, %[[TZ]]]
-//       CHECK:  %[[PTR:.*]] = memref.reinterpret_cast %[[BASE]] to offset: [%[[IDX]]], sizes: [], strides: [] : memref<f32> to memref<f32>
-//       CHECK:  %[[RES:.*]] = memref.load %[[PTR]][] : memref<f32>
-//       CHECK:  "test.test"(%[[RES]]) : (f32) -> ()
-func.func @decompose_load(%arg0 : memref<?x?x?xf32>) {
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %c2 = arith.constant 2 : index
-  %block_dim0 = memref.dim %arg0, %c0 : memref<?x?x?xf32>
-  %block_dim1 = memref.dim %arg0, %c1 : memref<?x?x?xf32>
-  %block_dim2 = memref.dim %arg0, %c2 : memref<?x?x?xf32>
-  gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c1, %grid_y = %c1, %grid_z = %c1)
-             threads(%tx, %ty, %tz) in (%block_x = %block_dim0, %block_y = %block_dim1, %block_z = %block_dim2) {
-    %res = memref.load %arg0[%tx, %ty, %tz] : memref<?x?x?xf32>
-    "test.test"(%res) : (f32) -> ()
-    gpu.terminator
-  }
-  return
-}
-
-// -----
-
-//       CHECK: #[[MAP:.*]] = affine_map<()[s0, s1, s2, s3, s4] -> (s0 * s1 + s2 * s3 + s4)>
-//       CHECK: @decompose_subview
-//  CHECK-SAME: (%[[MEM:.*]]: memref<?x?x?xf32>)
-//       CHECK:  %[[BASE:.*]], %[[OFFSET:.*]], %[[SIZES:.*]]:3, %[[STRIDES:.*]]:3 = memref.extract_strided_metadata %[[MEM]]
-//       CHECK:  gpu.launch
-//  CHECK-SAME:  threads(%[[TX:.*]], %[[TY:.*]], %[[TZ:.*]]) in
-//       CHECK:  %[[IDX:.*]] = affine.apply #[[MAP]]()[%[[TX]], %[[STRIDES]]#0, %[[TY]], %[[STRIDES]]#1, %[[TZ]]]
-//       CHECK:  %[[PTR:.*]] = memref.reinterpret_cast %[[BASE]] to offset: [%[[IDX]]], sizes: [%{{.*}}, %{{.*}}, %{{.*}}], strides: [%[[STRIDES]]#0, %[[STRIDES]]#1, 1]
-//       CHECK:  "test.test"(%[[PTR]]) : (memref<?x?x?xf32, strided<[?, ?, ?], offset: ?>>) -> ()
-func.func @decompose_subview(%arg0 : memref<?x?x?xf32>) {
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %c2 = arith.constant 2 : index
-  %block_dim0 = memref.dim %arg0, %c0 : memref<?x?x?xf32>
-  %block_dim1 = memref.dim %arg0, %c1 : memref<?x?x?xf32>
-  %block_dim2 = memref.dim %arg0, %c2 : memref<?x?x?xf32>
-  gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c1, %grid_y = %c1, %grid_z = %c1)
-             threads(%tx, %ty, %tz) in (%block_x = %block_dim0, %block_y = %block_dim1, %block_z = %block_dim2) {
-    %res = memref.subview %arg0[%tx, %ty, %tz] [%c2, %c2, %c2] [%c1, %c1, %c1] : memref<?x?x?xf32> to memref<?x?x?xf32, strided<[?, ?, ?], offset: ?>>
-    "test.test"(%res) : (memref<?x?x?xf32, strided<[?, ?, ?], offset: ?>>) -> ()
-    gpu.terminator
-  }
-  return
-}
-
-// -----
-
-//       CHECK: #[[MAP:.*]] = affine_map<()[s0] -> (s0 * 2)>
-//       CHECK: #[[MAP1:.*]] = affine_map<()[s0] -> (s0 * 3)>
-//       CHECK: #[[MAP2:.*]] = affine_map<()[s0, s1, s2, s3, s4] -> (s0 * s1 + s2 * s3 + s4)>
-//       CHECK: @decompose_subview_strided
-//  CHECK-SAME: (%[[MEM:.*]]: memref<?x?x?xf32>)
-//       CHECK:  %[[BASE:.*]], %[[OFFSET:.*]], %[[SIZES:.*]]:3, %[[STRIDES:.*]]:3 = memref.extract_strided_metadata %[[MEM]]
-//       CHECK:  gpu.launch
-//  CHECK-SAME:  threads(%[[TX:.*]], %[[TY:.*]], %[[TZ:.*]]) in
-//       CHECK:  %[[IDX:.*]] = affine.apply #[[MAP]]()[%[[STRIDES]]#0]
-//       CHECK:  %[[IDX1:.*]] = affine.apply #[[MAP1]]()[%[[STRIDES]]#1]
-//       CHECK:  %[[IDX2:.*]] = affine.apply #[[MAP2]]()[%[[TX]], %[[STRIDES]]#0, %[[TY]], %[[STRIDES]]#1, %[[TZ]]]
-//       CHECK:  %[[PTR:.*]] = memref.reinterpret_cast %[[BASE]] to offset: [%[[IDX2]]], sizes: [%{{.*}}, %{{.*}}, %{{.*}}], strides: [%[[IDX]], %[[IDX1]], 4]
-//       CHECK:  "test.test"(%[[PTR]]) : (memref<?x?x?xf32, strided<[?, ?, ?], offset: ?>>) -> ()
-func.func @decompose_subview_strided(%arg0 : memref<?x?x?xf32>) {
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %c2 = arith.constant 2 : index
-  %block_dim0 = memref.dim %arg0, %c0 : memref<?x?x?xf32>
-  %block_dim1 = memref.dim %arg0, %c1 : memref<?x?x?xf32>
-  %block_dim2 = memref.dim %arg0, %c2 : memref<?x?x?xf32>
-  gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c1, %grid_y = %c1, %grid_z = %c1)
-             threads(%tx, %ty, %tz) in (%block_x = %block_dim0, %block_y = %block_dim1, %block_z = %block_dim2) {
-    %res = memref.subview %arg0[%tx, %ty, %tz] [%c2, %c2, %c2] [2, 3, 4] : memref<?x?x?xf32> to memref<?x?x?xf32, strided<[?, ?, ?], offset: ?>>
-    "test.test"(%res) : (memref<?x?x?xf32, strided<[?, ?, ?], offset: ?>>) -> ()
-    gpu.terminator
-  }
-  return
-}