[Mlir-commits] [mlir] fc421d7 - [MLIR] Remove all-reduce lowering from GPU to NVVM. Use in-dialect lowering instead.

Wed Mar 11 07:18:05 PDT 2020

Author: Christian Sigg
Date: 2020-03-11T15:17:54+01:00
New Revision: fc421d7ca3ecc4864c547e587eac4dacd98c33da

URL: https://github.com/llvm/llvm-project/commit/fc421d7ca3ecc4864c547e587eac4dacd98c33da
DIFF: https://github.com/llvm/llvm-project/commit/fc421d7ca3ecc4864c547e587eac4dacd98c33da.diff

LOG: [MLIR] Remove all-reduce lowering from GPU to NVVM. Use in-dialect lowering instead.

Reviewers: herhut, mravishankar

Reviewed By: herhut

Subscribers: merge_guards_bot, jholewinski, mehdi_amini, rriddle, jpienaar, burmako, shauheen, antiagainst, nicolasvasilache, arpith-jacob, mgester, lucyrfox, aartbik, liufengdb, Joonsoo, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D73794

Added: 
    

Modified: 
    mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
    mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir

Removed: 
    


################################################################################
diff  --git a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
index 7af62dfb3e4a..67c335e629fe 100644

--- a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
+++ b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
@@ -15,6 +15,7 @@
 
 #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h"
 #include "mlir/Dialect/GPU/GPUDialect.h"
+#include "mlir/Dialect/GPU/Passes.h"
 #include "mlir/Dialect/LLVMIR/NVVMDialect.h"
 #include "mlir/IR/BlockAndValueMapping.h"
 #include "mlir/Pass/Pass.h"
@@ -28,448 +29,6 @@ using namespace mlir;
 
 namespace {
 
-/// Converts all_reduce op to LLVM/NVVM ops.
-struct GPUAllReduceOpLowering : public ConvertToLLVMPattern {
-  using AccumulatorFactory =
-      std::function<Value(Location, Value, Value, ConversionPatternRewriter &)>;
-
-  explicit GPUAllReduceOpLowering(LLVMTypeConverter &lowering_)
-      : ConvertToLLVMPattern(gpu::AllReduceOp::getOperationName(),
-                             lowering_.getDialect()->getContext(), lowering_),
-        int32Type(LLVM::LLVMType::getInt32Ty(lowering_.getDialect())) {}
-
-  PatternMatchResult
-  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
-                  ConversionPatternRewriter &rewriter) const override {
-    Location loc = op->getLoc();
-    Value operand = operands.front();
-
-    // TODO(csigg): Generalize to other types of accumulation.
-    assert(op->getOperand(0).getType().isSignlessIntOrFloat());
-
-    // Create the reduction using an accumulator factory.
-    AccumulatorFactory factory =
-        getFactory(cast<gpu::AllReduceOp>(op), operand);
-    assert(factory && "failed to create accumulator factory");
-    Value result = createBlockReduce(loc, operand, factory, rewriter);
-
-    rewriter.replaceOp(op, {result});
-    return matchSuccess();
-  }
-
-private:
-  /// Returns an accumulator factory using either the op attribute or the body
-  /// region.
-  AccumulatorFactory getFactory(gpu::AllReduceOp allReduce,
-                                Value operand) const {
-    if (!allReduce.body().empty()) {
-      return getFactory(allReduce.body());
-    }
-    if (allReduce.op()) {
-      auto type = operand.getType().cast<LLVM::LLVMType>();
-      return getFactory(*allReduce.op(), type.getUnderlyingType());
-    }
-    return AccumulatorFactory();
-  }
-
-  /// Returns an accumulator factory that clones the body. The body's entry
-  /// block is expected to have 2 arguments. The gpu.yield return the
-  /// accumulated value of the same type.
-  AccumulatorFactory getFactory(Region &body) const {
-    return AccumulatorFactory([&](Location loc, Value lhs, Value rhs,
-                                  ConversionPatternRewriter &rewriter) {
-      Block *block = rewriter.getInsertionBlock();
-      Block *split = rewriter.splitBlock(block, rewriter.getInsertionPoint());
-
-      // Insert accumulator body between split block.
-      BlockAndValueMapping mapping;
-      mapping.map(body.front().getArgument(0), lhs);
-      mapping.map(body.front().getArgument(1), rhs);
-      rewriter.cloneRegionBefore(body, *split->getParent(),
-                                 split->getIterator(), mapping);
-
-      // Add branch before inserted body, into body.
-      block = block->getNextNode();
-      rewriter.create<LLVM::BrOp>(loc, ValueRange(), block);
-
-      // Replace all gpu.yield ops with branch out of body.
-      for (; block != split; block = block->getNextNode()) {
-        Operation *terminator = block->getTerminator();
-        if (!llvm::isa<gpu::YieldOp>(terminator))
-          continue;
-        rewriter.setInsertionPointToEnd(block);
-        rewriter.replaceOpWithNewOp<LLVM::BrOp>(
-            terminator, terminator->getOperand(0), split);
-      }
-
-      // Return accumulator result.
-      rewriter.setInsertionPointToStart(split);
-      return split->addArgument(lhs.getType());
-    });
-  }
-
-  /// Returns an accumulator factory that creates an op specified by opName.
-  AccumulatorFactory getFactory(StringRef opName, llvm::Type *type) const {
-    if (type->isVectorTy() || type->isArrayTy())
-      return getFactory(opName, type->getSequentialElementType());
-
-    bool isFloatingPoint = type->isFloatingPointTy();
-
-    if (opName == "add") {
-      return isFloatingPoint ? getFactory<LLVM::FAddOp>()
-                             : getFactory<LLVM::AddOp>();
-    }
-    if (opName == "mul") {
-      return isFloatingPoint ? getFactory<LLVM::FMulOp>()
-                             : getFactory<LLVM::MulOp>();
-    }
-    if (opName == "and") {
-      return getFactory<LLVM::AndOp>();
-    }
-    if (opName == "or") {
-      return getFactory<LLVM::OrOp>();
-    }
-    if (opName == "xor") {
-      return getFactory<LLVM::XOrOp>();
-    }
-    if (opName == "max") {
-      return isFloatingPoint ? getCmpFactory<LLVM::FCmpOp, LLVM::FCmpPredicate,
-                                             LLVM::FCmpPredicate::ugt>()
-                             : getCmpFactory<LLVM::ICmpOp, LLVM::ICmpPredicate,
-                                             LLVM::ICmpPredicate::ugt>();
-    }
-    if (opName == "min") {
-      return isFloatingPoint ? getCmpFactory<LLVM::FCmpOp, LLVM::FCmpPredicate,
-                                             LLVM::FCmpPredicate::ult>()
-                             : getCmpFactory<LLVM::ICmpOp, LLVM::ICmpPredicate,
-                                             LLVM::ICmpPredicate::ult>();
-    }
-
-    return AccumulatorFactory();
-  }
-
-  /// Returns an accumulator factory that creates an op of type T.
-  template <typename T>
-  AccumulatorFactory getFactory() const {
-    return [](Location loc, Value lhs, Value rhs,
-              ConversionPatternRewriter &rewriter) {
-      return rewriter.create<T>(loc, lhs.getType(), lhs, rhs);
-    };
-  }
-
-  /// Returns an accumulator for comparaison such as min, max. T is the type
-  /// of the compare op.
-  template <typename T, typename PredicateEnum, PredicateEnum predicate>
-  AccumulatorFactory getCmpFactory() const {
-    return [](Location loc, Value lhs, Value rhs,
-              ConversionPatternRewriter &rewriter) {
-      Value cmp = rewriter.create<T>(loc, predicate, lhs, rhs);
-      return rewriter.create<LLVM::SelectOp>(loc, cmp, lhs, rhs);
-    };
-  }
-
-  /// Creates an all_reduce across the block.
-  ///
-  /// First reduce the elements within a warp. The first thread of each warp
-  /// writes the intermediate result to shared memory. After synchronizing the
-  /// block, the first warp reduces the values from shared memory. The result
-  /// is broadcasted to all threads through shared memory.
-  ///
-  ///     %warp_reduce = `createWarpReduce(%operand)`
-  ///     %shared_mem_ptr = llvm.mlir.addressof @reduce_buffer
-  ///     %zero = llvm.mlir.constant(0 : i32) : !llvm.i32
-  ///     %lane_id = nvvm.read.ptx.sreg.laneid  : !llvm.i32
-  ///     %is_first_lane = llvm.icmp "eq" %lane_id, %zero : !llvm.i1
-  ///     %thread_idx = `getLinearThreadIndex()` : !llvm.i32
-  ///     llvm.cond_br %is_first_lane, ^then1, ^continue1
-  ///   ^then1:
-  ///     %warp_id = `getWarpId()`
-  ///     %store_dst = llvm.getelementptr %shared_mem_ptr[%zero, %warp_id]
-  ///     llvm.store %store_dst, %warp_reduce
-  ///     llvm.br ^continue1
-  ///   ^continue1:
-  ///     nvvm.barrier0
-  ///     %num_warps = `getNumWarps()` : !llvm.i32
-  ///     %is_valid_warp = llvm.icmp "slt" %thread_idx, %num_warps
-  ///     %result_ptr = llvm.getelementptr %shared_mem_ptr[%zero, %zero]
-  ///     llvm.cond_br %is_first_lane, ^then2, ^continue2
-  ///   ^then2:
-  ///     %load_src = llvm.getelementptr %shared_mem_ptr[%zero, %thread_idx]
-  ///     %value = llvm.load %load_src
-  ///     %result = `createWarpReduce(%value)`
-  ///     llvm.store %result_ptr, %result
-  ///     llvm.br ^continue2
-  ///   ^continue2:
-  ///     nvvm.barrier0
-  ///     %result = llvm.load %result_ptr
-  ///     return %result
-  ///
-  Value createBlockReduce(Location loc, Value operand,
-                          AccumulatorFactory &accumFactory,
-                          ConversionPatternRewriter &rewriter) const {
-    auto type = operand.getType().cast<LLVM::LLVMType>();
-
-    // Create shared memory array to store the warp reduction.
-    auto module = operand.getDefiningOp()->getParentOfType<gpu::GPUModuleOp>();
-    assert(module && "op must belong to a module");
-    Value sharedMemPtr =
-        createSharedMemoryArray(loc, module, type, kWarpSize, rewriter);
-
-    Value zero = rewriter.create<LLVM::ConstantOp>(
-        loc, int32Type, rewriter.getI32IntegerAttr(0u));
-    Value laneId = rewriter.create<NVVM::LaneIdOp>(loc, int32Type);
-    Value isFirstLane = rewriter.create<LLVM::ICmpOp>(
-        loc, LLVM::ICmpPredicate::eq, laneId, zero);
-    Value threadIdx = getLinearThreadIndex(loc, rewriter);
-    Value blockSize = getBlockSize(loc, rewriter);
-    Value activeWidth = getActiveWidth(loc, threadIdx, blockSize, rewriter);
-
-    // Reduce elements within each warp to produce the intermediate results.
-    Value warpReduce = createWarpReduce(loc, activeWidth, laneId, operand,
-                                        accumFactory, rewriter);
-
-    // Write the intermediate results to shared memory, using the first lane of
-    // each warp.
-    createPredicatedBlock(loc, rewriter, isFirstLane, [&] {
-      Value warpId = getDivideByWarpSize(threadIdx, rewriter);
-      Value storeDst = rewriter.create<LLVM::GEPOp>(
-          loc, type, sharedMemPtr, ArrayRef<Value>({zero, warpId}));
-      rewriter.create<LLVM::StoreOp>(loc, warpReduce, storeDst);
-    });
-    rewriter.create<NVVM::Barrier0Op>(loc);
-
-    Value numWarps = getNumWarps(loc, blockSize, rewriter);
-    Value isValidWarp = rewriter.create<LLVM::ICmpOp>(
-        loc, LLVM::ICmpPredicate::slt, threadIdx, numWarps);
-    Value resultPtr = rewriter.create<LLVM::GEPOp>(
-        loc, type, sharedMemPtr, ArrayRef<Value>({zero, zero}));
-
-    // Use the first numWarps threads to reduce the intermediate results from
-    // shared memory. The final result is written to shared memory again.
-    createPredicatedBlock(loc, rewriter, isValidWarp, [&] {
-      Value loadSrc = rewriter.create<LLVM::GEPOp>(
-          loc, type, sharedMemPtr, ArrayRef<Value>({zero, threadIdx}));
-      Value value = rewriter.create<LLVM::LoadOp>(loc, type, loadSrc);
-      Value result = createWarpReduce(loc, numWarps, laneId, value,
-                                      accumFactory, rewriter);
-      rewriter.create<LLVM::StoreOp>(loc, result, resultPtr);
-    });
-    rewriter.create<NVVM::Barrier0Op>(loc);
-
-    // Load and return result from shared memory.
-    Value result = rewriter.create<LLVM::LoadOp>(loc, type, resultPtr);
-    return result;
-  }
-
-  /// Creates an if-block skeleton and calls the two factories to generate the
-  /// ops in the `then` and `else` block..
-  ///
-  ///     llvm.cond_br %condition, ^then, ^continue
-  ///   ^then:
-  ///     %then_operands = `thenOpsFactory()`
-  ///     llvm.br ^continue(%then_operands)
-  ///   ^else:
-  ///     %else_operands = `elseOpsFactory()`
-  ///     llvm.br ^continue(%else_operands)
-  ///   ^continue(%block_operands):
-  ///
-  template <typename ThenOpsFactory, typename ElseOpsFactory>
-  void createIf(Location loc, ConversionPatternRewriter &rewriter,
-                Value condition, ThenOpsFactory &&thenOpsFactory,
-                ElseOpsFactory &&elseOpsFactory) const {
-    Block *currentBlock = rewriter.getInsertionBlock();
-    auto currentPoint = rewriter.getInsertionPoint();
-
-    Block *thenBlock = rewriter.splitBlock(currentBlock, currentPoint);
-    Block *elseBlock = rewriter.splitBlock(thenBlock, thenBlock->begin());
-    Block *continueBlock = rewriter.splitBlock(elseBlock, elseBlock->begin());
-
-    rewriter.setInsertionPointToEnd(currentBlock);
-    rewriter.create<LLVM::CondBrOp>(loc, condition, thenBlock, elseBlock);
-
-    auto addBranch = [&](ValueRange operands) {
-      rewriter.create<LLVM::BrOp>(loc, operands, continueBlock);
-    };
-
-    rewriter.setInsertionPointToStart(thenBlock);
-    auto thenOperands = thenOpsFactory();
-    addBranch(thenOperands);
-
-    rewriter.setInsertionPointToStart(elseBlock);
-    auto elseOperands = elseOpsFactory();
-    addBranch(elseOperands);
-
-    assert(thenOperands.size() == elseOperands.size());
-    rewriter.setInsertionPointToStart(continueBlock);
-    for (auto operand : thenOperands)
-      continueBlock->addArgument(operand.getType());
-  }
-
-  /// Shortcut for createIf with empty else block and no block operands.
-  template <typename Factory>
-  void createPredicatedBlock(Location loc, ConversionPatternRewriter &rewriter,
-                             Value condition,
-                             Factory &&predicatedOpsFactory) const {
-    createIf(
-        loc, rewriter, condition,
-        [&] {
-          predicatedOpsFactory();
-          return ArrayRef<Value>();
-        },
-        [&] { return ArrayRef<Value>(); });
-  }
-
-  /// Creates a reduction across the first activeWidth lanes of a warp.
-  /// The first lane returns the result, all others return values are undefined.
-  Value createWarpReduce(Location loc, Value activeWidth, Value laneId,
-                         Value operand, AccumulatorFactory accumFactory,
-                         ConversionPatternRewriter &rewriter) const {
-    Value warpSize = rewriter.create<LLVM::ConstantOp>(
-        loc, int32Type, rewriter.getI32IntegerAttr(kWarpSize));
-    Value isPartialWarp = rewriter.create<LLVM::ICmpOp>(
-        loc, LLVM::ICmpPredicate::slt, activeWidth, warpSize);
-    auto type = operand.getType().cast<LLVM::LLVMType>();
-
-    createIf(
-        loc, rewriter, isPartialWarp,
-        // Generate reduction over a (potentially) partial warp.
-        [&] {
-          Value value = operand;
-          Value one = rewriter.create<LLVM::ConstantOp>(
-              loc, int32Type, rewriter.getI32IntegerAttr(1));
-          // Bit mask of active lanes: `(1 << activeWidth) - 1`.
-          Value activeMask = rewriter.create<LLVM::SubOp>(
-              loc, int32Type,
-              rewriter.create<LLVM::ShlOp>(loc, int32Type, one, activeWidth),
-              one);
-          // Clamp lane: `activeWidth - 1`
-          Value maskAndClamp =
-              rewriter.create<LLVM::SubOp>(loc, int32Type, activeWidth, one);
-          auto dialect = typeConverter.getDialect();
-          auto predTy = LLVM::LLVMType::getInt1Ty(dialect);
-          auto shflTy = LLVM::LLVMType::getStructTy(dialect, {type, predTy});
-          auto returnValueAndIsValidAttr = rewriter.getUnitAttr();
-
-          // Repeatedly shuffle value from 'laneId ^ i' and accumulate if source
-          // lane is within the active range. All lanes contain the final
-          // result, but only the first lane's result is used.
-          for (int i = 1; i < kWarpSize; i <<= 1) {
-            Value offset = rewriter.create<LLVM::ConstantOp>(
-                loc, int32Type, rewriter.getI32IntegerAttr(i));
-            Value shfl = rewriter.create<NVVM::ShflBflyOp>(
-                loc, shflTy, activeMask, value, offset, maskAndClamp,
-                returnValueAndIsValidAttr);
-            Value isActiveSrcLane = rewriter.create<LLVM::ExtractValueOp>(
-                loc, predTy, shfl, rewriter.getIndexArrayAttr(1));
-            // Skip the accumulation if the shuffle op read from a lane outside
-            // of the active range.
-            createIf(
-                loc, rewriter, isActiveSrcLane,
-                [&] {
-                  Value shflValue = rewriter.create<LLVM::ExtractValueOp>(
-                      loc, type, shfl, rewriter.getIndexArrayAttr(0));
-                  return SmallVector<Value, 1>{
-                      accumFactory(loc, value, shflValue, rewriter)};
-                },
-                [&] { return llvm::makeArrayRef(value); });
-            value = rewriter.getInsertionBlock()->getArgument(0);
-          }
-          return SmallVector<Value, 1>{value};
-        },
-        // Generate a reduction over the entire warp. This is a specialization
-        // of the above reduction with unconditional accumulation.
-        [&] {
-          Value value = operand;
-          Value activeMask = rewriter.create<LLVM::ConstantOp>(
-              loc, int32Type, rewriter.getI32IntegerAttr(~0u));
-          Value maskAndClamp = rewriter.create<LLVM::ConstantOp>(
-              loc, int32Type, rewriter.getI32IntegerAttr(kWarpSize - 1));
-          for (int i = 1; i < kWarpSize; i <<= 1) {
-            Value offset = rewriter.create<LLVM::ConstantOp>(
-                loc, int32Type, rewriter.getI32IntegerAttr(i));
-            Value shflValue = rewriter.create<NVVM::ShflBflyOp>(
-                loc, type, activeMask, value, offset, maskAndClamp,
-                /*return_value_and_is_valid=*/UnitAttr());
-            value = accumFactory(loc, value, shflValue, rewriter);
-          }
-          return SmallVector<Value, 1>{value};
-        });
-    return rewriter.getInsertionBlock()->getArgument(0);
-  }
-
-  /// Creates a global array stored in shared memory.
-  Value createSharedMemoryArray(Location loc, gpu::GPUModuleOp module,
-                                LLVM::LLVMType elementType, int numElements,
-                                ConversionPatternRewriter &rewriter) const {
-    OpBuilder builder(module.body());
-
-    auto arrayType = LLVM::LLVMType::getArrayTy(elementType, numElements);
-    StringRef name = "reduce_buffer";
-    auto globalOp = builder.create<LLVM::GlobalOp>(
-        loc, arrayType.cast<LLVM::LLVMType>(),
-        /*isConstant=*/false, LLVM::Linkage::Internal, name,
-        /*value=*/Attribute(), gpu::GPUDialect::getWorkgroupAddressSpace());
-
-    return rewriter.create<LLVM::AddressOfOp>(loc, globalOp);
-  }
-
-  /// Returns the index of the thread within the block.
-  Value getLinearThreadIndex(Location loc,
-                             ConversionPatternRewriter &rewriter) const {
-    Value dimX = rewriter.create<NVVM::BlockDimXOp>(loc, int32Type);
-    Value dimY = rewriter.create<NVVM::BlockDimYOp>(loc, int32Type);
-    Value idX = rewriter.create<NVVM::ThreadIdXOp>(loc, int32Type);
-    Value idY = rewriter.create<NVVM::ThreadIdYOp>(loc, int32Type);
-    Value idZ = rewriter.create<NVVM::ThreadIdZOp>(loc, int32Type);
-    Value tmp1 = rewriter.create<LLVM::MulOp>(loc, int32Type, idZ, dimY);
-    Value tmp2 = rewriter.create<LLVM::AddOp>(loc, int32Type, tmp1, idY);
-    Value tmp3 = rewriter.create<LLVM::MulOp>(loc, int32Type, tmp2, dimX);
-    return rewriter.create<LLVM::AddOp>(loc, int32Type, tmp3, idX);
-  }
-
-  /// Returns the number of threads in the block.
-  Value getBlockSize(Location loc, ConversionPatternRewriter &rewriter) const {
-    Value dimX = rewriter.create<NVVM::BlockDimXOp>(loc, int32Type);
-    Value dimY = rewriter.create<NVVM::BlockDimYOp>(loc, int32Type);
-    Value dimZ = rewriter.create<NVVM::BlockDimZOp>(loc, int32Type);
-    Value dimXY = rewriter.create<LLVM::MulOp>(loc, int32Type, dimX, dimY);
-    return rewriter.create<LLVM::MulOp>(loc, int32Type, dimXY, dimZ);
-  }
-
-  /// Returns the number of warps in the block.
-  Value getNumWarps(Location loc, Value blockSize,
-                    ConversionPatternRewriter &rewriter) const {
-    auto warpSizeMinusOne = rewriter.create<LLVM::ConstantOp>(
-        loc, int32Type, rewriter.getI32IntegerAttr(kWarpSize - 1));
-    auto biasedBlockSize = rewriter.create<LLVM::AddOp>(
-        loc, int32Type, blockSize, warpSizeMinusOne);
-    return getDivideByWarpSize(biasedBlockSize, rewriter);
-  }
-
-  /// Returns the number of active threads in the warp, not clamped to 32.
-  Value getActiveWidth(Location loc, Value threadIdx, Value blockSize,
-                       ConversionPatternRewriter &rewriter) const {
-    Value threadIdxMask = rewriter.create<LLVM::ConstantOp>(
-        loc, int32Type, rewriter.getI32IntegerAttr(~(kWarpSize - 1)));
-    Value numThreadsWithSmallerWarpId =
-        rewriter.create<LLVM::AndOp>(loc, threadIdx, threadIdxMask);
-    return rewriter.create<LLVM::SubOp>(loc, blockSize,
-                                        numThreadsWithSmallerWarpId);
-  }
-
-  /// Returns value divided by the warp size (i.e. 32).
-  Value getDivideByWarpSize(Value value,
-                            ConversionPatternRewriter &rewriter) const {
-    auto loc = value.getLoc();
-    auto warpSize = rewriter.create<LLVM::ConstantOp>(
-        loc, int32Type, rewriter.getI32IntegerAttr(kWarpSize));
-    return rewriter.create<LLVM::SDivOp>(loc, int32Type, value, warpSize);
-  }
-
-  LLVM::LLVMType int32Type;
-
-  static constexpr int kWarpSize = 32;
-};
 
 struct GPUShuffleOpLowering : public ConvertToLLVMPattern {
   explicit GPUShuffleOpLowering(LLVMTypeConverter &lowering_)
@@ -704,6 +263,14 @@ class LowerGpuOpsToNVVMOpsPass
     });
 
     OwningRewritePatternList patterns;
+
+    // Apply in-dialect lowering first. In-dialect lowering will replace ops
+    // which need to be lowered further, which is not supported by a single
+    // conversion pass.
+    populateGpuRewritePatterns(m.getContext(), patterns);
+    applyPatternsGreedily(m, patterns);
+    patterns.clear();
+
     populateStdToLLVMConversionPatterns(converter, patterns);
     populateGpuToNVVMConversionPatterns(converter, patterns);
     LLVMConversionTarget target(getContext());
@@ -735,8 +302,8 @@ void mlir::populateGpuToNVVMConversionPatterns(
                                           NVVM::BlockIdYOp, NVVM::BlockIdZOp>,
               GPUIndexIntrinsicOpLowering<gpu::GridDimOp, NVVM::GridDimXOp,
                                           NVVM::GridDimYOp, NVVM::GridDimZOp>,
-              GPUAllReduceOpLowering, GPUShuffleOpLowering, GPUFuncOpLowering,
-              GPUReturnOpLowering>(converter);
+              GPUShuffleOpLowering, GPUFuncOpLowering, GPUReturnOpLowering>(
+          converter);
   patterns.insert<OpToFuncCallLowering<AbsFOp>>(converter, "__nv_fabsf",
                                                 "__nv_fabs");
   patterns.insert<OpToFuncCallLowering<CeilFOp>>(converter, "__nv_ceilf",

diff  --git a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
index 4ae9d8022bc3..f05c9af1b30f 100644
--- a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
+++ b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
@@ -1,9 +1,10 @@
-// RUN: mlir-opt %s -convert-gpu-to-nvvm -split-input-file | FileCheck %s
+// RUN: mlir-opt %s -convert-gpu-to-nvvm -split-input-file | FileCheck %s --dump-input-on-failure
 
 gpu.module @test_module {
   // CHECK-LABEL: func @gpu_index_ops()
   func @gpu_index_ops()
-      attributes { gpu.kernel } {
+      -> (index, index, index, index, index, index,
+          index, index, index, index, index, index) {
     // CHECK: = nvvm.read.ptx.sreg.tid.x : !llvm.i32
     %tIdX = "gpu.thread_id"() {dimension = "x"} : () -> (index)
     // CHECK: = nvvm.read.ptx.sreg.tid.y : !llvm.i32
@@ -32,7 +33,10 @@ gpu.module @test_module {
     // CHECK: = nvvm.read.ptx.sreg.nctaid.z : !llvm.i32
     %gDimZ = "gpu.grid_dim"() {dimension = "z"} : () -> (index)
 
-    std.return
+    std.return %tIdX, %tIdY, %tIdZ, %bDimX, %bDimY, %bDimZ,
+               %bIdX, %bIdY, %bIdZ, %gDimX, %gDimY, %gDimZ
+        : index, index, index, index, index, index,
+          index, index, index, index, index, index
   }
 }
 
@@ -40,8 +44,7 @@ gpu.module @test_module {
 
 gpu.module @test_module {
   // CHECK-LABEL: func @gpu_all_reduce_op()
-  func @gpu_all_reduce_op()
-      attributes { gpu.kernel } {
+  gpu.func @gpu_all_reduce_op() {
     %arg0 = constant 1.0 : f32
     // TODO(csigg): Check full IR expansion once lowering has settled.
     // CHECK: nvvm.shfl.sync.bfly
@@ -49,7 +52,7 @@ gpu.module @test_module {
     // CHECK: llvm.fadd
     %result = "gpu.all_reduce"(%arg0) ({}) {op = "add"} : (f32) -> (f32)
 
-    std.return
+    gpu.return
   }
 }
 
@@ -57,8 +60,7 @@ gpu.module @test_module {
 
 gpu.module @test_module {
   // CHECK-LABEL: func @gpu_all_reduce_region()
-  func @gpu_all_reduce_region()
-      attributes { gpu.kernel } {
+  gpu.func @gpu_all_reduce_region() {
     %arg0 = constant 1 : i32
     // TODO(csigg): Check full IR expansion once lowering has settled.
     // CHECK: nvvm.shfl.sync.bfly
@@ -68,7 +70,7 @@ gpu.module @test_module {
       %xor = xor %lhs, %rhs : i32
       "gpu.yield"(%xor) : (i32) -> ()
     }) : (i32) -> (i32)
-    std.return
+    gpu.return
   }
 }
 
@@ -76,8 +78,7 @@ gpu.module @test_module {
 
 gpu.module @test_module {
   // CHECK-LABEL: func @gpu_shuffle()
-  func @gpu_shuffle()
-      attributes { gpu.kernel } {
+  func @gpu_shuffle() -> (f32) {
     // CHECK: %[[#VALUE:]] = llvm.mlir.constant(1.000000e+00 : f32) : !llvm.float
     %arg0 = constant 1.0 : f32
     // CHECK: %[[#OFFSET:]] = llvm.mlir.constant(4 : i32) : !llvm.i32
@@ -93,7 +94,7 @@ gpu.module @test_module {
     // CHECK: llvm.extractvalue %[[#SHFL]][1 : index] : !llvm<"{ float, i1 }">
     %shfl, %pred = "gpu.shuffle"(%arg0, %arg1, %arg2) { mode = "xor" } : (f32, i32, i32) -> (f32, i1)
 
-    std.return
+    std.return %shfl : f32
   }
 }
 
@@ -101,8 +102,7 @@ gpu.module @test_module {
 
 gpu.module @test_module {
   // CHECK-LABEL: func @gpu_sync()
-  func @gpu_sync()
-      attributes { gpu.kernel } {
+  func @gpu_sync() {
     // CHECK: nvvm.barrier0
     gpu.barrier
     std.return
@@ -115,12 +115,12 @@ gpu.module @test_module {
   // CHECK: llvm.func @__nv_fabsf(!llvm.float) -> !llvm.float
   // CHECK: llvm.func @__nv_fabs(!llvm.double) -> !llvm.double
   // CHECK-LABEL: func @gpu_fabs
-  func @gpu_fabs(%arg_f32 : f32, %arg_f64 : f64) {
+  func @gpu_fabs(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
     %result32 = std.absf %arg_f32 : f32
     // CHECK: llvm.call @__nv_fabsf(%{{.*}}) : (!llvm.float) -> !llvm.float
     %result64 = std.absf %arg_f64 : f64
     // CHECK: llvm.call @__nv_fabs(%{{.*}}) : (!llvm.double) -> !llvm.double
-    std.return
+    std.return %result32, %result64 : f32, f64
   }
 }
 
@@ -130,12 +130,12 @@ gpu.module @test_module {
   // CHECK: llvm.func @__nv_ceilf(!llvm.float) -> !llvm.float
   // CHECK: llvm.func @__nv_ceil(!llvm.double) -> !llvm.double
   // CHECK-LABEL: func @gpu_ceil
-  func @gpu_ceil(%arg_f32 : f32, %arg_f64 : f64) {
+  func @gpu_ceil(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
     %result32 = std.ceilf %arg_f32 : f32
     // CHECK: llvm.call @__nv_ceilf(%{{.*}}) : (!llvm.float) -> !llvm.float
     %result64 = std.ceilf %arg_f64 : f64
     // CHECK: llvm.call @__nv_ceil(%{{.*}}) : (!llvm.double) -> !llvm.double
-    std.return
+    std.return %result32, %result64 : f32, f64
   }
 }
 
@@ -145,12 +145,12 @@ gpu.module @test_module {
   // CHECK: llvm.func @__nv_cosf(!llvm.float) -> !llvm.float
   // CHECK: llvm.func @__nv_cos(!llvm.double) -> !llvm.double
   // CHECK-LABEL: func @gpu_cos
-  func @gpu_cos(%arg_f32 : f32, %arg_f64 : f64) {
+  func @gpu_cos(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
     %result32 = std.cos %arg_f32 : f32
     // CHECK: llvm.call @__nv_cosf(%{{.*}}) : (!llvm.float) -> !llvm.float
     %result64 = std.cos %arg_f64 : f64
     // CHECK: llvm.call @__nv_cos(%{{.*}}) : (!llvm.double) -> !llvm.double
-    std.return
+    std.return %result32, %result64 : f32, f64
   }
 }
 
@@ -159,14 +159,12 @@ gpu.module @test_module {
   // CHECK: llvm.func @__nv_expf(!llvm.float) -> !llvm.float
   // CHECK: llvm.func @__nv_exp(!llvm.double) -> !llvm.double
   // CHECK-LABEL: func @gpu_exp
-  func @gpu_exp(%arg_f32 : f32, %arg_f64 : f64) {
-    %exp_f32 = std.exp %arg_f32 : f32
-    // CHECK: llvm.call @__nv_expf(%{{.*}}) : (!llvm.float) -> !llvm.float
-    %result_f32 = std.exp %exp_f32 : f32
+  func @gpu_exp(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
+    %result32 = std.exp %arg_f32 : f32
     // CHECK: llvm.call @__nv_expf(%{{.*}}) : (!llvm.float) -> !llvm.float
     %result64 = std.exp %arg_f64 : f64
     // CHECK: llvm.call @__nv_exp(%{{.*}}) : (!llvm.double) -> !llvm.double
-    std.return
+    std.return %result32, %result64 : f32, f64
   }
 }
 
@@ -176,12 +174,12 @@ gpu.module @test_module {
   // CHECK: llvm.func @__nv_logf(!llvm.float) -> !llvm.float
   // CHECK: llvm.func @__nv_log(!llvm.double) -> !llvm.double
   // CHECK-LABEL: func @gpu_log
-  func @gpu_log(%arg_f32 : f32, %arg_f64 : f64) {
+  func @gpu_log(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
     %result32 = std.log %arg_f32 : f32
     // CHECK: llvm.call @__nv_logf(%{{.*}}) : (!llvm.float) -> !llvm.float
     %result64 = std.log %arg_f64 : f64
     // CHECK: llvm.call @__nv_log(%{{.*}}) : (!llvm.double) -> !llvm.double
-    std.return
+    std.return %result32, %result64 : f32, f64
   }
 }
 
@@ -191,12 +189,12 @@ gpu.module @test_module {
   // CHECK: llvm.func @__nv_log10f(!llvm.float) -> !llvm.float
   // CHECK: llvm.func @__nv_log10(!llvm.double) -> !llvm.double
   // CHECK-LABEL: func @gpu_log10
-  func @gpu_log10(%arg_f32 : f32, %arg_f64 : f64) {
+  func @gpu_log10(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
     %result32 = std.log10 %arg_f32 : f32
     // CHECK: llvm.call @__nv_log10f(%{{.*}}) : (!llvm.float) -> !llvm.float
     %result64 = std.log10 %arg_f64 : f64
     // CHECK: llvm.call @__nv_log10(%{{.*}}) : (!llvm.double) -> !llvm.double
-    std.return
+    std.return %result32, %result64 : f32, f64
   }
 }
 
@@ -206,12 +204,12 @@ gpu.module @test_module {
   // CHECK: llvm.func @__nv_log2f(!llvm.float) -> !llvm.float
   // CHECK: llvm.func @__nv_log2(!llvm.double) -> !llvm.double
   // CHECK-LABEL: func @gpu_log2
-  func @gpu_log2(%arg_f32 : f32, %arg_f64 : f64) {
+  func @gpu_log2(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
     %result32 = std.log2 %arg_f32 : f32
     // CHECK: llvm.call @__nv_log2f(%{{.*}}) : (!llvm.float) -> !llvm.float
     %result64 = std.log2 %arg_f64 : f64
     // CHECK: llvm.call @__nv_log2(%{{.*}}) : (!llvm.double) -> !llvm.double
-    std.return
+    std.return %result32, %result64 : f32, f64
   }
 }
 
@@ -221,12 +219,12 @@ gpu.module @test_module {
   // CHECK: llvm.func @__nv_tanhf(!llvm.float) -> !llvm.float
   // CHECK: llvm.func @__nv_tanh(!llvm.double) -> !llvm.double
   // CHECK-LABEL: func @gpu_tanh
-  func @gpu_tanh(%arg_f32 : f32, %arg_f64 : f64) {
+  func @gpu_tanh(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
     %result32 = std.tanh %arg_f32 : f32
     // CHECK: llvm.call @__nv_tanhf(%{{.*}}) : (!llvm.float) -> !llvm.float
     %result64 = std.tanh %arg_f64 : f64
     // CHECK: llvm.call @__nv_tanh(%{{.*}}) : (!llvm.double) -> !llvm.double
-    std.return
+    std.return %result32, %result64 : f32, f64
   }
 }
 
@@ -239,14 +237,12 @@ gpu.module @test_module {
   // CHECK: llvm.func @__nv_expf(!llvm.float) -> !llvm.float
   // CHECK: llvm.func @__nv_exp(!llvm.double) -> !llvm.double
   // CHECK-LABEL: func @gpu_exp
-    func @gpu_exp(%arg_f32 : f32, %arg_f64 : f64) {
-      %exp_f32 = std.exp %arg_f32 : f32
-      // CHECK: llvm.call @__nv_expf(%{{.*}}) : (!llvm.float) -> !llvm.float
-      %result_f32 = std.exp %exp_f32 : f32
+    func @gpu_exp(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
+      %result32 = std.exp %arg_f32 : f32
       // CHECK: llvm.call @__nv_expf(%{{.*}}) : (!llvm.float) -> !llvm.float
       %result64 = std.exp %arg_f64 : f64
       // CHECK: llvm.call @__nv_exp(%{{.*}}) : (!llvm.double) -> !llvm.double
-      std.return
+      std.return %result32, %result64 : f32, f64
     }
     "test.finish" () : () -> ()
   }) : () -> ()