[llvm] [AMDGPU] Automatic conversion from wave32 to wave64 (PR #137376)

Fri Apr 25 10:58:18 PDT 2025

https://github.com/alex-t created https://github.com/llvm/llvm-project/pull/137376

Small short living kernels may become waveslot limited. To work around the problem an optimization is proposed to convert such kernels from wave32 to wave64 automatically. These kernels shall conform to a strict set of limitations and satisfy profitability conditions.

1. A kernel shall have no function calls as we cannot analyze call stack requirements (nor will it fall into a category of short living kernels anyway).
2. A kernel itself shall not be called from a device enqueue call.
3. A kernel shall not attempt to access EXEC or VCC in any user visible way.
4. A kernel must not use readlane/readfirstlane or any cross-lane/DPP operations in general.
5. A kernel shall not read wavefront size or use ballot through intrinsics (a use of pre-defined frontend wave size macro was deemed permissible for now).
6. There shall be no atomic operations of any sort as these may be used for cross-thread communication.
7. There shall be no LDS access as the allocation is usually tied to the workgroup size and we generally cannot extend it. It is also changing occupancy which is tied to the wave size.
8. There shall be no inline asm calls.
9. There shall be no dynamic VGPRs.
10. Starting from GFX11 some instructions (such as WMMA on GFX11+ and transpose loads on GFX12+) work differently (have different operands) in wave32 and wave64. The kernel shall not have intrinsics to invoke such instructions.

>From 0e647db27581991071d6e69abe977206806c5548 Mon Sep 17 00:00:00 2001
From: alex-t <atimofee at amd.com>
Date: Wed, 23 Apr 2025 22:13:15 +0200
Subject: [PATCH] [AMDGPU] Automatic conversion from wave32 to wave64

---
 llvm/lib/Target/AMDGPU/AMDGPU.h               |   4 +
 llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def |   1 +
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |   2 +
 llvm/lib/Target/AMDGPU/CMakeLists.txt         |   1 +
 llvm/lib/Target/AMDGPU/SIConvertWaveSize.cpp  | 321 ++++++++++++++++++
 llvm/lib/Target/AMDGPU/SIConvertWaveSize.h    |  30 ++
 .../AMDGPU/wave32-to-64-auto-convert.ll       | 121 +++++++
 7 files changed, 480 insertions(+)
 create mode 100644 llvm/lib/Target/AMDGPU/SIConvertWaveSize.cpp
 create mode 100644 llvm/lib/Target/AMDGPU/SIConvertWaveSize.h
 create mode 100644 llvm/test/CodeGen/AMDGPU/wave32-to-64-auto-convert.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 4ff761ec19b3c..76ef87ba44913 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -51,6 +51,7 @@ FunctionPass *createSIMemoryLegalizerPass();
 FunctionPass *createSIInsertWaitcntsPass();
 FunctionPass *createSIPreAllocateWWMRegsLegacyPass();
 FunctionPass *createSIFormMemoryClausesLegacyPass();
+FunctionPass *createSIConvertWaveSizeLegacyPass(const TargetMachine *);
 
 FunctionPass *createSIPostRABundlerPass();
 FunctionPass *createAMDGPUImageIntrinsicOptimizerPass(const TargetMachine *);
@@ -174,6 +175,9 @@ extern char &SIShrinkInstructionsLegacyID;
 void initializeSIFixSGPRCopiesLegacyPass(PassRegistry &);
 extern char &SIFixSGPRCopiesLegacyID;
 
+void initializeSIConvertWaveSizeLegacyPass(PassRegistry &);
+extern char &SIConvertWaveSizeLegacyID;
+
 void initializeSIFixVGPRCopiesLegacyPass(PassRegistry &);
 extern char &SIFixVGPRCopiesID;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index 98a1147ef6d66..0cbd3ef8da761 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -67,6 +67,7 @@ FUNCTION_PASS("amdgpu-unify-divergent-exit-nodes",
               AMDGPUUnifyDivergentExitNodesPass())
 FUNCTION_PASS("amdgpu-usenative", AMDGPUUseNativeCallsPass())
 FUNCTION_PASS("si-annotate-control-flow", SIAnnotateControlFlowPass(*static_cast<const GCNTargetMachine *>(this)))
+FUNCTION_PASS("si-convert-wave-size", SIConvertWaveSizePass(*static_cast<const GCNTargetMachine *>(this)))
 #undef FUNCTION_PASS
 
 #ifndef FUNCTION_ANALYSIS
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index b6cc5137d711a..5be1640fd3db6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -44,6 +44,7 @@
 #include "R600TargetMachine.h"
 #include "SIFixSGPRCopies.h"
 #include "SIFixVGPRCopies.h"
+#include "SIConvertWaveSize.h"
 #include "SIFoldOperands.h"
 #include "SIFormMemoryClauses.h"
 #include "SILoadStoreOptimizer.h"
@@ -506,6 +507,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeSILowerSGPRSpillsLegacyPass(*PR);
   initializeSIFixSGPRCopiesLegacyPass(*PR);
   initializeSIFixVGPRCopiesLegacyPass(*PR);
+  initializeSIConvertWaveSizeLegacyPass(*PR);
   initializeSIFoldOperandsLegacyPass(*PR);
   initializeSIPeepholeSDWALegacyPass(*PR);
   initializeSIShrinkInstructionsLegacyPass(*PR);
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 09a3096602fc3..663361face090 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -150,6 +150,7 @@ add_llvm_target(AMDGPUCodeGen
   SIAnnotateControlFlow.cpp
   SIFixSGPRCopies.cpp
   SIFixVGPRCopies.cpp
+  SIConvertWaveSize.cpp
   SIFoldOperands.cpp
   SIFormMemoryClauses.cpp
   SIFrameLowering.cpp
diff --git a/llvm/lib/Target/AMDGPU/SIConvertWaveSize.cpp b/llvm/lib/Target/AMDGPU/SIConvertWaveSize.cpp
new file mode 100644
index 0000000000000..4f5b839000c77
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/SIConvertWaveSize.cpp
@@ -0,0 +1,321 @@
+//===- SIConvertWaveSize.cpp - Automatically converts wave32 kernels to wave64
+//---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 WITH LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+// Small short living kernels may become waveslot limited.
+// To work around the problem an optimization is proposed to convert such
+// kernels from wave32 to wave64 automatically.These kernels shall conform to a
+// strict set of limitations and satisfy profitability conditions.
+//
+// 1. A kernel shall have no function calls as we cannot analyze call stack
+// requirements (nor will it fall into a category of short living kernels
+// anyway).
+// 2. A kernel itself shall not be called from a device enqueue call.
+// 3. A kernel shall not attempt to access EXEC or VCC in any user visible
+// way.
+// 4. A kernel must not use readlane/readfirstlane or any cross-lane/DPP
+// operations in general.
+// 5. A kernel shall not read wavefront size or use ballot through
+// intrinsics (a use of pre-defined frontend wave size macro was deemed
+// permissible for now).
+// 6. There shall be no atomic operations of any sort as these may be used
+// for cross-thread communication.
+// 7. There shall be no LDS access as the allocation is usually tied to the
+// workgroup size and we generally cannot extend it. It is also changing
+// occupancy which is tied to the wave size.
+// 8. There shall be no inline asm calls.
+// 9 .There shall be no dynamic VGPRs.
+// 10 .Starting from GFX11 some instructions (such as WMMA on GFX11+ and
+// transpose loads on GFX12+) work differently (have different operands) in
+// wave32 and wave64. The kernel shall not have intrinsics to invoke such
+// instructions.
+
+#include "SIConvertWaveSize.h"
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/InitializePasses.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "si-convert-wave-size"
+
+namespace {
+class SIConvertWaveSize {
+  const TargetMachine *TM;
+  const LoopInfo *LI;
+  ScalarEvolution *SE;
+  TargetTransformInfo *TTI;
+
+  InstructionCost TotalCost = 0;
+
+  static const unsigned MaxLatency = 2000;
+
+  SmallVector<Function *> Callees;
+
+public:
+  SIConvertWaveSize(const TargetMachine *TM, const LoopInfo *LI,
+                    ScalarEvolution *SE, TargetTransformInfo *TTI)
+      : TM(TM), LI(LI), SE(SE), TTI(TTI) {}
+
+  bool run(Function &F);
+
+  bool changeWaveSizeAttr(Function *F);
+};
+
+class SIConvertWaveSizeLegacy : public FunctionPass {
+  const TargetMachine *TM;
+
+public:
+  static char ID;
+  SIConvertWaveSizeLegacy(const TargetMachine *TM) : FunctionPass(ID), TM(TM) {}
+  bool runOnFunction(Function &F) override {
+    auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+    auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+    auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+    SIConvertWaveSize Impl(TM, &LI, &SE, &TTI);
+    return Impl.run(F);
+  }
+  StringRef getPassName() const override { return "SI convert wave size"; }
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addRequired<ScalarEvolutionWrapperPass>();
+    AU.setPreservesAll();
+    FunctionPass::getAnalysisUsage(AU);
+  }
+};
+} // end anonymous namespace
+
+void printFunctionAttributes(const Function &F) {
+  LLVM_DEBUG(dbgs() << "Function: " << F.getName() << "\n");
+  for (const auto &Attr : F.getAttributes()) {
+    LLVM_DEBUG(dbgs() << "  Attribute: " << Attr.getAsString() << "\n");
+  }
+}
+
+bool SIConvertWaveSize::run(Function &F) {
+  LLVM_DEBUG(dbgs() << "Running SIConvertWaveSize on function: " << F.getName() << "\n");
+  LLVM_DEBUG(printFunctionAttributes(F));
+
+  const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
+  if (ST.getGeneration() < AMDGPUSubtarget::GFX11)
+    return false;
+
+  // Check if the function is a kernel.
+  if (F.getCallingConv() != CallingConv::AMDGPU_KERNEL)
+    return false;
+
+  // Check if the kernel is wave32
+  if (F.hasFnAttribute("target-features")) {
+    if (!F.getFnAttribute("target-features")
+            .getValueAsString().contains("wavefrontsize32")) {
+      LLVM_DEBUG(dbgs() << "SIConvertWaveSize: Kernel is not wave32.\n");
+      return false;
+    }
+  }
+
+  // Check if the function is a device enqueue call.
+  if (F.hasFnAttribute("amdgpu-device-enqueue")) {
+    LLVM_DEBUG(dbgs() << "SIConvertWaveSize: Device enqueue call detected.\n");
+    return false;
+  }
+
+  // Check if a trip count is a compile time constant for all loops in the
+  // kernel
+  for (Loop *L : *LI) {
+    const SCEV *TripCountSCEV = SE->getBackedgeTakenCount(L);
+    if (!isa<SCEVConstant>(TripCountSCEV)) {
+      LLVM_DEBUG(
+          dbgs() << "SIConvertWaveSize: Trip count is not a compile time "
+                    "constant.\n");
+      return false;
+    }
+  }
+
+  for (const auto &BB : F) {
+    InstructionCost BlockCost = 0;
+    for (const auto &I : BB) {
+      if (const CallBase *CB = dyn_cast<CallBase>(&I)) {
+        // FIXME: Any calls are not allowed. Only non-converged intrinsic clls
+        // and amdgsn_s_barrier are exempt. InlineAsm and Atomics are checkedd
+        // separately for debug purposes. This will be changed in the final
+        // version.
+        if (CB->isInlineAsm()) {
+          // Inline assembly is not allowed.
+          LLVM_DEBUG(dbgs()
+                     << "SIConvertWaveSize: Inline assembly detected.\n");
+          return false;
+        }
+        if (CB->isAtomic()) {
+          // Atomic operations are not allowed.
+          LLVM_DEBUG(dbgs()
+                     << "SIConvertWaveSize: Atomic operation detected.\n");
+          return false;
+        }
+        if (Function *Callee = CB->getCalledFunction()) {
+          // assuming readlane/readfirstlane or any cross-lane/DPP
+          // operations have "let isConvergent = 1" in IntrinsicsAMDGPU.td
+          if (Callee->isIntrinsic()) {
+              if (Callee->hasFnAttribute(Attribute::Convergent)) {
+                if (Callee->getIntrinsicID() != Intrinsic::amdgcn_s_barrier) {
+                  // TODO: what else should go in a "white list" ?
+                  // Intrinsic::amdgcn_s_barrier_wavefront ?
+                  // Intrinsic::amdgcn_s_barrier_signal ?
+                  LLVM_DEBUG(dbgs()
+                             << "SIConvertWaveSize: Convergent intrinsic "
+                             << Callee->getName() << " detected.\n");
+                  return false;
+                }
+              }
+
+            if (Callee->getIntrinsicID() == Intrinsic::read_register) {
+              if (const auto *MDVal =
+                      dyn_cast<MetadataAsValue>(CB->getArgOperand(0))) {
+                Metadata *MD = MDVal->getMetadata();
+                if (auto *MDNodeVal = dyn_cast<MDNode>(MD)) {
+                  if (MDNodeVal->getNumOperands() >= 1) {
+                    if (auto *MDStr =
+                            dyn_cast<MDString>(MDNodeVal->getOperand(0))) {
+                      if (MDStr->getString().starts_with("exec") ||
+                          MDStr->getString().starts_with("vcc")) {
+                        LLVM_DEBUG(dbgs() << "SIConvertWaveSize: read_register("
+                                          << MDStr->getString()
+                                          << ") intrinsic detected.\n");
+                        return false;
+                      }
+                    }
+                  }
+                }
+              }
+            }
+
+            // Save callee as a candidate for attribute change
+            Callees.push_back(Callee);
+          }
+        } else {
+          // General calls are not allowed.
+          LLVM_DEBUG(dbgs() << "SIConvertWaveSize: function call detected.\n");
+          return false;
+        }
+      }
+      // No  LDS access is allowed
+      if (auto LI = dyn_cast<LoadInst>(&I)) {
+        if (LI->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
+          LLVM_DEBUG(dbgs() << "SIConvertWaveSize: LDS access detected.\n");
+          return false;
+        }
+      }
+      if (auto SI = dyn_cast<StoreInst>(&I)) {
+        if (SI->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
+          LLVM_DEBUG(dbgs() << "SIConvertWaveSize: LDS access detected.\n");
+          return false;
+        }
+      }
+      // TODO: All atomics are not allowed?
+      // if (auto AI = dyn_cast<AtomicRMWInst>(&I)) {
+      //   if (AI->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
+      //     LLVM_DEBUG(dbgs() << "SIConvertWaveSize: LDS access
+      //     detected.\n"); return false;
+      //   }
+      // }
+
+      // TODO: Dynamic VGPRS and GFX11+ special operations ???
+      BlockCost +=
+          TTI->getInstructionCost(&I, TargetTransformInfo::TCK_RecipThroughput);
+    }
+    if (auto L = LI->getLoopFor(&BB)) {
+      const SCEV *TripCount = SE->getBackedgeTakenCount(L);
+      if (auto *C = dyn_cast<SCEVConstant>(TripCount)) {
+        uint64_t TC = C->getValue()->getZExtValue() + 1;
+        size_t Depth = LI->getLoopDepth(&BB);
+        BlockCost *= TC * Depth;
+      } else
+        llvm_unreachable("SIConvertWaveSize: only loops with compile time "
+                         "constant trip count could reach here!\n");
+    }
+    TotalCost += BlockCost;
+    if (TotalCost.isValid()) {
+      if (TotalCost.getValue().value() >= MaxLatency) {
+        LLVM_DEBUG(
+            dbgs() << "SIConvertWaveSize: Total latency of the kernel ["
+                   << TotalCost.getValue().value()
+                   << "] exceeds the limit of 2000 cycles - not profitable!\n");
+        return false;
+      }
+    } else
+      llvm_unreachable(
+          "SIConvertWaveSize: Cost model error - invalid state!\n");
+  }
+
+  // Additional checks can be added here...
+
+  // If all checks pass, convert wave size from wave32 to wave64.
+  // Conversion logic goes here...
+  bool Changed = changeWaveSizeAttr(&F);
+  if (Changed)
+    // Now take care of the intrinsic calls
+    for (auto C : Callees) {
+      // TODO: if we could not change Attr for one of the callee
+      // we need to rollback all the changes!
+      changeWaveSizeAttr(C);
+    }
+
+  return Changed;
+  }
+
+bool SIConvertWaveSize::changeWaveSizeAttr(Function *F) {
+  auto Attr = F->getFnAttribute("target-features");
+  if (Attr.isValid()) {
+    StringRef AttrStr = Attr.getValueAsString();
+    size_t Pos = AttrStr.find("+wavefrontsize32");
+    if (Pos != StringRef::npos) {
+      // Remove the "+wavefrontsize32" attribute.
+      std::string NewBegin = AttrStr.substr(0, Pos).str().append("+wavefrontsize64");
+      std::string End = AttrStr.substr(Pos + strlen("+wavefrontsize32")).str();
+      std::string NewAttrStr = NewBegin + End;
+      // Add the "+wavefrontsize64" attribute.
+      F->removeFnAttr("target-features");
+      F->addFnAttr("target-features", NewAttrStr);
+      LLVM_DEBUG(dbgs() << "SIConvertWaveSize: Converted wave size for "
+                        << F->getName()
+                        << " from wave32 "
+                           "to wave64.\n");
+      return true;
+    }
+  }
+  return false;
+}
+
+INITIALIZE_PASS_BEGIN(SIConvertWaveSizeLegacy, DEBUG_TYPE, "SI convert wave size",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_END(SIConvertWaveSizeLegacy, DEBUG_TYPE, "SI convert wave size",
+                    false, false)
+
+char SIConvertWaveSizeLegacy::ID = 0;
+
+char &llvm::SIConvertWaveSizeLegacyID = SIConvertWaveSizeLegacy::ID;
+
+FunctionPass *llvm::createSIConvertWaveSizeLegacyPass(const TargetMachine *TM) {
+  return new SIConvertWaveSizeLegacy(TM);
+}
+
+PreservedAnalyses SIConvertWaveSizePass::run(
+    Function &F, FunctionAnalysisManager &FAM) {
+      auto &LI = FAM.getResult<LoopAnalysis>(F);
+  auto &SE = FAM.getResult<ScalarEvolutionAnalysis>(F);
+  auto &TTI = FAM.getResult<TargetIRAnalysis>(F);
+
+  SIConvertWaveSize Impl(TM, &LI, &SE, &TTI);
+  bool Changed = Impl.run(F);
+  return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
+}
diff --git a/llvm/lib/Target/AMDGPU/SIConvertWaveSize.h b/llvm/lib/Target/AMDGPU/SIConvertWaveSize.h
new file mode 100644
index 0000000000000..78b8365ed9ebc
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/SIConvertWaveSize.h
@@ -0,0 +1,30 @@
+//===- SIConvertWaveSize.h ----------------------------------------*- C++- *-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_LIB_TARGET_AMDGPU_SICONVERTWAVESIZE_H
+#define LLVM_LIB_TARGET_AMDGPU_SICONVERTWAVESIZE_H
+
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+class SIConvertWaveSizePass : public PassInfoMixin<SIConvertWaveSizePass> {
+  /// The target machine.
+  const TargetMachine *TM;
+
+public:
+  SIConvertWaveSizePass(const TargetMachine &TM)
+      : TM(&TM) {}
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM);
+};
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_SICONVERTWAVESIZE_H
diff --git a/llvm/test/CodeGen/AMDGPU/wave32-to-64-auto-convert.ll b/llvm/test/CodeGen/AMDGPU/wave32-to-64-auto-convert.ll
new file mode 100644
index 0000000000000..d90e524e9cc2e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/wave32-to-64-auto-convert.ll
@@ -0,0 +1,121 @@
+; RUN: opt -S -mtriple=amdgcn -mcpu=gfx1100 -passes=si-convert-wave-size < %s | FileCheck %s
+
+define amdgpu_kernel void @test_not_wave32(ptr addrspace(1) %out) #0 {
+  ; CHECK:  @test_not_wave32{{.*}}) #0
+  %gep = getelementptr i32, ptr addrspace(1) %out, i32 2
+  %tmp = load i32, ptr addrspace(1) %gep
+  store i32 %tmp, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @intr_non_convergent(ptr addrspace(1) nocapture %arg) #1 {
+  ; CHECK: @intr_non_convergent{{.*}} #0
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.wavefrontsize()
+  %tmp1 = icmp ugt i32 %tmp, 32
+  %tmp2 = select i1 %tmp1, i32 2, i32 1
+  store i32 %tmp2, ptr addrspace(1) %arg
+  ret void
+}
+
+define amdgpu_kernel void @intr_convergent(ptr addrspace(1) nocapture %arg, i32 %X) #1 {
+  ; CHECK: @intr_convergent{{.*}}) #1
+bb:
+  %tmp = icmp ugt i32 %X, 32
+  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %tmp)
+  store i32 %ballot, ptr addrspace(1) %arg
+  ret void
+}
+
+define amdgpu_kernel void @test_barrier(ptr addrspace(1) %in, ptr addrspace(1) %out) #1 {
+  ; CHECK: @test_barrier{{.*}}) #0
+entry:
+  %val = load <2 x half>, ptr addrspace(1) %in
+  call void @llvm.amdgcn.s.barrier() #2
+  store <2 x half> %val, ptr addrspace(1) %out
+  ret void
+}
+
+
+define amdgpu_kernel void @test_read_exec(ptr addrspace(1) %out) #1 {
+  ; CHECK: @test_read_exec{{.*}}) #1
+  %exec = call i64 @llvm.read_register.i64(metadata !0)
+  store i64 %exec, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @test_read_vcc_lo(ptr addrspace(1) %out) #1 {
+  ; CHECK: @test_read_vcc_lo{{.*}}) #1
+  %vcc_lo = call i32 @llvm.read_register.i32(metadata !1)
+  store i32 %vcc_lo, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @test_read_vcc_hi(ptr addrspace(1) %out) #1 {
+  ; CHECK: @test_read_vcc_hi{{.*}}) #1
+  %vcc_hi = call i32 @llvm.read_register.i32(metadata !2)
+  store i32 %vcc_hi, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @test_lds_access(ptr addrspace(3) %out) #1 {
+  ; CHECK: @test_lds_access{{.*}}) #1
+  %gep = getelementptr i32, ptr addrspace(3) %out, i32 2
+  %tmp = load i32, ptr addrspace(3) %gep
+  store i32 %tmp, ptr addrspace(3) %out
+  ret void
+}
+
+define amdgpu_kernel void @test_simple_loop(ptr addrspace(1) nocapture %arg) #1 {
+  ; CHECK: @test_simple_loop{{.*}}) #1
+bb:
+  br label %bb2
+
+bb1:
+  ret void
+
+bb2:
+  %tmp1 = phi i32 [ 0, %bb ], [ %tmp2, %bb2 ]
+  %tmp2 = add nuw nsw i32 %tmp1, 1
+  %tmp3 = icmp eq i32 %tmp2, 1024
+  tail call void @llvm.amdgcn.s.sleep(i32 0)
+  br i1 %tmp3, label %bb1, label %bb2
+}
+
+define amdgpu_kernel void @test_nested_loop(ptr addrspace(1) nocapture %arg) #1 {
+  ; CHECK: @test_nested_loop{{.*}}) #1
+bb:
+  br label %bb2
+
+bb1:
+  ret void
+
+bb2:
+  %tmp1 = phi i32 [ 0, %bb ], [ %tmp2, %bb4 ]
+  %tmp2 = add nuw nsw i32 %tmp1, 1
+  %tmp3 = icmp eq i32 %tmp2, 8
+  br label %bb3
+
+bb3:
+  %tmp4 = phi i32 [ 0, %bb2 ], [ %tmp5, %bb3 ]
+  %tmp5 = add nuw nsw i32 %tmp4, 1
+  %tmp6 = icmp eq i32 %tmp5, 128
+  tail call void @llvm.amdgcn.s.sleep(i32 0)
+  br i1 %tmp6, label %bb4, label %bb3
+
+bb4:
+  br i1 %tmp3, label %bb1, label %bb2
+}
+
+declare void @llvm.amdgcn.s.sleep(i32)
+declare i32 @llvm.amdgcn.wavefrontsize()
+declare i32 @llvm.amdgcn.ballot.i32(i1)
+declare i32 @llvm.read_register.i32(metadata)
+declare i64 @llvm.read_register.i64(metadata)
+
+attributes #0 = { nounwind "target-features"="+wavefrontsize64" }
+attributes #1 = { nounwind "target-features"="+wavefrontsize32" }
+
+!0 = !{!"exec"}
+!1 = !{!"vcc_lo"}
+!2 = !{!"vcc_hi"}