[llvm-branch-commits] [llvm] e07c05b - [AMDGPU] Clear bodies of function with incompatible features

Pierre van Houtryve via llvm-branch-commits llvm-branch-commits at lists.llvm.org
Wed Nov 30 03:55:58 PST 2022


Author: Pierre van Houtryve
Date: 2022-11-30T06:14:35-05:00
New Revision: e07c05bc91ae1dfb625b7b0d93a83e5c6039fcb2

URL: https://github.com/llvm/llvm-project/commit/e07c05bc91ae1dfb625b7b0d93a83e5c6039fcb2
DIFF: https://github.com/llvm/llvm-project/commit/e07c05bc91ae1dfb625b7b0d93a83e5c6039fcb2.diff

LOG: [AMDGPU] Clear bodies of function with incompatible features

Adds a new passs that replaces the body of a function with trap+unreachable
if it uses features that are not supported on the current GPU.

This change is aimed at preventing crashes when building code at O0 that
uses idioms such as `if (ISA_VERSION >= N) intrinsic_a(); else intrinsic_b();`
where ISA_VERSION is not constexpr, and intrinsic_a is not selectable
on older targets.
This is a pattern that's used all over the ROCm device libs. The main
motive behind this change is to allow code using ROCm device libs
to be built at O0.

Note: the feature checking logic is done ad-hoc in the pass. There is no other
pass that needs (or will need in the foreseeable future) to do similar
feature-checking logic so I did not see a need to generalize the feature
checking logic yet. It can (and should probably) be generalized later and
moved to a TargetInfo-like class or helper file.

Added: 
    llvm/lib/Target/AMDGPU/AMDGPUClearIncompatibleFunctions.cpp
    llvm/test/CodeGen/AMDGPU/clear-incompatible-functions.ll

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPU.h
    llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
    llvm/lib/Target/AMDGPU/CMakeLists.txt
    llvm/test/CodeGen/AMDGPU/GlobalISel/dummy-target.ll
    llvm/test/CodeGen/AMDGPU/llc-pipeline.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 355aa0ba465b4..6a9ac1d165724 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -47,6 +47,7 @@ FunctionPass *createSIFormMemoryClausesPass();
 FunctionPass *createSIPostRABundlerPass();
 FunctionPass *createAMDGPUSimplifyLibCallsPass(const TargetMachine *);
 FunctionPass *createAMDGPUUseNativeCallsPass();
+FunctionPass *createAMDGPUClearIncompatibleFunctionsPass(const TargetMachine *);
 FunctionPass *createAMDGPUCodeGenPreparePass();
 FunctionPass *createAMDGPULateCodeGenPreparePass();
 FunctionPass *createAMDGPUMachineCFGStructurizerPass();
@@ -287,6 +288,9 @@ extern char &AMDGPUAnnotateUniformValuesPassID;
 void initializeAMDGPUCodeGenPreparePass(PassRegistry&);
 extern char &AMDGPUCodeGenPrepareID;
 
+void initializeAMDGPUClearIncompatibleFunctionsPass(PassRegistry &);
+extern char &AMDGPUClearIncompatibleFunctionsID;
+
 void initializeAMDGPULateCodeGenPreparePass(PassRegistry &);
 extern char &AMDGPULateCodeGenPrepareID;
 

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUClearIncompatibleFunctions.cpp b/llvm/lib/Target/AMDGPU/AMDGPUClearIncompatibleFunctions.cpp
new file mode 100644
index 0000000000000..e0ea3aac5b7f5
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUClearIncompatibleFunctions.cpp
@@ -0,0 +1,120 @@
+//===-- AMDGPUClearIncompatibleFunctions.cpp ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass replaces the bodies of functions that have attributes incompatible
+/// with the current target with trap/unreachable.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/Pass.h"
+
+#define DEBUG_TYPE "amdgpu-clear-incompatible-functions"
+
+using namespace llvm;
+
+namespace llvm {
+extern const SubtargetFeatureKV AMDGPUFeatureKV[AMDGPU::NumSubtargetFeatures-1];
+}
+
+namespace {
+
+using Generation = AMDGPUSubtarget::Generation;
+
+class AMDGPUClearIncompatibleFunctions : public FunctionPass {
+public:
+  static char ID;
+
+  AMDGPUClearIncompatibleFunctions(const TargetMachine *TM = nullptr) : FunctionPass(ID), TM(TM) {
+    assert(TM && "No TargetMachine!");
+  }
+
+  StringRef getPassName() const override {
+    return "AMDGPU Clear Incompatible Functions Bodies";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    // If changes are made, no analyses are preserved.
+  }
+
+  bool runOnFunction(Function &F) override;
+
+private:
+  const TargetMachine *TM = nullptr;
+};
+
+// List of features alongside the minimum GPU generation needed to support them.
+constexpr std::array<std::pair<unsigned, Generation>, 6> FeatureAndMinGen = {{
+  { AMDGPU::FeatureGFX11Insts, Generation::GFX11 },
+  { AMDGPU::FeatureGFX10Insts, Generation::GFX10 },
+  { AMDGPU::FeatureGFX9Insts, Generation::GFX9 },
+  { AMDGPU::FeatureGFX8Insts, Generation::VOLCANIC_ISLANDS },
+  { AMDGPU::FeatureDPP, Generation::VOLCANIC_ISLANDS },
+  { AMDGPU::Feature16BitInsts, Generation::VOLCANIC_ISLANDS }
+}};
+
+StringRef GetFeatureName(unsigned Feature) {
+  for (const SubtargetFeatureKV &KV : AMDGPUFeatureKV)
+    if (Feature == KV.Value)
+      return KV.Key;
+
+  llvm_unreachable("Unknown Target feature");
+}
+
+} // end anonymous namespace
+
+bool AMDGPUClearIncompatibleFunctions::runOnFunction(Function &F) {
+  if (skipFunction(F) || F.empty())
+    return false;
+
+  LLVMContext &Ctx = F.getContext();
+  const GCNSubtarget *ST = static_cast<const GCNSubtarget*>(TM->getSubtargetImpl(F));
+  Generation GPUGen = ST->getGeneration();
+
+  // Note: this pass checks attributes for GCN, so check we have a GCN GPU.
+  if(GPUGen < Generation::SOUTHERN_ISLANDS)
+    return false;
+
+  bool Remove = false;
+  for(const auto &[Feature, MinGPUGen]: FeatureAndMinGen) {
+    if(ST->hasFeature(Feature) && GPUGen < MinGPUGen) {
+      Remove = true;
+      std::string Msg = "+" + GetFeatureName(Feature).str() + " is not supported on the current target. Deleting function body.";
+      DiagnosticInfoUnsupported DiagInfo(F, Msg, DiagnosticLocation(), DS_Warning);
+      Ctx.diagnose(DiagInfo);
+    }
+  }
+
+  if (!Remove)
+    return false;
+
+  F.dropAllReferences();
+  assert(F.empty());
+
+  BasicBlock* Entry = BasicBlock::Create(Ctx, "entry", &F);
+  IRBuilder<> Builder(Entry);
+  Builder.CreateIntrinsic(Intrinsic::trap, {}, {});
+  Builder.CreateUnreachable();
+  return true;
+}
+
+INITIALIZE_PASS(AMDGPUClearIncompatibleFunctions, DEBUG_TYPE,
+                "AMDGPU Clear Incompatible Functions Bodies", false, false)
+
+char AMDGPUClearIncompatibleFunctions::ID = 0;
+
+FunctionPass *llvm::createAMDGPUClearIncompatibleFunctionsPass(const TargetMachine *TM) {
+  return new AMDGPUClearIncompatibleFunctions(TM);
+}

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 0cecf95c007d0..84d6879d4c36c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -213,6 +213,13 @@ static cl::opt<bool> EarlyInlineAll(
   cl::init(false),
   cl::Hidden);
 
+static cl::opt<bool> ClearIncompatibleFunctionsBodies(
+    "amdgpu-incompatible-features-clear-fns",
+    cl::Hidden,
+    cl::desc("Enable deletion of function bodies when they"
+             "use features not supported by the target GPU"),
+    cl::init(true));
+
 static cl::opt<bool> EnableSDWAPeephole(
   "amdgpu-sdwa-peephole",
   cl::desc("Enable SDWA peepholer"),
@@ -376,6 +383,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeAMDGPULateCodeGenPreparePass(*PR);
   initializeAMDGPUPropagateAttributesEarlyPass(*PR);
   initializeAMDGPUPropagateAttributesLatePass(*PR);
+  initializeAMDGPUClearIncompatibleFunctionsPass(*PR);
   initializeAMDGPUReplaceLDSUseWithPointerPass(*PR);
   initializeAMDGPULowerModuleLDSPass(*PR);
   initializeAMDGPURewriteOutArgumentsPass(*PR);
@@ -1058,6 +1066,10 @@ void AMDGPUPassConfig::addCodeGenPrepare() {
 bool AMDGPUPassConfig::addPreISel() {
   if (TM->getOptLevel() > CodeGenOpt::None)
     addPass(createFlattenCFGPass());
+
+  if(ClearIncompatibleFunctionsBodies)
+    addPass(createAMDGPUClearIncompatibleFunctionsPass(&getAMDGPUTargetMachine()));
+
   return false;
 }
 

diff  --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 76ae4faf792e7..1d7b0d6c26dc9 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -50,6 +50,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUAtomicOptimizer.cpp
   AMDGPUAttributor.cpp
   AMDGPUCallLowering.cpp
+  AMDGPUClearIncompatibleFunctions.cpp
   AMDGPUCodeGenPrepare.cpp
   AMDGPUCombinerHelper.cpp
   AMDGPUCtorDtorLowering.cpp

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/dummy-target.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/dummy-target.ll
index 5dfde116785db..ac121f9e6661c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/dummy-target.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/dummy-target.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -stop-after=legalizer -o - %s | FileCheck %s
+; RUN: llc -global-isel -amdgpu-incompatible-features-clear-fns=0 -mtriple=amdgcn-amd-amdhsa -stop-after=legalizer -o - %s | FileCheck %s
 
 ; Make sure legalizer info doesn't assert on dummy targets
 

diff  --git a/llvm/test/CodeGen/AMDGPU/clear-incompatible-functions.ll b/llvm/test/CodeGen/AMDGPU/clear-incompatible-functions.ll
new file mode 100644
index 0000000000000..a19497d6a03b6
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/clear-incompatible-functions.ll
@@ -0,0 +1,628 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s 2>%t | FileCheck -check-prefix=GFX7 %s
+; RUN: FileCheck --check-prefixes=GFX8-WARN,GFX9-WARN,GFX10-WARN,GFX11-WARN %s < %t
+
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s 2>%t | FileCheck -check-prefix=GFX8 %s
+; RUN: FileCheck --check-prefixes=GFX9-WARN,GFX10-WARN,GFX11-WARN %s < %t
+
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s 2>%t | FileCheck -check-prefix=GFX9 %s
+; RUN: FileCheck --check-prefixes=GFX10-WARN,GFX11-WARN %s < %t
+
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s 2>%t | FileCheck -check-prefix=GFX10 %s
+; RUN: FileCheck --check-prefixes=GFX11-WARN %s < %t
+
+; Use --fatal-warnings to confirm no diagnostics are emitted for GFX11.
+; RUN: llc --fatal-warnings -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11 %s
+
+; GFX8-WARN: needs_dpp {{.*}} +dpp is not supported on the current target. Deleting function body.
+; GFX8-WARN: needs_16bit_insts {{.*}} +16-bit-insts is not supported on the current target. Deleting function body.
+; GFX8-WARN: needs_gfx8_insts {{.*}} +gfx8-insts is not supported on the current target. Deleting function body.
+; GFX9-WARN: needs_gfx9_insts {{.*}} +gfx9-insts is not supported on the current target. Deleting function body.
+; GFX10-WARN: needs_gfx10_insts {{.*}} +gfx10-insts is not supported on the current target. Deleting function body.
+; GFX11-WARN: needs_gfx11_insts {{.*}} +gfx11-insts is not supported on the current target. Deleting function body.
+
+define void @needs_dpp(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) #0 {
+; GFX7-LABEL: needs_dpp:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: needs_dpp:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX8-NEXT:    ; implicit-def: $vgpr8_vgpr9
+; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX8-NEXT:  ; %bb.1: ; %else
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v4, v6
+; GFX8-NEXT:    v_addc_u32_e32 v9, vcc, v5, v7, vcc
+; GFX8-NEXT:    ; implicit-def: $vgpr2
+; GFX8-NEXT:  ; %bb.2: ; %Flow
+; GFX8-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX8-NEXT:    s_cbranch_execz .LBB0_4
+; GFX8-NEXT:  ; %bb.3: ; %if
+; GFX8-NEXT:    flat_load_dwordx2 v[8:9], v[2:3]
+; GFX8-NEXT:  .LBB0_4: ; %endif
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[8:9]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: needs_dpp:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX9-NEXT:    ; implicit-def: $vgpr8_vgpr9
+; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX9-NEXT:  ; %bb.1: ; %else
+; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v4, v6
+; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, v5, v7, vcc
+; GFX9-NEXT:    ; implicit-def: $vgpr2
+; GFX9-NEXT:  ; %bb.2: ; %Flow
+; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX9-NEXT:    s_cbranch_execz .LBB0_4
+; GFX9-NEXT:  ; %bb.3: ; %if
+; GFX9-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off
+; GFX9-NEXT:  .LBB0_4: ; %endif
+; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[8:9], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: needs_dpp:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
+; GFX10-NEXT:    ; implicit-def: $vgpr8_vgpr9
+; GFX10-NEXT:    s_and_saveexec_b32 s4, vcc_lo
+; GFX10-NEXT:    s_xor_b32 s4, exec_lo, s4
+; GFX10-NEXT:  ; %bb.1: ; %else
+; GFX10-NEXT:    v_add_co_u32 v8, vcc_lo, v4, v6
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, v5, v7, vcc_lo
+; GFX10-NEXT:    ; implicit-def: $vgpr2
+; GFX10-NEXT:  ; %bb.2: ; %Flow
+; GFX10-NEXT:    s_andn2_saveexec_b32 s4, s4
+; GFX10-NEXT:    s_cbranch_execz .LBB0_4
+; GFX10-NEXT:  ; %bb.3: ; %if
+; GFX10-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off
+; GFX10-NEXT:  .LBB0_4: ; %endif
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_dwordx2 v[0:1], v[8:9], off
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: needs_dpp:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-NEXT:    ; implicit-def: $vgpr8_vgpr9
+; GFX11-NEXT:    v_cmpx_ne_u64_e32 0, v[4:5]
+; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-NEXT:  ; %bb.1: ; %else
+; GFX11-NEXT:    v_add_co_u32 v8, vcc_lo, v4, v6
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, v5, v7, vcc_lo
+; GFX11-NEXT:    ; implicit-def: $vgpr2
+; GFX11-NEXT:  ; %bb.2: ; %Flow
+; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-NEXT:    s_cbranch_execz .LBB0_4
+; GFX11-NEXT:  ; %bb.3: ; %if
+; GFX11-NEXT:    global_load_b64 v[8:9], v[2:3], off
+; GFX11-NEXT:  .LBB0_4: ; %endif
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b64 v[0:1], v[8:9], off
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %0 = icmp eq i64 %a, 0
+  br i1 %0, label %if, label %else
+
+if:
+  %1 = load i64, i64 addrspace(1)* %in
+  br label %endif
+
+else:
+  %2 = add i64 %a, %b
+  br label %endif
+
+endif:
+  %3 = phi i64 [%1, %if], [%2, %else]
+  store i64 %3, i64 addrspace(1)* %out
+  ret void
+}
+
+define void @needs_16bit_insts(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) #1 {
+; GFX7-LABEL: needs_16bit_insts:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: needs_16bit_insts:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX8-NEXT:    ; implicit-def: $vgpr8_vgpr9
+; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX8-NEXT:  ; %bb.1: ; %else
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v4, v6
+; GFX8-NEXT:    v_addc_u32_e32 v9, vcc, v5, v7, vcc
+; GFX8-NEXT:    ; implicit-def: $vgpr2
+; GFX8-NEXT:  ; %bb.2: ; %Flow
+; GFX8-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX8-NEXT:    s_cbranch_execz .LBB1_4
+; GFX8-NEXT:  ; %bb.3: ; %if
+; GFX8-NEXT:    flat_load_dwordx2 v[8:9], v[2:3]
+; GFX8-NEXT:  .LBB1_4: ; %endif
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[8:9]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: needs_16bit_insts:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX9-NEXT:    ; implicit-def: $vgpr8_vgpr9
+; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX9-NEXT:  ; %bb.1: ; %else
+; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v4, v6
+; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, v5, v7, vcc
+; GFX9-NEXT:    ; implicit-def: $vgpr2
+; GFX9-NEXT:  ; %bb.2: ; %Flow
+; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX9-NEXT:    s_cbranch_execz .LBB1_4
+; GFX9-NEXT:  ; %bb.3: ; %if
+; GFX9-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off
+; GFX9-NEXT:  .LBB1_4: ; %endif
+; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[8:9], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: needs_16bit_insts:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
+; GFX10-NEXT:    ; implicit-def: $vgpr8_vgpr9
+; GFX10-NEXT:    s_and_saveexec_b32 s4, vcc_lo
+; GFX10-NEXT:    s_xor_b32 s4, exec_lo, s4
+; GFX10-NEXT:  ; %bb.1: ; %else
+; GFX10-NEXT:    v_add_co_u32 v8, vcc_lo, v4, v6
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, v5, v7, vcc_lo
+; GFX10-NEXT:    ; implicit-def: $vgpr2
+; GFX10-NEXT:  ; %bb.2: ; %Flow
+; GFX10-NEXT:    s_andn2_saveexec_b32 s4, s4
+; GFX10-NEXT:    s_cbranch_execz .LBB1_4
+; GFX10-NEXT:  ; %bb.3: ; %if
+; GFX10-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off
+; GFX10-NEXT:  .LBB1_4: ; %endif
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_dwordx2 v[0:1], v[8:9], off
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: needs_16bit_insts:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-NEXT:    ; implicit-def: $vgpr8_vgpr9
+; GFX11-NEXT:    v_cmpx_ne_u64_e32 0, v[4:5]
+; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-NEXT:  ; %bb.1: ; %else
+; GFX11-NEXT:    v_add_co_u32 v8, vcc_lo, v4, v6
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, v5, v7, vcc_lo
+; GFX11-NEXT:    ; implicit-def: $vgpr2
+; GFX11-NEXT:  ; %bb.2: ; %Flow
+; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-NEXT:    s_cbranch_execz .LBB1_4
+; GFX11-NEXT:  ; %bb.3: ; %if
+; GFX11-NEXT:    global_load_b64 v[8:9], v[2:3], off
+; GFX11-NEXT:  .LBB1_4: ; %endif
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b64 v[0:1], v[8:9], off
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %0 = icmp eq i64 %a, 0
+  br i1 %0, label %if, label %else
+
+if:
+  %1 = load i64, i64 addrspace(1)* %in
+  br label %endif
+
+else:
+  %2 = add i64 %a, %b
+  br label %endif
+
+endif:
+  %3 = phi i64 [%1, %if], [%2, %else]
+  store i64 %3, i64 addrspace(1)* %out
+  ret void
+}
+
+define void @needs_gfx8_insts(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) #2 {
+; GFX7-LABEL: needs_gfx8_insts:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: needs_gfx8_insts:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX8-NEXT:    ; implicit-def: $vgpr8_vgpr9
+; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX8-NEXT:  ; %bb.1: ; %else
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v4, v6
+; GFX8-NEXT:    v_addc_u32_e32 v9, vcc, v5, v7, vcc
+; GFX8-NEXT:    ; implicit-def: $vgpr2
+; GFX8-NEXT:  ; %bb.2: ; %Flow
+; GFX8-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX8-NEXT:    s_cbranch_execz .LBB2_4
+; GFX8-NEXT:  ; %bb.3: ; %if
+; GFX8-NEXT:    flat_load_dwordx2 v[8:9], v[2:3]
+; GFX8-NEXT:  .LBB2_4: ; %endif
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[8:9]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: needs_gfx8_insts:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX9-NEXT:    ; implicit-def: $vgpr8_vgpr9
+; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX9-NEXT:  ; %bb.1: ; %else
+; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v4, v6
+; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, v5, v7, vcc
+; GFX9-NEXT:    ; implicit-def: $vgpr2
+; GFX9-NEXT:  ; %bb.2: ; %Flow
+; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX9-NEXT:    s_cbranch_execz .LBB2_4
+; GFX9-NEXT:  ; %bb.3: ; %if
+; GFX9-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off
+; GFX9-NEXT:  .LBB2_4: ; %endif
+; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[8:9], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: needs_gfx8_insts:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
+; GFX10-NEXT:    ; implicit-def: $vgpr8_vgpr9
+; GFX10-NEXT:    s_and_saveexec_b32 s4, vcc_lo
+; GFX10-NEXT:    s_xor_b32 s4, exec_lo, s4
+; GFX10-NEXT:  ; %bb.1: ; %else
+; GFX10-NEXT:    v_add_co_u32 v8, vcc_lo, v4, v6
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, v5, v7, vcc_lo
+; GFX10-NEXT:    ; implicit-def: $vgpr2
+; GFX10-NEXT:  ; %bb.2: ; %Flow
+; GFX10-NEXT:    s_andn2_saveexec_b32 s4, s4
+; GFX10-NEXT:    s_cbranch_execz .LBB2_4
+; GFX10-NEXT:  ; %bb.3: ; %if
+; GFX10-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off
+; GFX10-NEXT:  .LBB2_4: ; %endif
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_dwordx2 v[0:1], v[8:9], off
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: needs_gfx8_insts:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-NEXT:    ; implicit-def: $vgpr8_vgpr9
+; GFX11-NEXT:    v_cmpx_ne_u64_e32 0, v[4:5]
+; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-NEXT:  ; %bb.1: ; %else
+; GFX11-NEXT:    v_add_co_u32 v8, vcc_lo, v4, v6
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, v5, v7, vcc_lo
+; GFX11-NEXT:    ; implicit-def: $vgpr2
+; GFX11-NEXT:  ; %bb.2: ; %Flow
+; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-NEXT:    s_cbranch_execz .LBB2_4
+; GFX11-NEXT:  ; %bb.3: ; %if
+; GFX11-NEXT:    global_load_b64 v[8:9], v[2:3], off
+; GFX11-NEXT:  .LBB2_4: ; %endif
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b64 v[0:1], v[8:9], off
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %0 = icmp eq i64 %a, 0
+  br i1 %0, label %if, label %else
+
+if:
+  %1 = load i64, i64 addrspace(1)* %in
+  br label %endif
+
+else:
+  %2 = add i64 %a, %b
+  br label %endif
+
+endif:
+  %3 = phi i64 [%1, %if], [%2, %else]
+  store i64 %3, i64 addrspace(1)* %out
+  ret void
+}
+
+define void @needs_gfx9_insts(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) #3 {
+; GFX7-LABEL: needs_gfx9_insts:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: needs_gfx9_insts:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-LABEL: needs_gfx9_insts:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX9-NEXT:    ; implicit-def: $vgpr8_vgpr9
+; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX9-NEXT:  ; %bb.1: ; %else
+; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v4, v6
+; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, v5, v7, vcc
+; GFX9-NEXT:    ; implicit-def: $vgpr2
+; GFX9-NEXT:  ; %bb.2: ; %Flow
+; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX9-NEXT:    s_cbranch_execz .LBB3_4
+; GFX9-NEXT:  ; %bb.3: ; %if
+; GFX9-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off
+; GFX9-NEXT:  .LBB3_4: ; %endif
+; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[8:9], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: needs_gfx9_insts:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
+; GFX10-NEXT:    ; implicit-def: $vgpr8_vgpr9
+; GFX10-NEXT:    s_and_saveexec_b32 s4, vcc_lo
+; GFX10-NEXT:    s_xor_b32 s4, exec_lo, s4
+; GFX10-NEXT:  ; %bb.1: ; %else
+; GFX10-NEXT:    v_add_co_u32 v8, vcc_lo, v4, v6
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, v5, v7, vcc_lo
+; GFX10-NEXT:    ; implicit-def: $vgpr2
+; GFX10-NEXT:  ; %bb.2: ; %Flow
+; GFX10-NEXT:    s_andn2_saveexec_b32 s4, s4
+; GFX10-NEXT:    s_cbranch_execz .LBB3_4
+; GFX10-NEXT:  ; %bb.3: ; %if
+; GFX10-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off
+; GFX10-NEXT:  .LBB3_4: ; %endif
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_dwordx2 v[0:1], v[8:9], off
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: needs_gfx9_insts:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-NEXT:    ; implicit-def: $vgpr8_vgpr9
+; GFX11-NEXT:    v_cmpx_ne_u64_e32 0, v[4:5]
+; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-NEXT:  ; %bb.1: ; %else
+; GFX11-NEXT:    v_add_co_u32 v8, vcc_lo, v4, v6
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, v5, v7, vcc_lo
+; GFX11-NEXT:    ; implicit-def: $vgpr2
+; GFX11-NEXT:  ; %bb.2: ; %Flow
+; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-NEXT:    s_cbranch_execz .LBB3_4
+; GFX11-NEXT:  ; %bb.3: ; %if
+; GFX11-NEXT:    global_load_b64 v[8:9], v[2:3], off
+; GFX11-NEXT:  .LBB3_4: ; %endif
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b64 v[0:1], v[8:9], off
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %0 = icmp eq i64 %a, 0
+  br i1 %0, label %if, label %else
+
+if:
+  %1 = load i64, i64 addrspace(1)* %in
+  br label %endif
+
+else:
+  %2 = add i64 %a, %b
+  br label %endif
+
+endif:
+  %3 = phi i64 [%1, %if], [%2, %else]
+  store i64 %3, i64 addrspace(1)* %out
+  ret void
+}
+
+define void @needs_gfx10_insts(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) #4 {
+; GFX7-LABEL: needs_gfx10_insts:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: needs_gfx10_insts:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-LABEL: needs_gfx10_insts:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: needs_gfx10_insts:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
+; GFX10-NEXT:    ; implicit-def: $vgpr8_vgpr9
+; GFX10-NEXT:    s_and_saveexec_b32 s4, vcc_lo
+; GFX10-NEXT:    s_xor_b32 s4, exec_lo, s4
+; GFX10-NEXT:  ; %bb.1: ; %else
+; GFX10-NEXT:    v_add_co_u32 v8, vcc_lo, v4, v6
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, v5, v7, vcc_lo
+; GFX10-NEXT:    ; implicit-def: $vgpr2
+; GFX10-NEXT:  ; %bb.2: ; %Flow
+; GFX10-NEXT:    s_andn2_saveexec_b32 s4, s4
+; GFX10-NEXT:    s_cbranch_execz .LBB4_4
+; GFX10-NEXT:  ; %bb.3: ; %if
+; GFX10-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off
+; GFX10-NEXT:  .LBB4_4: ; %endif
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_dwordx2 v[0:1], v[8:9], off
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: needs_gfx10_insts:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-NEXT:    ; implicit-def: $vgpr8_vgpr9
+; GFX11-NEXT:    v_cmpx_ne_u64_e32 0, v[4:5]
+; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-NEXT:  ; %bb.1: ; %else
+; GFX11-NEXT:    v_add_co_u32 v8, vcc_lo, v4, v6
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, v5, v7, vcc_lo
+; GFX11-NEXT:    ; implicit-def: $vgpr2
+; GFX11-NEXT:  ; %bb.2: ; %Flow
+; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-NEXT:    s_cbranch_execz .LBB4_4
+; GFX11-NEXT:  ; %bb.3: ; %if
+; GFX11-NEXT:    global_load_b64 v[8:9], v[2:3], off
+; GFX11-NEXT:  .LBB4_4: ; %endif
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b64 v[0:1], v[8:9], off
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %0 = icmp eq i64 %a, 0
+  br i1 %0, label %if, label %else
+
+if:
+  %1 = load i64, i64 addrspace(1)* %in
+  br label %endif
+
+else:
+  %2 = add i64 %a, %b
+  br label %endif
+
+endif:
+  %3 = phi i64 [%1, %if], [%2, %else]
+  store i64 %3, i64 addrspace(1)* %out
+  ret void
+}
+
+define void @needs_gfx11_insts(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) #5 {
+; GFX7-LABEL: needs_gfx11_insts:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: needs_gfx11_insts:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-LABEL: needs_gfx11_insts:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: needs_gfx11_insts:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: needs_gfx11_insts:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-NEXT:    ; implicit-def: $vgpr8_vgpr9
+; GFX11-NEXT:    v_cmpx_ne_u64_e32 0, v[4:5]
+; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-NEXT:  ; %bb.1: ; %else
+; GFX11-NEXT:    v_add_co_u32 v8, vcc_lo, v4, v6
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, v5, v7, vcc_lo
+; GFX11-NEXT:    ; implicit-def: $vgpr2
+; GFX11-NEXT:  ; %bb.2: ; %Flow
+; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-NEXT:    s_cbranch_execz .LBB5_4
+; GFX11-NEXT:  ; %bb.3: ; %if
+; GFX11-NEXT:    global_load_b64 v[8:9], v[2:3], off
+; GFX11-NEXT:  .LBB5_4: ; %endif
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b64 v[0:1], v[8:9], off
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %0 = icmp eq i64 %a, 0
+  br i1 %0, label %if, label %else
+
+if:
+  %1 = load i64, i64 addrspace(1)* %in
+  br label %endif
+
+else:
+  %2 = add i64 %a, %b
+  br label %endif
+
+endif:
+  %3 = phi i64 [%1, %if], [%2, %else]
+  store i64 %3, i64 addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { "target-features"="+dpp" }
+attributes #1 = { "target-features"="+16-bit-insts" }
+attributes #2 = { "target-features"="+gfx8-insts" }
+attributes #3 = { "target-features"="+gfx9-insts" }
+attributes #4 = { "target-features"="+gfx10-insts" }
+attributes #5 = { "target-features"="+gfx11-insts" }

diff  --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 9055e5c9ba799..3013cf9175cbc 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -59,6 +59,7 @@
 ; GCN-O0-NEXT:        Lower SwitchInst's to branches
 ; GCN-O0-NEXT:        Lower invoke and unwind, for unwindless code generators
 ; GCN-O0-NEXT:        Remove unreachable blocks from the CFG
+; GCN-O0-NEXT:        AMDGPU Clear Incompatible Functions Bodies
 ; GCN-O0-NEXT:        Post-Dominator Tree Construction
 ; GCN-O0-NEXT:        Dominator Tree Construction
 ; GCN-O0-NEXT:        Natural Loop Information
@@ -238,6 +239,7 @@
 ; GCN-O1-NEXT:        Basic Alias Analysis (stateless AA impl)
 ; GCN-O1-NEXT:        Function Alias Analysis Results
 ; GCN-O1-NEXT:        Flatten the CFG
+; GCN-O1-NEXT:        AMDGPU Clear Incompatible Functions Bodies
 ; GCN-O1-NEXT:        Dominator Tree Construction
 ; GCN-O1-NEXT:        Post-Dominator Tree Construction
 ; GCN-O1-NEXT:        Natural Loop Information
@@ -525,6 +527,7 @@
 ; GCN-O1-OPTS-NEXT:        Basic Alias Analysis (stateless AA impl)
 ; GCN-O1-OPTS-NEXT:        Function Alias Analysis Results
 ; GCN-O1-OPTS-NEXT:        Flatten the CFG
+; GCN-O1-OPTS-NEXT:        AMDGPU Clear Incompatible Functions Bodies
 ; GCN-O1-OPTS-NEXT:        Dominator Tree Construction
 ; GCN-O1-OPTS-NEXT:        Post-Dominator Tree Construction
 ; GCN-O1-OPTS-NEXT:        Natural Loop Information
@@ -820,6 +823,7 @@
 ; GCN-O2-NEXT:        Basic Alias Analysis (stateless AA impl)
 ; GCN-O2-NEXT:        Function Alias Analysis Results
 ; GCN-O2-NEXT:        Flatten the CFG
+; GCN-O2-NEXT:        AMDGPU Clear Incompatible Functions Bodies
 ; GCN-O2-NEXT:        Dominator Tree Construction
 ; GCN-O2-NEXT:        Post-Dominator Tree Construction
 ; GCN-O2-NEXT:        Natural Loop Information
@@ -1130,6 +1134,7 @@
 ; GCN-O3-NEXT:        Basic Alias Analysis (stateless AA impl)
 ; GCN-O3-NEXT:        Function Alias Analysis Results
 ; GCN-O3-NEXT:        Flatten the CFG
+; GCN-O3-NEXT:        AMDGPU Clear Incompatible Functions Bodies
 ; GCN-O3-NEXT:        Dominator Tree Construction
 ; GCN-O3-NEXT:        Post-Dominator Tree Construction
 ; GCN-O3-NEXT:        Natural Loop Information


        


More information about the llvm-branch-commits mailing list