[llvm] aef781b - [AMDGPU] Add llvm.amdgcn.wqm.demote intrinsic

Sun Feb 14 16:40:45 PST 2021

Author: Carl Ritson
Date: 2021-02-15T08:45:46+09:00
New Revision: aef781b47a5e3a82eb70a9c96595915fc7fe6cb3

URL: https://github.com/llvm/llvm-project/commit/aef781b47a5e3a82eb70a9c96595915fc7fe6cb3
DIFF: https://github.com/llvm/llvm-project/commit/aef781b47a5e3a82eb70a9c96595915fc7fe6cb3.diff

LOG: [AMDGPU] Add llvm.amdgcn.wqm.demote intrinsic

Add intrinsic which demotes all active lanes to helper lanes.
This is used to implement demote to helper Vulkan extension.

In practice demoting a lane to helper simply means removing it
from the mask of live lanes used for WQM/WWM/Exact mode.
Where the shader does not use WQM, demotes just become kills.

Additionally add llvm.amdgcn.live.mask intrinsic to complement
demote operations. In theory llvm.amdgcn.ps.live can be used
to detect helper lanes; however, ps.live can be moved by LICM.
The movement of ps.live cannot be remedied without changing
its type signature and such a change would require ps.live
users to update as well.

Reviewed By: piotr

Differential Revision: https://reviews.llvm.org/D94747

Added: 
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.live.mask.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.wqm.demote.mir
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll

Modified: 
    llvm/include/llvm/IR/IntrinsicsAMDGPU.td
    llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
    llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
    llvm/lib/Target/AMDGPU/SIInstructions.td
    llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
    llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index ac2291f9d43b..30bdef28230b 100644

--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1349,13 +1349,18 @@ def int_amdgcn_interp_p2_f16 :
             [IntrNoMem, IntrSpeculatable, IntrWillReturn,
              ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>]>;
 
-// Pixel shaders only: whether the current pixel is live (i.e. not a helper
-// invocation for derivative computation).
+// Deprecated: use llvm.amdgcn.live.mask instead.
 def int_amdgcn_ps_live : Intrinsic <
   [llvm_i1_ty],
   [],
   [IntrNoMem, IntrWillReturn]>;
 
+// Query currently live lanes.
+// Returns true if lane is live (and not a helper lane).
+def int_amdgcn_live_mask : Intrinsic <[llvm_i1_ty],
+  [], [IntrReadMem, IntrInaccessibleMemOnly]
+>;
+
 def int_amdgcn_mbcnt_lo :
   GCCBuiltin<"__builtin_amdgcn_mbcnt_lo">,
   Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
@@ -1585,6 +1590,11 @@ def int_amdgcn_endpgm : GCCBuiltin<"__builtin_amdgcn_endpgm">,
   Intrinsic<[], [], [IntrNoReturn, IntrCold, IntrNoMem, IntrHasSideEffects]
 >;
 
+// If false, mark all active lanes as helper lanes until the end of program.
+def int_amdgcn_wqm_demote : Intrinsic<[],
+  [llvm_i1_ty], [IntrWriteMem, IntrInaccessibleMemOnly]
+>;
+
 // Copies the active channels of the source value to the destination value,
 // with the guarantee that the source value is computed as if the entire
 // program were executed in Whole Wavefront Mode, i.e. with all channels

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 78be4db163f7..bb3d0bf8304a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4194,6 +4194,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
       break;
     }
+    case Intrinsic::amdgcn_live_mask: {
+      OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
+      break;
+    }
+    case Intrinsic::amdgcn_wqm_demote:
     case Intrinsic::amdgcn_kill: {
       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
       break;

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index fd65727f04d4..e63294b55ea8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -243,6 +243,7 @@ def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fadd>;
 def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_cmpswap>;
 def : SourceOfDivergence<int_amdgcn_buffer_atomic_csub>;
 def : SourceOfDivergence<int_amdgcn_ps_live>;
+def : SourceOfDivergence<int_amdgcn_live_mask>;
 def : SourceOfDivergence<int_amdgcn_ds_swizzle>;
 def : SourceOfDivergence<int_amdgcn_ds_ordered_add>;
 def : SourceOfDivergence<int_amdgcn_ds_ordered_swap>;

diff  --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 91d93016248e..bdc6b94ca6d6 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -378,6 +378,18 @@ def SI_PS_LIVE : PseudoInstSI <
   let SALU = 1;
 }
 
+let Uses = [EXEC] in {
+def SI_LIVE_MASK : PseudoInstSI <
+  (outs SReg_1:$dst), (ins),
+  [(set i1:$dst, (int_amdgcn_live_mask))]> {
+  let SALU = 1;
+}
+let Defs = [EXEC,SCC] in {
+// Demote: Turn a pixel shader thread into a helper lane.
+def SI_DEMOTE_I1 : SPseudoInstSI <(outs), (ins SCSrc_i1:$src, i1imm:$killvalue)>;
+} // End Defs = [EXEC,SCC]
+} // End Uses = [EXEC]
+
 def SI_MASKED_UNREACHABLE : SPseudoInstSI <(outs), (ins),
   [(int_amdgcn_unreachable)],
   "; divergent unreachable"> {
@@ -751,6 +763,16 @@ def : Pat <
   (SI_KILL_F32_COND_IMM_PSEUDO VSrc_b32:$src, (bitcast_fpimm_to_i32 $imm), (cond_as_i32imm $cond))
 >;
 
+def : Pat <
+  (int_amdgcn_wqm_demote i1:$src),
+  (SI_DEMOTE_I1 SCSrc_i1:$src, 0)
+>;
+
+def : Pat <
+  (int_amdgcn_wqm_demote (i1 (not i1:$src))),
+  (SI_DEMOTE_I1 SCSrc_i1:$src, -1)
+>;
+
   // TODO: we could add more variants for other types of conditionals
 
 def : Pat <

diff  --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
index 3e9537573977..e053e8618b9d 100644
--- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -72,6 +72,7 @@ class SILowerControlFlow : public MachineFunctionPass {
   MachineRegisterInfo *MRI = nullptr;
   SetVector<MachineInstr*> LoweredEndCf;
   DenseSet<Register> LoweredIf;
+  SmallSet<MachineBasicBlock *, 4> KillBlocks;
 
   const TargetRegisterClass *BoolRC = nullptr;
   unsigned AndOpc;
@@ -84,6 +85,8 @@ class SILowerControlFlow : public MachineFunctionPass {
   unsigned OrSaveExecOpc;
   unsigned Exec;
 
+  bool hasKill(const MachineBasicBlock *Begin, const MachineBasicBlock *End);
+
   void emitIf(MachineInstr &MI);
   void emitElse(MachineInstr &MI);
   void emitIfBreak(MachineInstr &MI);
@@ -161,8 +164,8 @@ static void setImpSCCDefDead(MachineInstr &MI, bool IsDead) {
 
 char &llvm::SILowerControlFlowID = SILowerControlFlow::ID;
 
-static bool hasKill(const MachineBasicBlock *Begin,
-                    const MachineBasicBlock *End, const SIInstrInfo *TII) {
+bool SILowerControlFlow::hasKill(const MachineBasicBlock *Begin,
+                                 const MachineBasicBlock *End) {
   DenseSet<const MachineBasicBlock*> Visited;
   SmallVector<MachineBasicBlock *, 4> Worklist(Begin->successors());
 
@@ -171,9 +174,8 @@ static bool hasKill(const MachineBasicBlock *Begin,
 
     if (MBB == End || !Visited.insert(MBB).second)
       continue;
-    for (auto &Term : MBB->terminators())
-      if (TII->isKillTerminator(Term.getOpcode()))
-        return true;
+    if (KillBlocks.contains(MBB))
+      return true;
 
     Worklist.append(MBB->succ_begin(), MBB->succ_end());
   }
@@ -213,7 +215,7 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) {
     // Check for SI_KILL_*_TERMINATOR on path from if to endif.
     // if there is any such terminator simplifications are not safe.
     auto UseMI = MRI->use_instr_nodbg_begin(SaveExecReg);
-    SimpleIf = !hasKill(MI.getParent(), UseMI->getParent(), TII);
+    SimpleIf = !hasKill(MI.getParent(), UseMI->getParent());
   }
 
   // Add an implicit def of exec to discourage scheduling VALU after this which
@@ -799,6 +801,28 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
     Exec = AMDGPU::EXEC;
   }
 
+  // Compute set of blocks with kills
+  const bool CanDemote =
+      MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS;
+  for (auto &MBB : MF) {
+    bool IsKillBlock = false;
+    for (auto &Term : MBB.terminators()) {
+      if (TII->isKillTerminator(Term.getOpcode())) {
+        KillBlocks.insert(&MBB);
+        IsKillBlock = true;
+        break;
+      }
+    }
+    if (CanDemote && !IsKillBlock) {
+      for (auto &MI : MBB) {
+        if (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1) {
+          KillBlocks.insert(&MBB);
+          break;
+        }
+      }
+    }
+  }
+
   MachineFunction::iterator NextBB;
   for (MachineFunction::iterator BI = MF.begin();
        BI != MF.end(); BI = NextBB) {
@@ -848,6 +872,7 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
 
   LoweredEndCf.clear();
   LoweredIf.clear();
+  KillBlocks.clear();
 
   return true;
 }

diff  --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index 2d0d55893f7f..1deef5080318 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -457,10 +457,11 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
         III.Disabled = StateWQM | StateWWM;
         continue;
       } else {
-        if (Opcode == AMDGPU::SI_PS_LIVE) {
+        if (Opcode == AMDGPU::SI_PS_LIVE || Opcode == AMDGPU::SI_LIVE_MASK) {
           LiveMaskQueries.push_back(&MI);
         } else if (Opcode == AMDGPU::SI_KILL_I1_TERMINATOR ||
-                   Opcode == AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR) {
+                   Opcode == AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR ||
+                   Opcode == AMDGPU::SI_DEMOTE_I1) {
           KillInstrs.push_back(&MI);
           BBI.NeedsLowering = true;
         } else if (WQMOutputs) {
@@ -799,6 +800,7 @@ MachineInstr *SIWholeQuadMode::lowerKillI1(MachineBasicBlock &MBB,
   const DebugLoc &DL = MI.getDebugLoc();
   MachineInstr *MaskUpdateMI = nullptr;
 
+  const bool IsDemote = IsWQM && (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1);
   const MachineOperand &Op = MI.getOperand(0);
   int64_t KillVal = MI.getOperand(1).getImm();
   MachineInstr *ComputeKilledMaskMI = nullptr;
@@ -815,10 +817,14 @@ MachineInstr *SIWholeQuadMode::lowerKillI1(MachineBasicBlock &MBB,
     } else {
       // Static: kill does nothing
       MachineInstr *NewTerm = nullptr;
-      assert(MBB.succ_size() == 1);
-      NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH))
-                    .addMBB(*MBB.succ_begin());
-      LIS->ReplaceMachineInstrInMaps(MI, *NewTerm);
+      if (IsDemote) {
+        LIS->RemoveMachineInstrFromMaps(MI);
+      } else {
+        assert(MBB.succ_size() == 1);
+        NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH))
+                      .addMBB(*MBB.succ_begin());
+        LIS->ReplaceMachineInstrInMaps(MI, *NewTerm);
+      }
       MBB.remove(&MI);
       return NewTerm;
     }
@@ -848,17 +854,30 @@ MachineInstr *SIWholeQuadMode::lowerKillI1(MachineBasicBlock &MBB,
   // In the case we got this far some lanes are still live,
   // update EXEC to deactivate lanes as appropriate.
   MachineInstr *NewTerm;
-  if (Op.isImm()) {
-    unsigned MovOpc = ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
-    NewTerm = BuildMI(MBB, &MI, DL, TII->get(MovOpc), Exec).addImm(0);
-  } else if (!IsWQM) {
-    NewTerm = BuildMI(MBB, &MI, DL, TII->get(AndOpc), Exec)
+  MachineInstr *WQMMaskMI = nullptr;
+  Register LiveMaskWQM;
+  if (IsDemote) {
+    // Demotes deactive quads with only helper lanes
+    LiveMaskWQM = MRI->createVirtualRegister(TRI->getBoolRC());
+    WQMMaskMI =
+        BuildMI(MBB, MI, DL, TII->get(WQMOpc), LiveMaskWQM).addReg(LiveMaskReg);
+    NewTerm = BuildMI(MBB, MI, DL, TII->get(AndOpc), Exec)
                   .addReg(Exec)
-                  .addReg(LiveMaskReg);
+                  .addReg(LiveMaskWQM);
   } else {
-    unsigned Opcode = KillVal ? AndN2Opc : AndOpc;
-    NewTerm =
-        BuildMI(MBB, &MI, DL, TII->get(Opcode), Exec).addReg(Exec).add(Op);
+    // Kills deactivate lanes
+    if (Op.isImm()) {
+      unsigned MovOpc = ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+      NewTerm = BuildMI(MBB, &MI, DL, TII->get(MovOpc), Exec).addImm(0);
+    } else if (!IsWQM) {
+      NewTerm = BuildMI(MBB, &MI, DL, TII->get(AndOpc), Exec)
+                    .addReg(Exec)
+                    .addReg(LiveMaskReg);
+    } else {
+      unsigned Opcode = KillVal ? AndN2Opc : AndOpc;
+      NewTerm =
+          BuildMI(MBB, &MI, DL, TII->get(Opcode), Exec).addReg(Exec).add(Op);
+    }
   }
 
   // Update live intervals
@@ -871,6 +890,8 @@ MachineInstr *SIWholeQuadMode::lowerKillI1(MachineBasicBlock &MBB,
     LIS->InsertMachineInstrInMaps(*ComputeKilledMaskMI);
   LIS->InsertMachineInstrInMaps(*MaskUpdateMI);
   LIS->InsertMachineInstrInMaps(*EarlyTermMI);
+  if (WQMMaskMI)
+    LIS->InsertMachineInstrInMaps(*WQMMaskMI);
   LIS->InsertMachineInstrInMaps(*NewTerm);
 
   if (CndReg) {
@@ -879,6 +900,8 @@ MachineInstr *SIWholeQuadMode::lowerKillI1(MachineBasicBlock &MBB,
   }
   if (TmpReg)
     LIS->createAndComputeVirtRegInterval(TmpReg);
+  if (LiveMaskWQM)
+    LIS->createAndComputeVirtRegInterval(LiveMaskWQM);
 
   return NewTerm;
 }
@@ -910,6 +933,7 @@ void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) {
 
     MachineInstr *SplitPoint = nullptr;
     switch (MI.getOpcode()) {
+    case AMDGPU::SI_DEMOTE_I1:
     case AMDGPU::SI_KILL_I1_TERMINATOR:
       SplitPoint = lowerKillI1(MBB, MI, State == StateWQM);
       break;
@@ -1319,6 +1343,7 @@ void SIWholeQuadMode::lowerKillInstrs(bool IsWQM) {
     MachineBasicBlock *MBB = MI->getParent();
     MachineInstr *SplitPoint = nullptr;
     switch (MI->getOpcode()) {
+    case AMDGPU::SI_DEMOTE_I1:
     case AMDGPU::SI_KILL_I1_TERMINATOR:
       SplitPoint = lowerKillI1(*MBB, *MI, IsWQM);
       break;

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll
new file mode 100644
index 000000000000..3dc5cba7bc85
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll
@@ -0,0 +1,1186 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10-32 %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10-64 %s
+
+define amdgpu_ps void @static_exact(float %arg0, float %arg1) {
+; SI-LABEL: static_exact:
+; SI:       ; %bb.0: ; %.entry
+; SI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
+; SI-NEXT:    s_andn2_b64 exec, exec, exec
+; SI-NEXT:    s_cbranch_scc0 BB0_2
+; SI-NEXT:  ; %bb.1: ; %.entry
+; SI-NEXT:    s_mov_b64 exec, 0
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
+; SI-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
+; SI-NEXT:    s_endpgm
+; SI-NEXT:  BB0_2:
+; SI-NEXT:    s_mov_b64 exec, 0
+; SI-NEXT:    exp null off, off, off, off done vm
+; SI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: static_exact:
+; GFX9:       ; %bb.0: ; %.entry
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
+; GFX9-NEXT:    s_andn2_b64 exec, exec, exec
+; GFX9-NEXT:    s_cbranch_scc0 BB0_2
+; GFX9-NEXT:  ; %bb.1: ; %.entry
+; GFX9-NEXT:    s_mov_b64 exec, 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
+; GFX9-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
+; GFX9-NEXT:    s_endpgm
+; GFX9-NEXT:  BB0_2:
+; GFX9-NEXT:    s_mov_b64 exec, 0
+; GFX9-NEXT:    exp null off, off, off, off done vm
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-32-LABEL: static_exact:
+; GFX10-32:       ; %bb.0: ; %.entry
+; GFX10-32-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v0
+; GFX10-32-NEXT:    s_andn2_b32 exec_lo, exec_lo, exec_lo
+; GFX10-32-NEXT:    s_cbranch_scc0 BB0_2
+; GFX10-32-NEXT:  ; %bb.1: ; %.entry
+; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
+; GFX10-32-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
+; GFX10-32-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
+; GFX10-32-NEXT:    s_endpgm
+; GFX10-32-NEXT:  BB0_2:
+; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
+; GFX10-32-NEXT:    exp null off, off, off, off done vm
+; GFX10-32-NEXT:    s_endpgm
+;
+; GFX10-64-LABEL: static_exact:
+; GFX10-64:       ; %bb.0: ; %.entry
+; GFX10-64-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
+; GFX10-64-NEXT:    s_andn2_b64 exec, exec, exec
+; GFX10-64-NEXT:    s_cbranch_scc0 BB0_2
+; GFX10-64-NEXT:  ; %bb.1: ; %.entry
+; GFX10-64-NEXT:    s_mov_b64 exec, 0
+; GFX10-64-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
+; GFX10-64-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
+; GFX10-64-NEXT:    s_endpgm
+; GFX10-64-NEXT:  BB0_2:
+; GFX10-64-NEXT:    s_mov_b64 exec, 0
+; GFX10-64-NEXT:    exp null off, off, off, off done vm
+; GFX10-64-NEXT:    s_endpgm
+.entry:
+  %c0 = fcmp olt float %arg0, 0.000000e+00
+  %c1 = fcmp oge float %arg1, 0.0
+  call void @llvm.amdgcn.wqm.demote(i1 false)
+  %tmp1 = select i1 %c0, float 1.000000e+00, float 0.000000e+00
+  call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0
+  ret void
+}
+
+define amdgpu_ps void @dynamic_exact(float %arg0, float %arg1) {
+; SI-LABEL: dynamic_exact:
+; SI:       ; %bb.0: ; %.entry
+; SI-NEXT:    v_cmp_le_f32_e64 s[0:1], 0, v1
+; SI-NEXT:    s_mov_b64 s[2:3], exec
+; SI-NEXT:    s_xor_b64 s[0:1], s[0:1], exec
+; SI-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[0:1]
+; SI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
+; SI-NEXT:    s_cbranch_scc0 BB1_2
+; SI-NEXT:  ; %bb.1: ; %.entry
+; SI-NEXT:    s_and_b64 exec, exec, s[2:3]
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
+; SI-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
+; SI-NEXT:    s_endpgm
+; SI-NEXT:  BB1_2:
+; SI-NEXT:    s_mov_b64 exec, 0
+; SI-NEXT:    exp null off, off, off, off done vm
+; SI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: dynamic_exact:
+; GFX9:       ; %bb.0: ; %.entry
+; GFX9-NEXT:    v_cmp_le_f32_e64 s[0:1], 0, v1
+; GFX9-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9-NEXT:    s_xor_b64 s[0:1], s[0:1], exec
+; GFX9-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[0:1]
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
+; GFX9-NEXT:    s_cbranch_scc0 BB1_2
+; GFX9-NEXT:  ; %bb.1: ; %.entry
+; GFX9-NEXT:    s_and_b64 exec, exec, s[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
+; GFX9-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
+; GFX9-NEXT:    s_endpgm
+; GFX9-NEXT:  BB1_2:
+; GFX9-NEXT:    s_mov_b64 exec, 0
+; GFX9-NEXT:    exp null off, off, off, off done vm
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-32-LABEL: dynamic_exact:
+; GFX10-32:       ; %bb.0: ; %.entry
+; GFX10-32-NEXT:    v_cmp_le_f32_e64 s0, 0, v1
+; GFX10-32-NEXT:    s_mov_b32 s1, exec_lo
+; GFX10-32-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v0
+; GFX10-32-NEXT:    s_xor_b32 s0, s0, exec_lo
+; GFX10-32-NEXT:    s_andn2_b32 s1, s1, s0
+; GFX10-32-NEXT:    s_cbranch_scc0 BB1_2
+; GFX10-32-NEXT:  ; %bb.1: ; %.entry
+; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s1
+; GFX10-32-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
+; GFX10-32-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
+; GFX10-32-NEXT:    s_endpgm
+; GFX10-32-NEXT:  BB1_2:
+; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
+; GFX10-32-NEXT:    exp null off, off, off, off done vm
+; GFX10-32-NEXT:    s_endpgm
+;
+; GFX10-64-LABEL: dynamic_exact:
+; GFX10-64:       ; %bb.0: ; %.entry
+; GFX10-64-NEXT:    v_cmp_le_f32_e64 s[0:1], 0, v1
+; GFX10-64-NEXT:    s_mov_b64 s[2:3], exec
+; GFX10-64-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
+; GFX10-64-NEXT:    s_xor_b64 s[0:1], s[0:1], exec
+; GFX10-64-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[0:1]
+; GFX10-64-NEXT:    s_cbranch_scc0 BB1_2
+; GFX10-64-NEXT:  ; %bb.1: ; %.entry
+; GFX10-64-NEXT:    s_and_b64 exec, exec, s[2:3]
+; GFX10-64-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
+; GFX10-64-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
+; GFX10-64-NEXT:    s_endpgm
+; GFX10-64-NEXT:  BB1_2:
+; GFX10-64-NEXT:    s_mov_b64 exec, 0
+; GFX10-64-NEXT:    exp null off, off, off, off done vm
+; GFX10-64-NEXT:    s_endpgm
+.entry:
+  %c0 = fcmp olt float %arg0, 0.000000e+00
+  %c1 = fcmp oge float %arg1, 0.0
+  call void @llvm.amdgcn.wqm.demote(i1 %c1)
+  %tmp1 = select i1 %c0, float 1.000000e+00, float 0.000000e+00
+  call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0
+  ret void
+}
+
+define amdgpu_ps void @branch(float %arg0, float %arg1) {
+; SI-LABEL: branch:
+; SI:       ; %bb.0: ; %.entry
+; SI-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; SI-NEXT:    v_cvt_i32_f32_e32 v1, v1
+; SI-NEXT:    s_mov_b64 s[0:1], exec
+; SI-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-NEXT:    v_and_b32_e32 v0, 1, v0
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; SI-NEXT:    s_xor_b64 s[2:3], vcc, -1
+; SI-NEXT:    s_and_saveexec_b64 s[4:5], s[2:3]
+; SI-NEXT:    s_xor_b64 s[2:3], exec, s[4:5]
+; SI-NEXT:  ; %bb.1: ; %.demote
+; SI-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
+; SI-NEXT:    s_cbranch_scc0 BB2_4
+; SI-NEXT:  ; %bb.2: ; %.demote
+; SI-NEXT:    s_mov_b64 exec, 0
+; SI-NEXT:  ; %bb.3: ; %.continue
+; SI-NEXT:    s_or_b64 exec, exec, s[2:3]
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
+; SI-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
+; SI-NEXT:    s_endpgm
+; SI-NEXT:  BB2_4:
+; SI-NEXT:    s_mov_b64 exec, 0
+; SI-NEXT:    exp null off, off, off, off done vm
+; SI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: branch:
+; GFX9:       ; %bb.0: ; %.entry
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v1, v1
+; GFX9-NEXT:    s_mov_b64 s[0:1], exec
+; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    s_xor_b64 s[2:3], vcc, -1
+; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], s[2:3]
+; GFX9-NEXT:    s_xor_b64 s[2:3], exec, s[4:5]
+; GFX9-NEXT:  ; %bb.1: ; %.demote
+; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
+; GFX9-NEXT:    s_cbranch_scc0 BB2_4
+; GFX9-NEXT:  ; %bb.2: ; %.demote
+; GFX9-NEXT:    s_mov_b64 exec, 0
+; GFX9-NEXT:  ; %bb.3: ; %.continue
+; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
+; GFX9-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
+; GFX9-NEXT:    s_endpgm
+; GFX9-NEXT:  BB2_4:
+; GFX9-NEXT:    s_mov_b64 exec, 0
+; GFX9-NEXT:    exp null off, off, off, off done vm
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-32-LABEL: branch:
+; GFX10-32:       ; %bb.0: ; %.entry
+; GFX10-32-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX10-32-NEXT:    v_cvt_i32_f32_e32 v1, v1
+; GFX10-32-NEXT:    s_mov_b32 s0, exec_lo
+; GFX10-32-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX10-32-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX10-32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX10-32-NEXT:    s_xor_b32 s1, vcc_lo, -1
+; GFX10-32-NEXT:    s_and_saveexec_b32 s2, s1
+; GFX10-32-NEXT:    s_xor_b32 s1, exec_lo, s2
+; GFX10-32-NEXT:  ; %bb.1: ; %.demote
+; GFX10-32-NEXT:    s_andn2_b32 s0, s0, exec_lo
+; GFX10-32-NEXT:    s_cbranch_scc0 BB2_4
+; GFX10-32-NEXT:  ; %bb.2: ; %.demote
+; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
+; GFX10-32-NEXT:  ; %bb.3: ; %.continue
+; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX10-32-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
+; GFX10-32-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
+; GFX10-32-NEXT:    s_endpgm
+; GFX10-32-NEXT:  BB2_4:
+; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
+; GFX10-32-NEXT:    exp null off, off, off, off done vm
+; GFX10-32-NEXT:    s_endpgm
+;
+; GFX10-64-LABEL: branch:
+; GFX10-64:       ; %bb.0: ; %.entry
+; GFX10-64-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX10-64-NEXT:    v_cvt_i32_f32_e32 v1, v1
+; GFX10-64-NEXT:    s_mov_b64 s[0:1], exec
+; GFX10-64-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX10-64-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX10-64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX10-64-NEXT:    s_xor_b64 s[2:3], vcc, -1
+; GFX10-64-NEXT:    s_and_saveexec_b64 s[4:5], s[2:3]
+; GFX10-64-NEXT:    s_xor_b64 s[2:3], exec, s[4:5]
+; GFX10-64-NEXT:  ; %bb.1: ; %.demote
+; GFX10-64-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
+; GFX10-64-NEXT:    s_cbranch_scc0 BB2_4
+; GFX10-64-NEXT:  ; %bb.2: ; %.demote
+; GFX10-64-NEXT:    s_mov_b64 exec, 0
+; GFX10-64-NEXT:  ; %bb.3: ; %.continue
+; GFX10-64-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX10-64-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
+; GFX10-64-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
+; GFX10-64-NEXT:    s_endpgm
+; GFX10-64-NEXT:  BB2_4:
+; GFX10-64-NEXT:    s_mov_b64 exec, 0
+; GFX10-64-NEXT:    exp null off, off, off, off done vm
+; GFX10-64-NEXT:    s_endpgm
+.entry:
+  %i0 = fptosi float %arg0 to i32
+  %i1 = fptosi float %arg1 to i32
+  %c0 = or i32 %i0, %i1
+  %c1 = and i32 %c0, 1
+  %c2 = icmp eq i32 %c1, 0
+  br i1 %c2, label %.continue, label %.demote
+
+.demote:
+  call void @llvm.amdgcn.wqm.demote(i1 false)
+  br label %.continue
+
+.continue:
+  %tmp1 = select i1 %c2, float 1.000000e+00, float 0.000000e+00
+  call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0
+  ret void
+}
+
+define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) {
+; SI-LABEL: wqm_demote_1:
+; SI:       ; %bb.0: ; %.entry
+; SI-NEXT:    s_mov_b64 s[12:13], exec
+; SI-NEXT:    s_wqm_b64 exec, exec
+; SI-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v1
+; SI-NEXT:    s_and_saveexec_b64 s[14:15], vcc
+; SI-NEXT:    s_xor_b64 s[14:15], exec, s[14:15]
+; SI-NEXT:  ; %bb.1: ; %.demote
+; SI-NEXT:    s_andn2_b64 s[12:13], s[12:13], exec
+; SI-NEXT:    s_cbranch_scc0 BB3_4
+; SI-NEXT:  ; %bb.2: ; %.demote
+; SI-NEXT:    s_wqm_b64 s[16:17], s[12:13]
+; SI-NEXT:    s_and_b64 exec, exec, s[16:17]
+; SI-NEXT:  ; %bb.3: ; %.continue
+; SI-NEXT:    s_or_b64 exec, exec, s[14:15]
+; SI-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_add_f32_e32 v0, v0, v0
+; SI-NEXT:    s_and_b64 exec, exec, s[12:13]
+; SI-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_branch BB3_5
+; SI-NEXT:  BB3_4:
+; SI-NEXT:    s_mov_b64 exec, 0
+; SI-NEXT:    exp null off, off, off, off done vm
+; SI-NEXT:    s_endpgm
+; SI-NEXT:  BB3_5:
+;
+; GFX9-LABEL: wqm_demote_1:
+; GFX9:       ; %bb.0: ; %.entry
+; GFX9-NEXT:    s_mov_b64 s[12:13], exec
+; GFX9-NEXT:    s_wqm_b64 exec, exec
+; GFX9-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v1
+; GFX9-NEXT:    s_and_saveexec_b64 s[14:15], vcc
+; GFX9-NEXT:    s_xor_b64 s[14:15], exec, s[14:15]
+; GFX9-NEXT:  ; %bb.1: ; %.demote
+; GFX9-NEXT:    s_andn2_b64 s[12:13], s[12:13], exec
+; GFX9-NEXT:    s_cbranch_scc0 BB3_4
+; GFX9-NEXT:  ; %bb.2: ; %.demote
+; GFX9-NEXT:    s_wqm_b64 s[16:17], s[12:13]
+; GFX9-NEXT:    s_and_b64 exec, exec, s[16:17]
+; GFX9-NEXT:  ; %bb.3: ; %.continue
+; GFX9-NEXT:    s_or_b64 exec, exec, s[14:15]
+; GFX9-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v0
+; GFX9-NEXT:    s_and_b64 exec, exec, s[12:13]
+; GFX9-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_branch BB3_5
+; GFX9-NEXT:  BB3_4:
+; GFX9-NEXT:    s_mov_b64 exec, 0
+; GFX9-NEXT:    exp null off, off, off, off done vm
+; GFX9-NEXT:    s_endpgm
+; GFX9-NEXT:  BB3_5:
+;
+; GFX10-32-LABEL: wqm_demote_1:
+; GFX10-32:       ; %bb.0: ; %.entry
+; GFX10-32-NEXT:    s_mov_b32 s12, exec_lo
+; GFX10-32-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX10-32-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, 0, v1
+; GFX10-32-NEXT:    s_and_saveexec_b32 s13, vcc_lo
+; GFX10-32-NEXT:    s_xor_b32 s13, exec_lo, s13
+; GFX10-32-NEXT:  ; %bb.1: ; %.demote
+; GFX10-32-NEXT:    s_andn2_b32 s12, s12, exec_lo
+; GFX10-32-NEXT:    s_cbranch_scc0 BB3_4
+; GFX10-32-NEXT:  ; %bb.2: ; %.demote
+; GFX10-32-NEXT:    s_wqm_b32 s28, s12
+; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s28
+; GFX10-32-NEXT:  ; %bb.3: ; %.continue
+; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s13
+; GFX10-32-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX10-32-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-32-NEXT:    v_add_f32_e32 v0, v0, v0
+; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX10-32-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX10-32-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-32-NEXT:    s_branch BB3_5
+; GFX10-32-NEXT:  BB3_4:
+; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
+; GFX10-32-NEXT:    exp null off, off, off, off done vm
+; GFX10-32-NEXT:    s_endpgm
+; GFX10-32-NEXT:  BB3_5:
+;
+; GFX10-64-LABEL: wqm_demote_1:
+; GFX10-64:       ; %bb.0: ; %.entry
+; GFX10-64-NEXT:    s_mov_b64 s[12:13], exec
+; GFX10-64-NEXT:    s_wqm_b64 exec, exec
+; GFX10-64-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v1
+; GFX10-64-NEXT:    s_and_saveexec_b64 s[14:15], vcc
+; GFX10-64-NEXT:    s_xor_b64 s[28:29], exec, s[14:15]
+; GFX10-64-NEXT:  ; %bb.1: ; %.demote
+; GFX10-64-NEXT:    s_andn2_b64 s[12:13], s[12:13], exec
+; GFX10-64-NEXT:    s_cbranch_scc0 BB3_4
+; GFX10-64-NEXT:  ; %bb.2: ; %.demote
+; GFX10-64-NEXT:    s_wqm_b64 s[16:17], s[12:13]
+; GFX10-64-NEXT:    s_and_b64 exec, exec, s[16:17]
+; GFX10-64-NEXT:  ; %bb.3: ; %.continue
+; GFX10-64-NEXT:    s_or_b64 exec, exec, s[28:29]
+; GFX10-64-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX10-64-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-64-NEXT:    v_add_f32_e32 v0, v0, v0
+; GFX10-64-NEXT:    s_and_b64 exec, exec, s[12:13]
+; GFX10-64-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX10-64-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-64-NEXT:    s_branch BB3_5
+; GFX10-64-NEXT:  BB3_4:
+; GFX10-64-NEXT:    s_mov_b64 exec, 0
+; GFX10-64-NEXT:    exp null off, off, off, off done vm
+; GFX10-64-NEXT:    s_endpgm
+; GFX10-64-NEXT:  BB3_5:
+.entry:
+  %z.cmp = fcmp olt float %z, 0.0
+  br i1 %z.cmp, label %.continue, label %.demote
+
+.demote:
+  call void @llvm.amdgcn.wqm.demote(i1 false)
+  br label %.continue
+
+.continue:
+  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
+  %tex0 = extractelement <4 x float> %tex, i32 0
+  %tex1 = extractelement <4 x float> %tex, i32 0
+  %coord1 = fadd float %tex0, %tex1
+  %rtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord1, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
+
+  ret <4 x float> %rtex
+}
+
+define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) {
+; SI-LABEL: wqm_demote_2:
+; SI:       ; %bb.0: ; %.entry
+; SI-NEXT:    s_mov_b64 s[12:13], exec
+; SI-NEXT:    s_wqm_b64 exec, exec
+; SI-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v0
+; SI-NEXT:    s_and_saveexec_b64 s[14:15], vcc
+; SI-NEXT:    s_xor_b64 s[14:15], exec, s[14:15]
+; SI-NEXT:  ; %bb.1: ; %.demote
+; SI-NEXT:    s_andn2_b64 s[12:13], s[12:13], exec
+; SI-NEXT:    s_cbranch_scc0 BB4_4
+; SI-NEXT:  ; %bb.2: ; %.demote
+; SI-NEXT:    s_wqm_b64 s[16:17], s[12:13]
+; SI-NEXT:    s_and_b64 exec, exec, s[16:17]
+; SI-NEXT:  ; %bb.3: ; %.continue
+; SI-NEXT:    s_or_b64 exec, exec, s[14:15]
+; SI-NEXT:    v_add_f32_e32 v0, v0, v0
+; SI-NEXT:    s_and_b64 exec, exec, s[12:13]
+; SI-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_branch BB4_5
+; SI-NEXT:  BB4_4:
+; SI-NEXT:    s_mov_b64 exec, 0
+; SI-NEXT:    exp null off, off, off, off done vm
+; SI-NEXT:    s_endpgm
+; SI-NEXT:  BB4_5:
+;
+; GFX9-LABEL: wqm_demote_2:
+; GFX9:       ; %bb.0: ; %.entry
+; GFX9-NEXT:    s_mov_b64 s[12:13], exec
+; GFX9-NEXT:    s_wqm_b64 exec, exec
+; GFX9-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v0
+; GFX9-NEXT:    s_and_saveexec_b64 s[14:15], vcc
+; GFX9-NEXT:    s_xor_b64 s[14:15], exec, s[14:15]
+; GFX9-NEXT:  ; %bb.1: ; %.demote
+; GFX9-NEXT:    s_andn2_b64 s[12:13], s[12:13], exec
+; GFX9-NEXT:    s_cbranch_scc0 BB4_4
+; GFX9-NEXT:  ; %bb.2: ; %.demote
+; GFX9-NEXT:    s_wqm_b64 s[16:17], s[12:13]
+; GFX9-NEXT:    s_and_b64 exec, exec, s[16:17]
+; GFX9-NEXT:  ; %bb.3: ; %.continue
+; GFX9-NEXT:    s_or_b64 exec, exec, s[14:15]
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v0
+; GFX9-NEXT:    s_and_b64 exec, exec, s[12:13]
+; GFX9-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_branch BB4_5
+; GFX9-NEXT:  BB4_4:
+; GFX9-NEXT:    s_mov_b64 exec, 0
+; GFX9-NEXT:    exp null off, off, off, off done vm
+; GFX9-NEXT:    s_endpgm
+; GFX9-NEXT:  BB4_5:
+;
+; GFX10-32-LABEL: wqm_demote_2:
+; GFX10-32:       ; %bb.0: ; %.entry
+; GFX10-32-NEXT:    s_mov_b32 s12, exec_lo
+; GFX10-32-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX10-32-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX10-32-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-32-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, 0, v0
+; GFX10-32-NEXT:    s_and_saveexec_b32 s13, vcc_lo
+; GFX10-32-NEXT:    s_xor_b32 s13, exec_lo, s13
+; GFX10-32-NEXT:  ; %bb.1: ; %.demote
+; GFX10-32-NEXT:    s_andn2_b32 s12, s12, exec_lo
+; GFX10-32-NEXT:    s_cbranch_scc0 BB4_4
+; GFX10-32-NEXT:  ; %bb.2: ; %.demote
+; GFX10-32-NEXT:    s_wqm_b32 s28, s12
+; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s28
+; GFX10-32-NEXT:  ; %bb.3: ; %.continue
+; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s13
+; GFX10-32-NEXT:    v_add_f32_e32 v0, v0, v0
+; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX10-32-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX10-32-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-32-NEXT:    s_branch BB4_5
+; GFX10-32-NEXT:  BB4_4:
+; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
+; GFX10-32-NEXT:    exp null off, off, off, off done vm
+; GFX10-32-NEXT:    s_endpgm
+; GFX10-32-NEXT:  BB4_5:
+;
+; GFX10-64-LABEL: wqm_demote_2:
+; GFX10-64:       ; %bb.0: ; %.entry
+; GFX10-64-NEXT:    s_mov_b64 s[12:13], exec
+; GFX10-64-NEXT:    s_wqm_b64 exec, exec
+; GFX10-64-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX10-64-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-64-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v0
+; GFX10-64-NEXT:    s_and_saveexec_b64 s[14:15], vcc
+; GFX10-64-NEXT:    s_xor_b64 s[28:29], exec, s[14:15]
+; GFX10-64-NEXT:  ; %bb.1: ; %.demote
+; GFX10-64-NEXT:    s_andn2_b64 s[12:13], s[12:13], exec
+; GFX10-64-NEXT:    s_cbranch_scc0 BB4_4
+; GFX10-64-NEXT:  ; %bb.2: ; %.demote
+; GFX10-64-NEXT:    s_wqm_b64 s[16:17], s[12:13]
+; GFX10-64-NEXT:    s_and_b64 exec, exec, s[16:17]
+; GFX10-64-NEXT:  ; %bb.3: ; %.continue
+; GFX10-64-NEXT:    s_or_b64 exec, exec, s[28:29]
+; GFX10-64-NEXT:    v_add_f32_e32 v0, v0, v0
+; GFX10-64-NEXT:    s_and_b64 exec, exec, s[12:13]
+; GFX10-64-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX10-64-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-64-NEXT:    s_branch BB4_5
+; GFX10-64-NEXT:  BB4_4:
+; GFX10-64-NEXT:    s_mov_b64 exec, 0
+; GFX10-64-NEXT:    exp null off, off, off, off done vm
+; GFX10-64-NEXT:    s_endpgm
+; GFX10-64-NEXT:  BB4_5:
+.entry:
+  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
+  %tex0 = extractelement <4 x float> %tex, i32 0
+  %tex1 = extractelement <4 x float> %tex, i32 0
+  %z.cmp = fcmp olt float %tex0, 0.0
+  br i1 %z.cmp, label %.continue, label %.demote
+
+.demote:
+  call void @llvm.amdgcn.wqm.demote(i1 false)
+  br label %.continue
+
+.continue:
+  %coord1 = fadd float %tex0, %tex1
+  %rtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord1, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
+
+  ret <4 x float> %rtex
+}
+
+define amdgpu_ps <4 x float> @wqm_demote_dynamic(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) {
+; SI-LABEL: wqm_demote_dynamic:
+; SI:       ; %bb.0: ; %.entry
+; SI-NEXT:    s_mov_b64 s[12:13], exec
+; SI-NEXT:    s_wqm_b64 exec, exec
+; SI-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
+; SI-NEXT:    s_xor_b64 s[14:15], vcc, exec
+; SI-NEXT:    s_andn2_b64 s[12:13], s[12:13], s[14:15]
+; SI-NEXT:    s_cbranch_scc0 BB5_2
+; SI-NEXT:  ; %bb.1: ; %.entry
+; SI-NEXT:    s_wqm_b64 s[14:15], s[12:13]
+; SI-NEXT:    s_and_b64 exec, exec, s[14:15]
+; SI-NEXT:    v_add_f32_e32 v0, v0, v0
+; SI-NEXT:    s_and_b64 exec, exec, s[12:13]
+; SI-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_branch BB5_3
+; SI-NEXT:  BB5_2:
+; SI-NEXT:    s_mov_b64 exec, 0
+; SI-NEXT:    exp null off, off, off, off done vm
+; SI-NEXT:    s_endpgm
+; SI-NEXT:  BB5_3:
+;
+; GFX9-LABEL: wqm_demote_dynamic:
+; GFX9:       ; %bb.0: ; %.entry
+; GFX9-NEXT:    s_mov_b64 s[12:13], exec
+; GFX9-NEXT:    s_wqm_b64 exec, exec
+; GFX9-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
+; GFX9-NEXT:    s_xor_b64 s[14:15], vcc, exec
+; GFX9-NEXT:    s_andn2_b64 s[12:13], s[12:13], s[14:15]
+; GFX9-NEXT:    s_cbranch_scc0 BB5_2
+; GFX9-NEXT:  ; %bb.1: ; %.entry
+; GFX9-NEXT:    s_wqm_b64 s[14:15], s[12:13]
+; GFX9-NEXT:    s_and_b64 exec, exec, s[14:15]
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v0
+; GFX9-NEXT:    s_and_b64 exec, exec, s[12:13]
+; GFX9-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_branch BB5_3
+; GFX9-NEXT:  BB5_2:
+; GFX9-NEXT:    s_mov_b64 exec, 0
+; GFX9-NEXT:    exp null off, off, off, off done vm
+; GFX9-NEXT:    s_endpgm
+; GFX9-NEXT:  BB5_3:
+;
+; GFX10-32-LABEL: wqm_demote_dynamic:
+; GFX10-32:       ; %bb.0: ; %.entry
+; GFX10-32-NEXT:    s_mov_b32 s12, exec_lo
+; GFX10-32-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX10-32-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX10-32-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-32-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v0
+; GFX10-32-NEXT:    s_xor_b32 s13, vcc_lo, exec_lo
+; GFX10-32-NEXT:    s_andn2_b32 s12, s12, s13
+; GFX10-32-NEXT:    s_cbranch_scc0 BB5_2
+; GFX10-32-NEXT:  ; %bb.1: ; %.entry
+; GFX10-32-NEXT:    s_wqm_b32 s13, s12
+; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s13
+; GFX10-32-NEXT:    v_add_f32_e32 v0, v0, v0
+; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX10-32-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX10-32-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-32-NEXT:    s_branch BB5_3
+; GFX10-32-NEXT:  BB5_2:
+; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
+; GFX10-32-NEXT:    exp null off, off, off, off done vm
+; GFX10-32-NEXT:    s_endpgm
+; GFX10-32-NEXT:  BB5_3:
+;
+; GFX10-64-LABEL: wqm_demote_dynamic:
+; GFX10-64:       ; %bb.0: ; %.entry
+; GFX10-64-NEXT:    s_mov_b64 s[12:13], exec
+; GFX10-64-NEXT:    s_wqm_b64 exec, exec
+; GFX10-64-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX10-64-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-64-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
+; GFX10-64-NEXT:    s_xor_b64 s[14:15], vcc, exec
+; GFX10-64-NEXT:    s_andn2_b64 s[12:13], s[12:13], s[14:15]
+; GFX10-64-NEXT:    s_cbranch_scc0 BB5_2
+; GFX10-64-NEXT:  ; %bb.1: ; %.entry
+; GFX10-64-NEXT:    s_wqm_b64 s[28:29], s[12:13]
+; GFX10-64-NEXT:    s_and_b64 exec, exec, s[28:29]
+; GFX10-64-NEXT:    v_add_f32_e32 v0, v0, v0
+; GFX10-64-NEXT:    s_and_b64 exec, exec, s[12:13]
+; GFX10-64-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX10-64-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-64-NEXT:    s_branch BB5_3
+; GFX10-64-NEXT:  BB5_2:
+; GFX10-64-NEXT:    s_mov_b64 exec, 0
+; GFX10-64-NEXT:    exp null off, off, off, off done vm
+; GFX10-64-NEXT:    s_endpgm
+; GFX10-64-NEXT:  BB5_3:
+.entry:
+  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
+  %tex0 = extractelement <4 x float> %tex, i32 0
+  %tex1 = extractelement <4 x float> %tex, i32 0
+  %z.cmp = fcmp olt float %tex0, 0.0
+  call void @llvm.amdgcn.wqm.demote(i1 %z.cmp)
+  %coord1 = fadd float %tex0, %tex1
+  %rtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord1, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
+
+  ret <4 x float> %rtex
+}
+
+define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) {
+; SI-LABEL: wqm_deriv:
+; SI:       ; %bb.0: ; %.entry
+; SI-NEXT:    s_mov_b64 s[0:1], exec
+; SI-NEXT:    s_wqm_b64 exec, exec
+; SI-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; SI-NEXT:    s_movk_i32 s2, 0x3c00
+; SI-NEXT:    s_bfe_u32 s4, 0, 0x100000
+; SI-NEXT:    s_bfe_u32 s3, s2, 0x100000
+; SI-NEXT:    s_lshl_b32 s2, s4, 16
+; SI-NEXT:    s_or_b32 s2, s3, s2
+; SI-NEXT:    s_lshl_b32 s3, s3, 16
+; SI-NEXT:    s_or_b32 s3, s4, s3
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; SI-NEXT:  ; %bb.1: ; %.demote0
+; SI-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
+; SI-NEXT:    s_cbranch_scc0 BB6_7
+; SI-NEXT:  ; %bb.2: ; %.demote0
+; SI-NEXT:    s_wqm_b64 s[6:7], s[0:1]
+; SI-NEXT:    s_and_b64 exec, exec, s[6:7]
+; SI-NEXT:  ; %bb.3: ; %.continue0
+; SI-NEXT:    s_or_b64 exec, exec, s[4:5]
+; SI-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; SI-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0, s[4:5]
+; SI-NEXT:    v_mov_b32_e32 v1, v0
+; SI-NEXT:    s_nop 1
+; SI-NEXT:    v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0
+; SI-NEXT:    s_nop 1
+; SI-NEXT:    v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0
+; SI-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
+; SI-NEXT:    s_and_b64 exec, exec, s[0:1]
+; SI-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v0
+; SI-NEXT:    s_and_b64 s[4:5], s[0:1], vcc
+; SI-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
+; SI-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
+; SI-NEXT:    s_xor_b64 s[4:5], exec, s[6:7]
+; SI-NEXT:  ; %bb.4: ; %.demote1
+; SI-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
+; SI-NEXT:    s_cbranch_scc0 BB6_7
+; SI-NEXT:  ; %bb.5: ; %.demote1
+; SI-NEXT:    s_mov_b64 exec, 0
+; SI-NEXT:  ; %bb.6: ; %.continue1
+; SI-NEXT:    s_or_b64 exec, exec, s[4:5]
+; SI-NEXT:    v_mov_b32_e32 v0, s2
+; SI-NEXT:    v_mov_b32_e32 v1, s3
+; SI-NEXT:    exp mrt0 v0, v0, v1, v1 done compr vm
+; SI-NEXT:    s_endpgm
+; SI-NEXT:  BB6_7:
+; SI-NEXT:    s_mov_b64 exec, 0
+; SI-NEXT:    exp null off, off, off, off done vm
+; SI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: wqm_deriv:
+; GFX9:       ; %bb.0: ; %.entry
+; GFX9-NEXT:    s_mov_b64 s[0:1], exec
+; GFX9-NEXT:    s_wqm_b64 exec, exec
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX9-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX9-NEXT:  ; %bb.1: ; %.demote0
+; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
+; GFX9-NEXT:    s_cbranch_scc0 BB6_7
+; GFX9-NEXT:  ; %bb.2: ; %.demote0
+; GFX9-NEXT:    s_wqm_b64 s[4:5], s[0:1]
+; GFX9-NEXT:    s_and_b64 exec, exec, s[4:5]
+; GFX9-NEXT:  ; %bb.3: ; %.continue0
+; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0, s[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_nop 1
+; GFX9-NEXT:    v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0
+; GFX9-NEXT:    s_nop 1
+; GFX9-NEXT:    v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0
+; GFX9-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
+; GFX9-NEXT:    s_and_b64 exec, exec, s[0:1]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v0
+; GFX9-NEXT:    s_and_b64 s[2:3], s[0:1], vcc
+; GFX9-NEXT:    s_xor_b64 s[2:3], s[2:3], -1
+; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], s[2:3]
+; GFX9-NEXT:    s_xor_b64 s[2:3], exec, s[4:5]
+; GFX9-NEXT:  ; %bb.4: ; %.demote1
+; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
+; GFX9-NEXT:    s_cbranch_scc0 BB6_7
+; GFX9-NEXT:  ; %bb.5: ; %.demote1
+; GFX9-NEXT:    s_mov_b64 exec, 0
+; GFX9-NEXT:  ; %bb.6: ; %.continue1
+; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x3c00
+; GFX9-NEXT:    v_bfrev_b32_e32 v1, 60
+; GFX9-NEXT:    exp mrt0 v0, v0, v1, v1 done compr vm
+; GFX9-NEXT:    s_endpgm
+; GFX9-NEXT:  BB6_7:
+; GFX9-NEXT:    s_mov_b64 exec, 0
+; GFX9-NEXT:    exp null off, off, off, off done vm
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-32-LABEL: wqm_deriv:
+; GFX10-32:       ; %bb.0: ; %.entry
+; GFX10-32-NEXT:    s_mov_b32 s0, exec_lo
+; GFX10-32-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX10-32-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX10-32-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX10-32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
+; GFX10-32-NEXT:    s_xor_b32 s1, exec_lo, s1
+; GFX10-32-NEXT:  ; %bb.1: ; %.demote0
+; GFX10-32-NEXT:    s_andn2_b32 s0, s0, exec_lo
+; GFX10-32-NEXT:    s_cbranch_scc0 BB6_7
+; GFX10-32-NEXT:  ; %bb.2: ; %.demote0
+; GFX10-32-NEXT:    s_wqm_b32 s2, s0
+; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s2
+; GFX10-32-NEXT:  ; %bb.3: ; %.continue0
+; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX10-32-NEXT:    s_mov_b32 s1, s0
+; GFX10-32-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0, s1
+; GFX10-32-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-32-NEXT:    v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0
+; GFX10-32-NEXT:    v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0
+; GFX10-32-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
+; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s0
+; GFX10-32-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v0
+; GFX10-32-NEXT:    s_and_b32 s1, s0, vcc_lo
+; GFX10-32-NEXT:    s_xor_b32 s1, s1, -1
+; GFX10-32-NEXT:    s_and_saveexec_b32 s2, s1
+; GFX10-32-NEXT:    s_xor_b32 s1, exec_lo, s2
+; GFX10-32-NEXT:  ; %bb.4: ; %.demote1
+; GFX10-32-NEXT:    s_andn2_b32 s0, s0, exec_lo
+; GFX10-32-NEXT:    s_cbranch_scc0 BB6_7
+; GFX10-32-NEXT:  ; %bb.5: ; %.demote1
+; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
+; GFX10-32-NEXT:  ; %bb.6: ; %.continue1
+; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX10-32-NEXT:    v_mov_b32_e32 v0, 0x3c00
+; GFX10-32-NEXT:    v_bfrev_b32_e32 v1, 60
+; GFX10-32-NEXT:    exp mrt0 v0, v0, v1, v1 done compr vm
+; GFX10-32-NEXT:    s_endpgm
+; GFX10-32-NEXT:  BB6_7:
+; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
+; GFX10-32-NEXT:    exp null off, off, off, off done vm
+; GFX10-32-NEXT:    s_endpgm
+;
+; GFX10-64-LABEL: wqm_deriv:
+; GFX10-64:       ; %bb.0: ; %.entry
+; GFX10-64-NEXT:    s_mov_b64 s[0:1], exec
+; GFX10-64-NEXT:    s_wqm_b64 exec, exec
+; GFX10-64-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX10-64-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX10-64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX10-64-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX10-64-NEXT:  ; %bb.1: ; %.demote0
+; GFX10-64-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
+; GFX10-64-NEXT:    s_cbranch_scc0 BB6_7
+; GFX10-64-NEXT:  ; %bb.2: ; %.demote0
+; GFX10-64-NEXT:    s_wqm_b64 s[4:5], s[0:1]
+; GFX10-64-NEXT:    s_and_b64 exec, exec, s[4:5]
+; GFX10-64-NEXT:  ; %bb.3: ; %.continue0
+; GFX10-64-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX10-64-NEXT:    s_mov_b64 s[2:3], s[0:1]
+; GFX10-64-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0, s[2:3]
+; GFX10-64-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-64-NEXT:    v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0
+; GFX10-64-NEXT:    v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0
+; GFX10-64-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
+; GFX10-64-NEXT:    s_and_b64 exec, exec, s[0:1]
+; GFX10-64-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v0
+; GFX10-64-NEXT:    s_and_b64 s[2:3], s[0:1], vcc
+; GFX10-64-NEXT:    s_xor_b64 s[2:3], s[2:3], -1
+; GFX10-64-NEXT:    s_and_saveexec_b64 s[4:5], s[2:3]
+; GFX10-64-NEXT:    s_xor_b64 s[2:3], exec, s[4:5]
+; GFX10-64-NEXT:  ; %bb.4: ; %.demote1
+; GFX10-64-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
+; GFX10-64-NEXT:    s_cbranch_scc0 BB6_7
+; GFX10-64-NEXT:  ; %bb.5: ; %.demote1
+; GFX10-64-NEXT:    s_mov_b64 exec, 0
+; GFX10-64-NEXT:  ; %bb.6: ; %.continue1
+; GFX10-64-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX10-64-NEXT:    v_mov_b32_e32 v0, 0x3c00
+; GFX10-64-NEXT:    v_bfrev_b32_e32 v1, 60
+; GFX10-64-NEXT:    exp mrt0 v0, v0, v1, v1 done compr vm
+; GFX10-64-NEXT:    s_endpgm
+; GFX10-64-NEXT:  BB6_7:
+; GFX10-64-NEXT:    s_mov_b64 exec, 0
+; GFX10-64-NEXT:    exp null off, off, off, off done vm
+; GFX10-64-NEXT:    s_endpgm
+.entry:
+  %p0 = extractelement <2 x float> %input, i32 0
+  %p1 = extractelement <2 x float> %input, i32 1
+  %x0 = call float @llvm.amdgcn.interp.p1(float %p0, i32 immarg 0, i32 immarg 0, i32 %index) #2
+  %x1 = call float @llvm.amdgcn.interp.p2(float %x0, float %p1, i32 immarg 0, i32 immarg 0, i32 %index) #2
+  %argi = fptosi float %arg to i32
+  %cond0 = icmp eq i32 %argi, 0
+  br i1 %cond0, label %.continue0, label %.demote0
+
+.demote0:
+  call void @llvm.amdgcn.wqm.demote(i1 false)
+  br label %.continue0
+
+.continue0:
+  %live = call i1 @llvm.amdgcn.live.mask()
+  %live.cond = select i1 %live, i32 0, i32 1065353216
+  %live.v0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 85, i32 15, i32 15, i1 true)
+  %live.v0f = bitcast i32 %live.v0 to float
+  %live.v1 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 0, i32 15, i32 15, i1 true)
+  %live.v1f = bitcast i32 %live.v1 to float
+  %v0 = fsub float %live.v0f, %live.v1f
+  %v0.wqm = call float @llvm.amdgcn.wqm.f32(float %v0)
+  %cond1 = fcmp oeq float %v0.wqm, 0.000000e+00
+  %cond2 = and i1 %live, %cond1
+  br i1 %cond2, label %.continue1, label %.demote1
+
+.demote1:
+  call void @llvm.amdgcn.wqm.demote(i1 false)
+  br label %.continue1
+
+.continue1:
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 immarg 0, i32 immarg 15, <2 x half> <half 0xH3C00, half 0xH0000>, <2 x half> <half 0xH0000, half 0xH3C00>, i1 immarg true, i1 immarg true) #3
+  ret void
+}
+
+define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index, i32 %limit) {
+; SI-LABEL: wqm_deriv_loop:
+; SI:       ; %bb.0: ; %.entry
+; SI-NEXT:    s_mov_b64 s[0:1], exec
+; SI-NEXT:    s_wqm_b64 exec, exec
+; SI-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; SI-NEXT:    s_movk_i32 s2, 0x3c00
+; SI-NEXT:    s_bfe_u32 s4, 0, 0x100000
+; SI-NEXT:    s_bfe_u32 s3, s2, 0x100000
+; SI-NEXT:    s_lshl_b32 s2, s4, 16
+; SI-NEXT:    s_or_b32 s2, s3, s2
+; SI-NEXT:    s_lshl_b32 s3, s3, 16
+; SI-NEXT:    s_or_b32 s3, s4, s3
+; SI-NEXT:    s_mov_b32 s6, 0
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; SI-NEXT:  ; %bb.1: ; %.demote0
+; SI-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
+; SI-NEXT:    s_cbranch_scc0 BB7_9
+; SI-NEXT:  ; %bb.2: ; %.demote0
+; SI-NEXT:    s_wqm_b64 s[8:9], s[0:1]
+; SI-NEXT:    s_and_b64 exec, exec, s[8:9]
+; SI-NEXT:  ; %bb.3: ; %.continue0.preheader
+; SI-NEXT:    s_or_b64 exec, exec, s[4:5]
+; SI-NEXT:    s_mov_b64 s[4:5], 0
+; SI-NEXT:    v_mov_b32_e32 v0, s6
+; SI-NEXT:    s_branch BB7_5
+; SI-NEXT:  BB7_4: ; %.continue1
+; SI-NEXT:    ; in Loop: Header=BB7_5 Depth=1
+; SI-NEXT:    s_or_b64 exec, exec, s[6:7]
+; SI-NEXT:    v_add_u32_e32 v0, vcc, 1, v0
+; SI-NEXT:    v_cmp_ge_i32_e32 vcc, v0, v1
+; SI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; SI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; SI-NEXT:    s_cbranch_execz BB7_8
+; SI-NEXT:  BB7_5: ; %.continue0
+; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; SI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; SI-NEXT:    v_cndmask_b32_e64 v2, v0, 0, s[6:7]
+; SI-NEXT:    v_mov_b32_e32 v3, v2
+; SI-NEXT:    s_nop 1
+; SI-NEXT:    v_mov_b32_dpp v3, v3 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0
+; SI-NEXT:    s_nop 1
+; SI-NEXT:    v_subrev_f32_dpp v2, v2, v3 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0
+; SI-NEXT:    ; kill: def $vgpr2 killed $vgpr2 killed $exec
+; SI-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; SI-NEXT:    s_and_b64 s[6:7], s[0:1], vcc
+; SI-NEXT:    s_xor_b64 s[6:7], s[6:7], -1
+; SI-NEXT:    s_and_saveexec_b64 s[8:9], s[6:7]
+; SI-NEXT:    s_xor_b64 s[6:7], exec, s[8:9]
+; SI-NEXT:    s_cbranch_execz BB7_4
+; SI-NEXT:  ; %bb.6: ; %.demote1
+; SI-NEXT:    ; in Loop: Header=BB7_5 Depth=1
+; SI-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
+; SI-NEXT:    s_cbranch_scc0 BB7_9
+; SI-NEXT:  ; %bb.7: ; %.demote1
+; SI-NEXT:    ; in Loop: Header=BB7_5 Depth=1
+; SI-NEXT:    s_wqm_b64 s[8:9], s[0:1]
+; SI-NEXT:    s_and_b64 exec, exec, s[8:9]
+; SI-NEXT:    s_branch BB7_4
+; SI-NEXT:  BB7_8: ; %.return
+; SI-NEXT:    s_or_b64 exec, exec, s[4:5]
+; SI-NEXT:    s_and_b64 exec, exec, s[0:1]
+; SI-NEXT:    v_mov_b32_e32 v0, s2
+; SI-NEXT:    v_mov_b32_e32 v1, s3
+; SI-NEXT:    exp mrt0 v0, v0, v1, v1 done compr vm
+; SI-NEXT:    s_endpgm
+; SI-NEXT:  BB7_9:
+; SI-NEXT:    s_mov_b64 exec, 0
+; SI-NEXT:    exp null off, off, off, off done vm
+; SI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: wqm_deriv_loop:
+; GFX9:       ; %bb.0: ; %.entry
+; GFX9-NEXT:    s_mov_b64 s[0:1], exec
+; GFX9-NEXT:    s_wqm_b64 exec, exec
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX9-NEXT:    s_mov_b32 s4, 0
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX9-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX9-NEXT:  ; %bb.1: ; %.demote0
+; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
+; GFX9-NEXT:    s_cbranch_scc0 BB7_9
+; GFX9-NEXT:  ; %bb.2: ; %.demote0
+; GFX9-NEXT:    s_wqm_b64 s[6:7], s[0:1]
+; GFX9-NEXT:    s_and_b64 exec, exec, s[6:7]
+; GFX9-NEXT:  ; %bb.3: ; %.continue0.preheader
+; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX9-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    s_branch BB7_5
+; GFX9-NEXT:  BB7_4: ; %.continue1
+; GFX9-NEXT:    ; in Loop: Header=BB7_5 Depth=1
+; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    v_add_u32_e32 v0, 1, v0
+; GFX9-NEXT:    v_cmp_ge_i32_e32 vcc, v0, v1
+; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT:    s_cbranch_execz BB7_8
+; GFX9-NEXT:  BB7_5: ; %.continue0
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v0, 0, s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v3, v2
+; GFX9-NEXT:    s_nop 1
+; GFX9-NEXT:    v_mov_b32_dpp v3, v3 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0
+; GFX9-NEXT:    s_nop 1
+; GFX9-NEXT:    v_subrev_f32_dpp v2, v2, v3 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0
+; GFX9-NEXT:    ; kill: def $vgpr2 killed $vgpr2 killed $exec
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX9-NEXT:    s_and_b64 s[4:5], s[0:1], vcc
+; GFX9-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
+; GFX9-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
+; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[6:7]
+; GFX9-NEXT:    s_cbranch_execz BB7_4
+; GFX9-NEXT:  ; %bb.6: ; %.demote1
+; GFX9-NEXT:    ; in Loop: Header=BB7_5 Depth=1
+; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
+; GFX9-NEXT:    s_cbranch_scc0 BB7_9
+; GFX9-NEXT:  ; %bb.7: ; %.demote1
+; GFX9-NEXT:    ; in Loop: Header=BB7_5 Depth=1
+; GFX9-NEXT:    s_wqm_b64 s[6:7], s[0:1]
+; GFX9-NEXT:    s_and_b64 exec, exec, s[6:7]
+; GFX9-NEXT:    s_branch BB7_4
+; GFX9-NEXT:  BB7_8: ; %.return
+; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX9-NEXT:    s_and_b64 exec, exec, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x3c00
+; GFX9-NEXT:    v_bfrev_b32_e32 v1, 60
+; GFX9-NEXT:    exp mrt0 v0, v0, v1, v1 done compr vm
+; GFX9-NEXT:    s_endpgm
+; GFX9-NEXT:  BB7_9:
+; GFX9-NEXT:    s_mov_b64 exec, 0
+; GFX9-NEXT:    exp null off, off, off, off done vm
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-32-LABEL: wqm_deriv_loop:
+; GFX10-32:       ; %bb.0: ; %.entry
+; GFX10-32-NEXT:    s_mov_b32 s0, exec_lo
+; GFX10-32-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX10-32-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX10-32-NEXT:    s_mov_b32 s1, 0
+; GFX10-32-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX10-32-NEXT:    s_and_saveexec_b32 s2, vcc_lo
+; GFX10-32-NEXT:    s_xor_b32 s2, exec_lo, s2
+; GFX10-32-NEXT:  ; %bb.1: ; %.demote0
+; GFX10-32-NEXT:    s_andn2_b32 s0, s0, exec_lo
+; GFX10-32-NEXT:    s_cbranch_scc0 BB7_9
+; GFX10-32-NEXT:  ; %bb.2: ; %.demote0
+; GFX10-32-NEXT:    s_wqm_b32 s3, s0
+; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s3
+; GFX10-32-NEXT:  ; %bb.3: ; %.continue0.preheader
+; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s2
+; GFX10-32-NEXT:    v_mov_b32_e32 v0, s1
+; GFX10-32-NEXT:    s_branch BB7_5
+; GFX10-32-NEXT:  BB7_4: ; %.continue1
+; GFX10-32-NEXT:    ; in Loop: Header=BB7_5 Depth=1
+; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s2
+; GFX10-32-NEXT:    v_add_nc_u32_e32 v0, 1, v0
+; GFX10-32-NEXT:    v_cmp_ge_i32_e32 vcc_lo, v0, v1
+; GFX10-32-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX10-32-NEXT:    s_andn2_b32 exec_lo, exec_lo, s1
+; GFX10-32-NEXT:    s_cbranch_execz BB7_8
+; GFX10-32-NEXT:  BB7_5: ; %.continue0
+; GFX10-32-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-32-NEXT:    s_mov_b32 s2, s0
+; GFX10-32-NEXT:    v_cndmask_b32_e64 v2, v0, 0, s2
+; GFX10-32-NEXT:    v_mov_b32_e32 v3, v2
+; GFX10-32-NEXT:    v_mov_b32_dpp v3, v3 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0
+; GFX10-32-NEXT:    v_subrev_f32_dpp v2, v2, v3 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0
+; GFX10-32-NEXT:    ; kill: def $vgpr2 killed $vgpr2 killed $exec
+; GFX10-32-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX10-32-NEXT:    s_and_b32 s2, s0, vcc_lo
+; GFX10-32-NEXT:    s_xor_b32 s2, s2, -1
+; GFX10-32-NEXT:    s_and_saveexec_b32 s3, s2
+; GFX10-32-NEXT:    s_xor_b32 s2, exec_lo, s3
+; GFX10-32-NEXT:    s_cbranch_execz BB7_4
+; GFX10-32-NEXT:  ; %bb.6: ; %.demote1
+; GFX10-32-NEXT:    ; in Loop: Header=BB7_5 Depth=1
+; GFX10-32-NEXT:    s_andn2_b32 s0, s0, exec_lo
+; GFX10-32-NEXT:    s_cbranch_scc0 BB7_9
+; GFX10-32-NEXT:  ; %bb.7: ; %.demote1
+; GFX10-32-NEXT:    ; in Loop: Header=BB7_5 Depth=1
+; GFX10-32-NEXT:    s_wqm_b32 s3, s0
+; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s3
+; GFX10-32-NEXT:    s_branch BB7_4
+; GFX10-32-NEXT:  BB7_8: ; %.return
+; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s0
+; GFX10-32-NEXT:    v_mov_b32_e32 v0, 0x3c00
+; GFX10-32-NEXT:    v_bfrev_b32_e32 v1, 60
+; GFX10-32-NEXT:    exp mrt0 v0, v0, v1, v1 done compr vm
+; GFX10-32-NEXT:    s_endpgm
+; GFX10-32-NEXT:  BB7_9:
+; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
+; GFX10-32-NEXT:    exp null off, off, off, off done vm
+; GFX10-32-NEXT:    s_endpgm
+;
+; GFX10-64-LABEL: wqm_deriv_loop:
+; GFX10-64:       ; %bb.0: ; %.entry
+; GFX10-64-NEXT:    s_mov_b64 s[0:1], exec
+; GFX10-64-NEXT:    s_wqm_b64 exec, exec
+; GFX10-64-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX10-64-NEXT:    s_mov_b32 s2, 0
+; GFX10-64-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX10-64-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX10-64-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX10-64-NEXT:  ; %bb.1: ; %.demote0
+; GFX10-64-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
+; GFX10-64-NEXT:    s_cbranch_scc0 BB7_9
+; GFX10-64-NEXT:  ; %bb.2: ; %.demote0
+; GFX10-64-NEXT:    s_wqm_b64 s[6:7], s[0:1]
+; GFX10-64-NEXT:    s_and_b64 exec, exec, s[6:7]
+; GFX10-64-NEXT:  ; %bb.3: ; %.continue0.preheader
+; GFX10-64-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX10-64-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-64-NEXT:    s_mov_b64 s[2:3], 0
+; GFX10-64-NEXT:    s_branch BB7_5
+; GFX10-64-NEXT:  BB7_4: ; %.continue1
+; GFX10-64-NEXT:    ; in Loop: Header=BB7_5 Depth=1
+; GFX10-64-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX10-64-NEXT:    v_add_nc_u32_e32 v0, 1, v0
+; GFX10-64-NEXT:    v_cmp_ge_i32_e32 vcc, v0, v1
+; GFX10-64-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX10-64-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX10-64-NEXT:    s_cbranch_execz BB7_8
+; GFX10-64-NEXT:  BB7_5: ; %.continue0
+; GFX10-64-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-64-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX10-64-NEXT:    v_cndmask_b32_e64 v2, v0, 0, s[4:5]
+; GFX10-64-NEXT:    v_mov_b32_e32 v3, v2
+; GFX10-64-NEXT:    v_mov_b32_dpp v3, v3 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0
+; GFX10-64-NEXT:    v_subrev_f32_dpp v2, v2, v3 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0
+; GFX10-64-NEXT:    ; kill: def $vgpr2 killed $vgpr2 killed $exec
+; GFX10-64-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX10-64-NEXT:    s_and_b64 s[4:5], s[0:1], vcc
+; GFX10-64-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
+; GFX10-64-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
+; GFX10-64-NEXT:    s_xor_b64 s[4:5], exec, s[6:7]
+; GFX10-64-NEXT:    s_cbranch_execz BB7_4
+; GFX10-64-NEXT:  ; %bb.6: ; %.demote1
+; GFX10-64-NEXT:    ; in Loop: Header=BB7_5 Depth=1
+; GFX10-64-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
+; GFX10-64-NEXT:    s_cbranch_scc0 BB7_9
+; GFX10-64-NEXT:  ; %bb.7: ; %.demote1
+; GFX10-64-NEXT:    ; in Loop: Header=BB7_5 Depth=1
+; GFX10-64-NEXT:    s_wqm_b64 s[6:7], s[0:1]
+; GFX10-64-NEXT:    s_and_b64 exec, exec, s[6:7]
+; GFX10-64-NEXT:    s_branch BB7_4
+; GFX10-64-NEXT:  BB7_8: ; %.return
+; GFX10-64-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX10-64-NEXT:    s_and_b64 exec, exec, s[0:1]
+; GFX10-64-NEXT:    v_mov_b32_e32 v0, 0x3c00
+; GFX10-64-NEXT:    v_bfrev_b32_e32 v1, 60
+; GFX10-64-NEXT:    exp mrt0 v0, v0, v1, v1 done compr vm
+; GFX10-64-NEXT:    s_endpgm
+; GFX10-64-NEXT:  BB7_9:
+; GFX10-64-NEXT:    s_mov_b64 exec, 0
+; GFX10-64-NEXT:    exp null off, off, off, off done vm
+; GFX10-64-NEXT:    s_endpgm
+.entry:
+  %p0 = extractelement <2 x float> %input, i32 0
+  %p1 = extractelement <2 x float> %input, i32 1
+  %x0 = call float @llvm.amdgcn.interp.p1(float %p0, i32 immarg 0, i32 immarg 0, i32 %index) #2
+  %x1 = call float @llvm.amdgcn.interp.p2(float %x0, float %p1, i32 immarg 0, i32 immarg 0, i32 %index) #2
+  %argi = fptosi float %arg to i32
+  %cond0 = icmp eq i32 %argi, 0
+  br i1 %cond0, label %.continue0, label %.demote0
+
+.demote0:
+  call void @llvm.amdgcn.wqm.demote(i1 false)
+  br label %.continue0
+
+.continue0:
+  %count = phi i32 [ 0, %.entry ], [ 0, %.demote0 ], [ %next, %.continue1 ]
+  %live = call i1 @llvm.amdgcn.live.mask()
+  %live.cond = select i1 %live, i32 0, i32 %count
+  %live.v0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 85, i32 15, i32 15, i1 true)
+  %live.v0f = bitcast i32 %live.v0 to float
+  %live.v1 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 0, i32 15, i32 15, i1 true)
+  %live.v1f = bitcast i32 %live.v1 to float
+  %v0 = fsub float %live.v0f, %live.v1f
+  %v0.wqm = call float @llvm.amdgcn.wqm.f32(float %v0)
+  %cond1 = fcmp oeq float %v0.wqm, 0.000000e+00
+  %cond2 = and i1 %live, %cond1
+  br i1 %cond2, label %.continue1, label %.demote1
+
+.demote1:
+  call void @llvm.amdgcn.wqm.demote(i1 false)
+  br label %.continue1
+
+.continue1:
+  %next = add i32 %count, 1
+  %loop.cond = icmp slt i32 %next, %limit
+  br i1 %loop.cond, label %.continue0, label %.return
+
+.return:
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 immarg 0, i32 immarg 15, <2 x half> <half 0xH3C00, half 0xH0000>, <2 x half> <half 0xH0000, half 0xH3C00>, i1 immarg true, i1 immarg true) #3
+  ret void
+}
+
+declare void @llvm.amdgcn.wqm.demote(i1) #0
+declare i1 @llvm.amdgcn.live.mask() #0
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
+declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare float @llvm.amdgcn.wqm.f32(float) #1
+declare float @llvm.amdgcn.interp.p1(float, i32 immarg, i32 immarg, i32) #2
+declare float @llvm.amdgcn.interp.p2(float, float, i32 immarg, i32 immarg, i32) #2
+declare void @llvm.amdgcn.exp.compr.v2f16(i32 immarg, i32 immarg, <2 x half>, <2 x half>, i1 immarg, i1 immarg) #3
+declare i32 @llvm.amdgcn.mov.dpp.i32(i32, i32 immarg, i32 immarg, i32 immarg, i1 immarg) #4
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind readnone speculatable }
+attributes #3 = { inaccessiblememonly nounwind }
+attributes #4 = { convergent nounwind readnone }

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.live.mask.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.live.mask.mir
new file mode 100644
index 000000000000..8b6a69d0d6a8
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.live.mask.mir
@@ -0,0 +1,16 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect -regbankselect-fast -verify-machineinstrs -o - %s | FileCheck %s
+# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect -regbankselect-greedy -verify-machineinstrs -o - %s | FileCheck %s
+
+---
+name: live_mask
+legalized: true
+
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: live_mask
+    ; CHECK: [[INT:%[0-9]+]]:vcc(s1) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.live.mask)
+    ; CHECK: S_ENDPGM 0, implicit [[INT]](s1)
+    %0:_(s1) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.live.mask)
+    S_ENDPGM 0, implicit %0
+...

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.wqm.demote.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.wqm.demote.mir
new file mode 100644
index 000000000000..2a70dbe67283
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.wqm.demote.mir
@@ -0,0 +1,69 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect -regbankselect-fast -verify-machineinstrs -o -  %s| FileCheck %s
+# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect -regbankselect-greedy -verify-machineinstrs -o -  %s| FileCheck %s
+
+---
+name: wqm_demote_scc
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0, $sgpr1
+    ; CHECK-LABEL: name: wqm_demote_scc
+    ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+    ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
+    ; CHECK: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY1]]
+    ; CHECK: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP]](s32)
+    ; CHECK: [[COPY2:%[0-9]+]]:vcc(s1) = COPY [[TRUNC]](s1)
+    ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wqm.demote), [[COPY2]](s1)
+    %0:_(s32) = COPY $sgpr0
+    %1:_(s32) = COPY $sgpr1
+    %2:_(s1) = G_ICMP intpred(eq), %0, %1
+    G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wqm.demote), %2
+...
+
+---
+name: wqm_demote_vcc
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+    ; CHECK-LABEL: name: wqm_demote_vcc
+    ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+    ; CHECK: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY1]]
+    ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wqm.demote), [[ICMP]](s1)
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s1) = G_ICMP intpred(eq), %0, %1
+    G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wqm.demote), %2
+...
+
+---
+name: wqm_demote_constant_true
+legalized: true
+
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: wqm_demote_constant_true
+    ; CHECK: [[C:%[0-9]+]]:sgpr(s1) = G_CONSTANT i1 true
+    ; CHECK: [[COPY:%[0-9]+]]:vcc(s1) = COPY [[C]](s1)
+    ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wqm.demote), [[COPY]](s1)
+    %0:_(s1) = G_CONSTANT i1 true
+    G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wqm.demote), %0
+...
+
+---
+name: wqm_demote_constant_false
+legalized: true
+
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: wqm_demote_constant_false
+    ; CHECK: [[C:%[0-9]+]]:sgpr(s1) = G_CONSTANT i1 false
+    ; CHECK: [[COPY:%[0-9]+]]:vcc(s1) = COPY [[C]](s1)
+    ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wqm.demote), [[COPY]](s1)
+    %0:_(s1) = G_CONSTANT i1 false
+    G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wqm.demote), %0
+...

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll
new file mode 100644
index 000000000000..b6718b3423a1
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll
@@ -0,0 +1,1177 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-32 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-64 %s
+
+define amdgpu_ps void @static_exact(float %arg0, float %arg1) {
+; SI-LABEL: static_exact:
+; SI:       ; %bb.0: ; %.entry
+; SI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
+; SI-NEXT:    s_andn2_b64 exec, exec, exec
+; SI-NEXT:    s_cbranch_scc0 BB0_2
+; SI-NEXT:  ; %bb.1: ; %.entry
+; SI-NEXT:    s_mov_b64 exec, 0
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
+; SI-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
+; SI-NEXT:    s_endpgm
+; SI-NEXT:  BB0_2:
+; SI-NEXT:    s_mov_b64 exec, 0
+; SI-NEXT:    exp null off, off, off, off done vm
+; SI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: static_exact:
+; GFX9:       ; %bb.0: ; %.entry
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
+; GFX9-NEXT:    s_andn2_b64 exec, exec, exec
+; GFX9-NEXT:    s_cbranch_scc0 BB0_2
+; GFX9-NEXT:  ; %bb.1: ; %.entry
+; GFX9-NEXT:    s_mov_b64 exec, 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
+; GFX9-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
+; GFX9-NEXT:    s_endpgm
+; GFX9-NEXT:  BB0_2:
+; GFX9-NEXT:    s_mov_b64 exec, 0
+; GFX9-NEXT:    exp null off, off, off, off done vm
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-32-LABEL: static_exact:
+; GFX10-32:       ; %bb.0: ; %.entry
+; GFX10-32-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v0
+; GFX10-32-NEXT:    s_andn2_b32 exec_lo, exec_lo, exec_lo
+; GFX10-32-NEXT:    s_cbranch_scc0 BB0_2
+; GFX10-32-NEXT:  ; %bb.1: ; %.entry
+; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
+; GFX10-32-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
+; GFX10-32-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
+; GFX10-32-NEXT:    s_endpgm
+; GFX10-32-NEXT:  BB0_2:
+; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
+; GFX10-32-NEXT:    exp null off, off, off, off done vm
+; GFX10-32-NEXT:    s_endpgm
+;
+; GFX10-64-LABEL: static_exact:
+; GFX10-64:       ; %bb.0: ; %.entry
+; GFX10-64-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
+; GFX10-64-NEXT:    s_andn2_b64 exec, exec, exec
+; GFX10-64-NEXT:    s_cbranch_scc0 BB0_2
+; GFX10-64-NEXT:  ; %bb.1: ; %.entry
+; GFX10-64-NEXT:    s_mov_b64 exec, 0
+; GFX10-64-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
+; GFX10-64-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
+; GFX10-64-NEXT:    s_endpgm
+; GFX10-64-NEXT:  BB0_2:
+; GFX10-64-NEXT:    s_mov_b64 exec, 0
+; GFX10-64-NEXT:    exp null off, off, off, off done vm
+; GFX10-64-NEXT:    s_endpgm
+.entry:
+  %c0 = fcmp olt float %arg0, 0.000000e+00
+  %c1 = fcmp oge float %arg1, 0.0
+  call void @llvm.amdgcn.wqm.demote(i1 false)
+  %tmp1 = select i1 %c0, float 1.000000e+00, float 0.000000e+00
+  call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0
+  ret void
+}
+
+define amdgpu_ps void @dynamic_exact(float %arg0, float %arg1) {
+; SI-LABEL: dynamic_exact:
+; SI:       ; %bb.0: ; %.entry
+; SI-NEXT:    v_cmp_le_f32_e64 s[0:1], 0, v1
+; SI-NEXT:    s_mov_b64 s[2:3], exec
+; SI-NEXT:    s_xor_b64 s[0:1], s[0:1], exec
+; SI-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[0:1]
+; SI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
+; SI-NEXT:    s_cbranch_scc0 BB1_2
+; SI-NEXT:  ; %bb.1: ; %.entry
+; SI-NEXT:    s_and_b64 exec, exec, s[2:3]
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
+; SI-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
+; SI-NEXT:    s_endpgm
+; SI-NEXT:  BB1_2:
+; SI-NEXT:    s_mov_b64 exec, 0
+; SI-NEXT:    exp null off, off, off, off done vm
+; SI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: dynamic_exact:
+; GFX9:       ; %bb.0: ; %.entry
+; GFX9-NEXT:    v_cmp_le_f32_e64 s[0:1], 0, v1
+; GFX9-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9-NEXT:    s_xor_b64 s[0:1], s[0:1], exec
+; GFX9-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[0:1]
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
+; GFX9-NEXT:    s_cbranch_scc0 BB1_2
+; GFX9-NEXT:  ; %bb.1: ; %.entry
+; GFX9-NEXT:    s_and_b64 exec, exec, s[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
+; GFX9-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
+; GFX9-NEXT:    s_endpgm
+; GFX9-NEXT:  BB1_2:
+; GFX9-NEXT:    s_mov_b64 exec, 0
+; GFX9-NEXT:    exp null off, off, off, off done vm
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-32-LABEL: dynamic_exact:
+; GFX10-32:       ; %bb.0: ; %.entry
+; GFX10-32-NEXT:    v_cmp_le_f32_e64 s0, 0, v1
+; GFX10-32-NEXT:    s_mov_b32 s1, exec_lo
+; GFX10-32-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v0
+; GFX10-32-NEXT:    s_xor_b32 s0, s0, exec_lo
+; GFX10-32-NEXT:    s_andn2_b32 s1, s1, s0
+; GFX10-32-NEXT:    s_cbranch_scc0 BB1_2
+; GFX10-32-NEXT:  ; %bb.1: ; %.entry
+; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s1
+; GFX10-32-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
+; GFX10-32-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
+; GFX10-32-NEXT:    s_endpgm
+; GFX10-32-NEXT:  BB1_2:
+; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
+; GFX10-32-NEXT:    exp null off, off, off, off done vm
+; GFX10-32-NEXT:    s_endpgm
+;
+; GFX10-64-LABEL: dynamic_exact:
+; GFX10-64:       ; %bb.0: ; %.entry
+; GFX10-64-NEXT:    v_cmp_le_f32_e64 s[0:1], 0, v1
+; GFX10-64-NEXT:    s_mov_b64 s[2:3], exec
+; GFX10-64-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
+; GFX10-64-NEXT:    s_xor_b64 s[0:1], s[0:1], exec
+; GFX10-64-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[0:1]
+; GFX10-64-NEXT:    s_cbranch_scc0 BB1_2
+; GFX10-64-NEXT:  ; %bb.1: ; %.entry
+; GFX10-64-NEXT:    s_and_b64 exec, exec, s[2:3]
+; GFX10-64-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
+; GFX10-64-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
+; GFX10-64-NEXT:    s_endpgm
+; GFX10-64-NEXT:  BB1_2:
+; GFX10-64-NEXT:    s_mov_b64 exec, 0
+; GFX10-64-NEXT:    exp null off, off, off, off done vm
+; GFX10-64-NEXT:    s_endpgm
+.entry:
+  %c0 = fcmp olt float %arg0, 0.000000e+00
+  %c1 = fcmp oge float %arg1, 0.0
+  call void @llvm.amdgcn.wqm.demote(i1 %c1)
+  %tmp1 = select i1 %c0, float 1.000000e+00, float 0.000000e+00
+  call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0
+  ret void
+}
+
+define amdgpu_ps void @branch(float %arg0, float %arg1) {
+; SI-LABEL: branch:
+; SI:       ; %bb.0: ; %.entry
+; SI-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; SI-NEXT:    v_cvt_i32_f32_e32 v1, v1
+; SI-NEXT:    s_mov_b64 s[2:3], exec
+; SI-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-NEXT:    v_and_b32_e32 v1, 1, v0
+; SI-NEXT:    v_and_b32_e32 v0, 1, v0
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; SI-NEXT:    v_cmp_eq_u32_e64 s[0:1], 1, v0
+; SI-NEXT:    s_and_saveexec_b64 s[4:5], s[0:1]
+; SI-NEXT:    s_xor_b64 s[0:1], exec, s[4:5]
+; SI-NEXT:  ; %bb.1: ; %.demote
+; SI-NEXT:    s_andn2_b64 s[2:3], s[2:3], exec
+; SI-NEXT:    s_cbranch_scc0 BB2_4
+; SI-NEXT:  ; %bb.2: ; %.demote
+; SI-NEXT:    s_mov_b64 exec, 0
+; SI-NEXT:  ; %bb.3: ; %.continue
+; SI-NEXT:    s_or_b64 exec, exec, s[0:1]
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
+; SI-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
+; SI-NEXT:    s_endpgm
+; SI-NEXT:  BB2_4:
+; SI-NEXT:    s_mov_b64 exec, 0
+; SI-NEXT:    exp null off, off, off, off done vm
+; SI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: branch:
+; GFX9:       ; %bb.0: ; %.entry
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v1, v1
+; GFX9-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX9-NEXT:    v_and_b32_e32 v1, 1, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 1, v0
+; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], s[0:1]
+; GFX9-NEXT:    s_xor_b64 s[0:1], exec, s[4:5]
+; GFX9-NEXT:  ; %bb.1: ; %.demote
+; GFX9-NEXT:    s_andn2_b64 s[2:3], s[2:3], exec
+; GFX9-NEXT:    s_cbranch_scc0 BB2_4
+; GFX9-NEXT:  ; %bb.2: ; %.demote
+; GFX9-NEXT:    s_mov_b64 exec, 0
+; GFX9-NEXT:  ; %bb.3: ; %.continue
+; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
+; GFX9-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
+; GFX9-NEXT:    s_endpgm
+; GFX9-NEXT:  BB2_4:
+; GFX9-NEXT:    s_mov_b64 exec, 0
+; GFX9-NEXT:    exp null off, off, off, off done vm
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-32-LABEL: branch:
+; GFX10-32:       ; %bb.0: ; %.entry
+; GFX10-32-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX10-32-NEXT:    v_cvt_i32_f32_e32 v1, v1
+; GFX10-32-NEXT:    s_mov_b32 s1, exec_lo
+; GFX10-32-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX10-32-NEXT:    v_and_b32_e32 v1, 1, v0
+; GFX10-32-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX10-32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX10-32-NEXT:    v_cmp_eq_u32_e64 s0, 1, v0
+; GFX10-32-NEXT:    s_and_saveexec_b32 s2, s0
+; GFX10-32-NEXT:    s_xor_b32 s0, exec_lo, s2
+; GFX10-32-NEXT:  ; %bb.1: ; %.demote
+; GFX10-32-NEXT:    s_andn2_b32 s1, s1, exec_lo
+; GFX10-32-NEXT:    s_cbranch_scc0 BB2_4
+; GFX10-32-NEXT:  ; %bb.2: ; %.demote
+; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
+; GFX10-32-NEXT:  ; %bb.3: ; %.continue
+; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX10-32-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
+; GFX10-32-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
+; GFX10-32-NEXT:    s_endpgm
+; GFX10-32-NEXT:  BB2_4:
+; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
+; GFX10-32-NEXT:    exp null off, off, off, off done vm
+; GFX10-32-NEXT:    s_endpgm
+;
+; GFX10-64-LABEL: branch:
+; GFX10-64:       ; %bb.0: ; %.entry
+; GFX10-64-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX10-64-NEXT:    v_cvt_i32_f32_e32 v1, v1
+; GFX10-64-NEXT:    s_mov_b64 s[2:3], exec
+; GFX10-64-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX10-64-NEXT:    v_and_b32_e32 v1, 1, v0
+; GFX10-64-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX10-64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX10-64-NEXT:    v_cmp_eq_u32_e64 s[0:1], 1, v0
+; GFX10-64-NEXT:    s_and_saveexec_b64 s[4:5], s[0:1]
+; GFX10-64-NEXT:    s_xor_b64 s[0:1], exec, s[4:5]
+; GFX10-64-NEXT:  ; %bb.1: ; %.demote
+; GFX10-64-NEXT:    s_andn2_b64 s[2:3], s[2:3], exec
+; GFX10-64-NEXT:    s_cbranch_scc0 BB2_4
+; GFX10-64-NEXT:  ; %bb.2: ; %.demote
+; GFX10-64-NEXT:    s_mov_b64 exec, 0
+; GFX10-64-NEXT:  ; %bb.3: ; %.continue
+; GFX10-64-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX10-64-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
+; GFX10-64-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
+; GFX10-64-NEXT:    s_endpgm
+; GFX10-64-NEXT:  BB2_4:
+; GFX10-64-NEXT:    s_mov_b64 exec, 0
+; GFX10-64-NEXT:    exp null off, off, off, off done vm
+; GFX10-64-NEXT:    s_endpgm
+.entry:
+  %i0 = fptosi float %arg0 to i32
+  %i1 = fptosi float %arg1 to i32
+  %c0 = or i32 %i0, %i1
+  %c1 = and i32 %c0, 1
+  %c2 = icmp eq i32 %c1, 0
+  br i1 %c2, label %.continue, label %.demote
+
+.demote:
+  call void @llvm.amdgcn.wqm.demote(i1 false)
+  br label %.continue
+
+.continue:
+  %tmp1 = select i1 %c2, float 1.000000e+00, float 0.000000e+00
+  call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0
+  ret void
+}
+
+
+define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) {
+; SI-LABEL: wqm_demote_1:
+; SI:       ; %bb.0: ; %.entry
+; SI-NEXT:    s_mov_b64 s[12:13], exec
+; SI-NEXT:    s_wqm_b64 exec, exec
+; SI-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v1
+; SI-NEXT:    s_and_saveexec_b64 s[14:15], vcc
+; SI-NEXT:    s_xor_b64 s[14:15], exec, s[14:15]
+; SI-NEXT:  ; %bb.1: ; %.demote
+; SI-NEXT:    s_andn2_b64 s[12:13], s[12:13], exec
+; SI-NEXT:    s_cbranch_scc0 BB3_4
+; SI-NEXT:  ; %bb.2: ; %.demote
+; SI-NEXT:    s_wqm_b64 s[16:17], s[12:13]
+; SI-NEXT:    s_and_b64 exec, exec, s[16:17]
+; SI-NEXT:  ; %bb.3: ; %.continue
+; SI-NEXT:    s_or_b64 exec, exec, s[14:15]
+; SI-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_add_f32_e32 v0, v0, v0
+; SI-NEXT:    s_and_b64 exec, exec, s[12:13]
+; SI-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_branch BB3_5
+; SI-NEXT:  BB3_4:
+; SI-NEXT:    s_mov_b64 exec, 0
+; SI-NEXT:    exp null off, off, off, off done vm
+; SI-NEXT:    s_endpgm
+; SI-NEXT:  BB3_5:
+;
+; GFX9-LABEL: wqm_demote_1:
+; GFX9:       ; %bb.0: ; %.entry
+; GFX9-NEXT:    s_mov_b64 s[12:13], exec
+; GFX9-NEXT:    s_wqm_b64 exec, exec
+; GFX9-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v1
+; GFX9-NEXT:    s_and_saveexec_b64 s[14:15], vcc
+; GFX9-NEXT:    s_xor_b64 s[14:15], exec, s[14:15]
+; GFX9-NEXT:  ; %bb.1: ; %.demote
+; GFX9-NEXT:    s_andn2_b64 s[12:13], s[12:13], exec
+; GFX9-NEXT:    s_cbranch_scc0 BB3_4
+; GFX9-NEXT:  ; %bb.2: ; %.demote
+; GFX9-NEXT:    s_wqm_b64 s[16:17], s[12:13]
+; GFX9-NEXT:    s_and_b64 exec, exec, s[16:17]
+; GFX9-NEXT:  ; %bb.3: ; %.continue
+; GFX9-NEXT:    s_or_b64 exec, exec, s[14:15]
+; GFX9-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v0
+; GFX9-NEXT:    s_and_b64 exec, exec, s[12:13]
+; GFX9-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_branch BB3_5
+; GFX9-NEXT:  BB3_4:
+; GFX9-NEXT:    s_mov_b64 exec, 0
+; GFX9-NEXT:    exp null off, off, off, off done vm
+; GFX9-NEXT:    s_endpgm
+; GFX9-NEXT:  BB3_5:
+;
+; GFX10-32-LABEL: wqm_demote_1:
+; GFX10-32:       ; %bb.0: ; %.entry
+; GFX10-32-NEXT:    s_mov_b32 s12, exec_lo
+; GFX10-32-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX10-32-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, 0, v1
+; GFX10-32-NEXT:    s_and_saveexec_b32 s13, vcc_lo
+; GFX10-32-NEXT:    s_xor_b32 s13, exec_lo, s13
+; GFX10-32-NEXT:  ; %bb.1: ; %.demote
+; GFX10-32-NEXT:    s_andn2_b32 s12, s12, exec_lo
+; GFX10-32-NEXT:    s_cbranch_scc0 BB3_4
+; GFX10-32-NEXT:  ; %bb.2: ; %.demote
+; GFX10-32-NEXT:    s_wqm_b32 s28, s12
+; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s28
+; GFX10-32-NEXT:  ; %bb.3: ; %.continue
+; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s13
+; GFX10-32-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX10-32-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-32-NEXT:    v_add_f32_e32 v0, v0, v0
+; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX10-32-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX10-32-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-32-NEXT:    s_branch BB3_5
+; GFX10-32-NEXT:  BB3_4:
+; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
+; GFX10-32-NEXT:    exp null off, off, off, off done vm
+; GFX10-32-NEXT:    s_endpgm
+; GFX10-32-NEXT:  BB3_5:
+;
+; GFX10-64-LABEL: wqm_demote_1:
+; GFX10-64:       ; %bb.0: ; %.entry
+; GFX10-64-NEXT:    s_mov_b64 s[12:13], exec
+; GFX10-64-NEXT:    s_wqm_b64 exec, exec
+; GFX10-64-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v1
+; GFX10-64-NEXT:    s_and_saveexec_b64 s[14:15], vcc
+; GFX10-64-NEXT:    s_xor_b64 s[28:29], exec, s[14:15]
+; GFX10-64-NEXT:  ; %bb.1: ; %.demote
+; GFX10-64-NEXT:    s_andn2_b64 s[12:13], s[12:13], exec
+; GFX10-64-NEXT:    s_cbranch_scc0 BB3_4
+; GFX10-64-NEXT:  ; %bb.2: ; %.demote
+; GFX10-64-NEXT:    s_wqm_b64 s[16:17], s[12:13]
+; GFX10-64-NEXT:    s_and_b64 exec, exec, s[16:17]
+; GFX10-64-NEXT:  ; %bb.3: ; %.continue
+; GFX10-64-NEXT:    s_or_b64 exec, exec, s[28:29]
+; GFX10-64-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX10-64-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-64-NEXT:    v_add_f32_e32 v0, v0, v0
+; GFX10-64-NEXT:    s_and_b64 exec, exec, s[12:13]
+; GFX10-64-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX10-64-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-64-NEXT:    s_branch BB3_5
+; GFX10-64-NEXT:  BB3_4:
+; GFX10-64-NEXT:    s_mov_b64 exec, 0
+; GFX10-64-NEXT:    exp null off, off, off, off done vm
+; GFX10-64-NEXT:    s_endpgm
+; GFX10-64-NEXT:  BB3_5:
+.entry:
+  %z.cmp = fcmp olt float %z, 0.0
+  br i1 %z.cmp, label %.continue, label %.demote
+
+.demote:
+  call void @llvm.amdgcn.wqm.demote(i1 false)
+  br label %.continue
+
+.continue:
+  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
+  %tex0 = extractelement <4 x float> %tex, i32 0
+  %tex1 = extractelement <4 x float> %tex, i32 0
+  %coord1 = fadd float %tex0, %tex1
+  %rtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord1, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
+
+  ret <4 x float> %rtex
+}
+
+define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) {
+; SI-LABEL: wqm_demote_2:
+; SI:       ; %bb.0: ; %.entry
+; SI-NEXT:    s_mov_b64 s[12:13], exec
+; SI-NEXT:    s_wqm_b64 exec, exec
+; SI-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v0
+; SI-NEXT:    s_and_saveexec_b64 s[14:15], vcc
+; SI-NEXT:    s_xor_b64 s[14:15], exec, s[14:15]
+; SI-NEXT:  ; %bb.1: ; %.demote
+; SI-NEXT:    s_andn2_b64 s[12:13], s[12:13], exec
+; SI-NEXT:    s_cbranch_scc0 BB4_4
+; SI-NEXT:  ; %bb.2: ; %.demote
+; SI-NEXT:    s_wqm_b64 s[16:17], s[12:13]
+; SI-NEXT:    s_and_b64 exec, exec, s[16:17]
+; SI-NEXT:  ; %bb.3: ; %.continue
+; SI-NEXT:    s_or_b64 exec, exec, s[14:15]
+; SI-NEXT:    v_add_f32_e32 v0, v0, v0
+; SI-NEXT:    s_and_b64 exec, exec, s[12:13]
+; SI-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_branch BB4_5
+; SI-NEXT:  BB4_4:
+; SI-NEXT:    s_mov_b64 exec, 0
+; SI-NEXT:    exp null off, off, off, off done vm
+; SI-NEXT:    s_endpgm
+; SI-NEXT:  BB4_5:
+;
+; GFX9-LABEL: wqm_demote_2:
+; GFX9:       ; %bb.0: ; %.entry
+; GFX9-NEXT:    s_mov_b64 s[12:13], exec
+; GFX9-NEXT:    s_wqm_b64 exec, exec
+; GFX9-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v0
+; GFX9-NEXT:    s_and_saveexec_b64 s[14:15], vcc
+; GFX9-NEXT:    s_xor_b64 s[14:15], exec, s[14:15]
+; GFX9-NEXT:  ; %bb.1: ; %.demote
+; GFX9-NEXT:    s_andn2_b64 s[12:13], s[12:13], exec
+; GFX9-NEXT:    s_cbranch_scc0 BB4_4
+; GFX9-NEXT:  ; %bb.2: ; %.demote
+; GFX9-NEXT:    s_wqm_b64 s[16:17], s[12:13]
+; GFX9-NEXT:    s_and_b64 exec, exec, s[16:17]
+; GFX9-NEXT:  ; %bb.3: ; %.continue
+; GFX9-NEXT:    s_or_b64 exec, exec, s[14:15]
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v0
+; GFX9-NEXT:    s_and_b64 exec, exec, s[12:13]
+; GFX9-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_branch BB4_5
+; GFX9-NEXT:  BB4_4:
+; GFX9-NEXT:    s_mov_b64 exec, 0
+; GFX9-NEXT:    exp null off, off, off, off done vm
+; GFX9-NEXT:    s_endpgm
+; GFX9-NEXT:  BB4_5:
+;
+; GFX10-32-LABEL: wqm_demote_2:
+; GFX10-32:       ; %bb.0: ; %.entry
+; GFX10-32-NEXT:    s_mov_b32 s12, exec_lo
+; GFX10-32-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX10-32-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX10-32-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-32-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, 0, v0
+; GFX10-32-NEXT:    s_and_saveexec_b32 s13, vcc_lo
+; GFX10-32-NEXT:    s_xor_b32 s13, exec_lo, s13
+; GFX10-32-NEXT:  ; %bb.1: ; %.demote
+; GFX10-32-NEXT:    s_andn2_b32 s12, s12, exec_lo
+; GFX10-32-NEXT:    s_cbranch_scc0 BB4_4
+; GFX10-32-NEXT:  ; %bb.2: ; %.demote
+; GFX10-32-NEXT:    s_wqm_b32 s28, s12
+; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s28
+; GFX10-32-NEXT:  ; %bb.3: ; %.continue
+; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s13
+; GFX10-32-NEXT:    v_add_f32_e32 v0, v0, v0
+; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX10-32-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX10-32-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-32-NEXT:    s_branch BB4_5
+; GFX10-32-NEXT:  BB4_4:
+; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
+; GFX10-32-NEXT:    exp null off, off, off, off done vm
+; GFX10-32-NEXT:    s_endpgm
+; GFX10-32-NEXT:  BB4_5:
+;
+; GFX10-64-LABEL: wqm_demote_2:
+; GFX10-64:       ; %bb.0: ; %.entry
+; GFX10-64-NEXT:    s_mov_b64 s[12:13], exec
+; GFX10-64-NEXT:    s_wqm_b64 exec, exec
+; GFX10-64-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX10-64-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-64-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v0
+; GFX10-64-NEXT:    s_and_saveexec_b64 s[14:15], vcc
+; GFX10-64-NEXT:    s_xor_b64 s[28:29], exec, s[14:15]
+; GFX10-64-NEXT:  ; %bb.1: ; %.demote
+; GFX10-64-NEXT:    s_andn2_b64 s[12:13], s[12:13], exec
+; GFX10-64-NEXT:    s_cbranch_scc0 BB4_4
+; GFX10-64-NEXT:  ; %bb.2: ; %.demote
+; GFX10-64-NEXT:    s_wqm_b64 s[16:17], s[12:13]
+; GFX10-64-NEXT:    s_and_b64 exec, exec, s[16:17]
+; GFX10-64-NEXT:  ; %bb.3: ; %.continue
+; GFX10-64-NEXT:    s_or_b64 exec, exec, s[28:29]
+; GFX10-64-NEXT:    v_add_f32_e32 v0, v0, v0
+; GFX10-64-NEXT:    s_and_b64 exec, exec, s[12:13]
+; GFX10-64-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX10-64-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-64-NEXT:    s_branch BB4_5
+; GFX10-64-NEXT:  BB4_4:
+; GFX10-64-NEXT:    s_mov_b64 exec, 0
+; GFX10-64-NEXT:    exp null off, off, off, off done vm
+; GFX10-64-NEXT:    s_endpgm
+; GFX10-64-NEXT:  BB4_5:
+.entry:
+  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
+  %tex0 = extractelement <4 x float> %tex, i32 0
+  %tex1 = extractelement <4 x float> %tex, i32 0
+  %z.cmp = fcmp olt float %tex0, 0.0
+  br i1 %z.cmp, label %.continue, label %.demote
+
+.demote:
+  call void @llvm.amdgcn.wqm.demote(i1 false)
+  br label %.continue
+
+.continue:
+  %coord1 = fadd float %tex0, %tex1
+  %rtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord1, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
+
+  ret <4 x float> %rtex
+}
+
+define amdgpu_ps <4 x float> @wqm_demote_dynamic(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) {
+; SI-LABEL: wqm_demote_dynamic:
+; SI:       ; %bb.0: ; %.entry
+; SI-NEXT:    s_mov_b64 s[12:13], exec
+; SI-NEXT:    s_wqm_b64 exec, exec
+; SI-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
+; SI-NEXT:    s_xor_b64 s[14:15], vcc, exec
+; SI-NEXT:    s_andn2_b64 s[12:13], s[12:13], s[14:15]
+; SI-NEXT:    s_cbranch_scc0 BB5_2
+; SI-NEXT:  ; %bb.1: ; %.entry
+; SI-NEXT:    s_wqm_b64 s[14:15], s[12:13]
+; SI-NEXT:    s_and_b64 exec, exec, s[14:15]
+; SI-NEXT:    v_add_f32_e32 v0, v0, v0
+; SI-NEXT:    s_and_b64 exec, exec, s[12:13]
+; SI-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_branch BB5_3
+; SI-NEXT:  BB5_2:
+; SI-NEXT:    s_mov_b64 exec, 0
+; SI-NEXT:    exp null off, off, off, off done vm
+; SI-NEXT:    s_endpgm
+; SI-NEXT:  BB5_3:
+;
+; GFX9-LABEL: wqm_demote_dynamic:
+; GFX9:       ; %bb.0: ; %.entry
+; GFX9-NEXT:    s_mov_b64 s[12:13], exec
+; GFX9-NEXT:    s_wqm_b64 exec, exec
+; GFX9-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
+; GFX9-NEXT:    s_xor_b64 s[14:15], vcc, exec
+; GFX9-NEXT:    s_andn2_b64 s[12:13], s[12:13], s[14:15]
+; GFX9-NEXT:    s_cbranch_scc0 BB5_2
+; GFX9-NEXT:  ; %bb.1: ; %.entry
+; GFX9-NEXT:    s_wqm_b64 s[14:15], s[12:13]
+; GFX9-NEXT:    s_and_b64 exec, exec, s[14:15]
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v0
+; GFX9-NEXT:    s_and_b64 exec, exec, s[12:13]
+; GFX9-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_branch BB5_3
+; GFX9-NEXT:  BB5_2:
+; GFX9-NEXT:    s_mov_b64 exec, 0
+; GFX9-NEXT:    exp null off, off, off, off done vm
+; GFX9-NEXT:    s_endpgm
+; GFX9-NEXT:  BB5_3:
+;
+; GFX10-32-LABEL: wqm_demote_dynamic:
+; GFX10-32:       ; %bb.0: ; %.entry
+; GFX10-32-NEXT:    s_mov_b32 s12, exec_lo
+; GFX10-32-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX10-32-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX10-32-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-32-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v0
+; GFX10-32-NEXT:    s_xor_b32 s13, vcc_lo, exec_lo
+; GFX10-32-NEXT:    s_andn2_b32 s12, s12, s13
+; GFX10-32-NEXT:    s_cbranch_scc0 BB5_2
+; GFX10-32-NEXT:  ; %bb.1: ; %.entry
+; GFX10-32-NEXT:    s_wqm_b32 s13, s12
+; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s13
+; GFX10-32-NEXT:    v_add_f32_e32 v0, v0, v0
+; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX10-32-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX10-32-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-32-NEXT:    s_branch BB5_3
+; GFX10-32-NEXT:  BB5_2:
+; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
+; GFX10-32-NEXT:    exp null off, off, off, off done vm
+; GFX10-32-NEXT:    s_endpgm
+; GFX10-32-NEXT:  BB5_3:
+;
+; GFX10-64-LABEL: wqm_demote_dynamic:
+; GFX10-64:       ; %bb.0: ; %.entry
+; GFX10-64-NEXT:    s_mov_b64 s[12:13], exec
+; GFX10-64-NEXT:    s_wqm_b64 exec, exec
+; GFX10-64-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX10-64-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-64-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
+; GFX10-64-NEXT:    s_xor_b64 s[14:15], vcc, exec
+; GFX10-64-NEXT:    s_andn2_b64 s[12:13], s[12:13], s[14:15]
+; GFX10-64-NEXT:    s_cbranch_scc0 BB5_2
+; GFX10-64-NEXT:  ; %bb.1: ; %.entry
+; GFX10-64-NEXT:    s_wqm_b64 s[28:29], s[12:13]
+; GFX10-64-NEXT:    s_and_b64 exec, exec, s[28:29]
+; GFX10-64-NEXT:    v_add_f32_e32 v0, v0, v0
+; GFX10-64-NEXT:    s_and_b64 exec, exec, s[12:13]
+; GFX10-64-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX10-64-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-64-NEXT:    s_branch BB5_3
+; GFX10-64-NEXT:  BB5_2:
+; GFX10-64-NEXT:    s_mov_b64 exec, 0
+; GFX10-64-NEXT:    exp null off, off, off, off done vm
+; GFX10-64-NEXT:    s_endpgm
+; GFX10-64-NEXT:  BB5_3:
+.entry:
+  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
+  %tex0 = extractelement <4 x float> %tex, i32 0
+  %tex1 = extractelement <4 x float> %tex, i32 0
+  %z.cmp = fcmp olt float %tex0, 0.0
+  call void @llvm.amdgcn.wqm.demote(i1 %z.cmp)
+  %coord1 = fadd float %tex0, %tex1
+  %rtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord1, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
+
+  ret <4 x float> %rtex
+}
+
+
+define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) {
+; SI-LABEL: wqm_deriv:
+; SI:       ; %bb.0: ; %.entry
+; SI-NEXT:    s_mov_b64 s[0:1], exec
+; SI-NEXT:    s_wqm_b64 exec, exec
+; SI-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; SI-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; SI-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; SI-NEXT:  ; %bb.1: ; %.demote0
+; SI-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
+; SI-NEXT:    s_cbranch_scc0 BB6_7
+; SI-NEXT:  ; %bb.2: ; %.demote0
+; SI-NEXT:    s_wqm_b64 s[4:5], s[0:1]
+; SI-NEXT:    s_and_b64 exec, exec, s[4:5]
+; SI-NEXT:  ; %bb.3: ; %.continue0
+; SI-NEXT:    s_or_b64 exec, exec, s[2:3]
+; SI-NEXT:    s_mov_b64 s[2:3], s[0:1]
+; SI-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0, s[2:3]
+; SI-NEXT:    v_mov_b32_e32 v1, v0
+; SI-NEXT:    s_xor_b64 s[2:3], s[0:1], -1
+; SI-NEXT:    s_nop 0
+; SI-NEXT:    v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0
+; SI-NEXT:    s_nop 1
+; SI-NEXT:    v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0
+; SI-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
+; SI-NEXT:    s_and_b64 exec, exec, s[0:1]
+; SI-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v0
+; SI-NEXT:    s_or_b64 s[2:3], s[2:3], vcc
+; SI-NEXT:    s_and_saveexec_b64 s[4:5], s[2:3]
+; SI-NEXT:    s_xor_b64 s[2:3], exec, s[4:5]
+; SI-NEXT:  ; %bb.4: ; %.demote1
+; SI-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
+; SI-NEXT:    s_cbranch_scc0 BB6_7
+; SI-NEXT:  ; %bb.5: ; %.demote1
+; SI-NEXT:    s_mov_b64 exec, 0
+; SI-NEXT:  ; %bb.6: ; %.continue1
+; SI-NEXT:    s_or_b64 exec, exec, s[2:3]
+; SI-NEXT:    v_bfrev_b32_e32 v0, 60
+; SI-NEXT:    v_mov_b32_e32 v1, 0x3c00
+; SI-NEXT:    exp mrt0 v1, v1, v0, v0 done compr vm
+; SI-NEXT:    s_endpgm
+; SI-NEXT:  BB6_7:
+; SI-NEXT:    s_mov_b64 exec, 0
+; SI-NEXT:    exp null off, off, off, off done vm
+; SI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: wqm_deriv:
+; GFX9:       ; %bb.0: ; %.entry
+; GFX9-NEXT:    s_mov_b64 s[0:1], exec
+; GFX9-NEXT:    s_wqm_b64 exec, exec
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX9-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX9-NEXT:  ; %bb.1: ; %.demote0
+; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
+; GFX9-NEXT:    s_cbranch_scc0 BB6_7
+; GFX9-NEXT:  ; %bb.2: ; %.demote0
+; GFX9-NEXT:    s_wqm_b64 s[4:5], s[0:1]
+; GFX9-NEXT:    s_and_b64 exec, exec, s[4:5]
+; GFX9-NEXT:  ; %bb.3: ; %.continue0
+; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0, s[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_xor_b64 s[2:3], s[0:1], -1
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0
+; GFX9-NEXT:    s_nop 1
+; GFX9-NEXT:    v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0
+; GFX9-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
+; GFX9-NEXT:    s_and_b64 exec, exec, s[0:1]
+; GFX9-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v0
+; GFX9-NEXT:    s_or_b64 s[2:3], s[2:3], vcc
+; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], s[2:3]
+; GFX9-NEXT:    s_xor_b64 s[2:3], exec, s[4:5]
+; GFX9-NEXT:  ; %bb.4: ; %.demote1
+; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
+; GFX9-NEXT:    s_cbranch_scc0 BB6_7
+; GFX9-NEXT:  ; %bb.5: ; %.demote1
+; GFX9-NEXT:    s_mov_b64 exec, 0
+; GFX9-NEXT:  ; %bb.6: ; %.continue1
+; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x3c00
+; GFX9-NEXT:    v_bfrev_b32_e32 v1, 60
+; GFX9-NEXT:    exp mrt0 v0, v0, v1, v1 done compr vm
+; GFX9-NEXT:    s_endpgm
+; GFX9-NEXT:  BB6_7:
+; GFX9-NEXT:    s_mov_b64 exec, 0
+; GFX9-NEXT:    exp null off, off, off, off done vm
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-32-LABEL: wqm_deriv:
+; GFX10-32:       ; %bb.0: ; %.entry
+; GFX10-32-NEXT:    s_mov_b32 s0, exec_lo
+; GFX10-32-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX10-32-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX10-32-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX10-32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
+; GFX10-32-NEXT:    s_xor_b32 s1, exec_lo, s1
+; GFX10-32-NEXT:  ; %bb.1: ; %.demote0
+; GFX10-32-NEXT:    s_andn2_b32 s0, s0, exec_lo
+; GFX10-32-NEXT:    s_cbranch_scc0 BB6_7
+; GFX10-32-NEXT:  ; %bb.2: ; %.demote0
+; GFX10-32-NEXT:    s_wqm_b32 s2, s0
+; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s2
+; GFX10-32-NEXT:  ; %bb.3: ; %.continue0
+; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX10-32-NEXT:    s_mov_b32 s1, s0
+; GFX10-32-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0, s1
+; GFX10-32-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-32-NEXT:    v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0
+; GFX10-32-NEXT:    v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0
+; GFX10-32-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
+; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s0
+; GFX10-32-NEXT:    v_cmp_neq_f32_e32 vcc_lo, 0, v0
+; GFX10-32-NEXT:    s_xor_b32 s1, s0, -1
+; GFX10-32-NEXT:    s_or_b32 s1, s1, vcc_lo
+; GFX10-32-NEXT:    s_and_saveexec_b32 s2, s1
+; GFX10-32-NEXT:    s_xor_b32 s1, exec_lo, s2
+; GFX10-32-NEXT:  ; %bb.4: ; %.demote1
+; GFX10-32-NEXT:    s_andn2_b32 s0, s0, exec_lo
+; GFX10-32-NEXT:    s_cbranch_scc0 BB6_7
+; GFX10-32-NEXT:  ; %bb.5: ; %.demote1
+; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
+; GFX10-32-NEXT:  ; %bb.6: ; %.continue1
+; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX10-32-NEXT:    v_mov_b32_e32 v0, 0x3c00
+; GFX10-32-NEXT:    v_bfrev_b32_e32 v1, 60
+; GFX10-32-NEXT:    exp mrt0 v0, v0, v1, v1 done compr vm
+; GFX10-32-NEXT:    s_endpgm
+; GFX10-32-NEXT:  BB6_7:
+; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
+; GFX10-32-NEXT:    exp null off, off, off, off done vm
+; GFX10-32-NEXT:    s_endpgm
+;
+; GFX10-64-LABEL: wqm_deriv:
+; GFX10-64:       ; %bb.0: ; %.entry
+; GFX10-64-NEXT:    s_mov_b64 s[0:1], exec
+; GFX10-64-NEXT:    s_wqm_b64 exec, exec
+; GFX10-64-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX10-64-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX10-64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX10-64-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX10-64-NEXT:  ; %bb.1: ; %.demote0
+; GFX10-64-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
+; GFX10-64-NEXT:    s_cbranch_scc0 BB6_7
+; GFX10-64-NEXT:  ; %bb.2: ; %.demote0
+; GFX10-64-NEXT:    s_wqm_b64 s[4:5], s[0:1]
+; GFX10-64-NEXT:    s_and_b64 exec, exec, s[4:5]
+; GFX10-64-NEXT:  ; %bb.3: ; %.continue0
+; GFX10-64-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX10-64-NEXT:    s_mov_b64 s[2:3], s[0:1]
+; GFX10-64-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0, s[2:3]
+; GFX10-64-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-64-NEXT:    v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0
+; GFX10-64-NEXT:    v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0
+; GFX10-64-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
+; GFX10-64-NEXT:    s_and_b64 exec, exec, s[0:1]
+; GFX10-64-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v0
+; GFX10-64-NEXT:    s_xor_b64 s[2:3], s[0:1], -1
+; GFX10-64-NEXT:    s_or_b64 s[2:3], s[2:3], vcc
+; GFX10-64-NEXT:    s_and_saveexec_b64 s[4:5], s[2:3]
+; GFX10-64-NEXT:    s_xor_b64 s[2:3], exec, s[4:5]
+; GFX10-64-NEXT:  ; %bb.4: ; %.demote1
+; GFX10-64-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
+; GFX10-64-NEXT:    s_cbranch_scc0 BB6_7
+; GFX10-64-NEXT:  ; %bb.5: ; %.demote1
+; GFX10-64-NEXT:    s_mov_b64 exec, 0
+; GFX10-64-NEXT:  ; %bb.6: ; %.continue1
+; GFX10-64-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX10-64-NEXT:    v_mov_b32_e32 v0, 0x3c00
+; GFX10-64-NEXT:    v_bfrev_b32_e32 v1, 60
+; GFX10-64-NEXT:    exp mrt0 v0, v0, v1, v1 done compr vm
+; GFX10-64-NEXT:    s_endpgm
+; GFX10-64-NEXT:  BB6_7:
+; GFX10-64-NEXT:    s_mov_b64 exec, 0
+; GFX10-64-NEXT:    exp null off, off, off, off done vm
+; GFX10-64-NEXT:    s_endpgm
+.entry:
+  %p0 = extractelement <2 x float> %input, i32 0
+  %p1 = extractelement <2 x float> %input, i32 1
+  %x0 = call float @llvm.amdgcn.interp.p1(float %p0, i32 immarg 0, i32 immarg 0, i32 %index) #2
+  %x1 = call float @llvm.amdgcn.interp.p2(float %x0, float %p1, i32 immarg 0, i32 immarg 0, i32 %index) #2
+  %argi = fptosi float %arg to i32
+  %cond0 = icmp eq i32 %argi, 0
+  br i1 %cond0, label %.continue0, label %.demote0
+
+.demote0:
+  call void @llvm.amdgcn.wqm.demote(i1 false)
+  br label %.continue0
+
+.continue0:
+  %live = call i1 @llvm.amdgcn.live.mask()
+  %live.cond = select i1 %live, i32 0, i32 1065353216
+  %live.v0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 85, i32 15, i32 15, i1 true)
+  %live.v0f = bitcast i32 %live.v0 to float
+  %live.v1 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 0, i32 15, i32 15, i1 true)
+  %live.v1f = bitcast i32 %live.v1 to float
+  %v0 = fsub float %live.v0f, %live.v1f
+  %v0.wqm = call float @llvm.amdgcn.wqm.f32(float %v0)
+  %cond1 = fcmp oeq float %v0.wqm, 0.000000e+00
+  %cond2 = and i1 %live, %cond1
+  br i1 %cond2, label %.continue1, label %.demote1
+
+.demote1:
+  call void @llvm.amdgcn.wqm.demote(i1 false)
+  br label %.continue1
+
+.continue1:
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 immarg 0, i32 immarg 15, <2 x half> <half 0xH3C00, half 0xH0000>, <2 x half> <half 0xH0000, half 0xH3C00>, i1 immarg true, i1 immarg true) #3
+  ret void
+}
+
+define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index, i32 %limit) {
+; SI-LABEL: wqm_deriv_loop:
+; SI:       ; %bb.0: ; %.entry
+; SI-NEXT:    s_mov_b64 s[0:1], exec
+; SI-NEXT:    s_wqm_b64 exec, exec
+; SI-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; SI-NEXT:    s_mov_b32 s2, 0
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; SI-NEXT:  ; %bb.1: ; %.demote0
+; SI-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
+; SI-NEXT:    s_cbranch_scc0 BB7_9
+; SI-NEXT:  ; %bb.2: ; %.demote0
+; SI-NEXT:    s_wqm_b64 s[6:7], s[0:1]
+; SI-NEXT:    s_and_b64 exec, exec, s[6:7]
+; SI-NEXT:  ; %bb.3: ; %.continue0.preheader
+; SI-NEXT:    s_or_b64 exec, exec, s[4:5]
+; SI-NEXT:    s_mov_b64 s[4:5], 0
+; SI-NEXT:    s_branch BB7_5
+; SI-NEXT:  BB7_4: ; %.continue1
+; SI-NEXT:    ; in Loop: Header=BB7_5 Depth=1
+; SI-NEXT:    s_or_b64 exec, exec, s[6:7]
+; SI-NEXT:    s_add_i32 s2, s2, 1
+; SI-NEXT:    v_cmp_ge_i32_e32 vcc, s2, v1
+; SI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; SI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; SI-NEXT:    s_cbranch_execz BB7_8
+; SI-NEXT:  BB7_5: ; %.continue0
+; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; SI-NEXT:    v_mov_b32_e32 v0, s2
+; SI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[6:7]
+; SI-NEXT:    v_mov_b32_e32 v2, v0
+; SI-NEXT:    s_xor_b64 s[6:7], s[0:1], -1
+; SI-NEXT:    s_nop 0
+; SI-NEXT:    v_mov_b32_dpp v2, v2 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0
+; SI-NEXT:    s_nop 1
+; SI-NEXT:    v_subrev_f32_dpp v0, v0, v2 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0
+; SI-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
+; SI-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v0
+; SI-NEXT:    s_or_b64 s[6:7], s[6:7], vcc
+; SI-NEXT:    s_and_saveexec_b64 s[8:9], s[6:7]
+; SI-NEXT:    s_xor_b64 s[6:7], exec, s[8:9]
+; SI-NEXT:    s_cbranch_execz BB7_4
+; SI-NEXT:  ; %bb.6: ; %.demote1
+; SI-NEXT:    ; in Loop: Header=BB7_5 Depth=1
+; SI-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
+; SI-NEXT:    s_cbranch_scc0 BB7_9
+; SI-NEXT:  ; %bb.7: ; %.demote1
+; SI-NEXT:    ; in Loop: Header=BB7_5 Depth=1
+; SI-NEXT:    s_wqm_b64 s[8:9], s[0:1]
+; SI-NEXT:    s_and_b64 exec, exec, s[8:9]
+; SI-NEXT:    s_branch BB7_4
+; SI-NEXT:  BB7_8: ; %.return
+; SI-NEXT:    s_or_b64 exec, exec, s[4:5]
+; SI-NEXT:    s_and_b64 exec, exec, s[0:1]
+; SI-NEXT:    v_bfrev_b32_e32 v0, 60
+; SI-NEXT:    v_mov_b32_e32 v1, 0x3c00
+; SI-NEXT:    exp mrt0 v1, v1, v0, v0 done compr vm
+; SI-NEXT:    s_endpgm
+; SI-NEXT:  BB7_9:
+; SI-NEXT:    s_mov_b64 exec, 0
+; SI-NEXT:    exp null off, off, off, off done vm
+; SI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: wqm_deriv_loop:
+; GFX9:       ; %bb.0: ; %.entry
+; GFX9-NEXT:    s_mov_b64 s[0:1], exec
+; GFX9-NEXT:    s_wqm_b64 exec, exec
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX9-NEXT:    s_mov_b32 s2, 0
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX9-NEXT:  ; %bb.1: ; %.demote0
+; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
+; GFX9-NEXT:    s_cbranch_scc0 BB7_9
+; GFX9-NEXT:  ; %bb.2: ; %.demote0
+; GFX9-NEXT:    s_wqm_b64 s[6:7], s[0:1]
+; GFX9-NEXT:    s_and_b64 exec, exec, s[6:7]
+; GFX9-NEXT:  ; %bb.3: ; %.continue0.preheader
+; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:    s_branch BB7_5
+; GFX9-NEXT:  BB7_4: ; %.continue1
+; GFX9-NEXT:    ; in Loop: Header=BB7_5 Depth=1
+; GFX9-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX9-NEXT:    s_add_i32 s2, s2, 1
+; GFX9-NEXT:    v_cmp_ge_i32_e32 vcc, s2, v1
+; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_cbranch_execz BB7_8
+; GFX9-NEXT:  BB7_5: ; %.continue0
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[6:7]
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    s_xor_b64 s[6:7], s[0:1], -1
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_mov_b32_dpp v2, v2 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0
+; GFX9-NEXT:    s_nop 1
+; GFX9-NEXT:    v_subrev_f32_dpp v0, v0, v2 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0
+; GFX9-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
+; GFX9-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v0
+; GFX9-NEXT:    s_or_b64 s[6:7], s[6:7], vcc
+; GFX9-NEXT:    s_and_saveexec_b64 s[8:9], s[6:7]
+; GFX9-NEXT:    s_xor_b64 s[6:7], exec, s[8:9]
+; GFX9-NEXT:    s_cbranch_execz BB7_4
+; GFX9-NEXT:  ; %bb.6: ; %.demote1
+; GFX9-NEXT:    ; in Loop: Header=BB7_5 Depth=1
+; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
+; GFX9-NEXT:    s_cbranch_scc0 BB7_9
+; GFX9-NEXT:  ; %bb.7: ; %.demote1
+; GFX9-NEXT:    ; in Loop: Header=BB7_5 Depth=1
+; GFX9-NEXT:    s_wqm_b64 s[8:9], s[0:1]
+; GFX9-NEXT:    s_and_b64 exec, exec, s[8:9]
+; GFX9-NEXT:    s_branch BB7_4
+; GFX9-NEXT:  BB7_8: ; %.return
+; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_and_b64 exec, exec, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x3c00
+; GFX9-NEXT:    v_bfrev_b32_e32 v1, 60
+; GFX9-NEXT:    exp mrt0 v0, v0, v1, v1 done compr vm
+; GFX9-NEXT:    s_endpgm
+; GFX9-NEXT:  BB7_9:
+; GFX9-NEXT:    s_mov_b64 exec, 0
+; GFX9-NEXT:    exp null off, off, off, off done vm
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-32-LABEL: wqm_deriv_loop:
+; GFX10-32:       ; %bb.0: ; %.entry
+; GFX10-32-NEXT:    s_mov_b32 s0, exec_lo
+; GFX10-32-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX10-32-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX10-32-NEXT:    s_mov_b32 s1, 0
+; GFX10-32-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX10-32-NEXT:    s_and_saveexec_b32 s2, vcc_lo
+; GFX10-32-NEXT:    s_xor_b32 s2, exec_lo, s2
+; GFX10-32-NEXT:  ; %bb.1: ; %.demote0
+; GFX10-32-NEXT:    s_andn2_b32 s0, s0, exec_lo
+; GFX10-32-NEXT:    s_cbranch_scc0 BB7_9
+; GFX10-32-NEXT:  ; %bb.2: ; %.demote0
+; GFX10-32-NEXT:    s_wqm_b32 s3, s0
+; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s3
+; GFX10-32-NEXT:  ; %bb.3: ; %.continue0.preheader
+; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s2
+; GFX10-32-NEXT:    s_mov_b32 s2, 0
+; GFX10-32-NEXT:    s_branch BB7_5
+; GFX10-32-NEXT:  BB7_4: ; %.continue1
+; GFX10-32-NEXT:    ; in Loop: Header=BB7_5 Depth=1
+; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s3
+; GFX10-32-NEXT:    s_add_i32 s2, s2, 1
+; GFX10-32-NEXT:    v_cmp_ge_i32_e32 vcc_lo, s2, v1
+; GFX10-32-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX10-32-NEXT:    s_andn2_b32 exec_lo, exec_lo, s1
+; GFX10-32-NEXT:    s_cbranch_execz BB7_8
+; GFX10-32-NEXT:  BB7_5: ; %.continue0
+; GFX10-32-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-32-NEXT:    s_mov_b32 s3, s0
+; GFX10-32-NEXT:    v_cndmask_b32_e64 v0, s2, 0, s3
+; GFX10-32-NEXT:    s_xor_b32 s3, s0, -1
+; GFX10-32-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-32-NEXT:    v_mov_b32_dpp v2, v2 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0
+; GFX10-32-NEXT:    v_subrev_f32_dpp v0, v0, v2 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0
+; GFX10-32-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
+; GFX10-32-NEXT:    v_cmp_neq_f32_e32 vcc_lo, 0, v0
+; GFX10-32-NEXT:    s_or_b32 s3, s3, vcc_lo
+; GFX10-32-NEXT:    s_and_saveexec_b32 s4, s3
+; GFX10-32-NEXT:    s_xor_b32 s3, exec_lo, s4
+; GFX10-32-NEXT:    s_cbranch_execz BB7_4
+; GFX10-32-NEXT:  ; %bb.6: ; %.demote1
+; GFX10-32-NEXT:    ; in Loop: Header=BB7_5 Depth=1
+; GFX10-32-NEXT:    s_andn2_b32 s0, s0, exec_lo
+; GFX10-32-NEXT:    s_cbranch_scc0 BB7_9
+; GFX10-32-NEXT:  ; %bb.7: ; %.demote1
+; GFX10-32-NEXT:    ; in Loop: Header=BB7_5 Depth=1
+; GFX10-32-NEXT:    s_wqm_b32 s4, s0
+; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s4
+; GFX10-32-NEXT:    s_branch BB7_4
+; GFX10-32-NEXT:  BB7_8: ; %.return
+; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s0
+; GFX10-32-NEXT:    v_mov_b32_e32 v0, 0x3c00
+; GFX10-32-NEXT:    v_bfrev_b32_e32 v1, 60
+; GFX10-32-NEXT:    exp mrt0 v0, v0, v1, v1 done compr vm
+; GFX10-32-NEXT:    s_endpgm
+; GFX10-32-NEXT:  BB7_9:
+; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
+; GFX10-32-NEXT:    exp null off, off, off, off done vm
+; GFX10-32-NEXT:    s_endpgm
+;
+; GFX10-64-LABEL: wqm_deriv_loop:
+; GFX10-64:       ; %bb.0: ; %.entry
+; GFX10-64-NEXT:    s_mov_b64 s[0:1], exec
+; GFX10-64-NEXT:    s_wqm_b64 exec, exec
+; GFX10-64-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX10-64-NEXT:    s_mov_b32 s2, 0
+; GFX10-64-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX10-64-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX10-64-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX10-64-NEXT:  ; %bb.1: ; %.demote0
+; GFX10-64-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
+; GFX10-64-NEXT:    s_cbranch_scc0 BB7_9
+; GFX10-64-NEXT:  ; %bb.2: ; %.demote0
+; GFX10-64-NEXT:    s_wqm_b64 s[6:7], s[0:1]
+; GFX10-64-NEXT:    s_and_b64 exec, exec, s[6:7]
+; GFX10-64-NEXT:  ; %bb.3: ; %.continue0.preheader
+; GFX10-64-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX10-64-NEXT:    s_mov_b64 s[4:5], 0
+; GFX10-64-NEXT:    s_branch BB7_5
+; GFX10-64-NEXT:  BB7_4: ; %.continue1
+; GFX10-64-NEXT:    ; in Loop: Header=BB7_5 Depth=1
+; GFX10-64-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX10-64-NEXT:    s_add_i32 s2, s2, 1
+; GFX10-64-NEXT:    v_cmp_ge_i32_e32 vcc, s2, v1
+; GFX10-64-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX10-64-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX10-64-NEXT:    s_cbranch_execz BB7_8
+; GFX10-64-NEXT:  BB7_5: ; %.continue0
+; GFX10-64-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-64-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX10-64-NEXT:    v_cndmask_b32_e64 v0, s2, 0, s[6:7]
+; GFX10-64-NEXT:    s_xor_b64 s[6:7], s[0:1], -1
+; GFX10-64-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-64-NEXT:    v_mov_b32_dpp v2, v2 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0
+; GFX10-64-NEXT:    v_subrev_f32_dpp v0, v0, v2 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0
+; GFX10-64-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
+; GFX10-64-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v0
+; GFX10-64-NEXT:    s_or_b64 s[6:7], s[6:7], vcc
+; GFX10-64-NEXT:    s_and_saveexec_b64 s[8:9], s[6:7]
+; GFX10-64-NEXT:    s_xor_b64 s[6:7], exec, s[8:9]
+; GFX10-64-NEXT:    s_cbranch_execz BB7_4
+; GFX10-64-NEXT:  ; %bb.6: ; %.demote1
+; GFX10-64-NEXT:    ; in Loop: Header=BB7_5 Depth=1
+; GFX10-64-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
+; GFX10-64-NEXT:    s_cbranch_scc0 BB7_9
+; GFX10-64-NEXT:  ; %bb.7: ; %.demote1
+; GFX10-64-NEXT:    ; in Loop: Header=BB7_5 Depth=1
+; GFX10-64-NEXT:    s_wqm_b64 s[8:9], s[0:1]
+; GFX10-64-NEXT:    s_and_b64 exec, exec, s[8:9]
+; GFX10-64-NEXT:    s_branch BB7_4
+; GFX10-64-NEXT:  BB7_8: ; %.return
+; GFX10-64-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX10-64-NEXT:    s_and_b64 exec, exec, s[0:1]
+; GFX10-64-NEXT:    v_mov_b32_e32 v0, 0x3c00
+; GFX10-64-NEXT:    v_bfrev_b32_e32 v1, 60
+; GFX10-64-NEXT:    exp mrt0 v0, v0, v1, v1 done compr vm
+; GFX10-64-NEXT:    s_endpgm
+; GFX10-64-NEXT:  BB7_9:
+; GFX10-64-NEXT:    s_mov_b64 exec, 0
+; GFX10-64-NEXT:    exp null off, off, off, off done vm
+; GFX10-64-NEXT:    s_endpgm
+.entry:
+  %p0 = extractelement <2 x float> %input, i32 0
+  %p1 = extractelement <2 x float> %input, i32 1
+  %x0 = call float @llvm.amdgcn.interp.p1(float %p0, i32 immarg 0, i32 immarg 0, i32 %index) #2
+  %x1 = call float @llvm.amdgcn.interp.p2(float %x0, float %p1, i32 immarg 0, i32 immarg 0, i32 %index) #2
+  %argi = fptosi float %arg to i32
+  %cond0 = icmp eq i32 %argi, 0
+  br i1 %cond0, label %.continue0, label %.demote0
+
+.demote0:
+  call void @llvm.amdgcn.wqm.demote(i1 false)
+  br label %.continue0
+
+.continue0:
+  %count = phi i32 [ 0, %.entry ], [ 0, %.demote0 ], [ %next, %.continue1 ]
+  %live = call i1 @llvm.amdgcn.live.mask()
+  %live.cond = select i1 %live, i32 0, i32 %count
+  %live.v0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 85, i32 15, i32 15, i1 true)
+  %live.v0f = bitcast i32 %live.v0 to float
+  %live.v1 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 0, i32 15, i32 15, i1 true)
+  %live.v1f = bitcast i32 %live.v1 to float
+  %v0 = fsub float %live.v0f, %live.v1f
+  %v0.wqm = call float @llvm.amdgcn.wqm.f32(float %v0)
+  %cond1 = fcmp oeq float %v0.wqm, 0.000000e+00
+  %cond2 = and i1 %live, %cond1
+  br i1 %cond2, label %.continue1, label %.demote1
+
+.demote1:
+  call void @llvm.amdgcn.wqm.demote(i1 false)
+  br label %.continue1
+
+.continue1:
+  %next = add i32 %count, 1
+  %loop.cond = icmp slt i32 %next, %limit
+  br i1 %loop.cond, label %.continue0, label %.return
+
+.return:
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 immarg 0, i32 immarg 15, <2 x half> <half 0xH3C00, half 0xH0000>, <2 x half> <half 0xH0000, half 0xH3C00>, i1 immarg true, i1 immarg true) #3
+  ret void
+}
+
+declare void @llvm.amdgcn.wqm.demote(i1) #0
+declare i1 @llvm.amdgcn.live.mask() #0
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
+declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare float @llvm.amdgcn.wqm.f32(float) #1
+declare float @llvm.amdgcn.interp.p1(float, i32 immarg, i32 immarg, i32) #2
+declare float @llvm.amdgcn.interp.p2(float, float, i32 immarg, i32 immarg, i32) #2
+declare void @llvm.amdgcn.exp.compr.v2f16(i32 immarg, i32 immarg, <2 x half>, <2 x half>, i1 immarg, i1 immarg) #3
+declare i32 @llvm.amdgcn.mov.dpp.i32(i32, i32 immarg, i32 immarg, i32 immarg, i1 immarg) #4
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind readnone speculatable }
+attributes #3 = { inaccessiblememonly nounwind }
+attributes #4 = { convergent nounwind readnone }